Doublecheck v5.py

From Admin Wiki
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
  1. !/usr/bin/env python
  2. Submission Similarity Checker (v5)
  3. V5: Added a bit of documentation and provenance.
  4. If reading on the ACL AdminWiki you may have to view the source material to see its proper formatting.
  5. Originally received by ACL 2017 PC co-chairs from ICML 2017 PC co-chairs Doina Precup and Yee Whye Teh
  6. Use the global variables immediately after the import to change the .csv files to be checked and to tune the similarity threshold to report for manual review.
  7. The CSV files should have to the format of "ID", "Title", "Abstract" and optional "Author"

import argparse import csv import copy import math from operator import itemgetter

  1. Tunable parameters

ABSTR_1 = 'IROS14.csv' # First source file ABSTR_2 = 'RSS14.csv' # Second source file THR = 0.4 # cosine similarity threshold

  1. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
  2. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def distentry(one_entry, two_entry):

ssum = 0 for w in one_entry['bow']: if(w in two_entry['bow']): ssum += one_entry['bow'][w] * two_entry['bow'][w] return(ssum)


  1. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
  2. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def do_tfidf(corpora):

N = len(corpora)

#~ dictionary of all terms in the corpora for c in corpora: tf = {} words_abstr = c['abstract'].split()

for w in words_abstr: w = w.replace(',',).replace('.',).replace(')',).replace('(',).replace(';',).replace(':',) if(w not in tf): tf[w] = 0 tf[w] += 1 c['tf'] = tf

idf = {} for c in corpora: for w in c['tf']: if(w not in idf): idf[w] = 0 idf[w] += 1

sanity1 = 0 for w in idf: if(idf[w] > N): print ('ASSERT FAILED!') exit(1)

for w in idf: idf[w] = math.log(float(N) / (1.0 + float(idf[w])))

#~ TFIDF for c in corpora: bow = {} norm1 = 0 for w in c['tf']: if(w not in bow): bow[w] =0 bow[w] = c['tf'][w] * idf[w] norm1 += bow[w] * bow[w]

for w in c['tf']: bow[w] /= math.sqrt(norm1)

c['bow'] = bow


return(corpora)


  1. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
  2. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def loadf_conf(fname):

f=open(fname,'rU') # opens file for reading reader = csv.reader(f,delimiter =",")


info = [] counter = 0 for line in reader: if(len(line) >= 3): if(line[0].isdigit()): onestr = {} onestr['id'] = int(line[0]) onestr['title'] = line[1] onestr['abstract'] = line[2]

                               if(len(line) == 4):
                                   onestr['authors'] = line[3]
                               else:
                                   onestr['authors'] = 

info.append(onestr)

return(info)


  1. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
  2. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


parser = argparse.ArgumentParser(description='Doublesubmission checker 0.01') parser.add_argument("c1", help="CSV of conference1") parser.add_argument("c2", help="CSV of conference2") parser.add_argument("-t", help="detection threshold [default: %g]"%THR, default=THR, required=False) args = vars(parser.parse_args()) f1 = args['c1'] f2 = args['c2'] self_comparison = (f1 == f2) print ('Self comparison: ' + str(self_comparison)) thr = float(args['t']) print ('## Double submission checker v 0.02. L. Spinello, G.D. Tipaldi (slight modifications by W. Burgard) 2014') print ('> Detection threshold set to '+str(thr)) print ('> Load file ' + f1) iros_raw = loadf_conf(f1) print ('entries: ' + str(len(iros_raw))) print ('> Load file ' + f2) rss_raw = loadf_conf(f2) print ('entries: ' + str(len(rss_raw))) rss_len = len(rss_raw) corpora_t = copy.deepcopy(rss_raw) for ir in iros_raw: corpora_t.append(ir) print ('> Compute TFIDF') corpora = do_tfidf(corpora_t)

bestv = 0 wmatches = [] for i in range(rss_len): bestmatch = -1 bestmatch_idx = -1 for j in range(rss_len+1,len(corpora)):

        if((i != (j-rss_len)) or (not self_comparison)):
           d = distentry(corpora[i], corpora[j])
           if(d > bestmatch):
               bestmatch = d
               bestmatch_idx = j

if(bestmatch > thr): onematch = {} onematch['from_id'] = i onematch['to_id'] = bestmatch_idx onematch['to_value'] = bestmatch wmatches.append(onematch)

  1. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
  2. ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

swmatches = sorted(wmatches, key=itemgetter('to_value'), reverse = True)

for wins in swmatches: print ('====================================================') cid1 = wins['to_id'] cid2 = wins['from_id']

       print (f1 + ' id '+ str(corpora[cid1]['id']) + ', Title: ' + corpora[cid1]['title'] + '\nAuthors: ' + corpora[cid1]['authors'] )
       print (f2 + ' id '+ str(corpora[cid2]['id']) + ', Title: ' + corpora[cid2]['title'] + '\nAuthors: ' + corpora[cid2]['authors'] )

print ('Cosine dist: ' + str(wins['to_value'])) print () print ('abstr ' + f1 + ": " + corpora[cid1]['abstract']) print ('----') print ('abstr ' + f2 + ": " + corpora[cid2]['abstract']) print () print ()