Doublecheck v5.py
- !/usr/bin/env python
- Submission Similarity Checker (v5)
- V5: Added a bit of documentation and provenance.
- If reading on the ACL AdminWiki you may have to view the source material to see its proper formatting.
- Originally received by ACL 2017 PC co-chairs from ICML 2017 PC co-chairs Doina Precup and Yee Whye Teh
- Use the global variables immediately after the import to change the .csv files to be checked and to tune the similarity threshold to report for manual review.
- The CSV files should have to the format of "ID", "Title", "Abstract" and optional "Author"
import argparse import csv import copy import math from operator import itemgetter
- Tunable parameters
ABSTR_1 = 'IROS14.csv' # First source file ABSTR_2 = 'RSS14.csv' # Second source file THR = 0.4 # cosine similarity threshold
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def distentry(one_entry, two_entry):
ssum = 0 for w in one_entry['bow']: if(w in two_entry['bow']): ssum += one_entry['bow'][w] * two_entry['bow'][w] return(ssum)
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def do_tfidf(corpora):
N = len(corpora)
#~ dictionary of all terms in the corpora for c in corpora: tf = {} words_abstr = c['abstract'].split()
for w in words_abstr: w = w.replace(',',).replace('.',).replace(')',).replace('(',).replace(';',).replace(':',) if(w not in tf): tf[w] = 0 tf[w] += 1 c['tf'] = tf
idf = {} for c in corpora: for w in c['tf']: if(w not in idf): idf[w] = 0 idf[w] += 1
sanity1 = 0 for w in idf: if(idf[w] > N): print ('ASSERT FAILED!') exit(1)
for w in idf: idf[w] = math.log(float(N) / (1.0 + float(idf[w])))
#~ TFIDF for c in corpora: bow = {} norm1 = 0 for w in c['tf']: if(w not in bow): bow[w] =0 bow[w] = c['tf'][w] * idf[w] norm1 += bow[w] * bow[w]
for w in c['tf']: bow[w] /= math.sqrt(norm1)
c['bow'] = bow
return(corpora)
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def loadf_conf(fname):
f=open(fname,'rU') # opens file for reading reader = csv.reader(f,delimiter =",")
info = []
counter = 0
for line in reader:
if(len(line) >= 3):
if(line[0].isdigit()):
onestr = {}
onestr['id'] = int(line[0])
onestr['title'] = line[1]
onestr['abstract'] = line[2]
if(len(line) == 4): onestr['authors'] = line[3] else: onestr['authors'] =
info.append(onestr)
return(info)
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
parser = argparse.ArgumentParser(description='Doublesubmission checker 0.01')
parser.add_argument("c1", help="CSV of conference1")
parser.add_argument("c2", help="CSV of conference2")
parser.add_argument("-t", help="detection threshold [default: %g]"%THR, default=THR, required=False)
args = vars(parser.parse_args())
f1 = args['c1']
f2 = args['c2']
self_comparison = (f1 == f2)
print ('Self comparison: ' + str(self_comparison))
thr = float(args['t'])
print ('## Double submission checker v 0.02. L. Spinello, G.D. Tipaldi (slight modifications by W. Burgard) 2014')
print ('> Detection threshold set to '+str(thr))
print ('> Load file ' + f1)
iros_raw = loadf_conf(f1)
print ('entries: ' + str(len(iros_raw)))
print ('> Load file ' + f2)
rss_raw = loadf_conf(f2)
print ('entries: ' + str(len(rss_raw)))
rss_len = len(rss_raw)
corpora_t = copy.deepcopy(rss_raw)
for ir in iros_raw:
corpora_t.append(ir)
print ('> Compute TFIDF')
corpora = do_tfidf(corpora_t)
bestv = 0 wmatches = [] for i in range(rss_len): bestmatch = -1 bestmatch_idx = -1 for j in range(rss_len+1,len(corpora)):
if((i != (j-rss_len)) or (not self_comparison)): d = distentry(corpora[i], corpora[j]) if(d > bestmatch): bestmatch = d bestmatch_idx = j
if(bestmatch > thr): onematch = {} onematch['from_id'] = i onematch['to_id'] = bestmatch_idx onematch['to_value'] = bestmatch wmatches.append(onematch)
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
- ~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
swmatches = sorted(wmatches, key=itemgetter('to_value'), reverse = True)
for wins in swmatches: print ('====================================================') cid1 = wins['to_id'] cid2 = wins['from_id']
print (f1 + ' id '+ str(corpora[cid1]['id']) + ', Title: ' + corpora[cid1]['title'] + '\nAuthors: ' + corpora[cid1]['authors'] ) print (f2 + ' id '+ str(corpora[cid2]['id']) + ', Title: ' + corpora[cid2]['title'] + '\nAuthors: ' + corpora[cid2]['authors'] )
print ('Cosine dist: ' + str(wins['to_value'])) print () print ('abstr ' + f1 + ": " + corpora[cid1]['abstract']) print ('----') print ('abstr ' + f2 + ": " + corpora[cid2]['abstract']) print () print ()