Doublecheck v5.py

!/usr/bin/env python
Submission Similarity Checker (v5)
V5: Added a bit of documentation and provenance.
If reading on the ACL AdminWiki you may have to view the source material to see its proper formatting.
Originally received by ACL 2017 PC co-chairs from ICML 2017 PC co-chairs Doina Precup and Yee Whye Teh
Use the global variables immediately after the import to change the .csv files to be checked and to tune the similarity threshold to report for manual review.
The CSV files should have to the format of "ID", "Title", "Abstract" and optional "Author"

import argparse import csv import copy import math from operator import itemgetter

Tunable parameters

ABSTR_1 = 'IROS14.csv' # First source file ABSTR_2 = 'RSS14.csv' # Second source file THR = 0.4 # cosine similarity threshold

~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def distentry(one_entry, two_entry):

ssum = 0 for w in one_entry['bow']: if(w in two_entry['bow']): ssum += one_entry['bow'][w] * two_entry['bow'][w] return(ssum)

~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def do_tfidf(corpora):

N = len(corpora)

#~ dictionary of all terms in the corpora for c in corpora: tf = {} words_abstr = c['abstract'].split()

for w in words_abstr: w = w.replace(',',).replace('.',).replace(')',).replace('(',).replace(';',).replace(':',) if(w not in tf): tf[w] = 0 tf[w] += 1 c['tf'] = tf

idf = {} for c in corpora: for w in c['tf']: if(w not in idf): idf[w] = 0 idf[w] += 1

sanity1 = 0 for w in idf: if(idf[w] > N): print ('ASSERT FAILED!') exit(1)

for w in idf: idf[w] = math.log(float(N) / (1.0 + float(idf[w])))

#~ TFIDF for c in corpora: bow = {} norm1 = 0 for w in c['tf']: if(w not in bow): bow[w] =0 bow[w] = c['tf'][w] * idf[w] norm1 += bow[w] * bow[w]

for w in c['tf']: bow[w] /= math.sqrt(norm1)

c['bow'] = bow

return(corpora)

~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

def loadf_conf(fname):

f=open(fname,'rU') # opens file for reading reader = csv.reader(f,delimiter =",")

info = [] counter = 0 for line in reader: if(len(line) >= 3): if(line[0].isdigit()): onestr = {} onestr['id'] = int(line[0]) onestr['title'] = line[1] onestr['abstract'] = line[2]

                               if(len(line) == 4):
                                   onestr['authors'] = line[3]
                               else:
                                   onestr['authors'] =

info.append(onestr)

return(info)

~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

parser = argparse.ArgumentParser(description='Doublesubmission checker 0.01') parser.add_argument("c1", help="CSV of conference1") parser.add_argument("c2", help="CSV of conference2") parser.add_argument("-t", help="detection threshold [default: %g]"%THR, default=THR, required=False) args = vars(parser.parse_args()) f1 = args['c1'] f2 = args['c2'] self_comparison = (f1 == f2) print ('Self comparison: ' + str(self_comparison)) thr = float(args['t']) print ('## Double submission checker v 0.02. L. Spinello, G.D. Tipaldi (slight modifications by W. Burgard) 2014') print ('> Detection threshold set to '+str(thr)) print ('> Load file ' + f1) iros_raw = loadf_conf(f1) print ('entries: ' + str(len(iros_raw))) print ('> Load file ' + f2) rss_raw = loadf_conf(f2) print ('entries: ' + str(len(rss_raw))) rss_len = len(rss_raw) corpora_t = copy.deepcopy(rss_raw) for ir in iros_raw: corpora_t.append(ir) print ('> Compute TFIDF') corpora = do_tfidf(corpora_t)

bestv = 0 wmatches = [] for i in range(rss_len): bestmatch = -1 bestmatch_idx = -1 for j in range(rss_len+1,len(corpora)):

        if((i != (j-rss_len)) or (not self_comparison)):
           d = distentry(corpora[i], corpora[j])
           if(d > bestmatch):
               bestmatch = d
               bestmatch_idx = j

if(bestmatch > thr): onematch = {} onematch['from_id'] = i onematch['to_id'] = bestmatch_idx onematch['to_value'] = bestmatch wmatches.append(onematch)

~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
~ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

swmatches = sorted(wmatches, key=itemgetter('to_value'), reverse = True)

for wins in swmatches: print ('====================================================') cid1 = wins['to_id'] cid2 = wins['from_id']

       print (f1 + ' id '+ str(corpora[cid1]['id']) + ', Title: ' + corpora[cid1]['title'] + '\nAuthors: ' + corpora[cid1]['authors'] )
       print (f2 + ' id '+ str(corpora[cid2]['id']) + ', Title: ' + corpora[cid2]['title'] + '\nAuthors: ' + corpora[cid2]['authors'] )

print ('Cosine dist: ' + str(wins['to_value'])) print () print ('abstr ' + f1 + ": " + corpora[cid1]['abstract']) print ('----') print ('abstr ' + f2 + ": " + corpora[cid2]['abstract']) print () print ()

Doublecheck v5.py

Navigation menu

Search