-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscriptionmatcher5.py
58 lines (48 loc) · 2.47 KB
/
transcriptionmatcher5.py
1
import csvimport osimport codecsimport itertools as itimport numpy as npimport refrom fuzzywuzzy import fuzzfrom fuzzywuzzy import process from collections import defaultdictcsvname = raw_input('Enter name of your file: ')while (not os.path.isfile(csvname)): print 'There is no file named', csvname csvname = raw_input('Enter location and name of your file: ')csvfile=open(str(csvname), 'rb')#opens csv fileheader=csvfile.next()header=header.translate(None,"\000")headlist=re.split(',', header)print "Your file has the following fields: "print '\n'.join(headlist)#list fields in csvuserheader = raw_input('Get text matching scores for which field?: ')while (not str(userheader) in headlist): print 'There is no header field named', userheader userheader = raw_input('Get text matching scores for which field?: ')#user can pick a field for processingprint 'Processing...' csvfile=open(str(csvname), 'rb')dictReader = csv.DictReader(x.replace('\000', '') for x in csvfile)#reopens file and creates dictionary that has null chars removedTransDat = [row for row in dictReader] subj_uniqid = [(i['subject_id'],i[str(userheader)]) for i in TransDat]newsubid=defaultdict(list)for k,v in subj_uniqid: newsubid[k].append(v)newsubsc=defaultdict(list)for newsubid_key,newsubid_values in newsubid.iteritems(): for pair in it.combinations(newsubid_values,2) : pair=map(str.lower, pair) c=int(fuzz.token_sort_ratio(str(pair[0]),str(pair[1]))) newsubsc[newsubid_key].append(c)#create a dictionary of lists. Loop through contents of newsubid, get selected field values related to each subj_id, take all combinations of those values, lower case them, and then run fuzz.token_sort_ratio on each combination, send the scores to that dictionary of lists with subj_id as key and fuzzymatch scores as integer values.print '{:30} {:2} {:7} {:2} {:8} {:2} {:7}'.format('subject_id', '\t', ' #compare', '\t', 'avgmatch', '\t', 'stddev')for newsubsc_key, newsubsc_values in newsubsc.iteritems(): length=int(len(newsubsc_values)) avg=sum(newsubsc_values)/len(newsubsc_values) stddev=np.std(newsubsc_values) print '{:30} {:2} {:7} {:2} {:8} {:2} {:5.3f}'.format(str(newsubsc_key), '\t', length, '\t',avg, '\t', stddev)#last step is to simply print out a summary, right now to STDOUT, which includes the subj_id, number of combinations tried, average of all scores for a particular field and subj_id, and standard deviation.