diff --git a/README.md b/README.md index 58e4ab4..b06e89e 100755 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ percent_peptides = np.array([0.18, 0.18, 0.35]) predictions = azimuth.model_comparison.predict(sequences, amino_acid_cut_positions, percent_peptides) for i, prediction in enumerate(predictions): - print sequences[i], prediction + print("%s %f" % (sequences[i], prediction)) ``` Output: @@ -87,5 +87,3 @@ Sometimes the pre-computed .pickle files in the saved_models directory are incom #### Contacting us You can submit bug reports using the GitHub issue tracker. If you have any other questions, please contact us at crispr@lists.research.microsoft.com. - - diff --git a/azimuth/cluster_job.py b/azimuth/cluster_job.py index c799ce8..259f90d 100755 --- a/azimuth/cluster_job.py +++ b/azimuth/cluster_job.py @@ -5,7 +5,7 @@ # just execute this file in python to create the xml file for the cluster (in ./analysis/cluster), which one then can manually submit through the HPC Job Manager def cluster_setup(i, python_path, home, t, work_dir, tempdir): - t.work_directory = work_dir + t.work_directory = work_dir #t.std_out_file_path = r'cluster\log\cluster_out%d.txt' % i #t.std_err_file_path = r'cluster\log\cluster_err%d.txt' % i t.std_out_file_path = tempdir + r'\out%d.txt' % i @@ -14,10 +14,10 @@ def cluster_setup(i, python_path, home, t, work_dir, tempdir): #t.std_err_file_path = r'err%d.txt' % i #if not os.path.exists(t.std_out_file_path): os.makedirs(t.std_out_file_path) #if not os.path.exists(t.std_err_file_path): os.makedirs(t.std_err_file_path) - t.environment_variables['PYTHONPATH'] = python_path + t.environment_variables['PYTHONPATH'] = python_path t.environment_variables['HOME'] = home - print "cluster python_path=%s" % python_path + print("cluster python_path=%s" % python_path) def create(user, models, orders, degrees, GP_likelihoods, adaboost_learning_rates=None, adaboost_num_estimators=None, adaboost_max_depths=None, adaboost_CV=False, exp_name=None, learn_options=None): job = WinHPCJob() @@ -42,18 +42,18 @@ def create(user, models, orders, degrees, GP_likelihoods, adaboost_learning_rate home = r"\\fusi1\CLUSTER_HOME" elif job.username == 'REDMOND\\jennl': remote_dir = r"\\GCR\Scratch\RR1\jennl\CRISPR" - work_dir = r'\\jennl2\D$\Source\CRISPR\analysis' + work_dir = r'\\jennl2\D$\Source\CRISPR\analysis' python = r'\\fusi1\crispr\python.exe' python_path = r'\\fusi1\crispr\lib\site-packages\;\\jennl2\D$\Source\CRISPR\analysis' home = r"\\fusi1\CLUSTER_HOME" - # print "workdir=%s" % work_dir - # print "python=%s" % python - # print "python_path=%s" % python_path + # print("workdir=%s" % work_dir) + # print("python=%s" % python) + # print("python_path=%s" % python_path) - # generate random dir in results directory + # generate random dir in results directory tempdir = tempfile.mkdtemp(prefix='cluster_experiment_', dir=remote_dir) - print "Created directory: %s" % str(tempdir) + print("Created directory: %s" % str(tempdir)) # dump learn_options with open(tempdir+'/learn_options.pickle', 'wb') as f: diff --git a/azimuth/corrstats.py b/azimuth/corrstats.py index 619157a..3d11b8e 100644 --- a/azimuth/corrstats.py +++ b/azimuth/corrstats.py @@ -113,8 +113,8 @@ def independent_corr(xy, ab, n, n2 = None, twotailed=True, conf_level=0.95, meth else: raise Exception('Wrong method!') -#print dependent_corr(.396, .179, .088, 200, method='steiger') -#print independent_corr(.560, .588, 100, 353, method='fisher') +#print(dependent_corr(.396, .179, .088, 200, method='steiger')) +#print(independent_corr(.560, .588, 100, 353, method='fisher')) -#print dependent_corr(.396, .179, .088, 200, method='zou') -#print independent_corr(.560, .588, 100, 353, method='zou') \ No newline at end of file +#print(dependent_corr(.396, .179, .088, 200, method='zou')) +#print(independent_corr(.560, .588, 100, 353, method='zou')) diff --git a/azimuth/features/featurization.py b/azimuth/features/featurization.py index bb88359..603094b 100755 --- a/azimuth/features/featurization.py +++ b/azimuth/features/featurization.py @@ -21,7 +21,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length assert num_lengths == 1, "should only have sequences of a single length, but found %s: %s" % (num_lengths, str(unique_lengths)) if not quiet: - print "Constructing features..." + print("Constructing features...") t0 = time.time() feature_sets = {} @@ -49,7 +49,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length feature_sets["Percent Peptide <50%"]['Percent Peptide <50%'] = feature_sets["Percent Peptide <50%"].pop("Percent Peptide") if learn_options["include_gene_effect"]: - print "including gene effect" + print("including gene effect") gene_names = Y['Target gene'] enc = sklearn.preprocessing.OneHotEncoder() label_encoder = sklearn.preprocessing.LabelEncoder() @@ -95,7 +95,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length t1 = time.time() if not quiet: - print "\t\tElapsed time for constructing features is %.2f seconds" % (t1-t0) + print("\t\tElapsed time for constructing features is %.2f seconds" % (t1-t0)) check_feature_set(feature_sets) @@ -138,8 +138,8 @@ def NGGX_interaction_feature(data, pam_audit=True): for seq in sequence: if pam_audit and seq[25:27] != "GG": raise Exception("expected GG but found %s" % seq[25:27]) - NX = seq[24]+seq[27] - NX_onehot = nucleotide_features(NX,order=2, feature_type='pos_dependent', max_index_to_use=2, prefix="NGGX") + NX = seq[24]+seq[27] + NX_onehot = nucleotide_features(NX,order=2, feature_type='pos_dependent', max_index_to_use=2, prefix="NGGX") # NX_onehot[:] = np.random.rand(NX_onehot.shape[0]) ##TESTING RANDOM FEATURE feat_NX = pandas.concat([feat_NX, NX_onehot], axis=1) return feat_NX.T @@ -148,7 +148,7 @@ def NGGX_interaction_feature(data, pam_audit=True): def get_all_order_nuc_features(data, feature_sets, learn_options, maxorder, max_index_to_use, prefix="", quiet=False): for order in range(1, maxorder+1): if not quiet: - print "\t\tconstructing order %s features" % order + print("\t\tconstructing order %s features" % order) nuc_features_pd, nuc_features_pi = apply_nucleotide_features(data, order, learn_options["num_proc"], include_pos_independent=True, max_index_to_use=max_index_to_use, prefix=prefix) feature_sets['%s_nuc_pd_Order%i' % (prefix, order)] = nuc_features_pd @@ -157,7 +157,7 @@ def get_all_order_nuc_features(data, feature_sets, learn_options, maxorder, max_ check_feature_set(feature_sets) if not quiet: - print "\t\t\t\t\t\t\tdone" + print("\t\t\t\t\t\t\tdone") def countGC(s, length_audit=True): @@ -202,7 +202,7 @@ def organism_feature(data): def get_micro_homology_features(gene_names, learn_options, X): # originally was flipping the guide itself as necessary, but now flipping the gene instead - print "building microhomology features" + print("building microhomology features") feat = pandas.DataFrame(index=X.index) feat["mh_score"] = "" feat["oof_score"] = "" @@ -215,7 +215,7 @@ def get_micro_homology_features(gene_names, learn_options, X): for gene in gene_names.unique(): gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement() guide_inds = np.where(gene_names.values == gene)[0] - print "getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene) + print("getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene)) for j, ps in enumerate(guide_inds): guide_seq = Seq.Seq(X['30mer'][ps]) strand = X['Strand'][ps] @@ -227,18 +227,18 @@ def get_micro_homology_features(gene_names, learn_options, X): gene_seq = gene_seq.reverse_complement() ind = gene_seq.find(guide_seq) #assert ind != -1, "still didn't work" - #print "shouldn't get here" + #print("shouldn't get here") else: - #print "all good" + #print("all good") pass #assert ind != -1, "could not find guide in gene" if ind==-1: - #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene)) + #print("***could not find guide %s for gene %s" % (str(guide_seq), str(gene))) #if.write(str(gene) + "," + str(guide_seq)) mh_score = 0 oof_score = 0 else: - #print "worked" + #print("worked") assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" left_win = gene_seq[(ind - k_mer_length_left):ind] @@ -258,14 +258,14 @@ def get_micro_homology_features(gene_names, learn_options, X): feat.ix[ps,"mh_score"] = mh_score feat.ix[ps,"oof_score"] = oof_score - print "computed microhomology of %s" % (str(gene)) + print("computed microhomology of %s" % (str(gene))) return pandas.DataFrame(feat, dtype='float') def local_gene_seq_features(gene_names, learn_options, X): - print "building local gene sequence features" + print("building local gene sequence features") feat = pandas.DataFrame(index=X.index) feat["gene_left_win"] = "" feat["gene_right_win"] = "" @@ -300,7 +300,7 @@ def local_gene_seq_features(gene_names, learn_options, X): assert len(left_win)==len(right_win), "k_mer_context, %s, is too large" % k_mer_length feat.ix[ps,"gene_left_win"] = left_win.tostring() feat.ix[ps,"gene_right_win"] = right_win.tostring() - print "featurizing local context of %s" % (gene) + print("featurizing local context of %s" % (gene)) feature_sets = {} get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_left_win") @@ -341,11 +341,11 @@ def gene_guide_feature(Y, X, learn_options): gene_file = r"..\data\gene_seq_feat_V%s_km%s.ord%s.pickle" % (learn_options['V'], learn_options['include_gene_guide_feature'], learn_options['order']) if False: #os.path.isfile(gene_file): #while debugging, comment out - print "loading local gene seq feats from file %s" % gene_file + print("loading local gene seq feats from file %s" % gene_file) with open(gene_file, "rb") as f: feature_sets = pickle.load(f) else: feature_sets = local_gene_seq_features(Y['Target gene'], learn_options, X) - print "writing local gene seq feats to file %s" % gene_file + print("writing local gene seq feats to file %s" % gene_file) with open(gene_file, "wb") as f: pickle.dump(feature_sets, f) return feature_sets @@ -383,11 +383,11 @@ def Tm_feature(data, pam_audit=True, learn_options=None): featarray[i,2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]], rna=rna) #8-mer featarray[i,3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]], rna=rna) #5-mer - #print "CRISPR" + #print("CRISPR") #for d in range(4): - # print featarray[i,d] + # print(featarray[i,d]) #import ipdb; ipdb.set_trace() - + feat = pandas.DataFrame(featarray, index=data.index, columns=["Tm global_%s" % rna, "5mer_end_%s" %rna, "8mer_middle_%s" %rna, "5mer_start_%s" %rna]) @@ -442,7 +442,7 @@ def nucleotide_features(s, order, max_index_to_use, prefix="", feature_type='all ''' assert feature_type in ['all', 'pos_independent', 'pos_dependent'] if max_index_to_use <= len(s): - #print "WARNING: trimming max_index_to use down to length of string=%s" % len(s) + #print("WARNING: trimming max_index_to use down to length of string=%s" % len(s)) max_index_to_use = len(s) if max_index_to_use is not None: @@ -493,7 +493,7 @@ def nucleotide_features(s, order, max_index_to_use, prefix="", feature_type='all return res res = pandas.Series(features_pos_dependent, index=index_dependent) - assert not np.any(np.isnan(res.values)) + assert not np.any(np.isnan(res.values)) return res def nucleotide_features_dictionary(prefix=''): @@ -537,7 +537,7 @@ def normalize_feature_sets(feature_sets): zero-mean, unit-variance each feature within each set ''' - print "Normalizing features..." + print("Normalizing features...") t1 = time.time() new_feature_sets = {} @@ -547,6 +547,6 @@ def normalize_feature_sets(feature_sets): raise Exception("found Nan feature values in set=%s" % set) assert new_feature_sets[set].shape[1] > 0, "0 columns of features" t2 = time.time() - print "\t\tElapsed time for normalizing features is %.2f seconds" % (t2-t1) + print("\t\tElapsed time for normalizing features is %.2f seconds" % (t2-t1)) return new_feature_sets diff --git a/azimuth/features/microhomology.py b/azimuth/features/microhomology.py index 45de681..3fa27d8 100755 --- a/azimuth/features/microhomology.py +++ b/azimuth/features/microhomology.py @@ -1,102 +1,102 @@ -#Supplementary Figure 3 | Source code for assigning a score to a hypothetical deletion -#pattern associated with microhomology +#Supplementary Figure 3 | Source code for assigning a score to a hypothetical deletion +#pattern associated with microhomology # ------------------------------------------ # comes from the Supplementary Info of the paper, in pdf form, copied here, but refactored to make a function # rather than to write it to file # also see their web server version: http://www.rgenome.net/mich-calculator/ where they say: # Insert one or more query sequences (A, G, T, C only) flanking the same length at a cleavage site (100bp or less, 60~80bp recommended). -from math import exp -from re import findall - +from math import exp +from re import findall + def compute_score(seq, tmpfile1="1.before removing duplication.txt", tmpfile2="2.all microhomology patterns.txt", verbose=False): - length_weight=20.0 - left=30 # Insert the position expected to be broken. - right=len(seq)-int(left) - #print 'length of seq = '+str(len(seq)) - - file_temp=open(tmpfile1, "w") - for k in range(2,left)[::-1]: - for j in range(left,left+right-k+1): - for i in range(0,left-k+1): - if seq[i:i+k]==seq[j:j+k]: - length=j-i - file_temp.write(seq[i:i+k]+'\t'+str(i)+'\t'+str(i+k)+'\t'+str(j)+'\t'+str(j+k)+'\t'+str(length)+'\n') - file_temp.close() - - ### After searching out all microhomology patterns, duplication should be removed!! - f1=open(tmpfile1, "r") - s1=f1.read() - - f2=open(tmpfile2, "w") #After removing duplication - f2.write(seq+'\t'+'microhomology\t'+'deletion length\t'+'score of a pattern\n') - - if s1!="": - list_f1=s1.strip().split('\n') - sum_score_3=0 - sum_score_not_3=0 - - for i in range(len(list_f1)): - n=0 - score_3=0 - score_not_3=0 - line=list_f1[i].split('\t') - scrap=line[0] - left_start=int(line[1]) - left_end=int(line[2]) - right_start=int(line[3]) - right_end=int(line[4]) - length=int(line[5]) - - for j in range(i): - line_ref=list_f1[j].split('\t') - left_start_ref=int(line_ref[1]) - left_end_ref=int(line_ref[2]) - right_start_ref=int(line_ref[3]) - right_end_ref=int(line_ref[4]) - - if (left_start >= left_start_ref) and (left_end <= left_end_ref) and (right_start >= right_start_ref) and (right_end <= right_end_ref): - if (left_start - left_start_ref)==(right_start - right_start_ref) and (left_end - left_end_ref)==(right_end - right_end_ref): - n+=1 - else: pass - - if n == 0: - if (length % 3)==0: - length_factor = round(1/exp((length)/(length_weight)),3) - num_GC=len(findall('G',scrap))+len(findall('C',scrap)) - score_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) - - elif (length % 3)!=0: - length_factor = round(1/exp((length)/(length_weight)),3) - num_GC=len(findall('G',scrap))+len(findall('C',scrap)) - score_not_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) - - f2.write(seq[0:left_end]+'-'*length+seq[right_end:]+'\t'+scrap+'\t'+str(length)+'\t'+str(100*length_factor*((len(scrap)-num_GC)+(num_GC*2)))+'\n') - sum_score_3+=score_3 - sum_score_not_3+=score_not_3 - + length_weight=20.0 + left=30 # Insert the position expected to be broken. + right=len(seq)-int(left) + #print('length of seq = '+str(len(seq))) + + file_temp=open(tmpfile1, "w") + for k in range(2,left)[::-1]: + for j in range(left,left+right-k+1): + for i in range(0,left-k+1): + if seq[i:i+k]==seq[j:j+k]: + length=j-i + file_temp.write(seq[i:i+k]+'\t'+str(i)+'\t'+str(i+k)+'\t'+str(j)+'\t'+str(j+k)+'\t'+str(length)+'\n') + file_temp.close() + + ### After searching out all microhomology patterns, duplication should be removed!! + f1=open(tmpfile1, "r") + s1=f1.read() + + f2=open(tmpfile2, "w") #After removing duplication + f2.write(seq+'\t'+'microhomology\t'+'deletion length\t'+'score of a pattern\n') + + if s1!="": + list_f1=s1.strip().split('\n') + sum_score_3=0 + sum_score_not_3=0 + + for i in range(len(list_f1)): + n=0 + score_3=0 + score_not_3=0 + line=list_f1[i].split('\t') + scrap=line[0] + left_start=int(line[1]) + left_end=int(line[2]) + right_start=int(line[3]) + right_end=int(line[4]) + length=int(line[5]) + + for j in range(i): + line_ref=list_f1[j].split('\t') + left_start_ref=int(line_ref[1]) + left_end_ref=int(line_ref[2]) + right_start_ref=int(line_ref[3]) + right_end_ref=int(line_ref[4]) + + if (left_start >= left_start_ref) and (left_end <= left_end_ref) and (right_start >= right_start_ref) and (right_end <= right_end_ref): + if (left_start - left_start_ref)==(right_start - right_start_ref) and (left_end - left_end_ref)==(right_end - right_end_ref): + n+=1 + else: pass + + if n == 0: + if (length % 3)==0: + length_factor = round(1/exp((length)/(length_weight)),3) + num_GC=len(findall('G',scrap))+len(findall('C',scrap)) + score_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) + + elif (length % 3)!=0: + length_factor = round(1/exp((length)/(length_weight)),3) + num_GC=len(findall('G',scrap))+len(findall('C',scrap)) + score_not_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) + + f2.write(seq[0:left_end]+'-'*length+seq[right_end:]+'\t'+scrap+'\t'+str(length)+'\t'+str(100*length_factor*((len(scrap)-num_GC)+(num_GC*2)))+'\n') + sum_score_3+=score_3 + sum_score_not_3+=score_not_3 + mh_score = sum_score_3+sum_score_not_3 oof_score = (sum_score_not_3)*100/(sum_score_3+sum_score_not_3) if verbose: - print 'Microhomology score = ' + str(mh_score) - print 'Out-of-frame score = ' + str(oof_score) - f1.close() + print('Microhomology score = ' + str(mh_score)) + print('Out-of-frame score = ' + str(oof_score)) + f1.close() f2.close() return mh_score, oof_score if __name__ == '__main__': - seq='GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG' # The length of sequence is recommend within 60~80 bases. + seq='GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG' # The length of sequence is recommend within 60~80 bases. tmpfile1 = "1.before removing duplication.txt" tmpfile2 = "2.all microhomology patterns.txt" - + mh_score, oof_score = compute_score(seq, tmpfile1=tmpfile1, tmpfile2=tmpfile2, verbose=True) - # The row of output file is consist of (full sequence, microhomology scrap, deletion length, score of pattern). + # The row of output file is consist of (full sequence, microhomology scrap, deletion length, score of pattern). #correct output is #Microhomology score = 4662.9 #Out-of-frame score = 50.7473889639 - #GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG - - print seq \ No newline at end of file + #GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG + + print(seq) diff --git a/azimuth/load_data.py b/azimuth/load_data.py index d7245a0..06781c7 100755 --- a/azimuth/load_data.py +++ b/azimuth/load_data.py @@ -10,7 +10,7 @@ def from_custom_file(data_file, learn_options): # use semantics of when we load V2 data - print "Loading inputs to predict from %s" % data_file + print("Loading inputs to predict from %s" % data_file) data = pandas.read_csv(data_file) mandatory_columns = ['30mer', 'Target gene', 'Percent Peptide', 'Amino Acid Cut position'] @@ -37,7 +37,7 @@ def from_custom_file(data_file, learn_options): def from_file(data_file, learn_options, data_file2=None, data_file3=None): if learn_options["V"] == 1: # from Nature Biotech paper - print "loading V%d data" % learn_options["V"] + print("loading V%d data" % learn_options["V"]) assert not learn_options["weighted"] is not None, "not supported for V1 data" annotations, gene_position, target_genes, Xdf, Y = read_V1_data(data_file, learn_options) @@ -152,12 +152,12 @@ def read_V1_data(data_file, learn_options, AML_file=cur_dir + "/data/V1_suppl_da assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" if learn_options is not None and learn_options["flipV1target"]: - print "************************************************************************" - print "*****************MATCHING DOENCH CODE (DEBUG MODE)**********************" - print "************************************************************************" + print("************************************************************************") + print("*****************MATCHING DOENCH CODE (DEBUG MODE)**********************") + print("************************************************************************") # normally it is: Y['average threshold'] = Y['average rank'] > 0.8, where 1s are good guides, 0s are not Y['average threshold'] = Y['average rank'] < 0.2 # 1s are bad guides - print "press c to continue" + print("press c to continue") import ipdb ipdb.set_trace() @@ -272,7 +272,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True): count = count + Xtmp.shape[0] Xdf = pandas.concat([Xdf, Xtmp], axis=0) if verbose: - print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count) + print("Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count)) # create new index that includes the drug Xdf = Xdf.set_index('drug', append=True) @@ -335,7 +335,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True): gene_position = util.impute_gene_position(gene_position) if learn_options is not None and learn_options["weighted"] == "variance": - print "computing weights from replicate variance..." + print("computing weights from replicate variance...") # compute the variance across replicates so can use it as a weight data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6+1), index_col=[0, 4]) data.index.names = ["Sequence", "Target gene"] @@ -359,7 +359,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True): orig_index = Y.index.copy() Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True) Y = Y.ix[orig_index] - print "done." + print("done.") # Make sure to keep this check last in this function assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" diff --git a/azimuth/local_multiprocessing.py b/azimuth/local_multiprocessing.py index 3f4aa15..4d3ad10 100755 --- a/azimuth/local_multiprocessing.py +++ b/azimuth/local_multiprocessing.py @@ -21,10 +21,10 @@ def configure(num_jobs=8, TEST=False, subtract=0, num_proc=None, num_thread_per_ try: import mkl - mkl.set_num_threads(num_thread_per_proc) + mkl.set_num_threads(num_thread_per_proc) except ImportError: - print "MKL not available, so I'm not adjusting the number of threads" + print("MKL not available, so I'm not adjusting the number of threads") - print "Launching %d jobs with %d MKL threads each" % (num_jobs, num_thread_per_proc) + print("Launching %d jobs with %d MKL threads each" % (num_jobs, num_thread_per_proc)) return num_jobs diff --git a/azimuth/metrics.py b/azimuth/metrics.py index ef95fba..50e7e31 100755 --- a/azimuth/metrics.py +++ b/azimuth/metrics.py @@ -255,26 +255,26 @@ def ndcg_at_k_ties(labels, predictions, k, method=0, normalize_from_below_too=Fa if isinstance(predictions, list): predictions = np.array(predictions) - + assert len(labels.shape)==1 or np.min(labels.shape)==1, "should be 1D array or equivalent" assert len(predictions.shape)==1 or np.min(predictions.shape)==1, "should be 1D array or equivalent" - + labels = labels.flatten() predictions = predictions.flatten() assert np.all(labels.shape == predictions.shape), "labels and predictions should have the same shape" - + if k is None: k = len(labels) labels = labels.copy() dcg = dcg_at_k_ties(labels, predictions, k, method=method, theta=theta) - + dcg_max = dcg_at_k_ties(labels, labels, k, method, theta=theta) # NOTE: I have checked that dcg_at_k_ties and dcg_at_k match when there are no ties, or ties in the labels - + if normalize_from_below_too: dcg_min = dcg_at_k_ties(np.sort(labels)[::-1], np.sort(predictions), k, method, theta=theta) else: @@ -282,9 +282,9 @@ def ndcg_at_k_ties(labels, predictions, k, method=0, normalize_from_below_too=Fa numerator = (dcg - dcg_min) assert numerator > -1e-5 numerator = np.max((0, numerator)) - ndcg = numerator / (dcg_max - dcg_min) + ndcg = numerator / (dcg_max - dcg_min) assert ndcg <= 1.0 and ndcg >= 0.0, "ndcg=%f should be in [0,1]" % ndcg - if not dcg_max: + if not dcg_max: ndcg = 0. return ndcg @@ -357,7 +357,7 @@ def gain(label, method): dcg = dcg_helper(discount_factors, gain, k, labels, method, predictions) assert not np.isnan(dcg), "found nan dcg" - + return dcg def get_discount_factors(num_labels, discount='log2', theta=None): @@ -411,13 +411,13 @@ def ndcg_bootstrap_test(preds1, preds2, true_labels, num_bootstrap, method, k, n return pv def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, normalize_from_below_too, theta=None, balance_zeros=True): - + # pVal is the probability that we would observe as big an AUC diff as we - # did if the ROC curves were drawn from the null hypothesis (which is that + # did if the ROC curves were drawn from the null hypothesis (which is that # one model does not perform better than the other) # - # null hypothesis is that the prediction ranking are the same, so we exchange a random - # number of them with each other. + # null hypothesis is that the prediction ranking are the same, so we exchange a random + # number of them with each other. # # see ndcg_at_k_ties for all but the first four parameters # @@ -425,7 +425,7 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm # # this is a two-sided test, but since it is a symmetric null distribution, one should # be able to divide the p-value by 2 to get the one-sided version (but think this through before using) - + if isinstance(preds1, list): preds1 = np.array(preds1) else: @@ -453,19 +453,19 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm ranks1 = sp.stats.mstats.rankdata(preds1) ranks2 = sp.stats.mstats.rankdata(preds2) - + ndcg1 = ndcg_at_k_ties(true_labels, ranks1, k=k, method=method, normalize_from_below_too=normalize_from_below_too, theta=theta) ndcg2 = ndcg_at_k_ties(true_labels, ranks2, k=k, method=method, normalize_from_below_too=normalize_from_below_too, theta=theta) real_ndcg_diff = {} perm_ndcg_diff = {} - real_ndcg_diff = np.abs(ndcg1 - ndcg2) + real_ndcg_diff = np.abs(ndcg1 - ndcg2) perm_ndcg_diff = np.nan*np.zeros(nperm) - + if False:#np.all(preds1 == preds2): - pval[theta] = 1.0 - else: + pval[theta] = 1.0 + else: zero_ind = true_labels == 0 assert np.sum(zero_ind) < len(zero_ind), "balancing assumes there are more zeros than ones" @@ -485,8 +485,8 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm tmp_diff = np.abs(ndcg1_perm[theta] - ndcg2_perm[theta]) perm_ndcg_diff[theta][t] = tmp_diff - pval = {} - + pval = {} + num_stat_greater = np.max((((perm_ndcg_diff > real_ndcg_diff).sum() + 1), 1.0)) pval = num_stat_greater / nperm @@ -495,7 +495,7 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm plt.plot(np.sort(perm_ndcg_diff), '.') plt.plot(real_ndcg_diff*np.ones(perm_ndcg_diff.shape), 'k-') plt.show() - + return pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 if __name__ == "__main__": @@ -506,35 +506,35 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm simulated_data = True permute_real_data = True - + T = 1000 allp = np.nan*np.ones(T) nperm = 100 - #method = 4; normalize_from_below_too = True; - + #method = 4; normalize_from_below_too = True; + #theta_range = np.logspace(np.log10(0.01), np.log10(1.0), 3) # Nicolo uses 10, so I grab the extremes and middle #theta_range = np.array([0.01]) - #weights = np.logspace(np.log10(0.0001), np.log10(10), 3); + #weights = np.logspace(np.log10(0.0001), np.log10(10), 3); #weights = np.array([100.0]) weights = np.array([0.001]) theta_range = weights# just to make life easier - + # only for simulated data N = 100 frac_zeros = 0 - + k = None allp = np.nan*np.zeros((len(theta_range) + 1, T)) if not simulated_data: - print "loading up saved data..." # two-fold CV data from CRISPR off-target GUIDE-SEQ + print("loading up saved data...") # two-fold CV data from CRISPR off-target GUIDE-SEQ with open(r'\\nerds5\kevin\from_nicolo\gs.pickle','rb') as f: predictions, truth_all = pickle.load(f) - print "done." + print("done.") N = len(truth_all[0]) - + for t in range(T): # totally simulated @@ -544,17 +544,17 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm truth[zero_ind] = 0 pred1 = np.random.rand(N) pred2 = np.random.rand(N) - else: + else: fold = 0 truth = truth_all[fold] pred1 = predictions["CFD"][fold] pred2 = predictions["product"][fold] - + if permute_real_data: truth = np.random.permutation(truth) - t0 = time.time() - #pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta_range=theta_range) + t0 = time.time() + #pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta_range=theta_range) for i, w in enumerate(weights): weights_array = truth.copy() weights_array += w @@ -562,26 +562,26 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm #corr0 = elevation.metrics.spearman_weighted(truth, pred1, w=weights_array) #corr1 = elevation.metrics.spearman_weighted(truth, pred2, w=weights_array) #corr01 = elevation.metrics.spearman_weighted(pred1, pred2, w=weights_array) - #n0 = len(truth) + #n0 = len(truth) #t2, pvaltmp = corrstats.dependent_corr(corr0, corr1, corr01, n0, twotailed=True, method="steiger") pvaltmp, real_corr_diff, perm_corr_diff, corr1, corr2 = elevation.spearman_weighted_swap_perm_test(pred1, pred2, truth, nperm, weights_array) - + allp[i, t] = pvaltmp t1 = time.time() #for i, theta in enumerate(theta_range.tolist() + ["all"]): - # print "%d, theta=%s) ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f, elapsed time=%f minutes, smallest_p=%f" % (t, str(theta), ndcg1[theta], ndcg2[theta], real_ndcg_diff[theta], pval[theta], (t1-t0)/60, 1.0/nperm) + # print("%d, theta=%s) ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f, elapsed time=%f minutes, smallest_p=%f" % (t, str(theta), ndcg1[theta], ndcg2[theta], real_ndcg_diff[theta], pval[theta], (t1-t0)/60, 1.0/nperm)) # allp[i, t] = pval[theta] - #print "---------------" - + #print("---------------") + #for i, theta in enumerate(theta_range.tolist() + ["all"]): for i, theta in enumerate(theta_range.tolist()): #mytitle = "Norm. hist p-values nDCG\n %d null samples, w %d perm and N=%d, theta=%s" % (T, nperm, N, str(theta)) mytitle = "Norm. hist p-values Steiger w weighted Spearman\n %d null samples, N=%d, weight=%s" % (T, N, str(theta)) ut.qqplotp(allp[i,:], dohist=True, numbins=10, figsize=[6,6], title=mytitle, markersize=5) plt.show() - + #save_tmp_results = r'D:\Source\CRISPR\elevation\pickles\tmp.ndcg.stat.calibration.p' #pickle.dump([theta_range, allp, pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2], open(save_tmp_results, "wb" )) #[theta_range, allp, pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2] = pickle.load(open(save_tmp_results, "rb" )) @@ -606,10 +606,10 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm # # using our function # dcg3 = dcg_at_k_ties(labels,predictions,k) - # print "%f, %f, %f" % (dcg1, dcg2, dcg3) + # print("%f, %f, %f" % (dcg1, dcg2, dcg3)) # assert(np.abs(dcg1 - dcg2) < 1e-8) # assert(np.abs(dcg2 - dcg3) < 1e-8) - # print "check out ok for case with all ties in predictions" + # print("check out ok for case with all ties in predictions") truth = np.array([3, 4, 2, 1, 0, 0, 0]) pred1 = np.array([3, 4, 2, 1, 0, 0, 0]) @@ -626,29 +626,29 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm k = len(pred3) pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) - - pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred1, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) + + pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred1, truth, nperm, method, k, normalize_from_below_too, theta=theta) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) - pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred4, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) + pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred4, truth, nperm, method, k, normalize_from_below_too, theta=theta) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) - pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred5, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) + pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred5, truth, nperm, method, k, normalize_from_below_too, theta=theta) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) import ipdb; ipdb.set_trace() - #print ndcg_at_k_ties(truth, truth, k, method=0, normalize_from_below_too=True) - #print ndcg_at_k_ties(truth, pred2, k, method=0, normalize_from_below_too=True) - #print ndcg_at_k_ties(truth, pred3, k, method=0, normalize_from_below_too=True) - #print ndcg_at_k_ties(truth3, pred3, k, method=3, normalize_from_below_too=True) - print ndcg_at_k_ties(truth4, pred2, k, method=3, normalize_from_below_too=True) - - print ndcg_alt(truth[np.argsort(pred2)[::-1]], 5) - print ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=1) - print ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=0) + #print(ndcg_at_k_ties(truth, truth, k, method=0, normalize_from_below_too=True)) + #print(ndcg_at_k_ties(truth, pred2, k, method=0, normalize_from_below_too=True)) + #print(ndcg_at_k_ties(truth, pred3, k, method=0, normalize_from_below_too=True)) + #print(ndcg_at_k_ties(truth3, pred3, k, method=3, normalize_from_below_too=True)) + print(ndcg_at_k_ties(truth4, pred2, k, method=3, normalize_from_below_too=True)) + + print(ndcg_alt(truth[np.argsort(pred2)[::-1]], 5)) + print(ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=1)) + print(ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=0)) - print ndcg_at_k_ties(truth, pred2, 5, method=1) - print ndcg_at_k_ties(truth, pred2, 5, method=0) + print(ndcg_at_k_ties(truth, pred2, 5, method=1)) + print(ndcg_at_k_ties(truth, pred2, 5, method=0)) diff --git a/azimuth/model_comparison.py b/azimuth/model_comparison.py index c5fbf29..d3b0ee8 100755 --- a/azimuth/model_comparison.py +++ b/azimuth/model_comparison.py @@ -266,7 +266,7 @@ def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=Tru learn_options["order"] = 1 if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True: - print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)" + print("WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)") for i in range(Xdf.shape[0]): Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"]) # to_keep = Xdf['30mer'].isnull() == False @@ -307,7 +307,7 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_ "logregL1": "logregL1", "sgrna_from_doench":"sgrna_from_doench", 'SVC': 'SVC', 'xu_et_al': 'xu_et_al'} if not CV: - print "Received option CV=False, so I'm training using all of the data" + print("Received option CV=False, so I'm training using all of the data") assert len(learn_options_set.keys()) == 1, "when CV is False, only 1 set of learn options is allowed" assert len(models) == 1, "when CV is False, only 1 model is allowed" @@ -320,10 +320,10 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_ # models requiring explicit featurization if model in feat_models_short.keys(): for order in orders: - print "running %s, order %d for %s" % (model, order, learn_options_str) + print("running %s, order %d for %s" % (model, order, learn_options_str)) Y, feature_sets, target_genes, learn_options, num_proc = setup_function(test=test, order=order, learn_options=partial_learn_opt, pam_audit=pam_audit, length_audit=length_audit) # TODO precompute features for all orders, as this is repated for each model - + if model == 'L1': learn_options_model = L1_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn) elif model == 'L2': @@ -359,7 +359,7 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_ # if the model doesn't require explicit featurization else: assert setup_fn==setup, "not yet modified to handle this" - print "running %s for %s" % (model, learn_options_str) + print("running %s for %s" % (model, learn_options_str)) Y, feature_sets, target_genes, learn_options, num_proc = setup(test=test, order=1, learn_options=partial_learn_opt, pam_audit=pam_audit, length_audit=length_audit) if model == 'mean': learn_options_model = mean_setup(copy.deepcopy(learn_options)) @@ -392,12 +392,12 @@ def pickle_runner_results(exp_name, results, all_learn_options, relpath="/../" + dname = os.path.dirname(abspath) + relpath if not os.path.exists(dname): os.makedirs(dname) - print "Created directory: %s" % str(dname) + print("Created directory: %s" % str(dname)) if exp_name is None: exp_name = results.keys()[0] myfile = dname+'/'+ exp_name + '.pickle' with open(myfile, 'wb') as f: - print "writing results to %s" % myfile + print("writing results to %s" % myfile) pickle.dump((results, all_learn_options), f, -1) def runner(models, learn_options, GP_likelihoods=None, orders=None, WD_kernel_degrees=None, where='local', cluster_user='fusi', cluster='RR1-N13-09-H44', test=False, exp_name = None, **kwargs): @@ -550,7 +550,7 @@ def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None, model, learn_options = pickle.load(f) else: model, learn_options = model - + learn_options["V"] = 2 learn_options = override_learn_options(learn_options_override, learn_options) @@ -567,12 +567,12 @@ def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None, feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit) inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets) - - #print "CRISPR" + + #print("CRISPR") #pandas.DataFrame(inputs).to_csv("CRISPR.inputs.test.csv") #import ipdb; ipdb.set_trace() - # call to scikit-learn, returns a vector of predicted values + # call to scikit-learn, returns a vector of predicted values preds = model.predict(inputs) # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function()) @@ -609,7 +609,7 @@ def write_results(predictions, file_to_predict): data = pandas.read_csv(file_to_predict) data['predictions'] = predictions data.to_csv(newfile) - print "wrote results to %s" % newfile + print("wrote results to %s" % newfile) return data, newfile if __name__ == '__main__': diff --git a/azimuth/models/DNN.py b/azimuth/models/DNN.py index c01fe1a..eaa44b1 100755 --- a/azimuth/models/DNN.py +++ b/azimuth/models/DNN.py @@ -64,11 +64,11 @@ def DNN_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_optio if best_score is None or accuracies[i, j] > best_score: best_score = accuracies[i, j] best_model = copy.deepcopy(e) - print "DNN with %d hidden layers and %d units, accuracy: %.4f *" % (hl, nu, accuracies[i,j]) + print("DNN with %d hidden layers and %d units, accuracy: %.4f *" % (hl, nu, accuracies[i,j])) else: - print "DNN with %d hidden layers and %d units, accuracy: %.4f" % (hl, nu, accuracies[i,j]) + print("DNN with %d hidden layers and %d units, accuracy: %.4f" % (hl, nu, accuracies[i,j])) best_model.run((X_train, y_train), (X_test, y_test)) y_pred = best_model.network.predict(X[test]) - return y_pred, None \ No newline at end of file + return y_pred, None diff --git a/azimuth/models/GP.py b/azimuth/models/GP.py index 7d53520..69c19c0 100755 --- a/azimuth/models/GP.py +++ b/azimuth/models/GP.py @@ -100,8 +100,8 @@ def gp_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_o plt.figure('kernel') plt.title('kernel') plt.imshow(m.kern.K(X,X)) - print m - print "%.3f variance explained" % (m.Gaussian_noise.variance/y[train].var()) + print(m) + print("%.3f variance explained" % (m.Gaussian_noise.variance/y[train].var())) import ipdb; ipdb.set_trace() plt.close('all') else: diff --git a/azimuth/models/baselines.py b/azimuth/models/baselines.py index 0ad1512..4ee1690 100755 --- a/azimuth/models/baselines.py +++ b/azimuth/models/baselines.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy as np import sklearn from sklearn.svm import LinearSVC @@ -62,11 +63,11 @@ def doench_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op assert np.nan not in tpr, "found nan tpr" roc_auc = sklearn.metrics.auc(fpr, tpr) if verbose: - print j, i, roc_auc + print(j, i, roc_auc) cv_results[j][i] = roc_auc best_penalty = penalty[np.argmax(np.mean(cv_results, axis=0))] - print "best AUC for penalty: ", np.median(cv_results, axis=0) + print("best AUC for penalty: ", np.median(cv_results, axis=0)) clf = LinearSVC(penalty='l1', C=best_penalty, dual=False, class_weight=auto_class_weight) clf.fit(X[train], y_bin[train].flatten()) non_zero_coeff = (clf.coef_ != 0.0) @@ -92,6 +93,3 @@ def SVC_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_optio #y_pred = clf.predict(X[test])[:, None] # this returns 0/1 y_pred = clf.decision_function(X[test])[:, None] return y_pred, clf - - - diff --git a/azimuth/models/ensembles.py b/azimuth/models/ensembles.py index 739c4e0..424e623 100755 --- a/azimuth/models/ensembles.py +++ b/azimuth/models/ensembles.py @@ -65,7 +65,7 @@ def adaboost_scoring_bo(params): 'max_features': hp.uniform('max_features', 0.05, 1.0)} best = fmin(adaboost_scoring_bo, space, algo=tpe.suggest, max_evals=50, verbose=1) - print best + print(best) clf = en.GradientBoostingRegressor(n_estimators=learn_options['adaboost_n_estimators'], learning_rate=best['learning_rate'], max_depth=best['max_depth'], @@ -77,7 +77,7 @@ def adaboost_scoring_bo(params): assert not classification, "need to tweak code below to do classificaton, as above" n_jobs = 20 - print "Adaboost with GridSearch" + print("Adaboost with GridSearch") from sklearn.grid_search import GridSearchCV param_grid = {'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 5, 6, 7], @@ -101,7 +101,7 @@ def adaboost_scoring_bo(params): est = en.GradientBoostingRegressor(loss=learn_options['adaboost_loss'], random_state=learn_options['seed'])#, n_estimators=learn_options['adaboost_n_estimators']) clf = GridSearchCV(est, param_grid, n_jobs=n_jobs, verbose=1, cv=cv, scoring=spearman_scoring, iid=False) clf.fit(X[train], y[train].flatten()) - print clf.best_params_ + print(clf.best_params_) else: raise Exception("if using adaboost_CV then need to specify grid (grid search) or bo (bayesian optimization)") diff --git a/azimuth/models/regression.py b/azimuth/models/regression.py index c68e682..df9f2ae 100755 --- a/azimuth/models/regression.py +++ b/azimuth/models/regression.py @@ -44,23 +44,23 @@ def logreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op assert len(np.unique(y)) <= 2, "if using logreg need binary targets" assert learn_options["weighted"] is None, "cannot do weighted Log reg" assert learn_options['feature_select'] is False, "cannot do feature selection yet in logistic regression--see linreg_on_fold to implement" - + cv, n_folds = set_up_inner_folds(learn_options, y_all.iloc[train]) assert learn_options['penalty'] == "L1" or learn_options['penalty'] == "L2", "can only use L1 or L2 with logistic regression" - + tol = 0.00001#0.0001 - + performance = np.zeros((len(learn_options["alpha"]), 1)) # degenerate_pred = np.zeros((len(learn_options["alpha"]))) for train_inner, test_inner in cv: for i, alpha in enumerate(learn_options["alpha"]): clf = sklearn.linear_model.LogisticRegression(penalty=learn_options['penalty'].lower(), dual=False, fit_intercept=learn_options["fit_intercept"], class_weight=learn_options["class_weight"], tol=tol, C=1.0/alpha) - + clf.fit(X[train][train_inner], y[train][train_inner].flatten()) #tmp_pred = clf.predict(X[train][test_inner]) tmp_pred = clf.predict_proba(X[train][test_inner])[:,1] - + if learn_options["training_metric"] == "AUC": fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred) assert ~np.any(np.isnan(fpr)), "found nan fpr" @@ -85,32 +85,32 @@ def logreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op if not isinstance(best_alpha, numbers.Number): raise Exception("best_alpha must be a number but is %s" % type(best_alpha)) - print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]]) + print("\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])) max_perf = np.nanmax(performance) if max_perf < 0.0: raise Exception("performance is negative") - print "\t\tbest performance is %f" % np.nanmax(performance) + print("\t\tbest performance is %f" % np.nanmax(performance)) clf = sklearn.linear_model.LogisticRegression(penalty=learn_options['penalty'], dual=False, fit_intercept=learn_options["fit_intercept"], class_weight=learn_options["class_weight"], tol=tol, C=1.0/best_alpha) clf.fit(X[train], y[train].flatten()) - # debugging check that get samed paramter estimation when have no regularization and use + # debugging check that get samed paramter estimation when have no regularization and use # either data with only that feature on, or all data), AND WITH NO INTERCEPT - if False: - # grab only feature "GA3" + if False: + # grab only feature "GA3" keep_ind = np.where(feature_sets['mutletpos'].columns=="GA3")[0] - print "%s, %s" % (str(clf.intercept_ ), str(clf.coef_[0, keep_ind])) + print("%s, %s" % (str(clf.intercept_ ), str(clf.coef_[0, keep_ind]))) clf.fit(X[train][:,keep_ind], y[train].flatten()) - print "%s, %s" % (str(clf.intercept_ ), str(clf.coef_)) - import ipdb; ipdb.set_trace() + print("%s, %s" % (str(clf.intercept_ ), str(clf.coef_))) + import ipdb; ipdb.set_trace() + - #y_pred = clf.predict(X[test]) y_pred = clf.predict_proba(X[test])[:,1] - y_pred = y_pred[:, None] + y_pred = y_pred[:, None] #fpr, tpr, _ = roc_curve(y, y_pred); tmp_auc = auc(fpr, tpr) #import ipdb; ipdb.set_trace() return y_pred, clf @@ -124,7 +124,7 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op if learn_options["weighted"] is not None and (learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"): raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment") - + if not learn_options.has_key("fit_intercept"): learn_options["fit_intercept"] = True if not learn_options.has_key('normalize_features'): @@ -194,16 +194,16 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]] - print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]]) - + print("\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])) + if learn_options['penalty'] == "EN": - print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]]) + print("\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]])) max_perf = np.nanmax(performance) if max_perf < 0.0: raise Exception("performance is negative") - print "\t\tbest performance is %f" % max_perf + print("\t\tbest performance is %f" % max_perf) clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all) if learn_options["feature_select"]: @@ -214,7 +214,7 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op if learn_options["penalty"] != "L2": y_pred = y_pred[:, None] - + return y_pred, clf @@ -267,16 +267,16 @@ def get_weights(learn_options, fold, y, y_all): return weights -def set_up_inner_folds(learn_options, y): +def set_up_inner_folds(learn_options, y): label_encoder = sklearn.preprocessing.LabelEncoder() - label_encoder.fit(y['Target gene'].values) + label_encoder.fit(y['Target gene'].values) gene_classes = label_encoder.transform(y['Target gene'].values) - n_genes = len(np.unique(gene_classes)) + n_genes = len(np.unique(gene_classes)) if learn_options['ignore_gene_level_for_inner_loop'] or learn_options["cv"] == "stratified" or n_genes==1: if 'n_folds' not in learn_options.keys(): n_folds = len(np.unique(gene_classes)) else: - n_folds = learn_options['n_folds'] + n_folds = learn_options['n_folds'] cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True) elif learn_options["cv"] == "gene": gene_list = np.unique(y['Target gene'].values) diff --git a/azimuth/predict.py b/azimuth/predict.py index b116983..2978fdc 100755 --- a/azimuth/predict.py +++ b/azimuth/predict.py @@ -65,26 +65,26 @@ def construct_filename(learn_options, TEST): elif learn_options["training_metric"] == 'spearmanr': filename += ".spearman" - print "filename = %s" % filename + print("filename = %s" % filename) return filename def print_summary(global_metric, results, learn_options, feature_sets, flags): - print "\nSummary:" - print learn_options - print "\t\tglobal %s=%.2f" % (learn_options['metric'], global_metric) - print "\t\tmedian %s across folds=%.2f" % (learn_options['metric'], np.median(results[0])) - print "\t\torder=%d" % learn_options["order"] + print("\nSummary:") + print(learn_options) + print("\t\tglobal %s=%.2f" % (learn_options['metric'], global_metric)) + print("\t\tmedian %s across folds=%.2f" % (learn_options['metric'], np.median(results[0]))) + print("\t\torder=%d" % learn_options["order"]) if learn_options.has_key('kerntype'): "\t\tkern type = %s" % learn_options['kerntype'] - if learn_options.has_key('degree'): print "\t\tdegree=%d" % learn_options['degree'] - print "\t\ttarget_name=%s" % learn_options["target_name"] + if learn_options.has_key('degree'): print("\t\tdegree=%d" % learn_options['degree']) + print("\t\ttarget_name=%s" % learn_options["target_name"]) for k in flags.keys(): - print '\t\t' + k + '=' + str(learn_options[k]) + print('\t\t' + k + '=' + str(learn_options[k])) - print "\t\tfeature set:" + print("\t\tfeature set:") for set in feature_sets.keys(): - print "\t\t\t%s" % set - print "\t\ttotal # features=%d" % results[4] + print("\t\t\t%s" % set) + print("\t\ttotal # features=%d" % results[4]) def extract_fpr_tpr_for_fold(aucs, fold, i, predictions, truth, y_binary, test, y_pred): assert len(np.unique(y_binary))<=2, "if using AUC need binary targets" @@ -136,7 +136,7 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge When CV=False, it trains on everything (and tests on everything, just to fit the code) ''' - print "range of y_all is [%f, %f]" % (np.min(y_all[learn_options['target_name']].values), np.max(y_all[learn_options['target_name']].values)) + print("range of y_all is [%f, %f]" % (np.min(y_all[learn_options['target_name']].values), np.max(y_all[learn_options['target_name']].values))) allowed_methods = ["GPy", "linreg", "AdaBoostRegressor", "AdaBoostClassifier", "DecisionTreeRegressor", "RandomForestRegressor", @@ -149,7 +149,7 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge # construct filename from options filename = construct_filename(learn_options, TEST) - print "Cross-validating genes..." + print("Cross-validating genes...") t2 = time.time() y = np.array(y_all[learn_options["target_name"]].values[:,None],dtype=np.float64) @@ -219,8 +219,8 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge if learn_options['num_genes_remove_train']==0: assert np.all(cv_i_orig[0]==cv[i][0]) assert np.all(cv_i_orig[1]==cv[i][1]) - print "# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0])) - print "# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1])) + print("# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0]))) + print("# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1]))) else: raise Exception("invalid cv options given: %s" % learn_options["cv"]) @@ -240,12 +240,12 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge num_proc = learn_options["num_proc"] if num_proc > 1: num_proc = np.min([num_proc,len(cv)]) - print "using multiprocessing with %d procs--one for each fold" % num_proc + print("using multiprocessing with %d procs--one for each fold" % num_proc) jobs = [] pool = multiprocessing.Pool(processes=num_proc) for i,fold in enumerate(cv): train,test = fold - print "working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test)) + print("working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test))) if learn_options["method"]=="GPy": job = pool.apply_async(azimuth.models.GP.gp_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"]=="linreg": @@ -351,15 +351,15 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge truth, predictions = fill_in_truth_and_predictions(truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test) - print "\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean()) - print "\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0] - print "\t\tfinished fold/gene %i of %i" % (i+1, len(fold_labels)) + print("\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean())) + print("\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0]) + print("\t\tfinished fold/gene %i of %i" % (i+1, len(fold_labels))) cv_median_metric =[np.median(metrics)] gene_pred = [(truth, predictions)] - print "\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1]) + print("\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1])) t3 = time.time() - print "\t\tElapsed time for cv is %.2f seconds" % (t3-t2) + print("\t\tElapsed time for cv is %.2f seconds" % (t3-t2)) return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names diff --git a/azimuth/util.py b/azimuth/util.py index 1bd6dcb..b3b252c 100755 --- a/azimuth/util.py +++ b/azimuth/util.py @@ -1,3 +1,4 @@ +from __future__ import print_function import pandas import matplotlib.pylab as plt import pylab as pl # so can just grab qqplotting code from fastlmm directly @@ -26,6 +27,7 @@ import pandas as pd import corrstats + def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None,fixaxes=True,addlambda=True,minpval=1e-20,title=None,h1=None,figsize=[5,5],grid=True, markersize=2): ''' performs a P-value QQ-plot in -log10(P-value) space @@ -46,7 +48,7 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N grid boolean: use a grid? (default: True) Returns: fighandle, qnull, qemp ----------------------------------------------------------------------- - ''' + ''' distr = 'log10' import pylab as pl if type(pvals)==list: @@ -57,20 +59,20 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N legendlist=legend else: legendlist = [legend] - + if h1 is None: - h1=pl.figure(figsize=figsize) - + h1=pl.figure(figsize=figsize) + pl.grid(b=grid, alpha = 0.5) - + maxval = 0 - for i in xrange(len(pvallist)): + for i in xrange(len(pvallist)): pval =pvallist[i].flatten() M = pval.shape[0] pnull = (0.5 + sp.arange(M))/M # pnull = np.sort(np.random.uniform(size = tests)) - + pval[pval=1]=1 @@ -81,31 +83,31 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N yl = '$\chi^2$ quantiles' if distr == 'log10': - qnull = -sp.log10(pnull) + qnull = -sp.log10(pnull) qemp = -sp.log10(sp.sort(pval)) #sorts the object, returns nothing xl = '-log10(P) observed' yl = '-log10(P) expected' if not (sp.isreal(qemp)).all(): raise Exception("imaginary qemp found") if qnull.max>maxval: - maxval = qnull.max() + maxval = qnull.max() pl.plot(qnull, qemp, '.', markersize=markersize) - #pl.plot([0,qemp.max()], [0,qemp.max()],'r') + #pl.plot([0,qemp.max()], [0,qemp.max()],'r') if addlambda: lambda_gc = estimate_lambda(pval) - print "lambda=%1.4f" % lambda_gc - #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) + print("lambda=%1.4f" % lambda_gc) + #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) # if there's only one method, just print the lambda if len(pvallist) == 1: - legendlist=["$\lambda_{GC}=$%1.4f" % lambda_gc] + legendlist=["$\lambda_{GC}=$%1.4f" % lambda_gc] # otherwise add it at the end of the name else: legendlist[i] = legendlist[i] + " ($\lambda_{GC}=$%1.4f)" % lambda_gc - addqqplotinfo(qnull,M,xl,yl,xlim,ylim,alphalevel,legendlist,fixaxes) - + addqqplotinfo(qnull,M,xl,yl,xlim,ylim,alphalevel,legendlist,fixaxes) + if title is not None: - pl.title(title) - + pl.title(title) + if fileout is not None: pl.savefig(fileout) @@ -116,20 +118,20 @@ def qqplotp(pv,fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None ''' Read in p-values from filein and make a qqplot adn histogram. If fileout is provided, saves the qqplot only at present. - Searches through p until one is found. ''' - - import pylab as pl - pl.ion() - - fs=8 + Searches through p until one is found. ''' + + import pylab as pl + pl.ion() + + fs=8 h1=qqplot(pv, fileout, alphalevel,legend,xlim,ylim,addlambda=True, figsize=figsize, markersize=markersize) #lambda_gc=estimate_lambda(pv) - #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) + #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) pl.title(title,fontsize=fs) - + wm=pl.get_current_fig_manager() #e.g. "652x526+100+10 - xcoord=100 + xcoord=100 #wm.window.wm_geometry(plotsize + "+" + str(xcoord) + "+" + str(ycoord)) if dohist: @@ -144,7 +146,7 @@ def qqplotp(pv,fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None return h1,h2 -def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=None,ylim=None,alphalevel=0.05,legendlist=None,fixaxes=False): +def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=None,ylim=None,alphalevel=0.05,legendlist=None,fixaxes=False): distr='log10' pl.plot([0,qnull.max()], [0,qnull.max()],'k') pl.ylabel(xl) @@ -152,7 +154,7 @@ def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=N if xlim is not None: pl.xlim(xlim) if ylim is not None: - pl.ylim(ylim) + pl.ylim(ylim) if alphalevel is not None: if distr == 'log10': betaUp, betaDown, theoreticalPvals = _qqplot_bar(M=M,alphalevel=alphalevel,distr=distr) @@ -168,7 +170,7 @@ def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=N lo.set_markersize(10) if fixaxes: - fix_axes() + fix_axes() def _qqplot_bar(M=1000000, alphalevel = 0.05,distr = 'log10'): ''' @@ -214,8 +216,8 @@ def _qqplot_bar(M=1000000, alphalevel = 0.05,distr = 'log10'): def fix_axes(buffer=0.1): ''' Makes x and y max the same, and the lower limits 0. - ''' - maxlim=max(pl.xlim()[1],pl.ylim()[1]) + ''' + maxlim=max(pl.xlim()[1],pl.ylim()[1]) pl.xlim([0-buffer,maxlim+buffer]) pl.ylim([0-buffer,maxlim+buffer]) @@ -232,13 +234,13 @@ def estimate_lambda(pv): L = (LOD2/0.456) return L - -def pvalhist(pv,numbins=50,linewidth=3.0,linespec='--r', figsize=[5,5]): + +def pvalhist(pv,numbins=50,linewidth=3.0,linespec='--r', figsize=[5,5]): ''' Plots normalized histogram, plus theoretical null-only line. - ''' - h2=pl.figure(figsize=figsize) - [nn,bins,patches]=pl.hist(pv,numbins,normed=True) + ''' + h2=pl.figure(figsize=figsize) + [nn,bins,patches]=pl.hist(pv,numbins,normed=True) pl.plot([0, 1],[1,1],linespec,linewidth=linewidth) @@ -291,7 +293,7 @@ def guide_positional_features(guide_seq, gene, strand): guide_seq = guide_seq.reverse_complement() ind = gene_seq.find(guide_seq) if ind ==-1: - print "returning None, could not find guide %s in gene %s" % (guide_seq, gene) + print("returning None, could not find guide %s in gene %s" % (guide_seq, gene)) return "" assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" ## now get what we want from this: @@ -310,7 +312,7 @@ def convert_to_thirty_one(guide_seq, gene, strand): guide_seq = guide_seq.reverse_complement() ind = gene_seq.find(guide_seq) if ind ==-1: - print "returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene) + print("returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene)) return gene_seq + 'A' assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" #new_mer = gene_seq[ind:(ind+len(guide_seq))+1] #looks correct, but is wrong, due to strand frame-of-reference @@ -349,7 +351,7 @@ def concatenate_feature_sets(feature_sets, keys=None): if False: inputs.shape - for j in keys: print j + str(feature_sets[j].shape) + for j in keys: print(j + str(feature_sets[j].shape)) import ipdb; ipdb.set_trace() #print "final size of inputs matrix is (%d, %d)" % inputs.shape @@ -383,7 +385,7 @@ def spearmanr_nonan(x,y): r, p = st.spearmanr(x, y) if np.isnan(p): if len(np.unique(x))==1 or len(np.unique(y))==1: - print "WARNING: spearmanr is nan due to unique values, setting to 0" + print("WARNING: spearmanr is nan due to unique values, setting to 0") p = 0.0 r = 0.0 else: @@ -435,7 +437,7 @@ def get_gene_sequence(gene_name): # records = Entrez.read(search) # if len(records['IdList']) > 1: - # print "warning, multiple hits found for entrez gene search %s" % gene_name + # print("warning, multiple hits found for entrez gene search %s" % gene_name) # elink = Entrez.read(Entrez.elink(dbfrom="gene", db='nucleotide', id=records['IdList'][0])) # nucl_id = elink[0]['LinkSetDb'][3] @@ -446,7 +448,7 @@ def get_gene_sequence(gene_name): # nucl_id = elink[0]['LinkSetDb'][0]['Link'][0]['Id'] # cut = True # else: - # print "sorry not enough information to return sequence" + # print("sorry not enough information to return sequence") # return None # else: # nucl_id = nucl_id['Link'][0]['Id'] @@ -466,7 +468,7 @@ def target_genes_stats(genes=['HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', for gene in genes: seq = get_gene_sequence(gene) if seq != None: - print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA')) + print('%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))) def ranktrafo(data): @@ -518,7 +520,7 @@ def get_ranks(y, thresh=0.8, prefix="", flip=False, col_name='score'): # y_quantized = pandas.DataFrame(data=pandas.qcut(y[col_name], 5, labels=np.arange(5.0))) # quantized vector y_quantized = y_threshold.copy() y_quantized.columns = [prefix + "quantized"] - + return y_rank, y_rank_raw, y_threshold, y_quantized def get_data(data, y_names, organism="human", target_gene=None): @@ -1004,7 +1006,7 @@ def plot_all_metrics(metrics, gene_names, all_learn_options, save, plots=None, b plt.bar(ind+(i*width), metrics[method][metric], width, color=plt.cm.Paired(1.*i/len(metrics.keys())), label=method) median_metric = np.median(metrics[method][metric]) - print method, metric, median_metric + print(method, metric, median_metric) assert not np.isnan(median_metric), "found nan for %s, %s" % (method, metric) if metric not in boxplot_arrays.keys(): boxplot_arrays[metric] = np.array(metrics[method][metric])[:, None] @@ -1061,7 +1063,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a if filelist ==[]: raise Exception("found no pickle files in %s" % directory) else: - print "found %d files in %s" % (len(filelist), directory) + print("found %d files in %s" % (len(filelist), directory)) for results_file in filelist: if 'learn_options' in results_file: @@ -1074,7 +1076,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a if m in results_file: in_filt = True if not in_filt: - print "%s not in model_filter" % (results_file)#, model_filter) + print("%s not in model_filter" % (results_file))#, model_filter) continue elif model_filter not in results_file: continue @@ -1094,7 +1096,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a else: k_new = k assert k_new not in all_results.keys(), "found %s already" % k - print "adding key %s (from file %s)" % (k_new, os.path.split(results_file)[-1]) + print("adding key %s (from file %s)" % (k_new, os.path.split(results_file)[-1])) all_results[k_new] = results[k] all_learn_options[k_new] = learn_options[k] num_added = num_added +1 @@ -1205,8 +1207,8 @@ def ensemble_cluster_results(directory=r'\\fusi1\crispr2\analysis\cluster\result # spearmans = [] # for gene in ens_predictions.keys(): # spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0]) - # print gene, spearmans[-1] - # print "median: %.5f" % np.median(spearmans) + # print(gene, spearmans[-1]) + # print("median: %.5f" % np.median(spearmans)) return all_results, all_learn_options @@ -1245,13 +1247,13 @@ def plot_old_vs_new_feat(results, models, fontsize=20, filename=None, print_outp feat_AUC_se.append(np.std(metrics_feat['AUC'])) - print "old features" - print "mean: " + str(base_spearman_means) - print "std: " + str(base_spearman_std) + print("old features") + print("mean: " + str(base_spearman_means)) + print("std: " + str(base_spearman_std)) - print "old + new features" - print "mean: " + str(feat_spearman_means) - print "std: " + str(feat_spearman_std) + print("old + new features") + print("mean: " + str(feat_spearman_means)) + print("std: " + str(feat_spearman_std)) plt.figure() ind = np.arange(len(models)) @@ -1322,7 +1324,7 @@ def remove_top_right_on_plot(ax=None): X, Y = combine_organisms() X.to_pickle('../data/X.pd') #sequence features (i.e. inputs to prediction) Y.to_pickle('../data/Y.pd') #cell-averaged ranks, plus more (i.e. possible targets for prediction) - print "done writing to file" + print("done writing to file") elif V =="2": # this is now all in predict.py pass diff --git a/setup.py b/setup.py index 7fcea42..a0a2bad 100755 --- a/setup.py +++ b/setup.py @@ -1,6 +1,11 @@ # from Cython.Build import cythonize from setuptools import setup +import sys +if sys.version_info[0] >= 3: + requires = ['scipy', 'numpy', 'matplotlib', 'nose', 'scikit-learn', 'pandas', 'biopython'] +else: + requires = ['scipy', 'numpy', 'matplotlib<3.0', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython'] setup(name='Azimuth', version='2.0', @@ -9,7 +14,7 @@ description=("Machine Learning-Based Predictive Modelling of CRISPR/Cas9 guide efficiency"), packages=["azimuth", "azimuth.features", "azimuth.models", "azimuth.tests"], package_data={'azimuth': ['saved_models/*.*']}, - install_requires=['scipy', 'numpy', 'matplotlib', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython'], + install_requires=requires, license="BSD", # ext_modules=cythonize("ssk_cython.pyx"), )