From 9ea5cfb3758659946d45ad78116837fd749842be Mon Sep 17 00:00:00 2001 From: Thomas Kent Date: Sat, 27 Oct 2018 07:45:14 -0500 Subject: [PATCH 1/2] Python2 compatible version of matplotlib The latest version of matplotlib doesn't support python2, so need to track the latest 2.x. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7fcea42..4392e56 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ description=("Machine Learning-Based Predictive Modelling of CRISPR/Cas9 guide efficiency"), packages=["azimuth", "azimuth.features", "azimuth.models", "azimuth.tests"], package_data={'azimuth': ['saved_models/*.*']}, - install_requires=['scipy', 'numpy', 'matplotlib', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython'], + install_requires=['scipy', 'numpy', 'matplotlib<3.0', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython'], license="BSD", # ext_modules=cythonize("ssk_cython.pyx"), ) From 91efab5c884bfc60d798327172e2a9652112d84d Mon Sep 17 00:00:00 2001 From: Thomas Kent Date: Sat, 27 Oct 2018 09:12:53 -0500 Subject: [PATCH 2/2] A start at python3 support This should be 100% compatible with python2, and should not affect the existing ability to run. It is mostly changing print statements to use the print function. When attempting to package with python3, it fails to find a version of sklearn that is compatible with python3 *and* has the sklearn.cross_validation module (which went away). --- README.md | 4 +- azimuth/cluster_job.py | 18 ++-- azimuth/corrstats.py | 8 +- azimuth/features/featurization.py | 50 +++++----- azimuth/features/microhomology.py | 158 +++++++++++++++--------------- azimuth/load_data.py | 18 ++-- azimuth/local_multiprocessing.py | 6 +- azimuth/metrics.py | 122 +++++++++++------------ azimuth/model_comparison.py | 24 ++--- azimuth/models/DNN.py | 6 +- azimuth/models/GP.py | 4 +- azimuth/models/baselines.py | 8 +- azimuth/models/ensembles.py | 6 +- azimuth/models/regression.py | 50 +++++----- azimuth/predict.py | 46 ++++----- azimuth/util.py | 116 +++++++++++----------- setup.py | 7 +- 17 files changed, 327 insertions(+), 324 deletions(-) diff --git a/README.md b/README.md index 58e4ab4..b06e89e 100755 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ percent_peptides = np.array([0.18, 0.18, 0.35]) predictions = azimuth.model_comparison.predict(sequences, amino_acid_cut_positions, percent_peptides) for i, prediction in enumerate(predictions): - print sequences[i], prediction + print("%s %f" % (sequences[i], prediction)) ``` Output: @@ -87,5 +87,3 @@ Sometimes the pre-computed .pickle files in the saved_models directory are incom #### Contacting us You can submit bug reports using the GitHub issue tracker. If you have any other questions, please contact us at crispr@lists.research.microsoft.com. - - diff --git a/azimuth/cluster_job.py b/azimuth/cluster_job.py index c799ce8..259f90d 100755 --- a/azimuth/cluster_job.py +++ b/azimuth/cluster_job.py @@ -5,7 +5,7 @@ # just execute this file in python to create the xml file for the cluster (in ./analysis/cluster), which one then can manually submit through the HPC Job Manager def cluster_setup(i, python_path, home, t, work_dir, tempdir): - t.work_directory = work_dir + t.work_directory = work_dir #t.std_out_file_path = r'cluster\log\cluster_out%d.txt' % i #t.std_err_file_path = r'cluster\log\cluster_err%d.txt' % i t.std_out_file_path = tempdir + r'\out%d.txt' % i @@ -14,10 +14,10 @@ def cluster_setup(i, python_path, home, t, work_dir, tempdir): #t.std_err_file_path = r'err%d.txt' % i #if not os.path.exists(t.std_out_file_path): os.makedirs(t.std_out_file_path) #if not os.path.exists(t.std_err_file_path): os.makedirs(t.std_err_file_path) - t.environment_variables['PYTHONPATH'] = python_path + t.environment_variables['PYTHONPATH'] = python_path t.environment_variables['HOME'] = home - print "cluster python_path=%s" % python_path + print("cluster python_path=%s" % python_path) def create(user, models, orders, degrees, GP_likelihoods, adaboost_learning_rates=None, adaboost_num_estimators=None, adaboost_max_depths=None, adaboost_CV=False, exp_name=None, learn_options=None): job = WinHPCJob() @@ -42,18 +42,18 @@ def create(user, models, orders, degrees, GP_likelihoods, adaboost_learning_rate home = r"\\fusi1\CLUSTER_HOME" elif job.username == 'REDMOND\\jennl': remote_dir = r"\\GCR\Scratch\RR1\jennl\CRISPR" - work_dir = r'\\jennl2\D$\Source\CRISPR\analysis' + work_dir = r'\\jennl2\D$\Source\CRISPR\analysis' python = r'\\fusi1\crispr\python.exe' python_path = r'\\fusi1\crispr\lib\site-packages\;\\jennl2\D$\Source\CRISPR\analysis' home = r"\\fusi1\CLUSTER_HOME" - # print "workdir=%s" % work_dir - # print "python=%s" % python - # print "python_path=%s" % python_path + # print("workdir=%s" % work_dir) + # print("python=%s" % python) + # print("python_path=%s" % python_path) - # generate random dir in results directory + # generate random dir in results directory tempdir = tempfile.mkdtemp(prefix='cluster_experiment_', dir=remote_dir) - print "Created directory: %s" % str(tempdir) + print("Created directory: %s" % str(tempdir)) # dump learn_options with open(tempdir+'/learn_options.pickle', 'wb') as f: diff --git a/azimuth/corrstats.py b/azimuth/corrstats.py index 619157a..3d11b8e 100644 --- a/azimuth/corrstats.py +++ b/azimuth/corrstats.py @@ -113,8 +113,8 @@ def independent_corr(xy, ab, n, n2 = None, twotailed=True, conf_level=0.95, meth else: raise Exception('Wrong method!') -#print dependent_corr(.396, .179, .088, 200, method='steiger') -#print independent_corr(.560, .588, 100, 353, method='fisher') +#print(dependent_corr(.396, .179, .088, 200, method='steiger')) +#print(independent_corr(.560, .588, 100, 353, method='fisher')) -#print dependent_corr(.396, .179, .088, 200, method='zou') -#print independent_corr(.560, .588, 100, 353, method='zou') \ No newline at end of file +#print(dependent_corr(.396, .179, .088, 200, method='zou')) +#print(independent_corr(.560, .588, 100, 353, method='zou')) diff --git a/azimuth/features/featurization.py b/azimuth/features/featurization.py index bb88359..603094b 100755 --- a/azimuth/features/featurization.py +++ b/azimuth/features/featurization.py @@ -21,7 +21,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length assert num_lengths == 1, "should only have sequences of a single length, but found %s: %s" % (num_lengths, str(unique_lengths)) if not quiet: - print "Constructing features..." + print("Constructing features...") t0 = time.time() feature_sets = {} @@ -49,7 +49,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length feature_sets["Percent Peptide <50%"]['Percent Peptide <50%'] = feature_sets["Percent Peptide <50%"].pop("Percent Peptide") if learn_options["include_gene_effect"]: - print "including gene effect" + print("including gene effect") gene_names = Y['Target gene'] enc = sklearn.preprocessing.OneHotEncoder() label_encoder = sklearn.preprocessing.LabelEncoder() @@ -95,7 +95,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length t1 = time.time() if not quiet: - print "\t\tElapsed time for constructing features is %.2f seconds" % (t1-t0) + print("\t\tElapsed time for constructing features is %.2f seconds" % (t1-t0)) check_feature_set(feature_sets) @@ -138,8 +138,8 @@ def NGGX_interaction_feature(data, pam_audit=True): for seq in sequence: if pam_audit and seq[25:27] != "GG": raise Exception("expected GG but found %s" % seq[25:27]) - NX = seq[24]+seq[27] - NX_onehot = nucleotide_features(NX,order=2, feature_type='pos_dependent', max_index_to_use=2, prefix="NGGX") + NX = seq[24]+seq[27] + NX_onehot = nucleotide_features(NX,order=2, feature_type='pos_dependent', max_index_to_use=2, prefix="NGGX") # NX_onehot[:] = np.random.rand(NX_onehot.shape[0]) ##TESTING RANDOM FEATURE feat_NX = pandas.concat([feat_NX, NX_onehot], axis=1) return feat_NX.T @@ -148,7 +148,7 @@ def NGGX_interaction_feature(data, pam_audit=True): def get_all_order_nuc_features(data, feature_sets, learn_options, maxorder, max_index_to_use, prefix="", quiet=False): for order in range(1, maxorder+1): if not quiet: - print "\t\tconstructing order %s features" % order + print("\t\tconstructing order %s features" % order) nuc_features_pd, nuc_features_pi = apply_nucleotide_features(data, order, learn_options["num_proc"], include_pos_independent=True, max_index_to_use=max_index_to_use, prefix=prefix) feature_sets['%s_nuc_pd_Order%i' % (prefix, order)] = nuc_features_pd @@ -157,7 +157,7 @@ def get_all_order_nuc_features(data, feature_sets, learn_options, maxorder, max_ check_feature_set(feature_sets) if not quiet: - print "\t\t\t\t\t\t\tdone" + print("\t\t\t\t\t\t\tdone") def countGC(s, length_audit=True): @@ -202,7 +202,7 @@ def organism_feature(data): def get_micro_homology_features(gene_names, learn_options, X): # originally was flipping the guide itself as necessary, but now flipping the gene instead - print "building microhomology features" + print("building microhomology features") feat = pandas.DataFrame(index=X.index) feat["mh_score"] = "" feat["oof_score"] = "" @@ -215,7 +215,7 @@ def get_micro_homology_features(gene_names, learn_options, X): for gene in gene_names.unique(): gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement() guide_inds = np.where(gene_names.values == gene)[0] - print "getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene) + print("getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene)) for j, ps in enumerate(guide_inds): guide_seq = Seq.Seq(X['30mer'][ps]) strand = X['Strand'][ps] @@ -227,18 +227,18 @@ def get_micro_homology_features(gene_names, learn_options, X): gene_seq = gene_seq.reverse_complement() ind = gene_seq.find(guide_seq) #assert ind != -1, "still didn't work" - #print "shouldn't get here" + #print("shouldn't get here") else: - #print "all good" + #print("all good") pass #assert ind != -1, "could not find guide in gene" if ind==-1: - #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene)) + #print("***could not find guide %s for gene %s" % (str(guide_seq), str(gene))) #if.write(str(gene) + "," + str(guide_seq)) mh_score = 0 oof_score = 0 else: - #print "worked" + #print("worked") assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" left_win = gene_seq[(ind - k_mer_length_left):ind] @@ -258,14 +258,14 @@ def get_micro_homology_features(gene_names, learn_options, X): feat.ix[ps,"mh_score"] = mh_score feat.ix[ps,"oof_score"] = oof_score - print "computed microhomology of %s" % (str(gene)) + print("computed microhomology of %s" % (str(gene))) return pandas.DataFrame(feat, dtype='float') def local_gene_seq_features(gene_names, learn_options, X): - print "building local gene sequence features" + print("building local gene sequence features") feat = pandas.DataFrame(index=X.index) feat["gene_left_win"] = "" feat["gene_right_win"] = "" @@ -300,7 +300,7 @@ def local_gene_seq_features(gene_names, learn_options, X): assert len(left_win)==len(right_win), "k_mer_context, %s, is too large" % k_mer_length feat.ix[ps,"gene_left_win"] = left_win.tostring() feat.ix[ps,"gene_right_win"] = right_win.tostring() - print "featurizing local context of %s" % (gene) + print("featurizing local context of %s" % (gene)) feature_sets = {} get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_left_win") @@ -341,11 +341,11 @@ def gene_guide_feature(Y, X, learn_options): gene_file = r"..\data\gene_seq_feat_V%s_km%s.ord%s.pickle" % (learn_options['V'], learn_options['include_gene_guide_feature'], learn_options['order']) if False: #os.path.isfile(gene_file): #while debugging, comment out - print "loading local gene seq feats from file %s" % gene_file + print("loading local gene seq feats from file %s" % gene_file) with open(gene_file, "rb") as f: feature_sets = pickle.load(f) else: feature_sets = local_gene_seq_features(Y['Target gene'], learn_options, X) - print "writing local gene seq feats to file %s" % gene_file + print("writing local gene seq feats to file %s" % gene_file) with open(gene_file, "wb") as f: pickle.dump(feature_sets, f) return feature_sets @@ -383,11 +383,11 @@ def Tm_feature(data, pam_audit=True, learn_options=None): featarray[i,2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]], rna=rna) #8-mer featarray[i,3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]], rna=rna) #5-mer - #print "CRISPR" + #print("CRISPR") #for d in range(4): - # print featarray[i,d] + # print(featarray[i,d]) #import ipdb; ipdb.set_trace() - + feat = pandas.DataFrame(featarray, index=data.index, columns=["Tm global_%s" % rna, "5mer_end_%s" %rna, "8mer_middle_%s" %rna, "5mer_start_%s" %rna]) @@ -442,7 +442,7 @@ def nucleotide_features(s, order, max_index_to_use, prefix="", feature_type='all ''' assert feature_type in ['all', 'pos_independent', 'pos_dependent'] if max_index_to_use <= len(s): - #print "WARNING: trimming max_index_to use down to length of string=%s" % len(s) + #print("WARNING: trimming max_index_to use down to length of string=%s" % len(s)) max_index_to_use = len(s) if max_index_to_use is not None: @@ -493,7 +493,7 @@ def nucleotide_features(s, order, max_index_to_use, prefix="", feature_type='all return res res = pandas.Series(features_pos_dependent, index=index_dependent) - assert not np.any(np.isnan(res.values)) + assert not np.any(np.isnan(res.values)) return res def nucleotide_features_dictionary(prefix=''): @@ -537,7 +537,7 @@ def normalize_feature_sets(feature_sets): zero-mean, unit-variance each feature within each set ''' - print "Normalizing features..." + print("Normalizing features...") t1 = time.time() new_feature_sets = {} @@ -547,6 +547,6 @@ def normalize_feature_sets(feature_sets): raise Exception("found Nan feature values in set=%s" % set) assert new_feature_sets[set].shape[1] > 0, "0 columns of features" t2 = time.time() - print "\t\tElapsed time for normalizing features is %.2f seconds" % (t2-t1) + print("\t\tElapsed time for normalizing features is %.2f seconds" % (t2-t1)) return new_feature_sets diff --git a/azimuth/features/microhomology.py b/azimuth/features/microhomology.py index 45de681..3fa27d8 100755 --- a/azimuth/features/microhomology.py +++ b/azimuth/features/microhomology.py @@ -1,102 +1,102 @@ -#Supplementary Figure 3 | Source code for assigning a score to a hypothetical deletion -#pattern associated with microhomology +#Supplementary Figure 3 | Source code for assigning a score to a hypothetical deletion +#pattern associated with microhomology # ------------------------------------------ # comes from the Supplementary Info of the paper, in pdf form, copied here, but refactored to make a function # rather than to write it to file # also see their web server version: http://www.rgenome.net/mich-calculator/ where they say: # Insert one or more query sequences (A, G, T, C only) flanking the same length at a cleavage site (100bp or less, 60~80bp recommended). -from math import exp -from re import findall - +from math import exp +from re import findall + def compute_score(seq, tmpfile1="1.before removing duplication.txt", tmpfile2="2.all microhomology patterns.txt", verbose=False): - length_weight=20.0 - left=30 # Insert the position expected to be broken. - right=len(seq)-int(left) - #print 'length of seq = '+str(len(seq)) - - file_temp=open(tmpfile1, "w") - for k in range(2,left)[::-1]: - for j in range(left,left+right-k+1): - for i in range(0,left-k+1): - if seq[i:i+k]==seq[j:j+k]: - length=j-i - file_temp.write(seq[i:i+k]+'\t'+str(i)+'\t'+str(i+k)+'\t'+str(j)+'\t'+str(j+k)+'\t'+str(length)+'\n') - file_temp.close() - - ### After searching out all microhomology patterns, duplication should be removed!! - f1=open(tmpfile1, "r") - s1=f1.read() - - f2=open(tmpfile2, "w") #After removing duplication - f2.write(seq+'\t'+'microhomology\t'+'deletion length\t'+'score of a pattern\n') - - if s1!="": - list_f1=s1.strip().split('\n') - sum_score_3=0 - sum_score_not_3=0 - - for i in range(len(list_f1)): - n=0 - score_3=0 - score_not_3=0 - line=list_f1[i].split('\t') - scrap=line[0] - left_start=int(line[1]) - left_end=int(line[2]) - right_start=int(line[3]) - right_end=int(line[4]) - length=int(line[5]) - - for j in range(i): - line_ref=list_f1[j].split('\t') - left_start_ref=int(line_ref[1]) - left_end_ref=int(line_ref[2]) - right_start_ref=int(line_ref[3]) - right_end_ref=int(line_ref[4]) - - if (left_start >= left_start_ref) and (left_end <= left_end_ref) and (right_start >= right_start_ref) and (right_end <= right_end_ref): - if (left_start - left_start_ref)==(right_start - right_start_ref) and (left_end - left_end_ref)==(right_end - right_end_ref): - n+=1 - else: pass - - if n == 0: - if (length % 3)==0: - length_factor = round(1/exp((length)/(length_weight)),3) - num_GC=len(findall('G',scrap))+len(findall('C',scrap)) - score_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) - - elif (length % 3)!=0: - length_factor = round(1/exp((length)/(length_weight)),3) - num_GC=len(findall('G',scrap))+len(findall('C',scrap)) - score_not_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) - - f2.write(seq[0:left_end]+'-'*length+seq[right_end:]+'\t'+scrap+'\t'+str(length)+'\t'+str(100*length_factor*((len(scrap)-num_GC)+(num_GC*2)))+'\n') - sum_score_3+=score_3 - sum_score_not_3+=score_not_3 - + length_weight=20.0 + left=30 # Insert the position expected to be broken. + right=len(seq)-int(left) + #print('length of seq = '+str(len(seq))) + + file_temp=open(tmpfile1, "w") + for k in range(2,left)[::-1]: + for j in range(left,left+right-k+1): + for i in range(0,left-k+1): + if seq[i:i+k]==seq[j:j+k]: + length=j-i + file_temp.write(seq[i:i+k]+'\t'+str(i)+'\t'+str(i+k)+'\t'+str(j)+'\t'+str(j+k)+'\t'+str(length)+'\n') + file_temp.close() + + ### After searching out all microhomology patterns, duplication should be removed!! + f1=open(tmpfile1, "r") + s1=f1.read() + + f2=open(tmpfile2, "w") #After removing duplication + f2.write(seq+'\t'+'microhomology\t'+'deletion length\t'+'score of a pattern\n') + + if s1!="": + list_f1=s1.strip().split('\n') + sum_score_3=0 + sum_score_not_3=0 + + for i in range(len(list_f1)): + n=0 + score_3=0 + score_not_3=0 + line=list_f1[i].split('\t') + scrap=line[0] + left_start=int(line[1]) + left_end=int(line[2]) + right_start=int(line[3]) + right_end=int(line[4]) + length=int(line[5]) + + for j in range(i): + line_ref=list_f1[j].split('\t') + left_start_ref=int(line_ref[1]) + left_end_ref=int(line_ref[2]) + right_start_ref=int(line_ref[3]) + right_end_ref=int(line_ref[4]) + + if (left_start >= left_start_ref) and (left_end <= left_end_ref) and (right_start >= right_start_ref) and (right_end <= right_end_ref): + if (left_start - left_start_ref)==(right_start - right_start_ref) and (left_end - left_end_ref)==(right_end - right_end_ref): + n+=1 + else: pass + + if n == 0: + if (length % 3)==0: + length_factor = round(1/exp((length)/(length_weight)),3) + num_GC=len(findall('G',scrap))+len(findall('C',scrap)) + score_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) + + elif (length % 3)!=0: + length_factor = round(1/exp((length)/(length_weight)),3) + num_GC=len(findall('G',scrap))+len(findall('C',scrap)) + score_not_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) + + f2.write(seq[0:left_end]+'-'*length+seq[right_end:]+'\t'+scrap+'\t'+str(length)+'\t'+str(100*length_factor*((len(scrap)-num_GC)+(num_GC*2)))+'\n') + sum_score_3+=score_3 + sum_score_not_3+=score_not_3 + mh_score = sum_score_3+sum_score_not_3 oof_score = (sum_score_not_3)*100/(sum_score_3+sum_score_not_3) if verbose: - print 'Microhomology score = ' + str(mh_score) - print 'Out-of-frame score = ' + str(oof_score) - f1.close() + print('Microhomology score = ' + str(mh_score)) + print('Out-of-frame score = ' + str(oof_score)) + f1.close() f2.close() return mh_score, oof_score if __name__ == '__main__': - seq='GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG' # The length of sequence is recommend within 60~80 bases. + seq='GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG' # The length of sequence is recommend within 60~80 bases. tmpfile1 = "1.before removing duplication.txt" tmpfile2 = "2.all microhomology patterns.txt" - + mh_score, oof_score = compute_score(seq, tmpfile1=tmpfile1, tmpfile2=tmpfile2, verbose=True) - # The row of output file is consist of (full sequence, microhomology scrap, deletion length, score of pattern). + # The row of output file is consist of (full sequence, microhomology scrap, deletion length, score of pattern). #correct output is #Microhomology score = 4662.9 #Out-of-frame score = 50.7473889639 - #GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG - - print seq \ No newline at end of file + #GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG + + print(seq) diff --git a/azimuth/load_data.py b/azimuth/load_data.py index d7245a0..06781c7 100755 --- a/azimuth/load_data.py +++ b/azimuth/load_data.py @@ -10,7 +10,7 @@ def from_custom_file(data_file, learn_options): # use semantics of when we load V2 data - print "Loading inputs to predict from %s" % data_file + print("Loading inputs to predict from %s" % data_file) data = pandas.read_csv(data_file) mandatory_columns = ['30mer', 'Target gene', 'Percent Peptide', 'Amino Acid Cut position'] @@ -37,7 +37,7 @@ def from_custom_file(data_file, learn_options): def from_file(data_file, learn_options, data_file2=None, data_file3=None): if learn_options["V"] == 1: # from Nature Biotech paper - print "loading V%d data" % learn_options["V"] + print("loading V%d data" % learn_options["V"]) assert not learn_options["weighted"] is not None, "not supported for V1 data" annotations, gene_position, target_genes, Xdf, Y = read_V1_data(data_file, learn_options) @@ -152,12 +152,12 @@ def read_V1_data(data_file, learn_options, AML_file=cur_dir + "/data/V1_suppl_da assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" if learn_options is not None and learn_options["flipV1target"]: - print "************************************************************************" - print "*****************MATCHING DOENCH CODE (DEBUG MODE)**********************" - print "************************************************************************" + print("************************************************************************") + print("*****************MATCHING DOENCH CODE (DEBUG MODE)**********************") + print("************************************************************************") # normally it is: Y['average threshold'] = Y['average rank'] > 0.8, where 1s are good guides, 0s are not Y['average threshold'] = Y['average rank'] < 0.2 # 1s are bad guides - print "press c to continue" + print("press c to continue") import ipdb ipdb.set_trace() @@ -272,7 +272,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True): count = count + Xtmp.shape[0] Xdf = pandas.concat([Xdf, Xtmp], axis=0) if verbose: - print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count) + print("Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count)) # create new index that includes the drug Xdf = Xdf.set_index('drug', append=True) @@ -335,7 +335,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True): gene_position = util.impute_gene_position(gene_position) if learn_options is not None and learn_options["weighted"] == "variance": - print "computing weights from replicate variance..." + print("computing weights from replicate variance...") # compute the variance across replicates so can use it as a weight data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6+1), index_col=[0, 4]) data.index.names = ["Sequence", "Target gene"] @@ -359,7 +359,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True): orig_index = Y.index.copy() Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True) Y = Y.ix[orig_index] - print "done." + print("done.") # Make sure to keep this check last in this function assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" diff --git a/azimuth/local_multiprocessing.py b/azimuth/local_multiprocessing.py index 3f4aa15..4d3ad10 100755 --- a/azimuth/local_multiprocessing.py +++ b/azimuth/local_multiprocessing.py @@ -21,10 +21,10 @@ def configure(num_jobs=8, TEST=False, subtract=0, num_proc=None, num_thread_per_ try: import mkl - mkl.set_num_threads(num_thread_per_proc) + mkl.set_num_threads(num_thread_per_proc) except ImportError: - print "MKL not available, so I'm not adjusting the number of threads" + print("MKL not available, so I'm not adjusting the number of threads") - print "Launching %d jobs with %d MKL threads each" % (num_jobs, num_thread_per_proc) + print("Launching %d jobs with %d MKL threads each" % (num_jobs, num_thread_per_proc)) return num_jobs diff --git a/azimuth/metrics.py b/azimuth/metrics.py index ef95fba..50e7e31 100755 --- a/azimuth/metrics.py +++ b/azimuth/metrics.py @@ -255,26 +255,26 @@ def ndcg_at_k_ties(labels, predictions, k, method=0, normalize_from_below_too=Fa if isinstance(predictions, list): predictions = np.array(predictions) - + assert len(labels.shape)==1 or np.min(labels.shape)==1, "should be 1D array or equivalent" assert len(predictions.shape)==1 or np.min(predictions.shape)==1, "should be 1D array or equivalent" - + labels = labels.flatten() predictions = predictions.flatten() assert np.all(labels.shape == predictions.shape), "labels and predictions should have the same shape" - + if k is None: k = len(labels) labels = labels.copy() dcg = dcg_at_k_ties(labels, predictions, k, method=method, theta=theta) - + dcg_max = dcg_at_k_ties(labels, labels, k, method, theta=theta) # NOTE: I have checked that dcg_at_k_ties and dcg_at_k match when there are no ties, or ties in the labels - + if normalize_from_below_too: dcg_min = dcg_at_k_ties(np.sort(labels)[::-1], np.sort(predictions), k, method, theta=theta) else: @@ -282,9 +282,9 @@ def ndcg_at_k_ties(labels, predictions, k, method=0, normalize_from_below_too=Fa numerator = (dcg - dcg_min) assert numerator > -1e-5 numerator = np.max((0, numerator)) - ndcg = numerator / (dcg_max - dcg_min) + ndcg = numerator / (dcg_max - dcg_min) assert ndcg <= 1.0 and ndcg >= 0.0, "ndcg=%f should be in [0,1]" % ndcg - if not dcg_max: + if not dcg_max: ndcg = 0. return ndcg @@ -357,7 +357,7 @@ def gain(label, method): dcg = dcg_helper(discount_factors, gain, k, labels, method, predictions) assert not np.isnan(dcg), "found nan dcg" - + return dcg def get_discount_factors(num_labels, discount='log2', theta=None): @@ -411,13 +411,13 @@ def ndcg_bootstrap_test(preds1, preds2, true_labels, num_bootstrap, method, k, n return pv def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, normalize_from_below_too, theta=None, balance_zeros=True): - + # pVal is the probability that we would observe as big an AUC diff as we - # did if the ROC curves were drawn from the null hypothesis (which is that + # did if the ROC curves were drawn from the null hypothesis (which is that # one model does not perform better than the other) # - # null hypothesis is that the prediction ranking are the same, so we exchange a random - # number of them with each other. + # null hypothesis is that the prediction ranking are the same, so we exchange a random + # number of them with each other. # # see ndcg_at_k_ties for all but the first four parameters # @@ -425,7 +425,7 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm # # this is a two-sided test, but since it is a symmetric null distribution, one should # be able to divide the p-value by 2 to get the one-sided version (but think this through before using) - + if isinstance(preds1, list): preds1 = np.array(preds1) else: @@ -453,19 +453,19 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm ranks1 = sp.stats.mstats.rankdata(preds1) ranks2 = sp.stats.mstats.rankdata(preds2) - + ndcg1 = ndcg_at_k_ties(true_labels, ranks1, k=k, method=method, normalize_from_below_too=normalize_from_below_too, theta=theta) ndcg2 = ndcg_at_k_ties(true_labels, ranks2, k=k, method=method, normalize_from_below_too=normalize_from_below_too, theta=theta) real_ndcg_diff = {} perm_ndcg_diff = {} - real_ndcg_diff = np.abs(ndcg1 - ndcg2) + real_ndcg_diff = np.abs(ndcg1 - ndcg2) perm_ndcg_diff = np.nan*np.zeros(nperm) - + if False:#np.all(preds1 == preds2): - pval[theta] = 1.0 - else: + pval[theta] = 1.0 + else: zero_ind = true_labels == 0 assert np.sum(zero_ind) < len(zero_ind), "balancing assumes there are more zeros than ones" @@ -485,8 +485,8 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm tmp_diff = np.abs(ndcg1_perm[theta] - ndcg2_perm[theta]) perm_ndcg_diff[theta][t] = tmp_diff - pval = {} - + pval = {} + num_stat_greater = np.max((((perm_ndcg_diff > real_ndcg_diff).sum() + 1), 1.0)) pval = num_stat_greater / nperm @@ -495,7 +495,7 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm plt.plot(np.sort(perm_ndcg_diff), '.') plt.plot(real_ndcg_diff*np.ones(perm_ndcg_diff.shape), 'k-') plt.show() - + return pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 if __name__ == "__main__": @@ -506,35 +506,35 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm simulated_data = True permute_real_data = True - + T = 1000 allp = np.nan*np.ones(T) nperm = 100 - #method = 4; normalize_from_below_too = True; - + #method = 4; normalize_from_below_too = True; + #theta_range = np.logspace(np.log10(0.01), np.log10(1.0), 3) # Nicolo uses 10, so I grab the extremes and middle #theta_range = np.array([0.01]) - #weights = np.logspace(np.log10(0.0001), np.log10(10), 3); + #weights = np.logspace(np.log10(0.0001), np.log10(10), 3); #weights = np.array([100.0]) weights = np.array([0.001]) theta_range = weights# just to make life easier - + # only for simulated data N = 100 frac_zeros = 0 - + k = None allp = np.nan*np.zeros((len(theta_range) + 1, T)) if not simulated_data: - print "loading up saved data..." # two-fold CV data from CRISPR off-target GUIDE-SEQ + print("loading up saved data...") # two-fold CV data from CRISPR off-target GUIDE-SEQ with open(r'\\nerds5\kevin\from_nicolo\gs.pickle','rb') as f: predictions, truth_all = pickle.load(f) - print "done." + print("done.") N = len(truth_all[0]) - + for t in range(T): # totally simulated @@ -544,17 +544,17 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm truth[zero_ind] = 0 pred1 = np.random.rand(N) pred2 = np.random.rand(N) - else: + else: fold = 0 truth = truth_all[fold] pred1 = predictions["CFD"][fold] pred2 = predictions["product"][fold] - + if permute_real_data: truth = np.random.permutation(truth) - t0 = time.time() - #pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta_range=theta_range) + t0 = time.time() + #pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta_range=theta_range) for i, w in enumerate(weights): weights_array = truth.copy() weights_array += w @@ -562,26 +562,26 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm #corr0 = elevation.metrics.spearman_weighted(truth, pred1, w=weights_array) #corr1 = elevation.metrics.spearman_weighted(truth, pred2, w=weights_array) #corr01 = elevation.metrics.spearman_weighted(pred1, pred2, w=weights_array) - #n0 = len(truth) + #n0 = len(truth) #t2, pvaltmp = corrstats.dependent_corr(corr0, corr1, corr01, n0, twotailed=True, method="steiger") pvaltmp, real_corr_diff, perm_corr_diff, corr1, corr2 = elevation.spearman_weighted_swap_perm_test(pred1, pred2, truth, nperm, weights_array) - + allp[i, t] = pvaltmp t1 = time.time() #for i, theta in enumerate(theta_range.tolist() + ["all"]): - # print "%d, theta=%s) ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f, elapsed time=%f minutes, smallest_p=%f" % (t, str(theta), ndcg1[theta], ndcg2[theta], real_ndcg_diff[theta], pval[theta], (t1-t0)/60, 1.0/nperm) + # print("%d, theta=%s) ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f, elapsed time=%f minutes, smallest_p=%f" % (t, str(theta), ndcg1[theta], ndcg2[theta], real_ndcg_diff[theta], pval[theta], (t1-t0)/60, 1.0/nperm)) # allp[i, t] = pval[theta] - #print "---------------" - + #print("---------------") + #for i, theta in enumerate(theta_range.tolist() + ["all"]): for i, theta in enumerate(theta_range.tolist()): #mytitle = "Norm. hist p-values nDCG\n %d null samples, w %d perm and N=%d, theta=%s" % (T, nperm, N, str(theta)) mytitle = "Norm. hist p-values Steiger w weighted Spearman\n %d null samples, N=%d, weight=%s" % (T, N, str(theta)) ut.qqplotp(allp[i,:], dohist=True, numbins=10, figsize=[6,6], title=mytitle, markersize=5) plt.show() - + #save_tmp_results = r'D:\Source\CRISPR\elevation\pickles\tmp.ndcg.stat.calibration.p' #pickle.dump([theta_range, allp, pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2], open(save_tmp_results, "wb" )) #[theta_range, allp, pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2] = pickle.load(open(save_tmp_results, "rb" )) @@ -606,10 +606,10 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm # # using our function # dcg3 = dcg_at_k_ties(labels,predictions,k) - # print "%f, %f, %f" % (dcg1, dcg2, dcg3) + # print("%f, %f, %f" % (dcg1, dcg2, dcg3)) # assert(np.abs(dcg1 - dcg2) < 1e-8) # assert(np.abs(dcg2 - dcg3) < 1e-8) - # print "check out ok for case with all ties in predictions" + # print("check out ok for case with all ties in predictions") truth = np.array([3, 4, 2, 1, 0, 0, 0]) pred1 = np.array([3, 4, 2, 1, 0, 0, 0]) @@ -626,29 +626,29 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm k = len(pred3) pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) - - pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred1, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) + + pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred1, truth, nperm, method, k, normalize_from_below_too, theta=theta) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) - pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred4, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) + pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred4, truth, nperm, method, k, normalize_from_below_too, theta=theta) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) - pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred5, truth, nperm, method, k, normalize_from_below_too, theta=theta) - print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval) + pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred5, truth, nperm, method, k, normalize_from_below_too, theta=theta) + print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)) import ipdb; ipdb.set_trace() - #print ndcg_at_k_ties(truth, truth, k, method=0, normalize_from_below_too=True) - #print ndcg_at_k_ties(truth, pred2, k, method=0, normalize_from_below_too=True) - #print ndcg_at_k_ties(truth, pred3, k, method=0, normalize_from_below_too=True) - #print ndcg_at_k_ties(truth3, pred3, k, method=3, normalize_from_below_too=True) - print ndcg_at_k_ties(truth4, pred2, k, method=3, normalize_from_below_too=True) - - print ndcg_alt(truth[np.argsort(pred2)[::-1]], 5) - print ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=1) - print ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=0) + #print(ndcg_at_k_ties(truth, truth, k, method=0, normalize_from_below_too=True)) + #print(ndcg_at_k_ties(truth, pred2, k, method=0, normalize_from_below_too=True)) + #print(ndcg_at_k_ties(truth, pred3, k, method=0, normalize_from_below_too=True)) + #print(ndcg_at_k_ties(truth3, pred3, k, method=3, normalize_from_below_too=True)) + print(ndcg_at_k_ties(truth4, pred2, k, method=3, normalize_from_below_too=True)) + + print(ndcg_alt(truth[np.argsort(pred2)[::-1]], 5)) + print(ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=1)) + print(ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=0)) - print ndcg_at_k_ties(truth, pred2, 5, method=1) - print ndcg_at_k_ties(truth, pred2, 5, method=0) + print(ndcg_at_k_ties(truth, pred2, 5, method=1)) + print(ndcg_at_k_ties(truth, pred2, 5, method=0)) diff --git a/azimuth/model_comparison.py b/azimuth/model_comparison.py index c5fbf29..d3b0ee8 100755 --- a/azimuth/model_comparison.py +++ b/azimuth/model_comparison.py @@ -266,7 +266,7 @@ def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=Tru learn_options["order"] = 1 if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True: - print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)" + print("WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)") for i in range(Xdf.shape[0]): Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"]) # to_keep = Xdf['30mer'].isnull() == False @@ -307,7 +307,7 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_ "logregL1": "logregL1", "sgrna_from_doench":"sgrna_from_doench", 'SVC': 'SVC', 'xu_et_al': 'xu_et_al'} if not CV: - print "Received option CV=False, so I'm training using all of the data" + print("Received option CV=False, so I'm training using all of the data") assert len(learn_options_set.keys()) == 1, "when CV is False, only 1 set of learn options is allowed" assert len(models) == 1, "when CV is False, only 1 model is allowed" @@ -320,10 +320,10 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_ # models requiring explicit featurization if model in feat_models_short.keys(): for order in orders: - print "running %s, order %d for %s" % (model, order, learn_options_str) + print("running %s, order %d for %s" % (model, order, learn_options_str)) Y, feature_sets, target_genes, learn_options, num_proc = setup_function(test=test, order=order, learn_options=partial_learn_opt, pam_audit=pam_audit, length_audit=length_audit) # TODO precompute features for all orders, as this is repated for each model - + if model == 'L1': learn_options_model = L1_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn) elif model == 'L2': @@ -359,7 +359,7 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_ # if the model doesn't require explicit featurization else: assert setup_fn==setup, "not yet modified to handle this" - print "running %s for %s" % (model, learn_options_str) + print("running %s for %s" % (model, learn_options_str)) Y, feature_sets, target_genes, learn_options, num_proc = setup(test=test, order=1, learn_options=partial_learn_opt, pam_audit=pam_audit, length_audit=length_audit) if model == 'mean': learn_options_model = mean_setup(copy.deepcopy(learn_options)) @@ -392,12 +392,12 @@ def pickle_runner_results(exp_name, results, all_learn_options, relpath="/../" + dname = os.path.dirname(abspath) + relpath if not os.path.exists(dname): os.makedirs(dname) - print "Created directory: %s" % str(dname) + print("Created directory: %s" % str(dname)) if exp_name is None: exp_name = results.keys()[0] myfile = dname+'/'+ exp_name + '.pickle' with open(myfile, 'wb') as f: - print "writing results to %s" % myfile + print("writing results to %s" % myfile) pickle.dump((results, all_learn_options), f, -1) def runner(models, learn_options, GP_likelihoods=None, orders=None, WD_kernel_degrees=None, where='local', cluster_user='fusi', cluster='RR1-N13-09-H44', test=False, exp_name = None, **kwargs): @@ -550,7 +550,7 @@ def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None, model, learn_options = pickle.load(f) else: model, learn_options = model - + learn_options["V"] = 2 learn_options = override_learn_options(learn_options_override, learn_options) @@ -567,12 +567,12 @@ def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None, feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit) inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets) - - #print "CRISPR" + + #print("CRISPR") #pandas.DataFrame(inputs).to_csv("CRISPR.inputs.test.csv") #import ipdb; ipdb.set_trace() - # call to scikit-learn, returns a vector of predicted values + # call to scikit-learn, returns a vector of predicted values preds = model.predict(inputs) # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function()) @@ -609,7 +609,7 @@ def write_results(predictions, file_to_predict): data = pandas.read_csv(file_to_predict) data['predictions'] = predictions data.to_csv(newfile) - print "wrote results to %s" % newfile + print("wrote results to %s" % newfile) return data, newfile if __name__ == '__main__': diff --git a/azimuth/models/DNN.py b/azimuth/models/DNN.py index c01fe1a..eaa44b1 100755 --- a/azimuth/models/DNN.py +++ b/azimuth/models/DNN.py @@ -64,11 +64,11 @@ def DNN_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_optio if best_score is None or accuracies[i, j] > best_score: best_score = accuracies[i, j] best_model = copy.deepcopy(e) - print "DNN with %d hidden layers and %d units, accuracy: %.4f *" % (hl, nu, accuracies[i,j]) + print("DNN with %d hidden layers and %d units, accuracy: %.4f *" % (hl, nu, accuracies[i,j])) else: - print "DNN with %d hidden layers and %d units, accuracy: %.4f" % (hl, nu, accuracies[i,j]) + print("DNN with %d hidden layers and %d units, accuracy: %.4f" % (hl, nu, accuracies[i,j])) best_model.run((X_train, y_train), (X_test, y_test)) y_pred = best_model.network.predict(X[test]) - return y_pred, None \ No newline at end of file + return y_pred, None diff --git a/azimuth/models/GP.py b/azimuth/models/GP.py index 7d53520..69c19c0 100755 --- a/azimuth/models/GP.py +++ b/azimuth/models/GP.py @@ -100,8 +100,8 @@ def gp_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_o plt.figure('kernel') plt.title('kernel') plt.imshow(m.kern.K(X,X)) - print m - print "%.3f variance explained" % (m.Gaussian_noise.variance/y[train].var()) + print(m) + print("%.3f variance explained" % (m.Gaussian_noise.variance/y[train].var())) import ipdb; ipdb.set_trace() plt.close('all') else: diff --git a/azimuth/models/baselines.py b/azimuth/models/baselines.py index 0ad1512..4ee1690 100755 --- a/azimuth/models/baselines.py +++ b/azimuth/models/baselines.py @@ -1,3 +1,4 @@ +from __future__ import print_function import numpy as np import sklearn from sklearn.svm import LinearSVC @@ -62,11 +63,11 @@ def doench_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op assert np.nan not in tpr, "found nan tpr" roc_auc = sklearn.metrics.auc(fpr, tpr) if verbose: - print j, i, roc_auc + print(j, i, roc_auc) cv_results[j][i] = roc_auc best_penalty = penalty[np.argmax(np.mean(cv_results, axis=0))] - print "best AUC for penalty: ", np.median(cv_results, axis=0) + print("best AUC for penalty: ", np.median(cv_results, axis=0)) clf = LinearSVC(penalty='l1', C=best_penalty, dual=False, class_weight=auto_class_weight) clf.fit(X[train], y_bin[train].flatten()) non_zero_coeff = (clf.coef_ != 0.0) @@ -92,6 +93,3 @@ def SVC_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_optio #y_pred = clf.predict(X[test])[:, None] # this returns 0/1 y_pred = clf.decision_function(X[test])[:, None] return y_pred, clf - - - diff --git a/azimuth/models/ensembles.py b/azimuth/models/ensembles.py index 739c4e0..424e623 100755 --- a/azimuth/models/ensembles.py +++ b/azimuth/models/ensembles.py @@ -65,7 +65,7 @@ def adaboost_scoring_bo(params): 'max_features': hp.uniform('max_features', 0.05, 1.0)} best = fmin(adaboost_scoring_bo, space, algo=tpe.suggest, max_evals=50, verbose=1) - print best + print(best) clf = en.GradientBoostingRegressor(n_estimators=learn_options['adaboost_n_estimators'], learning_rate=best['learning_rate'], max_depth=best['max_depth'], @@ -77,7 +77,7 @@ def adaboost_scoring_bo(params): assert not classification, "need to tweak code below to do classificaton, as above" n_jobs = 20 - print "Adaboost with GridSearch" + print("Adaboost with GridSearch") from sklearn.grid_search import GridSearchCV param_grid = {'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 5, 6, 7], @@ -101,7 +101,7 @@ def adaboost_scoring_bo(params): est = en.GradientBoostingRegressor(loss=learn_options['adaboost_loss'], random_state=learn_options['seed'])#, n_estimators=learn_options['adaboost_n_estimators']) clf = GridSearchCV(est, param_grid, n_jobs=n_jobs, verbose=1, cv=cv, scoring=spearman_scoring, iid=False) clf.fit(X[train], y[train].flatten()) - print clf.best_params_ + print(clf.best_params_) else: raise Exception("if using adaboost_CV then need to specify grid (grid search) or bo (bayesian optimization)") diff --git a/azimuth/models/regression.py b/azimuth/models/regression.py index c68e682..df9f2ae 100755 --- a/azimuth/models/regression.py +++ b/azimuth/models/regression.py @@ -44,23 +44,23 @@ def logreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op assert len(np.unique(y)) <= 2, "if using logreg need binary targets" assert learn_options["weighted"] is None, "cannot do weighted Log reg" assert learn_options['feature_select'] is False, "cannot do feature selection yet in logistic regression--see linreg_on_fold to implement" - + cv, n_folds = set_up_inner_folds(learn_options, y_all.iloc[train]) assert learn_options['penalty'] == "L1" or learn_options['penalty'] == "L2", "can only use L1 or L2 with logistic regression" - + tol = 0.00001#0.0001 - + performance = np.zeros((len(learn_options["alpha"]), 1)) # degenerate_pred = np.zeros((len(learn_options["alpha"]))) for train_inner, test_inner in cv: for i, alpha in enumerate(learn_options["alpha"]): clf = sklearn.linear_model.LogisticRegression(penalty=learn_options['penalty'].lower(), dual=False, fit_intercept=learn_options["fit_intercept"], class_weight=learn_options["class_weight"], tol=tol, C=1.0/alpha) - + clf.fit(X[train][train_inner], y[train][train_inner].flatten()) #tmp_pred = clf.predict(X[train][test_inner]) tmp_pred = clf.predict_proba(X[train][test_inner])[:,1] - + if learn_options["training_metric"] == "AUC": fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred) assert ~np.any(np.isnan(fpr)), "found nan fpr" @@ -85,32 +85,32 @@ def logreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op if not isinstance(best_alpha, numbers.Number): raise Exception("best_alpha must be a number but is %s" % type(best_alpha)) - print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]]) + print("\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])) max_perf = np.nanmax(performance) if max_perf < 0.0: raise Exception("performance is negative") - print "\t\tbest performance is %f" % np.nanmax(performance) + print("\t\tbest performance is %f" % np.nanmax(performance)) clf = sklearn.linear_model.LogisticRegression(penalty=learn_options['penalty'], dual=False, fit_intercept=learn_options["fit_intercept"], class_weight=learn_options["class_weight"], tol=tol, C=1.0/best_alpha) clf.fit(X[train], y[train].flatten()) - # debugging check that get samed paramter estimation when have no regularization and use + # debugging check that get samed paramter estimation when have no regularization and use # either data with only that feature on, or all data), AND WITH NO INTERCEPT - if False: - # grab only feature "GA3" + if False: + # grab only feature "GA3" keep_ind = np.where(feature_sets['mutletpos'].columns=="GA3")[0] - print "%s, %s" % (str(clf.intercept_ ), str(clf.coef_[0, keep_ind])) + print("%s, %s" % (str(clf.intercept_ ), str(clf.coef_[0, keep_ind]))) clf.fit(X[train][:,keep_ind], y[train].flatten()) - print "%s, %s" % (str(clf.intercept_ ), str(clf.coef_)) - import ipdb; ipdb.set_trace() + print("%s, %s" % (str(clf.intercept_ ), str(clf.coef_))) + import ipdb; ipdb.set_trace() + - #y_pred = clf.predict(X[test]) y_pred = clf.predict_proba(X[test])[:,1] - y_pred = y_pred[:, None] + y_pred = y_pred[:, None] #fpr, tpr, _ = roc_curve(y, y_pred); tmp_auc = auc(fpr, tpr) #import ipdb; ipdb.set_trace() return y_pred, clf @@ -124,7 +124,7 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op if learn_options["weighted"] is not None and (learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"): raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment") - + if not learn_options.has_key("fit_intercept"): learn_options["fit_intercept"] = True if not learn_options.has_key('normalize_features'): @@ -194,16 +194,16 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]] - print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]]) - + print("\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])) + if learn_options['penalty'] == "EN": - print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]]) + print("\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]])) max_perf = np.nanmax(performance) if max_perf < 0.0: raise Exception("performance is negative") - print "\t\tbest performance is %f" % max_perf + print("\t\tbest performance is %f" % max_perf) clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all) if learn_options["feature_select"]: @@ -214,7 +214,7 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op if learn_options["penalty"] != "L2": y_pred = y_pred[:, None] - + return y_pred, clf @@ -267,16 +267,16 @@ def get_weights(learn_options, fold, y, y_all): return weights -def set_up_inner_folds(learn_options, y): +def set_up_inner_folds(learn_options, y): label_encoder = sklearn.preprocessing.LabelEncoder() - label_encoder.fit(y['Target gene'].values) + label_encoder.fit(y['Target gene'].values) gene_classes = label_encoder.transform(y['Target gene'].values) - n_genes = len(np.unique(gene_classes)) + n_genes = len(np.unique(gene_classes)) if learn_options['ignore_gene_level_for_inner_loop'] or learn_options["cv"] == "stratified" or n_genes==1: if 'n_folds' not in learn_options.keys(): n_folds = len(np.unique(gene_classes)) else: - n_folds = learn_options['n_folds'] + n_folds = learn_options['n_folds'] cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True) elif learn_options["cv"] == "gene": gene_list = np.unique(y['Target gene'].values) diff --git a/azimuth/predict.py b/azimuth/predict.py index b116983..2978fdc 100755 --- a/azimuth/predict.py +++ b/azimuth/predict.py @@ -65,26 +65,26 @@ def construct_filename(learn_options, TEST): elif learn_options["training_metric"] == 'spearmanr': filename += ".spearman" - print "filename = %s" % filename + print("filename = %s" % filename) return filename def print_summary(global_metric, results, learn_options, feature_sets, flags): - print "\nSummary:" - print learn_options - print "\t\tglobal %s=%.2f" % (learn_options['metric'], global_metric) - print "\t\tmedian %s across folds=%.2f" % (learn_options['metric'], np.median(results[0])) - print "\t\torder=%d" % learn_options["order"] + print("\nSummary:") + print(learn_options) + print("\t\tglobal %s=%.2f" % (learn_options['metric'], global_metric)) + print("\t\tmedian %s across folds=%.2f" % (learn_options['metric'], np.median(results[0]))) + print("\t\torder=%d" % learn_options["order"]) if learn_options.has_key('kerntype'): "\t\tkern type = %s" % learn_options['kerntype'] - if learn_options.has_key('degree'): print "\t\tdegree=%d" % learn_options['degree'] - print "\t\ttarget_name=%s" % learn_options["target_name"] + if learn_options.has_key('degree'): print("\t\tdegree=%d" % learn_options['degree']) + print("\t\ttarget_name=%s" % learn_options["target_name"]) for k in flags.keys(): - print '\t\t' + k + '=' + str(learn_options[k]) + print('\t\t' + k + '=' + str(learn_options[k])) - print "\t\tfeature set:" + print("\t\tfeature set:") for set in feature_sets.keys(): - print "\t\t\t%s" % set - print "\t\ttotal # features=%d" % results[4] + print("\t\t\t%s" % set) + print("\t\ttotal # features=%d" % results[4]) def extract_fpr_tpr_for_fold(aucs, fold, i, predictions, truth, y_binary, test, y_pred): assert len(np.unique(y_binary))<=2, "if using AUC need binary targets" @@ -136,7 +136,7 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge When CV=False, it trains on everything (and tests on everything, just to fit the code) ''' - print "range of y_all is [%f, %f]" % (np.min(y_all[learn_options['target_name']].values), np.max(y_all[learn_options['target_name']].values)) + print("range of y_all is [%f, %f]" % (np.min(y_all[learn_options['target_name']].values), np.max(y_all[learn_options['target_name']].values))) allowed_methods = ["GPy", "linreg", "AdaBoostRegressor", "AdaBoostClassifier", "DecisionTreeRegressor", "RandomForestRegressor", @@ -149,7 +149,7 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge # construct filename from options filename = construct_filename(learn_options, TEST) - print "Cross-validating genes..." + print("Cross-validating genes...") t2 = time.time() y = np.array(y_all[learn_options["target_name"]].values[:,None],dtype=np.float64) @@ -219,8 +219,8 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge if learn_options['num_genes_remove_train']==0: assert np.all(cv_i_orig[0]==cv[i][0]) assert np.all(cv_i_orig[1]==cv[i][1]) - print "# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0])) - print "# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1])) + print("# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0]))) + print("# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1]))) else: raise Exception("invalid cv options given: %s" % learn_options["cv"]) @@ -240,12 +240,12 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge num_proc = learn_options["num_proc"] if num_proc > 1: num_proc = np.min([num_proc,len(cv)]) - print "using multiprocessing with %d procs--one for each fold" % num_proc + print("using multiprocessing with %d procs--one for each fold" % num_proc) jobs = [] pool = multiprocessing.Pool(processes=num_proc) for i,fold in enumerate(cv): train,test = fold - print "working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test)) + print("working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test))) if learn_options["method"]=="GPy": job = pool.apply_async(azimuth.models.GP.gp_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)) elif learn_options["method"]=="linreg": @@ -351,15 +351,15 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge truth, predictions = fill_in_truth_and_predictions(truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test) - print "\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean()) - print "\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0] - print "\t\tfinished fold/gene %i of %i" % (i+1, len(fold_labels)) + print("\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean())) + print("\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0]) + print("\t\tfinished fold/gene %i of %i" % (i+1, len(fold_labels))) cv_median_metric =[np.median(metrics)] gene_pred = [(truth, predictions)] - print "\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1]) + print("\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1])) t3 = time.time() - print "\t\tElapsed time for cv is %.2f seconds" % (t3-t2) + print("\t\tElapsed time for cv is %.2f seconds" % (t3-t2)) return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names diff --git a/azimuth/util.py b/azimuth/util.py index 1bd6dcb..b3b252c 100755 --- a/azimuth/util.py +++ b/azimuth/util.py @@ -1,3 +1,4 @@ +from __future__ import print_function import pandas import matplotlib.pylab as plt import pylab as pl # so can just grab qqplotting code from fastlmm directly @@ -26,6 +27,7 @@ import pandas as pd import corrstats + def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None,fixaxes=True,addlambda=True,minpval=1e-20,title=None,h1=None,figsize=[5,5],grid=True, markersize=2): ''' performs a P-value QQ-plot in -log10(P-value) space @@ -46,7 +48,7 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N grid boolean: use a grid? (default: True) Returns: fighandle, qnull, qemp ----------------------------------------------------------------------- - ''' + ''' distr = 'log10' import pylab as pl if type(pvals)==list: @@ -57,20 +59,20 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N legendlist=legend else: legendlist = [legend] - + if h1 is None: - h1=pl.figure(figsize=figsize) - + h1=pl.figure(figsize=figsize) + pl.grid(b=grid, alpha = 0.5) - + maxval = 0 - for i in xrange(len(pvallist)): + for i in xrange(len(pvallist)): pval =pvallist[i].flatten() M = pval.shape[0] pnull = (0.5 + sp.arange(M))/M # pnull = np.sort(np.random.uniform(size = tests)) - + pval[pval=1]=1 @@ -81,31 +83,31 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N yl = '$\chi^2$ quantiles' if distr == 'log10': - qnull = -sp.log10(pnull) + qnull = -sp.log10(pnull) qemp = -sp.log10(sp.sort(pval)) #sorts the object, returns nothing xl = '-log10(P) observed' yl = '-log10(P) expected' if not (sp.isreal(qemp)).all(): raise Exception("imaginary qemp found") if qnull.max>maxval: - maxval = qnull.max() + maxval = qnull.max() pl.plot(qnull, qemp, '.', markersize=markersize) - #pl.plot([0,qemp.max()], [0,qemp.max()],'r') + #pl.plot([0,qemp.max()], [0,qemp.max()],'r') if addlambda: lambda_gc = estimate_lambda(pval) - print "lambda=%1.4f" % lambda_gc - #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) + print("lambda=%1.4f" % lambda_gc) + #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) # if there's only one method, just print the lambda if len(pvallist) == 1: - legendlist=["$\lambda_{GC}=$%1.4f" % lambda_gc] + legendlist=["$\lambda_{GC}=$%1.4f" % lambda_gc] # otherwise add it at the end of the name else: legendlist[i] = legendlist[i] + " ($\lambda_{GC}=$%1.4f)" % lambda_gc - addqqplotinfo(qnull,M,xl,yl,xlim,ylim,alphalevel,legendlist,fixaxes) - + addqqplotinfo(qnull,M,xl,yl,xlim,ylim,alphalevel,legendlist,fixaxes) + if title is not None: - pl.title(title) - + pl.title(title) + if fileout is not None: pl.savefig(fileout) @@ -116,20 +118,20 @@ def qqplotp(pv,fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None ''' Read in p-values from filein and make a qqplot adn histogram. If fileout is provided, saves the qqplot only at present. - Searches through p until one is found. ''' - - import pylab as pl - pl.ion() - - fs=8 + Searches through p until one is found. ''' + + import pylab as pl + pl.ion() + + fs=8 h1=qqplot(pv, fileout, alphalevel,legend,xlim,ylim,addlambda=True, figsize=figsize, markersize=markersize) #lambda_gc=estimate_lambda(pv) - #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) + #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2) pl.title(title,fontsize=fs) - + wm=pl.get_current_fig_manager() #e.g. "652x526+100+10 - xcoord=100 + xcoord=100 #wm.window.wm_geometry(plotsize + "+" + str(xcoord) + "+" + str(ycoord)) if dohist: @@ -144,7 +146,7 @@ def qqplotp(pv,fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None return h1,h2 -def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=None,ylim=None,alphalevel=0.05,legendlist=None,fixaxes=False): +def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=None,ylim=None,alphalevel=0.05,legendlist=None,fixaxes=False): distr='log10' pl.plot([0,qnull.max()], [0,qnull.max()],'k') pl.ylabel(xl) @@ -152,7 +154,7 @@ def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=N if xlim is not None: pl.xlim(xlim) if ylim is not None: - pl.ylim(ylim) + pl.ylim(ylim) if alphalevel is not None: if distr == 'log10': betaUp, betaDown, theoreticalPvals = _qqplot_bar(M=M,alphalevel=alphalevel,distr=distr) @@ -168,7 +170,7 @@ def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=N lo.set_markersize(10) if fixaxes: - fix_axes() + fix_axes() def _qqplot_bar(M=1000000, alphalevel = 0.05,distr = 'log10'): ''' @@ -214,8 +216,8 @@ def _qqplot_bar(M=1000000, alphalevel = 0.05,distr = 'log10'): def fix_axes(buffer=0.1): ''' Makes x and y max the same, and the lower limits 0. - ''' - maxlim=max(pl.xlim()[1],pl.ylim()[1]) + ''' + maxlim=max(pl.xlim()[1],pl.ylim()[1]) pl.xlim([0-buffer,maxlim+buffer]) pl.ylim([0-buffer,maxlim+buffer]) @@ -232,13 +234,13 @@ def estimate_lambda(pv): L = (LOD2/0.456) return L - -def pvalhist(pv,numbins=50,linewidth=3.0,linespec='--r', figsize=[5,5]): + +def pvalhist(pv,numbins=50,linewidth=3.0,linespec='--r', figsize=[5,5]): ''' Plots normalized histogram, plus theoretical null-only line. - ''' - h2=pl.figure(figsize=figsize) - [nn,bins,patches]=pl.hist(pv,numbins,normed=True) + ''' + h2=pl.figure(figsize=figsize) + [nn,bins,patches]=pl.hist(pv,numbins,normed=True) pl.plot([0, 1],[1,1],linespec,linewidth=linewidth) @@ -291,7 +293,7 @@ def guide_positional_features(guide_seq, gene, strand): guide_seq = guide_seq.reverse_complement() ind = gene_seq.find(guide_seq) if ind ==-1: - print "returning None, could not find guide %s in gene %s" % (guide_seq, gene) + print("returning None, could not find guide %s in gene %s" % (guide_seq, gene)) return "" assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" ## now get what we want from this: @@ -310,7 +312,7 @@ def convert_to_thirty_one(guide_seq, gene, strand): guide_seq = guide_seq.reverse_complement() ind = gene_seq.find(guide_seq) if ind ==-1: - print "returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene) + print("returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene)) return gene_seq + 'A' assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" #new_mer = gene_seq[ind:(ind+len(guide_seq))+1] #looks correct, but is wrong, due to strand frame-of-reference @@ -349,7 +351,7 @@ def concatenate_feature_sets(feature_sets, keys=None): if False: inputs.shape - for j in keys: print j + str(feature_sets[j].shape) + for j in keys: print(j + str(feature_sets[j].shape)) import ipdb; ipdb.set_trace() #print "final size of inputs matrix is (%d, %d)" % inputs.shape @@ -383,7 +385,7 @@ def spearmanr_nonan(x,y): r, p = st.spearmanr(x, y) if np.isnan(p): if len(np.unique(x))==1 or len(np.unique(y))==1: - print "WARNING: spearmanr is nan due to unique values, setting to 0" + print("WARNING: spearmanr is nan due to unique values, setting to 0") p = 0.0 r = 0.0 else: @@ -435,7 +437,7 @@ def get_gene_sequence(gene_name): # records = Entrez.read(search) # if len(records['IdList']) > 1: - # print "warning, multiple hits found for entrez gene search %s" % gene_name + # print("warning, multiple hits found for entrez gene search %s" % gene_name) # elink = Entrez.read(Entrez.elink(dbfrom="gene", db='nucleotide', id=records['IdList'][0])) # nucl_id = elink[0]['LinkSetDb'][3] @@ -446,7 +448,7 @@ def get_gene_sequence(gene_name): # nucl_id = elink[0]['LinkSetDb'][0]['Link'][0]['Id'] # cut = True # else: - # print "sorry not enough information to return sequence" + # print("sorry not enough information to return sequence") # return None # else: # nucl_id = nucl_id['Link'][0]['Id'] @@ -466,7 +468,7 @@ def target_genes_stats(genes=['HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', for gene in genes: seq = get_gene_sequence(gene) if seq != None: - print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA')) + print('%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))) def ranktrafo(data): @@ -518,7 +520,7 @@ def get_ranks(y, thresh=0.8, prefix="", flip=False, col_name='score'): # y_quantized = pandas.DataFrame(data=pandas.qcut(y[col_name], 5, labels=np.arange(5.0))) # quantized vector y_quantized = y_threshold.copy() y_quantized.columns = [prefix + "quantized"] - + return y_rank, y_rank_raw, y_threshold, y_quantized def get_data(data, y_names, organism="human", target_gene=None): @@ -1004,7 +1006,7 @@ def plot_all_metrics(metrics, gene_names, all_learn_options, save, plots=None, b plt.bar(ind+(i*width), metrics[method][metric], width, color=plt.cm.Paired(1.*i/len(metrics.keys())), label=method) median_metric = np.median(metrics[method][metric]) - print method, metric, median_metric + print(method, metric, median_metric) assert not np.isnan(median_metric), "found nan for %s, %s" % (method, metric) if metric not in boxplot_arrays.keys(): boxplot_arrays[metric] = np.array(metrics[method][metric])[:, None] @@ -1061,7 +1063,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a if filelist ==[]: raise Exception("found no pickle files in %s" % directory) else: - print "found %d files in %s" % (len(filelist), directory) + print("found %d files in %s" % (len(filelist), directory)) for results_file in filelist: if 'learn_options' in results_file: @@ -1074,7 +1076,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a if m in results_file: in_filt = True if not in_filt: - print "%s not in model_filter" % (results_file)#, model_filter) + print("%s not in model_filter" % (results_file))#, model_filter) continue elif model_filter not in results_file: continue @@ -1094,7 +1096,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a else: k_new = k assert k_new not in all_results.keys(), "found %s already" % k - print "adding key %s (from file %s)" % (k_new, os.path.split(results_file)[-1]) + print("adding key %s (from file %s)" % (k_new, os.path.split(results_file)[-1])) all_results[k_new] = results[k] all_learn_options[k_new] = learn_options[k] num_added = num_added +1 @@ -1205,8 +1207,8 @@ def ensemble_cluster_results(directory=r'\\fusi1\crispr2\analysis\cluster\result # spearmans = [] # for gene in ens_predictions.keys(): # spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0]) - # print gene, spearmans[-1] - # print "median: %.5f" % np.median(spearmans) + # print(gene, spearmans[-1]) + # print("median: %.5f" % np.median(spearmans)) return all_results, all_learn_options @@ -1245,13 +1247,13 @@ def plot_old_vs_new_feat(results, models, fontsize=20, filename=None, print_outp feat_AUC_se.append(np.std(metrics_feat['AUC'])) - print "old features" - print "mean: " + str(base_spearman_means) - print "std: " + str(base_spearman_std) + print("old features") + print("mean: " + str(base_spearman_means)) + print("std: " + str(base_spearman_std)) - print "old + new features" - print "mean: " + str(feat_spearman_means) - print "std: " + str(feat_spearman_std) + print("old + new features") + print("mean: " + str(feat_spearman_means)) + print("std: " + str(feat_spearman_std)) plt.figure() ind = np.arange(len(models)) @@ -1322,7 +1324,7 @@ def remove_top_right_on_plot(ax=None): X, Y = combine_organisms() X.to_pickle('../data/X.pd') #sequence features (i.e. inputs to prediction) Y.to_pickle('../data/Y.pd') #cell-averaged ranks, plus more (i.e. possible targets for prediction) - print "done writing to file" + print("done writing to file") elif V =="2": # this is now all in predict.py pass diff --git a/setup.py b/setup.py index 4392e56..a0a2bad 100755 --- a/setup.py +++ b/setup.py @@ -1,6 +1,11 @@ # from Cython.Build import cythonize from setuptools import setup +import sys +if sys.version_info[0] >= 3: + requires = ['scipy', 'numpy', 'matplotlib', 'nose', 'scikit-learn', 'pandas', 'biopython'] +else: + requires = ['scipy', 'numpy', 'matplotlib<3.0', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython'] setup(name='Azimuth', version='2.0', @@ -9,7 +14,7 @@ description=("Machine Learning-Based Predictive Modelling of CRISPR/Cas9 guide efficiency"), packages=["azimuth", "azimuth.features", "azimuth.models", "azimuth.tests"], package_data={'azimuth': ['saved_models/*.*']}, - install_requires=['scipy', 'numpy', 'matplotlib<3.0', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython'], + install_requires=requires, license="BSD", # ext_modules=cythonize("ssk_cython.pyx"), )