diff --git a/README.md b/README.md
index 58e4ab4..b06e89e 100755
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ percent_peptides = np.array([0.18, 0.18, 0.35])
 predictions = azimuth.model_comparison.predict(sequences, amino_acid_cut_positions, percent_peptides)
 
 for i, prediction in enumerate(predictions):
-    print sequences[i], prediction
+    print("%s %f" % (sequences[i], prediction))
 ```
 
 Output:
@@ -87,5 +87,3 @@ Sometimes the pre-computed .pickle files in the saved_models directory are incom
 #### Contacting us
 
 You can submit bug reports using the GitHub issue tracker. If you have any other questions, please contact us at crispr@lists.research.microsoft.com.
-
-
diff --git a/azimuth/cluster_job.py b/azimuth/cluster_job.py
index c799ce8..259f90d 100755
--- a/azimuth/cluster_job.py
+++ b/azimuth/cluster_job.py
@@ -5,7 +5,7 @@
 # just execute this file in python to create the xml file for the cluster (in ./analysis/cluster), which one then can manually submit through the HPC Job Manager
 
 def cluster_setup(i, python_path, home, t, work_dir, tempdir):
-    t.work_directory = work_dir    
+    t.work_directory = work_dir
     #t.std_out_file_path = r'cluster\log\cluster_out%d.txt' % i
     #t.std_err_file_path = r'cluster\log\cluster_err%d.txt' % i
     t.std_out_file_path = tempdir + r'\out%d.txt' % i
@@ -14,10 +14,10 @@ def cluster_setup(i, python_path, home, t, work_dir, tempdir):
     #t.std_err_file_path = r'err%d.txt' % i
     #if not os.path.exists(t.std_out_file_path): os.makedirs(t.std_out_file_path)
     #if not os.path.exists(t.std_err_file_path): os.makedirs(t.std_err_file_path)
-    t.environment_variables['PYTHONPATH'] = python_path     
+    t.environment_variables['PYTHONPATH'] = python_path
     t.environment_variables['HOME'] = home
 
-    print "cluster python_path=%s" % python_path
+    print("cluster python_path=%s" % python_path)
 
 def create(user, models, orders, degrees, GP_likelihoods, adaboost_learning_rates=None, adaboost_num_estimators=None, adaboost_max_depths=None, adaboost_CV=False, exp_name=None, learn_options=None):
     job = WinHPCJob()
@@ -42,18 +42,18 @@ def create(user, models, orders, degrees, GP_likelihoods, adaboost_learning_rate
         home =  r"\\fusi1\CLUSTER_HOME"
     elif job.username == 'REDMOND\\jennl':
         remote_dir = r"\\GCR\Scratch\RR1\jennl\CRISPR"
-        work_dir = r'\\jennl2\D$\Source\CRISPR\analysis'            
+        work_dir = r'\\jennl2\D$\Source\CRISPR\analysis'
         python = r'\\fusi1\crispr\python.exe'
         python_path = r'\\fusi1\crispr\lib\site-packages\;\\jennl2\D$\Source\CRISPR\analysis'
         home =  r"\\fusi1\CLUSTER_HOME"
 
-    # print "workdir=%s" % work_dir
-    # print "python=%s" % python
-    # print "python_path=%s" % python_path
+    # print("workdir=%s" % work_dir)
+    # print("python=%s" % python)
+    # print("python_path=%s" % python_path)
 
-    # generate random dir in results directory   
+    # generate random dir in results directory
     tempdir = tempfile.mkdtemp(prefix='cluster_experiment_', dir=remote_dir)
-    print "Created directory: %s" % str(tempdir)
+    print("Created directory: %s" % str(tempdir))
 
     # dump learn_options
     with open(tempdir+'/learn_options.pickle', 'wb') as f:
diff --git a/azimuth/corrstats.py b/azimuth/corrstats.py
index 619157a..3d11b8e 100644
--- a/azimuth/corrstats.py
+++ b/azimuth/corrstats.py
@@ -113,8 +113,8 @@ def independent_corr(xy, ab, n, n2 = None, twotailed=True, conf_level=0.95, meth
     else:
         raise Exception('Wrong method!')
 
-#print dependent_corr(.396, .179, .088, 200, method='steiger')
-#print independent_corr(.560, .588, 100, 353, method='fisher')
+#print(dependent_corr(.396, .179, .088, 200, method='steiger'))
+#print(independent_corr(.560, .588, 100, 353, method='fisher'))
 
-#print dependent_corr(.396, .179, .088, 200, method='zou')
-#print independent_corr(.560, .588, 100, 353, method='zou')
\ No newline at end of file
+#print(dependent_corr(.396, .179, .088, 200, method='zou'))
+#print(independent_corr(.560, .588, 100, 353, method='zou'))
diff --git a/azimuth/features/featurization.py b/azimuth/features/featurization.py
index bb88359..603094b 100755
--- a/azimuth/features/featurization.py
+++ b/azimuth/features/featurization.py
@@ -21,7 +21,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length
     assert num_lengths == 1, "should only have sequences of a single length, but found %s: %s" % (num_lengths, str(unique_lengths))
 
     if not quiet:
-        print "Constructing features..."
+        print("Constructing features...")
     t0 = time.time()
 
     feature_sets = {}
@@ -49,7 +49,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length
         feature_sets["Percent Peptide <50%"]['Percent Peptide <50%'] = feature_sets["Percent Peptide <50%"].pop("Percent Peptide")
 
     if learn_options["include_gene_effect"]:
-        print "including gene effect"
+        print("including gene effect")
         gene_names = Y['Target gene']
         enc = sklearn.preprocessing.OneHotEncoder()
         label_encoder = sklearn.preprocessing.LabelEncoder()
@@ -95,7 +95,7 @@ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length
 
     t1 = time.time()
     if not quiet:
-        print "\t\tElapsed time for constructing features is %.2f seconds" % (t1-t0)
+        print("\t\tElapsed time for constructing features is %.2f seconds" % (t1-t0))
 
     check_feature_set(feature_sets)
 
@@ -138,8 +138,8 @@ def NGGX_interaction_feature(data, pam_audit=True):
     for seq in sequence:
         if pam_audit and seq[25:27] != "GG":
             raise Exception("expected GG but found %s" % seq[25:27])
-        NX = seq[24]+seq[27]        
-        NX_onehot = nucleotide_features(NX,order=2, feature_type='pos_dependent', max_index_to_use=2, prefix="NGGX")        
+        NX = seq[24]+seq[27]
+        NX_onehot = nucleotide_features(NX,order=2, feature_type='pos_dependent', max_index_to_use=2, prefix="NGGX")
         # NX_onehot[:] = np.random.rand(NX_onehot.shape[0]) ##TESTING RANDOM FEATURE
         feat_NX = pandas.concat([feat_NX, NX_onehot], axis=1)
     return feat_NX.T
@@ -148,7 +148,7 @@ def NGGX_interaction_feature(data, pam_audit=True):
 def get_all_order_nuc_features(data, feature_sets, learn_options, maxorder, max_index_to_use, prefix="", quiet=False):
     for order in range(1, maxorder+1):
         if not quiet:
-            print "\t\tconstructing order %s features" % order
+            print("\t\tconstructing order %s features" % order)
         nuc_features_pd, nuc_features_pi = apply_nucleotide_features(data, order, learn_options["num_proc"],
                                                                      include_pos_independent=True, max_index_to_use=max_index_to_use, prefix=prefix)
         feature_sets['%s_nuc_pd_Order%i' % (prefix, order)] = nuc_features_pd
@@ -157,7 +157,7 @@ def get_all_order_nuc_features(data, feature_sets, learn_options, maxorder, max_
         check_feature_set(feature_sets)
 
         if not quiet:
-            print "\t\t\t\t\t\t\tdone"
+            print("\t\t\t\t\t\t\tdone")
 
 
 def countGC(s, length_audit=True):
@@ -202,7 +202,7 @@ def organism_feature(data):
 def get_micro_homology_features(gene_names, learn_options, X):
     # originally was flipping the guide itself as necessary, but now flipping the gene instead
 
-    print "building microhomology features"
+    print("building microhomology features")
     feat = pandas.DataFrame(index=X.index)
     feat["mh_score"] = ""
     feat["oof_score"] = ""
@@ -215,7 +215,7 @@ def get_micro_homology_features(gene_names, learn_options, X):
         for gene in gene_names.unique():
             gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
             guide_inds = np.where(gene_names.values == gene)[0]
-            print "getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene)
+            print("getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene))
             for j, ps in enumerate(guide_inds):
                 guide_seq = Seq.Seq(X['30mer'][ps])
                 strand = X['Strand'][ps]
@@ -227,18 +227,18 @@ def get_micro_homology_features(gene_names, learn_options, X):
                     gene_seq = gene_seq.reverse_complement()
                     ind = gene_seq.find(guide_seq)
                     #assert ind != -1, "still didn't work"
-                    #print "shouldn't get here"
+                    #print("shouldn't get here")
                 else:
-                    #print "all good"
+                    #print("all good")
                     pass
                 #assert ind != -1, "could not find guide in gene"
                 if ind==-1:
-                    #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene))
+                    #print("***could not find guide %s for gene %s" % (str(guide_seq), str(gene)))
                     #if.write(str(gene) + "," + str(guide_seq))
                     mh_score = 0
                     oof_score = 0
                 else:
-                    #print "worked"
+                    #print("worked")
 
                     assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
                     left_win = gene_seq[(ind - k_mer_length_left):ind]
@@ -258,14 +258,14 @@ def get_micro_homology_features(gene_names, learn_options, X):
 
                 feat.ix[ps,"mh_score"] = mh_score
                 feat.ix[ps,"oof_score"] = oof_score
-            print "computed microhomology of %s" % (str(gene))
+            print("computed microhomology of %s" % (str(gene)))
 
     return pandas.DataFrame(feat, dtype='float')
 
 
 def local_gene_seq_features(gene_names, learn_options, X):
 
-    print "building local gene sequence features"
+    print("building local gene sequence features")
     feat = pandas.DataFrame(index=X.index)
     feat["gene_left_win"] = ""
     feat["gene_right_win"] = ""
@@ -300,7 +300,7 @@ def local_gene_seq_features(gene_names, learn_options, X):
             assert len(left_win)==len(right_win), "k_mer_context, %s, is too large" % k_mer_length
             feat.ix[ps,"gene_left_win"] = left_win.tostring()
             feat.ix[ps,"gene_right_win"] = right_win.tostring()
-        print "featurizing local context of %s" % (gene)
+        print("featurizing local context of %s" % (gene))
 
     feature_sets = {}
     get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_left_win")
@@ -341,11 +341,11 @@ def gene_guide_feature(Y, X, learn_options):
     gene_file = r"..\data\gene_seq_feat_V%s_km%s.ord%s.pickle" % (learn_options['V'], learn_options['include_gene_guide_feature'], learn_options['order'])
 
     if False: #os.path.isfile(gene_file): #while debugging, comment out
-        print "loading local gene seq feats from file %s" % gene_file
+        print("loading local gene seq feats from file %s" % gene_file)
         with open(gene_file, "rb") as f: feature_sets = pickle.load(f)
     else:
         feature_sets = local_gene_seq_features(Y['Target gene'], learn_options, X)
-        print "writing local gene seq feats to file %s" % gene_file
+        print("writing local gene seq feats to file %s" % gene_file)
         with open(gene_file, "wb") as f: pickle.dump(feature_sets, f)
 
     return feature_sets
@@ -383,11 +383,11 @@ def Tm_feature(data, pam_audit=True, learn_options=None):
         featarray[i,2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]], rna=rna)   #8-mer
         featarray[i,3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]], rna=rna)      #5-mer
 
-        #print "CRISPR"
+        #print("CRISPR")
         #for d in range(4):
-        #    print featarray[i,d]
+        #    print(featarray[i,d])
         #import ipdb; ipdb.set_trace()
-    
+
 
     feat = pandas.DataFrame(featarray, index=data.index, columns=["Tm global_%s" % rna, "5mer_end_%s" %rna, "8mer_middle_%s" %rna, "5mer_start_%s" %rna])
 
@@ -442,7 +442,7 @@ def nucleotide_features(s, order, max_index_to_use, prefix="", feature_type='all
     '''
     assert feature_type in ['all', 'pos_independent', 'pos_dependent']
     if max_index_to_use <= len(s):
-        #print "WARNING: trimming max_index_to use down to length of string=%s" % len(s)
+        #print("WARNING: trimming max_index_to use down to length of string=%s" % len(s))
         max_index_to_use = len(s)
 
     if max_index_to_use is not None:
@@ -493,7 +493,7 @@ def nucleotide_features(s, order, max_index_to_use, prefix="", feature_type='all
             return res
 
     res = pandas.Series(features_pos_dependent, index=index_dependent)
-    assert not np.any(np.isnan(res.values))    
+    assert not np.any(np.isnan(res.values))
     return res
 
 def nucleotide_features_dictionary(prefix=''):
@@ -537,7 +537,7 @@ def normalize_feature_sets(feature_sets):
     zero-mean, unit-variance each feature within each set
     '''
 
-    print "Normalizing features..."
+    print("Normalizing features...")
     t1 = time.time()
 
     new_feature_sets = {}
@@ -547,6 +547,6 @@ def normalize_feature_sets(feature_sets):
              raise Exception("found Nan feature values in set=%s" % set)
          assert new_feature_sets[set].shape[1] > 0, "0 columns of features"
     t2 = time.time()
-    print "\t\tElapsed time for normalizing features is %.2f seconds" % (t2-t1)
+    print("\t\tElapsed time for normalizing features is %.2f seconds" % (t2-t1))
 
     return new_feature_sets
diff --git a/azimuth/features/microhomology.py b/azimuth/features/microhomology.py
index 45de681..3fa27d8 100755
--- a/azimuth/features/microhomology.py
+++ b/azimuth/features/microhomology.py
@@ -1,102 +1,102 @@
-#Supplementary Figure 3  |  Source code for assigning a score to a hypothetical deletion 
-#pattern associated with microhomology 
+#Supplementary Figure 3  |  Source code for assigning a score to a hypothetical deletion
+#pattern associated with microhomology
 # ------------------------------------------
 # comes from the Supplementary Info of the paper, in pdf form, copied here, but refactored to make a function
 #    rather than to write it to file
 # also see their web server version: http://www.rgenome.net/mich-calculator/ where they say:
 # Insert one or more query sequences (A, G, T, C only) flanking the same length at a cleavage site (100bp or less, 60~80bp recommended).
 
-from math import exp       
-from re import findall 
- 
+from math import exp
+from re import findall
+
 def compute_score(seq, tmpfile1="1.before removing duplication.txt", tmpfile2="2.all microhomology patterns.txt", verbose=False):
-    length_weight=20.0 
-    left=30        # Insert the position expected to be broken. 
-    right=len(seq)-int(left) 
-    #print 'length of seq = '+str(len(seq)) 
-     
-    file_temp=open(tmpfile1, "w") 
-    for k in range(2,left)[::-1]: 
-            for j in range(left,left+right-k+1): 
-                    for i in range(0,left-k+1): 
-                            if seq[i:i+k]==seq[j:j+k]: 
-                                    length=j-i 
-                                    file_temp.write(seq[i:i+k]+'\t'+str(i)+'\t'+str(i+k)+'\t'+str(j)+'\t'+str(j+k)+'\t'+str(length)+'\n') 
-    file_temp.close() 
-     
-    ### After searching out all microhomology patterns, duplication should be removed!! 
-    f1=open(tmpfile1, "r") 
-    s1=f1.read() 
-     
-    f2=open(tmpfile2, "w") #After removing duplication 
-    f2.write(seq+'\t'+'microhomology\t'+'deletion length\t'+'score of a pattern\n') 
-     
-    if s1!="": 
-            list_f1=s1.strip().split('\n') 
-            sum_score_3=0 
-            sum_score_not_3=0 
-     
-            for i in range(len(list_f1)): 
-                    n=0 
-                    score_3=0 
-                    score_not_3=0 
-                    line=list_f1[i].split('\t') 
-                    scrap=line[0] 
-                    left_start=int(line[1]) 
-                    left_end=int(line[2]) 
-                    right_start=int(line[3]) 
-                    right_end=int(line[4]) 
-                    length=int(line[5]) 
-     
-                    for j in range(i): 
-                            line_ref=list_f1[j].split('\t') 
-                            left_start_ref=int(line_ref[1]) 
-                            left_end_ref=int(line_ref[2]) 
-                            right_start_ref=int(line_ref[3]) 
-                            right_end_ref=int(line_ref[4]) 
-     
-                            if (left_start >= left_start_ref) and (left_end <= left_end_ref) and (right_start >= right_start_ref) and (right_end <= right_end_ref): 
-                                    if (left_start - left_start_ref)==(right_start - right_start_ref) and (left_end - left_end_ref)==(right_end - right_end_ref): 
-                                            n+=1 
-                            else: pass 
-                           
-                    if n == 0: 
-                            if (length % 3)==0: 
-                                    length_factor = round(1/exp((length)/(length_weight)),3) 
-                                    num_GC=len(findall('G',scrap))+len(findall('C',scrap)) 
-                                    score_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) 
-                                     
-                            elif (length % 3)!=0: 
-                                    length_factor = round(1/exp((length)/(length_weight)),3) 
-                                    num_GC=len(findall('G',scrap))+len(findall('C',scrap)) 
-                                    score_not_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2)) 
-     
-                            f2.write(seq[0:left_end]+'-'*length+seq[right_end:]+'\t'+scrap+'\t'+str(length)+'\t'+str(100*length_factor*((len(scrap)-num_GC)+(num_GC*2)))+'\n') 
-                    sum_score_3+=score_3 
-                    sum_score_not_3+=score_not_3 
-     
+    length_weight=20.0
+    left=30        # Insert the position expected to be broken.
+    right=len(seq)-int(left)
+    #print('length of seq = '+str(len(seq)))
+
+    file_temp=open(tmpfile1, "w")
+    for k in range(2,left)[::-1]:
+            for j in range(left,left+right-k+1):
+                    for i in range(0,left-k+1):
+                            if seq[i:i+k]==seq[j:j+k]:
+                                    length=j-i
+                                    file_temp.write(seq[i:i+k]+'\t'+str(i)+'\t'+str(i+k)+'\t'+str(j)+'\t'+str(j+k)+'\t'+str(length)+'\n')
+    file_temp.close()
+
+    ### After searching out all microhomology patterns, duplication should be removed!!
+    f1=open(tmpfile1, "r")
+    s1=f1.read()
+
+    f2=open(tmpfile2, "w") #After removing duplication
+    f2.write(seq+'\t'+'microhomology\t'+'deletion length\t'+'score of a pattern\n')
+
+    if s1!="":
+            list_f1=s1.strip().split('\n')
+            sum_score_3=0
+            sum_score_not_3=0
+
+            for i in range(len(list_f1)):
+                    n=0
+                    score_3=0
+                    score_not_3=0
+                    line=list_f1[i].split('\t')
+                    scrap=line[0]
+                    left_start=int(line[1])
+                    left_end=int(line[2])
+                    right_start=int(line[3])
+                    right_end=int(line[4])
+                    length=int(line[5])
+
+                    for j in range(i):
+                            line_ref=list_f1[j].split('\t')
+                            left_start_ref=int(line_ref[1])
+                            left_end_ref=int(line_ref[2])
+                            right_start_ref=int(line_ref[3])
+                            right_end_ref=int(line_ref[4])
+
+                            if (left_start >= left_start_ref) and (left_end <= left_end_ref) and (right_start >= right_start_ref) and (right_end <= right_end_ref):
+                                    if (left_start - left_start_ref)==(right_start - right_start_ref) and (left_end - left_end_ref)==(right_end - right_end_ref):
+                                            n+=1
+                            else: pass
+
+                    if n == 0:
+                            if (length % 3)==0:
+                                    length_factor = round(1/exp((length)/(length_weight)),3)
+                                    num_GC=len(findall('G',scrap))+len(findall('C',scrap))
+                                    score_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2))
+
+                            elif (length % 3)!=0:
+                                    length_factor = round(1/exp((length)/(length_weight)),3)
+                                    num_GC=len(findall('G',scrap))+len(findall('C',scrap))
+                                    score_not_3=100*length_factor*((len(scrap)-num_GC)+(num_GC*2))
+
+                            f2.write(seq[0:left_end]+'-'*length+seq[right_end:]+'\t'+scrap+'\t'+str(length)+'\t'+str(100*length_factor*((len(scrap)-num_GC)+(num_GC*2)))+'\n')
+                    sum_score_3+=score_3
+                    sum_score_not_3+=score_not_3
+
             mh_score = sum_score_3+sum_score_not_3
             oof_score = (sum_score_not_3)*100/(sum_score_3+sum_score_not_3)
             if verbose:
-                print 'Microhomology score = ' + str(mh_score) 
-                print 'Out-of-frame score = ' + str(oof_score) 
-    f1.close() 
+                print('Microhomology score = ' + str(mh_score))
+                print('Out-of-frame score = ' + str(oof_score))
+    f1.close()
     f2.close()
     return mh_score, oof_score
 
 if __name__ == '__main__':
-    seq='GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG'    # The length of sequence is recommend within 60~80 bases. 
+    seq='GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG'    # The length of sequence is recommend within 60~80 bases.
 
     tmpfile1 = "1.before removing duplication.txt"
     tmpfile2 = "2.all microhomology patterns.txt"
-    
+
     mh_score, oof_score = compute_score(seq, tmpfile1=tmpfile1, tmpfile2=tmpfile2, verbose=True)
 
-    # The row of output file is consist of (full sequence, microhomology scrap, deletion length, score of pattern). 
+    # The row of output file is consist of (full sequence, microhomology scrap, deletion length, score of pattern).
 
     #correct output is
     #Microhomology score = 4662.9
     #Out-of-frame score = 50.7473889639
-    #GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG    
-    
-    print seq  
\ No newline at end of file
+    #GGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCG
+
+    print(seq)
diff --git a/azimuth/load_data.py b/azimuth/load_data.py
index d7245a0..06781c7 100755
--- a/azimuth/load_data.py
+++ b/azimuth/load_data.py
@@ -10,7 +10,7 @@
 
 def from_custom_file(data_file, learn_options):
     # use semantics of when we load V2 data
-    print "Loading inputs to predict from %s" % data_file
+    print("Loading inputs to predict from %s" % data_file)
     data = pandas.read_csv(data_file)
 
     mandatory_columns = ['30mer', 'Target gene', 'Percent Peptide', 'Amino Acid Cut position']
@@ -37,7 +37,7 @@ def from_custom_file(data_file, learn_options):
 def from_file(data_file, learn_options, data_file2=None, data_file3=None):
     if learn_options["V"] == 1:  # from Nature Biotech paper
 
-        print "loading V%d data" % learn_options["V"]
+        print("loading V%d data" % learn_options["V"])
 
         assert not learn_options["weighted"] is not None, "not supported for V1 data"
         annotations, gene_position, target_genes, Xdf, Y = read_V1_data(data_file, learn_options)
@@ -152,12 +152,12 @@ def read_V1_data(data_file, learn_options, AML_file=cur_dir + "/data/V1_suppl_da
     assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"
 
     if learn_options is not None and learn_options["flipV1target"]:
-        print "************************************************************************"
-        print "*****************MATCHING DOENCH CODE (DEBUG MODE)**********************"
-        print "************************************************************************"
+        print("************************************************************************")
+        print("*****************MATCHING DOENCH CODE (DEBUG MODE)**********************")
+        print("************************************************************************")
         # normally it is: Y['average threshold'] = Y['average rank'] > 0.8, where 1s are good guides, 0s are not
         Y['average threshold'] = Y['average rank'] < 0.2  # 1s are bad guides
-        print "press c to continue"
+        print("press c to continue")
         import ipdb
         ipdb.set_trace()
 
@@ -272,7 +272,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True):
             count = count + Xtmp.shape[0]
             Xdf = pandas.concat([Xdf, Xtmp], axis=0)
             if verbose:
-                print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count)
+                print("Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count))
 
     # create new index that includes the drug
     Xdf = Xdf.set_index('drug', append=True)
@@ -335,7 +335,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True):
     gene_position = util.impute_gene_position(gene_position)
 
     if learn_options is not None and learn_options["weighted"] == "variance":
-        print "computing weights from replicate variance..."
+        print("computing weights from replicate variance...")
         # compute the variance across replicates so can use it as a weight
         data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6+1), index_col=[0, 4])
         data.index.names = ["Sequence", "Target gene"]
@@ -359,7 +359,7 @@ def read_V2_data(data_file, learn_options=None, verbose=True):
         orig_index = Y.index.copy()
         Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True)
         Y = Y.ix[orig_index]
-        print "done."
+        print("done.")
 
     # Make sure to keep this check last in this function
     assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"
diff --git a/azimuth/local_multiprocessing.py b/azimuth/local_multiprocessing.py
index 3f4aa15..4d3ad10 100755
--- a/azimuth/local_multiprocessing.py
+++ b/azimuth/local_multiprocessing.py
@@ -21,10 +21,10 @@ def configure(num_jobs=8, TEST=False, subtract=0, num_proc=None, num_thread_per_
 
     try:
         import mkl
-        mkl.set_num_threads(num_thread_per_proc)    
+        mkl.set_num_threads(num_thread_per_proc)
     except ImportError:
-        print "MKL not available, so I'm not adjusting the number of threads"
+        print("MKL not available, so I'm not adjusting the number of threads")
 
-    print "Launching %d jobs with %d MKL threads each" % (num_jobs, num_thread_per_proc)
+    print("Launching %d jobs with %d MKL threads each" % (num_jobs, num_thread_per_proc))
 
     return num_jobs
diff --git a/azimuth/metrics.py b/azimuth/metrics.py
index ef95fba..50e7e31 100755
--- a/azimuth/metrics.py
+++ b/azimuth/metrics.py
@@ -255,26 +255,26 @@ def ndcg_at_k_ties(labels, predictions, k, method=0, normalize_from_below_too=Fa
     if isinstance(predictions, list):
         predictions = np.array(predictions)
 
-            
+
     assert len(labels.shape)==1 or np.min(labels.shape)==1, "should be 1D array or equivalent"
     assert len(predictions.shape)==1 or np.min(predictions.shape)==1, "should be 1D array or equivalent"
-        
+
     labels = labels.flatten()
     predictions = predictions.flatten()
 
     assert np.all(labels.shape == predictions.shape), "labels and predictions should have the same shape"
-        
+
     if k is None:
         k = len(labels)
 
     labels = labels.copy()
 
     dcg = dcg_at_k_ties(labels, predictions, k, method=method, theta=theta)
-        
+
     dcg_max = dcg_at_k_ties(labels, labels, k, method, theta=theta)
     # NOTE: I have checked that dcg_at_k_ties and dcg_at_k match when there are no ties, or ties in the labels
 
-    
+
     if normalize_from_below_too:
         dcg_min = dcg_at_k_ties(np.sort(labels)[::-1], np.sort(predictions), k, method, theta=theta)
     else:
@@ -282,9 +282,9 @@ def ndcg_at_k_ties(labels, predictions, k, method=0, normalize_from_below_too=Fa
     numerator = (dcg - dcg_min)
     assert numerator > -1e-5
     numerator = np.max((0, numerator))
-    ndcg = numerator / (dcg_max - dcg_min)            
+    ndcg = numerator / (dcg_max - dcg_min)
     assert ndcg <= 1.0 and ndcg >= 0.0, "ndcg=%f should be in [0,1]" % ndcg
-    if not dcg_max: 
+    if not dcg_max:
         ndcg = 0.
     return ndcg
 
@@ -357,7 +357,7 @@ def gain(label, method):
 
     dcg = dcg_helper(discount_factors, gain, k, labels, method, predictions)
     assert not np.isnan(dcg), "found nan dcg"
-    
+
     return dcg
 
 def get_discount_factors(num_labels, discount='log2', theta=None):
@@ -411,13 +411,13 @@ def ndcg_bootstrap_test(preds1, preds2, true_labels, num_bootstrap, method, k, n
     return pv
 
 def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, normalize_from_below_too, theta=None, balance_zeros=True):
-            
+
         # pVal is the probability that we would observe as big an AUC diff as we
-        # did if the ROC curves were drawn from the null hypothesis (which is that 
+        # did if the ROC curves were drawn from the null hypothesis (which is that
         # one model does not perform better than the other)
         #
-        # null hypothesis is that the prediction ranking are the same, so we exchange a random 
-        # number of them with each other. 
+        # null hypothesis is that the prediction ranking are the same, so we exchange a random
+        # number of them with each other.
         #
         # see ndcg_at_k_ties for all but the first four parameters
         #
@@ -425,7 +425,7 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
         #
         # this is a two-sided test, but since it is a symmetric null distribution, one should
         # be able to divide the p-value by 2 to get the one-sided version (but think this through before using)
-        
+
         if isinstance(preds1, list):
             preds1 = np.array(preds1)
         else:
@@ -453,19 +453,19 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
 
         ranks1 = sp.stats.mstats.rankdata(preds1)
         ranks2 = sp.stats.mstats.rankdata(preds2)
-        
+
         ndcg1 = ndcg_at_k_ties(true_labels, ranks1, k=k, method=method, normalize_from_below_too=normalize_from_below_too, theta=theta)
         ndcg2 = ndcg_at_k_ties(true_labels, ranks2, k=k, method=method, normalize_from_below_too=normalize_from_below_too, theta=theta)
 
         real_ndcg_diff = {}
         perm_ndcg_diff = {}
 
-        real_ndcg_diff = np.abs(ndcg1 - ndcg2)                
+        real_ndcg_diff = np.abs(ndcg1 - ndcg2)
         perm_ndcg_diff = np.nan*np.zeros(nperm)
-            
+
         if False:#np.all(preds1 == preds2):
-            pval[theta] = 1.0            
-        else:                    
+            pval[theta] = 1.0
+        else:
             zero_ind = true_labels == 0
             assert np.sum(zero_ind) < len(zero_ind), "balancing assumes there are more zeros than ones"
 
@@ -485,8 +485,8 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
                     tmp_diff = np.abs(ndcg1_perm[theta] - ndcg2_perm[theta])
                     perm_ndcg_diff[theta][t] = tmp_diff
 
-            pval = {}            
-            
+            pval = {}
+
             num_stat_greater = np.max((((perm_ndcg_diff > real_ndcg_diff).sum() + 1), 1.0))
             pval = num_stat_greater / nperm
 
@@ -495,7 +495,7 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
             plt.plot(np.sort(perm_ndcg_diff), '.')
             plt.plot(real_ndcg_diff*np.ones(perm_ndcg_diff.shape), 'k-')
             plt.show()
-                        
+
         return pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2
 
 if __name__ == "__main__":
@@ -506,35 +506,35 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
 
     simulated_data = True
     permute_real_data = True
-        
+
     T = 1000
     allp = np.nan*np.ones(T)
 
     nperm = 100
-    #method = 4; normalize_from_below_too = True; 
-    
+    #method = 4; normalize_from_below_too = True;
+
     #theta_range = np.logspace(np.log10(0.01), np.log10(1.0), 3)  # Nicolo uses 10, so I grab the extremes and middle
     #theta_range = np.array([0.01])
-    #weights = np.logspace(np.log10(0.0001), np.log10(10), 3); 
+    #weights = np.logspace(np.log10(0.0001), np.log10(10), 3);
     #weights = np.array([100.0])
     weights = np.array([0.001])
     theta_range = weights# just to make life easier
 
-    
+
     # only for simulated data
     N = 100
     frac_zeros = 0
-    
+
     k = None
 
     allp = np.nan*np.zeros((len(theta_range) + 1, T))
 
     if not simulated_data:
-        print "loading up saved data..." # two-fold CV data from CRISPR off-target GUIDE-SEQ
+        print("loading up saved data...") # two-fold CV data from CRISPR off-target GUIDE-SEQ
         with open(r'\\nerds5\kevin\from_nicolo\gs.pickle','rb') as f:  predictions, truth_all = pickle.load(f)
-        print "done."
+        print("done.")
         N = len(truth_all[0])
-            
+
     for t in range(T):
 
         # totally simulated
@@ -544,17 +544,17 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
             truth[zero_ind] = 0
             pred1 = np.random.rand(N)
             pred2 = np.random.rand(N)
-        else:                                
+        else:
             fold = 0
             truth = truth_all[fold]
             pred1 = predictions["CFD"][fold]
             pred2 = predictions["product"][fold]
-                        
+
             if permute_real_data:
                 truth = np.random.permutation(truth)
 
-        t0 = time.time()        
-        #pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta_range=theta_range)        
+        t0 = time.time()
+        #pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta_range=theta_range)
         for i, w in enumerate(weights):
             weights_array = truth.copy()
             weights_array += w
@@ -562,26 +562,26 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
             #corr0 = elevation.metrics.spearman_weighted(truth, pred1, w=weights_array)
             #corr1 = elevation.metrics.spearman_weighted(truth, pred2, w=weights_array)
             #corr01 = elevation.metrics.spearman_weighted(pred1, pred2, w=weights_array)
-            #n0 = len(truth)        
+            #n0 = len(truth)
             #t2, pvaltmp = corrstats.dependent_corr(corr0, corr1, corr01, n0, twotailed=True, method="steiger")
 
             pvaltmp, real_corr_diff, perm_corr_diff, corr1, corr2 = elevation.spearman_weighted_swap_perm_test(pred1, pred2, truth, nperm, weights_array)
-                                                        
+
             allp[i, t] = pvaltmp
             t1 = time.time()
 
         #for i, theta in enumerate(theta_range.tolist() + ["all"]):
-        #    print "%d, theta=%s) ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f, elapsed time=%f minutes, smallest_p=%f" % (t, str(theta), ndcg1[theta], ndcg2[theta], real_ndcg_diff[theta], pval[theta], (t1-t0)/60, 1.0/nperm)        
+        #    print("%d, theta=%s) ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f, elapsed time=%f minutes, smallest_p=%f" % (t, str(theta), ndcg1[theta], ndcg2[theta], real_ndcg_diff[theta], pval[theta], (t1-t0)/60, 1.0/nperm))
         #    allp[i, t] = pval[theta]
-        #print "---------------"
-        
+        #print("---------------")
+
     #for i, theta in enumerate(theta_range.tolist() + ["all"]):
     for i, theta in enumerate(theta_range.tolist()):
         #mytitle = "Norm. hist p-values nDCG\n %d null samples, w %d perm and N=%d, theta=%s" % (T, nperm, N, str(theta))
         mytitle = "Norm. hist p-values Steiger w weighted Spearman\n %d null samples, N=%d, weight=%s" % (T, N, str(theta))
         ut.qqplotp(allp[i,:], dohist=True, numbins=10, figsize=[6,6], title=mytitle, markersize=5)
         plt.show()
-    
+
     #save_tmp_results = r'D:\Source\CRISPR\elevation\pickles\tmp.ndcg.stat.calibration.p'
     #pickle.dump([theta_range, allp, pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2], open(save_tmp_results, "wb" ))
     #[theta_range, allp, pval, real_ndcg_diff, perm_ndcg_diff, ndcg1, ndcg2] = pickle.load(open(save_tmp_results, "rb" ))
@@ -606,10 +606,10 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
     #     # using our function
     #     dcg3 = dcg_at_k_ties(labels,predictions,k)
 
-    #     print "%f, %f, %f" % (dcg1, dcg2, dcg3)
+    #     print("%f, %f, %f" % (dcg1, dcg2, dcg3))
     #     assert(np.abs(dcg1 - dcg2) < 1e-8)
     #     assert(np.abs(dcg2 - dcg3) < 1e-8)
-    # print "check out ok for case with all ties in predictions"
+    # print("check out ok for case with all ties in predictions")
 
     truth = np.array([3, 4, 2, 1, 0, 0, 0])
     pred1 = np.array([3, 4, 2, 1, 0, 0, 0])
@@ -626,29 +626,29 @@ def ndcg_at_k_swap_perm_test(preds1, preds2, true_labels, nperm, method, k, norm
     k = len(pred3)
 
     pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred2, truth, nperm, method, k, normalize_from_below_too, theta=theta)
-    print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)
-    
-    pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred1, truth, nperm, method, k, normalize_from_below_too, theta=theta)    
-    print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)
+    print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval))
+
+    pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred1, truth, nperm, method, k, normalize_from_below_too, theta=theta)
+    print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval))
 
-    pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred4, truth, nperm, method, k, normalize_from_below_too, theta=theta)    
-    print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)
+    pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred4, truth, nperm, method, k, normalize_from_below_too, theta=theta)
+    print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval))
 
-    pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred5, truth, nperm, method, k, normalize_from_below_too, theta=theta)    
-    print "ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval)
+    pval, real_ndcg_diff,  perm_ndcg_diff, ndcg1, ndcg2 = ndcg_at_k_swap_perm_test(pred1, pred5, truth, nperm, method, k, normalize_from_below_too, theta=theta)
+    print("ndcg1=%f, ndcg2=%f, ndcg_diff=%f, p=%f" % (ndcg1, ndcg2, real_ndcg_diff, pval))
 
     import ipdb; ipdb.set_trace()
 
 
-    #print ndcg_at_k_ties(truth, truth, k, method=0, normalize_from_below_too=True)
-    #print ndcg_at_k_ties(truth, pred2, k, method=0, normalize_from_below_too=True)
-    #print ndcg_at_k_ties(truth, pred3, k, method=0, normalize_from_below_too=True)
-    #print ndcg_at_k_ties(truth3, pred3, k, method=3, normalize_from_below_too=True)
-    print ndcg_at_k_ties(truth4, pred2, k, method=3, normalize_from_below_too=True)
-        
-    print ndcg_alt(truth[np.argsort(pred2)[::-1]], 5)
-    print ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=1)
-    print ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=0)
+    #print(ndcg_at_k_ties(truth, truth, k, method=0, normalize_from_below_too=True))
+    #print(ndcg_at_k_ties(truth, pred2, k, method=0, normalize_from_below_too=True))
+    #print(ndcg_at_k_ties(truth, pred3, k, method=0, normalize_from_below_too=True))
+    #print(ndcg_at_k_ties(truth3, pred3, k, method=3, normalize_from_below_too=True))
+    print(ndcg_at_k_ties(truth4, pred2, k, method=3, normalize_from_below_too=True))
+
+    print(ndcg_alt(truth[np.argsort(pred2)[::-1]], 5))
+    print(ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=1))
+    print(ndcg_at_k(truth[np.argsort(pred2)[::-1]], 5, method=0))
 
-    print ndcg_at_k_ties(truth, pred2, 5, method=1)
-    print ndcg_at_k_ties(truth, pred2, 5, method=0)
+    print(ndcg_at_k_ties(truth, pred2, 5, method=1))
+    print(ndcg_at_k_ties(truth, pred2, 5, method=0))
diff --git a/azimuth/model_comparison.py b/azimuth/model_comparison.py
index c5fbf29..d3b0ee8 100755
--- a/azimuth/model_comparison.py
+++ b/azimuth/model_comparison.py
@@ -266,7 +266,7 @@ def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=Tru
         learn_options["order"] = 1
 
     if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True:
-        print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)"
+        print("WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)")
         for i in range(Xdf.shape[0]):
             Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"])
         # to_keep = Xdf['30mer'].isnull() == False
@@ -307,7 +307,7 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_
                          "logregL1": "logregL1", "sgrna_from_doench":"sgrna_from_doench", 'SVC': 'SVC', 'xu_et_al': 'xu_et_al'}
 
     if not CV:
-        print "Received option CV=False, so I'm training using all of the data"
+        print("Received option CV=False, so I'm training using all of the data")
         assert len(learn_options_set.keys()) == 1, "when CV is False, only 1 set of learn options is allowed"
         assert len(models) == 1, "when CV is False, only 1 model is allowed"
 
@@ -320,10 +320,10 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_
             # models requiring explicit featurization
             if model in feat_models_short.keys():
                 for order in orders:
-                    print "running %s, order %d for %s" % (model, order, learn_options_str)
+                    print("running %s, order %d for %s" % (model, order, learn_options_str))
 
                     Y, feature_sets, target_genes, learn_options, num_proc = setup_function(test=test, order=order, learn_options=partial_learn_opt, pam_audit=pam_audit, length_audit=length_audit) # TODO precompute features for all orders, as this is repated for each model
-                    
+
                     if model == 'L1':
                         learn_options_model = L1_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
                     elif model == 'L2':
@@ -359,7 +359,7 @@ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_
             # if the model doesn't require explicit featurization
             else:
                 assert setup_fn==setup, "not yet modified to handle this"
-                print "running %s for %s" % (model, learn_options_str)
+                print("running %s for %s" % (model, learn_options_str))
                 Y, feature_sets, target_genes, learn_options, num_proc = setup(test=test, order=1, learn_options=partial_learn_opt, pam_audit=pam_audit, length_audit=length_audit)
                 if model == 'mean':
                     learn_options_model = mean_setup(copy.deepcopy(learn_options))
@@ -392,12 +392,12 @@ def pickle_runner_results(exp_name, results, all_learn_options, relpath="/../" +
     dname = os.path.dirname(abspath) + relpath
     if not os.path.exists(dname):
         os.makedirs(dname)
-        print "Created directory: %s" % str(dname)
+        print("Created directory: %s" % str(dname))
     if exp_name is None:
         exp_name = results.keys()[0]
     myfile = dname+'/'+ exp_name + '.pickle'
     with open(myfile, 'wb') as f:
-        print "writing results to %s" % myfile
+        print("writing results to %s" % myfile)
         pickle.dump((results, all_learn_options), f, -1)
 
 def runner(models, learn_options, GP_likelihoods=None, orders=None, WD_kernel_degrees=None, where='local', cluster_user='fusi', cluster='RR1-N13-09-H44', test=False, exp_name = None, **kwargs):
@@ -550,7 +550,7 @@ def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None,
             model, learn_options = pickle.load(f)
     else:
         model, learn_options = model
-        
+
     learn_options["V"] = 2
 
     learn_options = override_learn_options(learn_options_override, learn_options)
@@ -567,12 +567,12 @@ def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None,
 
     feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit)
     inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets)
-    
-    #print "CRISPR"
+
+    #print("CRISPR")
     #pandas.DataFrame(inputs).to_csv("CRISPR.inputs.test.csv")
     #import ipdb; ipdb.set_trace()
 
-    # call to scikit-learn, returns a vector of predicted values    
+    # call to scikit-learn, returns a vector of predicted values
     preds = model.predict(inputs)
 
     # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function())
@@ -609,7 +609,7 @@ def write_results(predictions, file_to_predict):
     data = pandas.read_csv(file_to_predict)
     data['predictions'] = predictions
     data.to_csv(newfile)
-    print "wrote results to %s" % newfile
+    print("wrote results to %s" % newfile)
     return data, newfile
 
 if __name__ == '__main__':
diff --git a/azimuth/models/DNN.py b/azimuth/models/DNN.py
index c01fe1a..eaa44b1 100755
--- a/azimuth/models/DNN.py
+++ b/azimuth/models/DNN.py
@@ -64,11 +64,11 @@ def DNN_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_optio
             if best_score is None or accuracies[i, j] > best_score:
                 best_score = accuracies[i, j]
                 best_model = copy.deepcopy(e)
-                print "DNN with %d hidden layers and %d units, accuracy: %.4f   *" % (hl, nu, accuracies[i,j])
+                print("DNN with %d hidden layers and %d units, accuracy: %.4f   *" % (hl, nu, accuracies[i,j]))
             else:
-                print "DNN with %d hidden layers and %d units, accuracy: %.4f" % (hl, nu, accuracies[i,j])
+                print("DNN with %d hidden layers and %d units, accuracy: %.4f" % (hl, nu, accuracies[i,j]))
 
     best_model.run((X_train, y_train), (X_test, y_test))
     y_pred = best_model.network.predict(X[test])
 
-    return y_pred, None
\ No newline at end of file
+    return y_pred, None
diff --git a/azimuth/models/GP.py b/azimuth/models/GP.py
index 7d53520..69c19c0 100755
--- a/azimuth/models/GP.py
+++ b/azimuth/models/GP.py
@@ -100,8 +100,8 @@ def gp_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_o
         plt.figure('kernel')
         plt.title('kernel')
         plt.imshow(m.kern.K(X,X))
-        print m
-        print "%.3f variance explained" % (m.Gaussian_noise.variance/y[train].var())
+        print(m)
+        print("%.3f variance explained" % (m.Gaussian_noise.variance/y[train].var()))
         import ipdb; ipdb.set_trace()
         plt.close('all')
     else:
diff --git a/azimuth/models/baselines.py b/azimuth/models/baselines.py
index 0ad1512..4ee1690 100755
--- a/azimuth/models/baselines.py
+++ b/azimuth/models/baselines.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import numpy as np
 import sklearn
 from sklearn.svm import LinearSVC
@@ -62,11 +63,11 @@ def doench_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op
             assert np.nan not in tpr, "found nan tpr"
             roc_auc = sklearn.metrics.auc(fpr, tpr)
             if verbose:
-                print j, i, roc_auc
+                print(j, i, roc_auc)
             cv_results[j][i] = roc_auc
 
     best_penalty = penalty[np.argmax(np.mean(cv_results, axis=0))]
-    print "best AUC for penalty: ", np.median(cv_results, axis=0)
+    print("best AUC for penalty: ", np.median(cv_results, axis=0))
     clf = LinearSVC(penalty='l1', C=best_penalty, dual=False, class_weight=auto_class_weight)
     clf.fit(X[train], y_bin[train].flatten())
     non_zero_coeff = (clf.coef_ != 0.0)
@@ -92,6 +93,3 @@ def SVC_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_optio
     #y_pred = clf.predict(X[test])[:, None] # this returns 0/1
     y_pred = clf.decision_function(X[test])[:, None]
     return y_pred, clf
-
-
-    
diff --git a/azimuth/models/ensembles.py b/azimuth/models/ensembles.py
index 739c4e0..424e623 100755
--- a/azimuth/models/ensembles.py
+++ b/azimuth/models/ensembles.py
@@ -65,7 +65,7 @@ def adaboost_scoring_bo(params):
                          'max_features': hp.uniform('max_features', 0.05, 1.0)}
 
                 best = fmin(adaboost_scoring_bo, space, algo=tpe.suggest, max_evals=50, verbose=1)
-                print best
+                print(best)
                 clf = en.GradientBoostingRegressor(n_estimators=learn_options['adaboost_n_estimators'],
                                                    learning_rate=best['learning_rate'],
                                                    max_depth=best['max_depth'],
@@ -77,7 +77,7 @@ def adaboost_scoring_bo(params):
                  assert not classification, "need to tweak code below to do classificaton, as above"
                  n_jobs = 20
 
-                 print "Adaboost with GridSearch"
+                 print("Adaboost with GridSearch")
                  from sklearn.grid_search import GridSearchCV
                  param_grid = {'learning_rate': [0.1, 0.05, 0.01],
                               'max_depth': [4, 5, 6, 7],
@@ -101,7 +101,7 @@ def adaboost_scoring_bo(params):
                  est = en.GradientBoostingRegressor(loss=learn_options['adaboost_loss'], random_state=learn_options['seed'])#, n_estimators=learn_options['adaboost_n_estimators'])
                  clf = GridSearchCV(est, param_grid, n_jobs=n_jobs, verbose=1, cv=cv, scoring=spearman_scoring, iid=False)
                  clf.fit(X[train], y[train].flatten())
-                 print clf.best_params_
+                 print(clf.best_params_)
             else:
                 raise Exception("if using adaboost_CV then need to specify grid (grid search) or bo (bayesian optimization)")
 
diff --git a/azimuth/models/regression.py b/azimuth/models/regression.py
index c68e682..df9f2ae 100755
--- a/azimuth/models/regression.py
+++ b/azimuth/models/regression.py
@@ -44,23 +44,23 @@ def logreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op
     assert len(np.unique(y)) <= 2, "if using logreg need binary targets"
     assert learn_options["weighted"] is None, "cannot do weighted Log reg"
     assert learn_options['feature_select'] is False, "cannot do feature selection yet in logistic regression--see linreg_on_fold to implement"
-    
+
     cv, n_folds = set_up_inner_folds(learn_options, y_all.iloc[train])
 
     assert learn_options['penalty'] == "L1" or learn_options['penalty'] == "L2", "can only use L1 or L2 with logistic regression"
-    
+
     tol = 0.00001#0.0001
-    
+
     performance = np.zeros((len(learn_options["alpha"]), 1))
     # degenerate_pred = np.zeros((len(learn_options["alpha"])))
     for train_inner, test_inner in cv:
         for i, alpha in enumerate(learn_options["alpha"]):
             clf = sklearn.linear_model.LogisticRegression(penalty=learn_options['penalty'].lower(), dual=False, fit_intercept=learn_options["fit_intercept"], class_weight=learn_options["class_weight"], tol=tol, C=1.0/alpha)
-            
+
             clf.fit(X[train][train_inner], y[train][train_inner].flatten())
             #tmp_pred = clf.predict(X[train][test_inner])
             tmp_pred = clf.predict_proba(X[train][test_inner])[:,1]
-            
+
             if learn_options["training_metric"] == "AUC":
                 fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred)
                 assert ~np.any(np.isnan(fpr)), "found nan fpr"
@@ -85,32 +85,32 @@ def logreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op
     if not isinstance(best_alpha, numbers.Number):
         raise Exception("best_alpha must be a number but is %s" % type(best_alpha))
 
-    print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])
+    print("\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]]))
     max_perf = np.nanmax(performance)
 
     if max_perf < 0.0:
         raise Exception("performance is negative")
 
-    print "\t\tbest performance is %f" % np.nanmax(performance)
+    print("\t\tbest performance is %f" % np.nanmax(performance))
 
     clf = sklearn.linear_model.LogisticRegression(penalty=learn_options['penalty'],
                                                   dual=False, fit_intercept=learn_options["fit_intercept"],             class_weight=learn_options["class_weight"], tol=tol, C=1.0/best_alpha)
     clf.fit(X[train], y[train].flatten())
 
-    # debugging check that get samed paramter estimation when have no regularization and use 
+    # debugging check that get samed paramter estimation when have no regularization and use
     # either data with only that feature on, or all data), AND WITH NO INTERCEPT
-    if False:        
-        # grab only feature "GA3"        
+    if False:
+        # grab only feature "GA3"
         keep_ind = np.where(feature_sets['mutletpos'].columns=="GA3")[0]
-        print "%s, %s" % (str(clf.intercept_ ), str(clf.coef_[0, keep_ind]))
+        print("%s, %s" % (str(clf.intercept_ ), str(clf.coef_[0, keep_ind])))
         clf.fit(X[train][:,keep_ind], y[train].flatten())
-        print "%s, %s" % (str(clf.intercept_ ), str(clf.coef_))
-        import ipdb; ipdb.set_trace()               
+        print("%s, %s" % (str(clf.intercept_ ), str(clf.coef_)))
+        import ipdb; ipdb.set_trace()
+
 
-    
     #y_pred = clf.predict(X[test])
     y_pred = clf.predict_proba(X[test])[:,1]
-    y_pred = y_pred[:, None]    
+    y_pred = y_pred[:, None]
     #fpr, tpr, _ = roc_curve(y, y_pred); tmp_auc = auc(fpr, tpr)
     #import ipdb; ipdb.set_trace()
     return y_pred, clf
@@ -124,7 +124,7 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op
 
     if learn_options["weighted"] is not None and (learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"):
         raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment")
-        
+
     if not learn_options.has_key("fit_intercept"):
         learn_options["fit_intercept"] = True
     if not learn_options.has_key('normalize_features'):
@@ -194,16 +194,16 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op
 
     best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]]
 
-    print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])
-    
+    print("\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]]))
+
     if learn_options['penalty'] == "EN":
-        print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]])
+        print("\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]]))
     max_perf = np.nanmax(performance)
 
     if max_perf < 0.0:
         raise Exception("performance is negative")
 
-    print "\t\tbest performance is %f" % max_perf
+    print("\t\tbest performance is %f" % max_perf)
 
     clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all)
     if learn_options["feature_select"]:
@@ -214,7 +214,7 @@ def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_op
 
     if learn_options["penalty"] != "L2":
         y_pred = y_pred[:, None]
-            
+
     return y_pred, clf
 
 
@@ -267,16 +267,16 @@ def get_weights(learn_options, fold, y, y_all):
     return weights
 
 
-def set_up_inner_folds(learn_options, y):            
+def set_up_inner_folds(learn_options, y):
     label_encoder = sklearn.preprocessing.LabelEncoder()
-    label_encoder.fit(y['Target gene'].values)    
+    label_encoder.fit(y['Target gene'].values)
     gene_classes = label_encoder.transform(y['Target gene'].values)
-    n_genes = len(np.unique(gene_classes))    
+    n_genes = len(np.unique(gene_classes))
     if learn_options['ignore_gene_level_for_inner_loop'] or learn_options["cv"] == "stratified" or n_genes==1:
         if 'n_folds' not in learn_options.keys():
             n_folds = len(np.unique(gene_classes))
         else:
-            n_folds = learn_options['n_folds']        
+            n_folds = learn_options['n_folds']
         cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True)
     elif learn_options["cv"] == "gene":
         gene_list = np.unique(y['Target gene'].values)
diff --git a/azimuth/predict.py b/azimuth/predict.py
index b116983..2978fdc 100755
--- a/azimuth/predict.py
+++ b/azimuth/predict.py
@@ -65,26 +65,26 @@ def construct_filename(learn_options, TEST):
     elif learn_options["training_metric"] == 'spearmanr':
         filename += ".spearman"
 
-    print "filename = %s" % filename
+    print("filename = %s" % filename)
     return filename
 
 def print_summary(global_metric, results, learn_options, feature_sets, flags):
-    print "\nSummary:"
-    print learn_options
-    print "\t\tglobal %s=%.2f" % (learn_options['metric'], global_metric)
-    print "\t\tmedian %s across folds=%.2f" % (learn_options['metric'], np.median(results[0]))
-    print "\t\torder=%d" % learn_options["order"]
+    print("\nSummary:")
+    print(learn_options)
+    print("\t\tglobal %s=%.2f" % (learn_options['metric'], global_metric))
+    print("\t\tmedian %s across folds=%.2f" % (learn_options['metric'], np.median(results[0])))
+    print("\t\torder=%d" % learn_options["order"])
     if learn_options.has_key('kerntype'): "\t\tkern type = %s" % learn_options['kerntype']
-    if learn_options.has_key('degree'): print "\t\tdegree=%d" % learn_options['degree']
-    print "\t\ttarget_name=%s" % learn_options["target_name"]
+    if learn_options.has_key('degree'): print("\t\tdegree=%d" % learn_options['degree'])
+    print("\t\ttarget_name=%s" % learn_options["target_name"])
 
     for k in flags.keys():
-        print '\t\t' + k + '=' + str(learn_options[k])
+        print('\t\t' + k + '=' + str(learn_options[k]))
 
-    print "\t\tfeature set:"
+    print("\t\tfeature set:")
     for set in feature_sets.keys():
-        print "\t\t\t%s" % set
-    print "\t\ttotal # features=%d" % results[4]
+        print("\t\t\t%s" % set)
+    print("\t\ttotal # features=%d" % results[4])
 
 def extract_fpr_tpr_for_fold(aucs, fold, i, predictions, truth, y_binary, test, y_pred):
     assert len(np.unique(y_binary))<=2, "if using AUC need binary targets"
@@ -136,7 +136,7 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge
     When CV=False, it trains on everything (and tests on everything, just to fit the code)
     '''
 
-    print "range of y_all is [%f, %f]" % (np.min(y_all[learn_options['target_name']].values), np.max(y_all[learn_options['target_name']].values))
+    print("range of y_all is [%f, %f]" % (np.min(y_all[learn_options['target_name']].values), np.max(y_all[learn_options['target_name']].values)))
 
     allowed_methods = ["GPy", "linreg", "AdaBoostRegressor", "AdaBoostClassifier",
                        "DecisionTreeRegressor", "RandomForestRegressor",
@@ -149,7 +149,7 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge
     # construct filename from options
     filename = construct_filename(learn_options, TEST)
 
-    print "Cross-validating genes..."
+    print("Cross-validating genes...")
     t2 = time.time()
 
     y = np.array(y_all[learn_options["target_name"]].values[:,None],dtype=np.float64)
@@ -219,8 +219,8 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge
                 if learn_options['num_genes_remove_train']==0:
                     assert np.all(cv_i_orig[0]==cv[i][0])
                     assert np.all(cv_i_orig[1]==cv[i][1])
-                print "# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0]))
-                print "# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1]))
+                print("# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0])))
+                print("# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1])))
     else:
         raise Exception("invalid cv options given: %s" % learn_options["cv"])
 
@@ -240,12 +240,12 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge
     num_proc = learn_options["num_proc"]
     if num_proc > 1:
         num_proc = np.min([num_proc,len(cv)])
-        print "using multiprocessing with %d procs--one for each fold" % num_proc
+        print("using multiprocessing with %d procs--one for each fold" % num_proc)
         jobs = []
         pool = multiprocessing.Pool(processes=num_proc)
         for i,fold in enumerate(cv):
             train,test = fold
-            print "working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test))
+            print("working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test)))
             if learn_options["method"]=="GPy":
                 job = pool.apply_async(azimuth.models.GP.gp_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
             elif learn_options["method"]=="linreg":
@@ -351,15 +351,15 @@ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_ge
 
             truth, predictions = fill_in_truth_and_predictions(truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test)
 
-            print "\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean())
-            print "\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0]
-            print "\t\tfinished fold/gene %i of %i" % (i+1, len(fold_labels))
+            print("\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean()))
+            print("\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0])
+            print("\t\tfinished fold/gene %i of %i" % (i+1, len(fold_labels)))
 
 
     cv_median_metric =[np.median(metrics)]
     gene_pred = [(truth, predictions)]
-    print "\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1])
+    print("\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1]))
 
     t3 = time.time()
-    print "\t\tElapsed time for cv is %.2f seconds" % (t3-t2)
+    print("\t\tElapsed time for cv is %.2f seconds" % (t3-t2))
     return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names
diff --git a/azimuth/util.py b/azimuth/util.py
index 1bd6dcb..b3b252c 100755
--- a/azimuth/util.py
+++ b/azimuth/util.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import pandas
 import matplotlib.pylab as plt
 import pylab as pl # so can just grab qqplotting code from fastlmm directly
@@ -26,6 +27,7 @@
 import pandas as pd
 import corrstats
 
+
 def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None,fixaxes=True,addlambda=True,minpval=1e-20,title=None,h1=None,figsize=[5,5],grid=True, markersize=2):
     '''
     performs a P-value QQ-plot in -log10(P-value) space
@@ -46,7 +48,7 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N
         grid        boolean: use a grid? (default: True)
     Returns:   fighandle, qnull, qemp
     -----------------------------------------------------------------------
-    '''    
+    '''
     distr = 'log10'
     import pylab as pl
     if type(pvals)==list:
@@ -57,20 +59,20 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N
         legendlist=legend
     else:
         legendlist = [legend]
-    
+
     if h1 is None:
-        h1=pl.figure(figsize=figsize) 
-    
+        h1=pl.figure(figsize=figsize)
+
     pl.grid(b=grid, alpha = 0.5)
-         
+
     maxval = 0
 
-    for i in xrange(len(pvallist)):        
+    for i in xrange(len(pvallist)):
         pval =pvallist[i].flatten()
         M = pval.shape[0]
         pnull = (0.5 + sp.arange(M))/M
         # pnull = np.sort(np.random.uniform(size = tests))
-                
+
         pval[pval<minpval]=minpval
         pval[pval>=1]=1
 
@@ -81,31 +83,31 @@ def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=N
             yl = '$\chi^2$ quantiles'
 
         if distr == 'log10':
-            qnull = -sp.log10(pnull)            
+            qnull = -sp.log10(pnull)
             qemp = -sp.log10(sp.sort(pval)) #sorts the object, returns nothing
             xl = '-log10(P) observed'
             yl = '-log10(P) expected'
         if not (sp.isreal(qemp)).all(): raise Exception("imaginary qemp found")
         if qnull.max>maxval:
-            maxval = qnull.max()                
+            maxval = qnull.max()
         pl.plot(qnull, qemp, '.', markersize=markersize)
-        #pl.plot([0,qemp.max()], [0,qemp.max()],'r')        
+        #pl.plot([0,qemp.max()], [0,qemp.max()],'r')
         if addlambda:
             lambda_gc = estimate_lambda(pval)
-            print "lambda=%1.4f" % lambda_gc
-            #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2)   
+            print("lambda=%1.4f" % lambda_gc)
+            #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2)
             # if there's only one method, just print the lambda
             if len(pvallist) == 1:
-                legendlist=["$\lambda_{GC}=$%1.4f" % lambda_gc]   
+                legendlist=["$\lambda_{GC}=$%1.4f" % lambda_gc]
             # otherwise add it at the end of the name
             else:
                 legendlist[i] = legendlist[i] + " ($\lambda_{GC}=$%1.4f)" % lambda_gc
 
-    addqqplotinfo(qnull,M,xl,yl,xlim,ylim,alphalevel,legendlist,fixaxes)  
-    
+    addqqplotinfo(qnull,M,xl,yl,xlim,ylim,alphalevel,legendlist,fixaxes)
+
     if title is not None:
-        pl.title(title)            
-    
+        pl.title(title)
+
     if fileout is not None:
         pl.savefig(fileout)
 
@@ -116,20 +118,20 @@ def qqplotp(pv,fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None
      '''
      Read in p-values from filein and make a qqplot adn histogram.
      If fileout is provided, saves the qqplot only at present.
-     Searches through p until one is found.   '''       
-     
-     import pylab as pl     
-     pl.ion()     
-     
-     fs=8     
+     Searches through p until one is found.   '''
+
+     import pylab as pl
+     pl.ion()
+
+     fs=8
      h1=qqplot(pv, fileout, alphalevel,legend,xlim,ylim,addlambda=True, figsize=figsize, markersize=markersize)
      #lambda_gc=estimate_lambda(pv)
-     #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2)     
+     #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2)
      pl.title(title,fontsize=fs)
-     
+
      wm=pl.get_current_fig_manager()
      #e.g. "652x526+100+10
-     xcoord=100     
+     xcoord=100
      #wm.window.wm_geometry(plotsize + "+" + str(xcoord) + "+" + str(ycoord))
 
      if dohist:
@@ -144,7 +146,7 @@ def qqplotp(pv,fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None
 
      return h1,h2
 
-def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=None,ylim=None,alphalevel=0.05,legendlist=None,fixaxes=False):    
+def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=None,ylim=None,alphalevel=0.05,legendlist=None,fixaxes=False):
     distr='log10'
     pl.plot([0,qnull.max()], [0,qnull.max()],'k')
     pl.ylabel(xl)
@@ -152,7 +154,7 @@ def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=N
     if xlim is not None:
         pl.xlim(xlim)
     if ylim is not None:
-        pl.ylim(ylim)        
+        pl.ylim(ylim)
     if alphalevel is not None:
         if distr == 'log10':
             betaUp, betaDown, theoreticalPvals = _qqplot_bar(M=M,alphalevel=alphalevel,distr=distr)
@@ -168,7 +170,7 @@ def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=N
             lo.set_markersize(10)
 
     if fixaxes:
-        fix_axes()        
+        fix_axes()
 
 def _qqplot_bar(M=1000000, alphalevel = 0.05,distr = 'log10'):
     '''
@@ -214,8 +216,8 @@ def _qqplot_bar(M=1000000, alphalevel = 0.05,distr = 'log10'):
 def fix_axes(buffer=0.1):
     '''
     Makes x and y max the same, and the lower limits 0.
-    '''    
-    maxlim=max(pl.xlim()[1],pl.ylim()[1])    
+    '''
+    maxlim=max(pl.xlim()[1],pl.ylim()[1])
     pl.xlim([0-buffer,maxlim+buffer])
     pl.ylim([0-buffer,maxlim+buffer])
 
@@ -232,13 +234,13 @@ def estimate_lambda(pv):
     L = (LOD2/0.456)
     return L
 
-     
-def pvalhist(pv,numbins=50,linewidth=3.0,linespec='--r', figsize=[5,5]):    
+
+def pvalhist(pv,numbins=50,linewidth=3.0,linespec='--r', figsize=[5,5]):
     '''
     Plots normalized histogram, plus theoretical null-only line.
-    '''    
-    h2=pl.figure(figsize=figsize)      
-    [nn,bins,patches]=pl.hist(pv,numbins,normed=True)    
+    '''
+    h2=pl.figure(figsize=figsize)
+    [nn,bins,patches]=pl.hist(pv,numbins,normed=True)
     pl.plot([0, 1],[1,1],linespec,linewidth=linewidth)
 
 
@@ -291,7 +293,7 @@ def guide_positional_features(guide_seq, gene, strand):
         guide_seq = guide_seq.reverse_complement()
     ind = gene_seq.find(guide_seq)
     if ind ==-1:
-        print "returning None, could not find guide %s in gene %s" % (guide_seq, gene)
+        print("returning None, could not find guide %s in gene %s" % (guide_seq, gene))
         return ""
     assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
     ## now get what we want from this:
@@ -310,7 +312,7 @@ def convert_to_thirty_one(guide_seq, gene, strand):
         guide_seq = guide_seq.reverse_complement()
     ind = gene_seq.find(guide_seq)
     if ind ==-1:
-        print "returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene)
+        print("returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene))
         return gene_seq + 'A'
     assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
     #new_mer = gene_seq[ind:(ind+len(guide_seq))+1] #looks correct, but is wrong, due to strand frame-of-reference
@@ -349,7 +351,7 @@ def concatenate_feature_sets(feature_sets, keys=None):
 
     if False:
         inputs.shape
-        for j in keys: print j + str(feature_sets[j].shape)
+        for j in keys: print(j + str(feature_sets[j].shape))
         import ipdb; ipdb.set_trace()
 
     #print "final size of inputs matrix is (%d, %d)" % inputs.shape
@@ -383,7 +385,7 @@ def spearmanr_nonan(x,y):
     r, p = st.spearmanr(x, y)
     if np.isnan(p):
         if len(np.unique(x))==1 or len(np.unique(y))==1:
-            print "WARNING: spearmanr is nan due to unique values, setting to 0"
+            print("WARNING: spearmanr is nan due to unique values, setting to 0")
             p = 0.0
             r = 0.0
         else:
@@ -435,7 +437,7 @@ def get_gene_sequence(gene_name):
     # records = Entrez.read(search)
 
     # if len(records['IdList']) > 1:
-    #     print "warning, multiple hits found for entrez gene search %s" % gene_name
+    #     print("warning, multiple hits found for entrez gene search %s" % gene_name)
 
     # elink = Entrez.read(Entrez.elink(dbfrom="gene", db='nucleotide', id=records['IdList'][0]))
     # nucl_id = elink[0]['LinkSetDb'][3]
@@ -446,7 +448,7 @@ def get_gene_sequence(gene_name):
     #         nucl_id = elink[0]['LinkSetDb'][0]['Link'][0]['Id']
     #         cut = True
     #     else:
-    #         print "sorry not enough information to return sequence"
+    #         print("sorry not enough information to return sequence")
     #         return None
     # else:
     #     nucl_id = nucl_id['Link'][0]['Id']
@@ -466,7 +468,7 @@ def target_genes_stats(genes=['HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3',
     for gene in genes:
         seq = get_gene_sequence(gene)
         if seq != None:
-            print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
+            print('%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA')))
 
 
 def ranktrafo(data):
@@ -518,7 +520,7 @@ def get_ranks(y, thresh=0.8, prefix="", flip=False, col_name='score'):
     # y_quantized = pandas.DataFrame(data=pandas.qcut(y[col_name], 5, labels=np.arange(5.0))) # quantized vector
     y_quantized = y_threshold.copy()
     y_quantized.columns = [prefix + "quantized"]
-    
+
     return y_rank, y_rank_raw, y_threshold, y_quantized
 
 def get_data(data, y_names, organism="human", target_gene=None):
@@ -1004,7 +1006,7 @@ def plot_all_metrics(metrics, gene_names, all_learn_options, save, plots=None, b
                     plt.bar(ind+(i*width), metrics[method][metric], width, color=plt.cm.Paired(1.*i/len(metrics.keys())), label=method)
 
                 median_metric = np.median(metrics[method][metric])
-                print method, metric, median_metric
+                print(method, metric, median_metric)
                 assert not np.isnan(median_metric), "found nan for %s, %s" % (method, metric)
                 if metric not in boxplot_arrays.keys():
                     boxplot_arrays[metric] = np.array(metrics[method][metric])[:, None]
@@ -1061,7 +1063,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a
     if filelist ==[]:
         raise Exception("found no pickle files in %s" % directory)
     else:
-        print "found %d files in %s" % (len(filelist), directory)
+        print("found %d files in %s" % (len(filelist), directory))
 
     for results_file in filelist:
         if 'learn_options' in results_file:
@@ -1074,7 +1076,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a
                     if m in results_file:
                         in_filt = True
                 if not in_filt:
-                    print "%s not in model_filter" % (results_file)#, model_filter)
+                    print("%s not in model_filter" % (results_file))#, model_filter)
                     continue
             elif model_filter not in results_file:
                 continue
@@ -1094,7 +1096,7 @@ def load_results(directory, all_results, all_learn_options, model_filter=None, a
             else:
                 k_new = k
             assert k_new not in all_results.keys(), "found %s already" % k
-            print "adding key %s (from file %s)" % (k_new, os.path.split(results_file)[-1])
+            print("adding key %s (from file %s)" % (k_new, os.path.split(results_file)[-1]))
             all_results[k_new] = results[k]
             all_learn_options[k_new] = learn_options[k]
             num_added = num_added +1
@@ -1205,8 +1207,8 @@ def ensemble_cluster_results(directory=r'\\fusi1\crispr2\analysis\cluster\result
     # spearmans = []
     # for gene in ens_predictions.keys():
     #     spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0])
-    #     print gene, spearmans[-1]
-    # print "median: %.5f" % np.median(spearmans)
+    #     print(gene, spearmans[-1])
+    # print("median: %.5f" % np.median(spearmans))
 
     return all_results, all_learn_options
 
@@ -1245,13 +1247,13 @@ def plot_old_vs_new_feat(results, models, fontsize=20, filename=None, print_outp
         feat_AUC_se.append(np.std(metrics_feat['AUC']))
 
 
-    print "old features"
-    print "mean: " + str(base_spearman_means)
-    print "std: " + str(base_spearman_std)
+    print("old features")
+    print("mean: " + str(base_spearman_means))
+    print("std: " + str(base_spearman_std))
 
-    print "old + new features"
-    print "mean: " + str(feat_spearman_means)
-    print "std: " + str(feat_spearman_std)
+    print("old + new features")
+    print("mean: " + str(feat_spearman_means))
+    print("std: " + str(feat_spearman_std))
 
     plt.figure()
     ind = np.arange(len(models))
@@ -1322,7 +1324,7 @@ def remove_top_right_on_plot(ax=None):
         X, Y = combine_organisms()
         X.to_pickle('../data/X.pd') #sequence features (i.e. inputs to prediction)
         Y.to_pickle('../data/Y.pd') #cell-averaged ranks, plus more (i.e. possible targets for prediction)
-        print "done writing to file"
+        print("done writing to file")
     elif V =="2":
         # this is now all in predict.py
         pass
diff --git a/setup.py b/setup.py
index 7fcea42..a0a2bad 100755
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,11 @@
 # from Cython.Build import cythonize
 from setuptools import setup
+import sys
 
+if sys.version_info[0] >= 3:
+    requires = ['scipy', 'numpy', 'matplotlib', 'nose', 'scikit-learn', 'pandas', 'biopython']
+else:
+    requires = ['scipy', 'numpy', 'matplotlib<3.0', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython']
 
 setup(name='Azimuth',
       version='2.0',
@@ -9,7 +14,7 @@
       description=("Machine Learning-Based Predictive Modelling of CRISPR/Cas9 guide efficiency"),
       packages=["azimuth", "azimuth.features", "azimuth.models", "azimuth.tests"],
       package_data={'azimuth': ['saved_models/*.*']},
-      install_requires=['scipy', 'numpy', 'matplotlib', 'nose', 'scikit-learn>=0.17.1,<0.18', 'pandas', 'biopython'],
+      install_requires=requires,
       license="BSD",
       # ext_modules=cythonize("ssk_cython.pyx"),
       )