final push!

kenminsoo · Aug 19, 2023 · 224a55c · 224a55c
1 parent a5d63d1
commit 224a55c
Show file tree

Hide file tree

Showing 38 changed files with 10,408 additions and 5,820 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,6 @@ SPRMT_merge_031323_seq.gtf
 *.fasta
 .Rdata
 .Rhistory
-*.bt2
 mirna_to_snorna/
 mirna_to_snorna_mi/
 piRNAbank_to_pirnadb/

diff --git a/__pycache__/conversion_tools.cpython-310.pyc b/__pycache__/conversion_tools.cpython-310.pyc
diff --git a/__pycache__/fragmentation.cpython-310.pyc b/__pycache__/fragmentation.cpython-310.pyc
diff --git a/__pycache__/gtf_groundtruth.cpython-310.pyc b/__pycache__/gtf_groundtruth.cpython-310.pyc
diff --git a/__pycache__/gtf_modifiers.cpython-310.pyc b/__pycache__/gtf_modifiers.cpython-310.pyc
diff --git a/conversion_tools.py b/conversion_tools.py
@@ -350,7 +350,7 @@ def gtf_attribute_to_fasta(gtf, output, attribute, primary_key, pipeline = False
                     names[extracted_sequence].add(primary)
                     # generate a deduplicated gtf
                 else:
-                    new.write(">" + primary + "\n" + extracted_sequence + "\n")
+                    new.write(">" + primary.replace('"', "") + "\n" + extracted_sequence.replace('"', "") + "\n")
                     new_gtf.write(line)
 
                     names[extracted_sequence].add(primary)
@@ -370,6 +370,7 @@ def gtf_attribute_to_fasta(gtf, output, attribute, primary_key, pipeline = False
 
                 seq_index = attributes.index(attribute)
                 extracted_sequence = attributes[seq_index + 1]
+                extracted_sequence = extracted_sequence.upper()
 
                 primary_index = attributes.index(primary_key)
                 primary = attributes[primary_index + 1]
@@ -378,7 +379,7 @@ def gtf_attribute_to_fasta(gtf, output, attribute, primary_key, pipeline = False
                     names[extracted_sequence].add(primary)
                     # generate a deduplicated gtf
                 else:
-                    new.write(">" + primary + "\n" + extracted_sequence + "\n")
+                    new.write(">" + primary.strip('"') + "\n" + extracted_sequence.strip('"') + "\n")
 
                     names[extracted_sequence].add(primary)
 

diff --git a/fragmentation.py b/fragmentation.py
@@ -1,6 +1,7 @@
 from basics import *
 from MINTplates import *
 import numpy as np
+from collections import defaultdict
 
 # takes in a gtf with sequences
 
@@ -23,6 +24,8 @@ def create_lookup(gtf_file, seq_key, source_key,start, end, output, id_pre = "sd
 
             seq = attributes[seq_index + 1]
             seq = seq.replace('"', "")
+            seq = seq.strip(" ")
+            seq = seq.upper()
 
             source_index = attributes.index(source_key)
 
@@ -131,4 +134,162 @@ def create_lookup(gtf_file, seq_key, source_key,start, end, output, id_pre = "sd
 
     lookup_table["ID"] = lookup_table["sequence"].apply(lambda x: encode_sequence(x, id_pre))
 
+    lookup_table.to_csv(output, index = False)
+
+def create_lookup_selective(gtf_file, seq_key, source_key,min_length, max_length, output, lookup_builder, id_pre = "sdr_snoRNAdb2023_", flank = 5):
+    lookup_dict = my_dictionary()
+    selector = defaultdict(set)
+
+    constructor_headers = ["sequence", "sources", "num", "5p_end", "n_5p", "avg_start", "std", "flanking5p", "all_same5p","flanking3p", "all_same3p"]
+    df_constructor = []
+
+    with open(lookup_builder, "r") as lookup_build:
+        for line in lookup_build:
+            splitted_line = line.split(sep = "\t")
+
+            the_number = splitted_line[2]
+
+            the_number = the_number.replace("M", "")
+
+            the_number = int(the_number)
+
+            loci = splitted_line[1]
+
+            loci = int(loci)
+
+            selector[splitted_line[0].replace('"', "")].add((loci, the_number))
+
+    with open(gtf_file, "r") as gtf, open(output, "w") as new:
+        for line in gtf:
+            sepped = separate_gtf_line(line)
+
+            main_cols = sepped[0]
+            attributes = sepped[1]
+
+            strand = main_cols[6]
+
+            seq_index = attributes.index(seq_key)
+
+            seq = attributes[seq_index + 1]
+            seq = seq.replace('"', "")
+            seq = seq.strip(" ")
+
+            source_index = attributes.index(source_key)
+
+            source_base = attributes[source_index + 1]
+            source_base = source_base.replace('"', "")
+
+            selector_key_in_gtf = source_base
+
+            source_base = source_base + "__" + strand
+
+            seq_length = len(seq)
+
+            # make it s.t. we extract each info
+            query_set_list = selector[selector_key_in_gtf]
+
+
+            for item in query_set_list:
+                start = item[0]
+                start = start - 1
+                length = item[1]
+                end = start + length
+
+                segment = seq[start:end]
+                segment = segment.strip(" ")
+                segment = segment.upper()
+
+                if len(segment) < min_length:
+                    continue
+                elif len(segment) > max_length:
+                    continue
+
+                source = str(start + 1) + ";" + str(end + 1) + "__" + source_base + "__.__" + str(start/seq_length)
+
+                p5_flank = seq[(start - flank):start]
+                p3_flank = seq[end:end + flank]
+
+                if segment in lookup_dict:
+
+                    lookup_dict[segment][0].append(source)
+
+                    lookup_dict[segment][1] = lookup_dict[segment][1] + 1
+
+                    lookup_dict[segment][3].append(start/seq_length)
+
+                    lookup_dict[segment][4].append(p5_flank)
+
+                    lookup_dict[segment][6].append(p3_flank)
+
+                    if start == 0:
+
+                        lookup_dict[segment][2] = lookup_dict[segment][2] + 1
+
+                else:
+
+                    if start == 0:
+                        lookup_dict.add(segment, [[source], 1, 1, [start/seq_length], [p5_flank], 1, [p3_flank], 1])
+
+                    else:
+                        lookup_dict.add(segment, [[source], 1, 0, [start/seq_length], [p5_flank], 1, [p3_flank], 1])
+
+    for key in lookup_dict:
+
+        # Skip if there are more than 300 sources
+        # Likely represents sequence of low complexity
+        if lookup_dict[key][1] > 300:
+            continue
+
+        entry = []
+
+        entry.append(key)
+
+        entry.append(">".join(lookup_dict[key][0]))
+        entry.append(str(lookup_dict[key][1]))
+
+        avg = np.mean(lookup_dict[key][3])
+        std = np.std(lookup_dict[key][3])
+
+        if lookup_dict[key][2] > 0:
+            entry.append(True)
+        else:
+            entry.append(False)
+
+        entry.append(str(lookup_dict[key][2]))
+
+        entry.append(str(avg))
+        entry.append(str(std))
+
+        # check if all 5p flanks are the same
+        if all(element == lookup_dict[key][4][0] for element in lookup_dict[key][4]):
+            entry.append(max(lookup_dict[key][4]))
+            entry.append(str(1))
+        else:
+            entry.append(max(lookup_dict[key][4]))
+            entry.append(str(0))
+
+        # check if all 3p flanks are the same
+        if all(element == lookup_dict[key][6][0] for element in lookup_dict[key][6]):
+            entry.append(max(lookup_dict[key][6]))
+            entry.append(str(1))
+        else:
+            entry.append(max(lookup_dict[key][6]))
+            entry.append(str(0))
+
+        df_constructor.append(entry)
+
+    lookup_table = pd.DataFrame(df_constructor, columns = constructor_headers)
+
+    lookup_table["sequence"] = lookup_table["sequence"].str.upper()
+
+    lookup_table["flanking5p"] = lookup_table["flanking5p"].str.upper()
+
+    lookup_table["flanking3p"] = lookup_table["flanking3p"].str.upper()
+
+    lookup_table = lookup_table.sort_values(by = ["sequence"])
+
+    lookup_table = lookup_table.loc[~lookup_table["sequence"].str.contains("N"),:]
+
+    lookup_table["ID"] = lookup_table["sequence"].apply(lambda x: encode_sequence(x, id_pre))
+
     lookup_table.to_csv(output, index = False)
diff --git a/gtf_descriptors.py b/gtf_descriptors.py
@@ -6,6 +6,25 @@
 # - count # of times value in column appears
 # - count # of times an attribute appears
 
+# find number of possible sequences
+def possible_fragments(gtf_file, m, k):
+    total = 0
+
+    with open(gtf_file, "r") as gtf:
+        for line in gtf:
+            sep = separate_gtf_line(line)
+            attributes = sep[1]
+            cols = sep[0]
+
+            length = int(cols[4]) - int(cols[3]) + 1
+
+            length = length
+
+            total_frags = (length * (m - k + 1)) - ((m * (m+1))/2) - ((m*(k-1))/2)
+
+            total += total_frags
+
+    return total
 # Input gtf file, if sequence exists, this will compare the extracted one (with bedtools) to see if any errors exist in the annotation
 def compare_sequence(gtf_file, output_name, sequence_feature1, sequence_feature2):
     # store data in here

diff --git a/gtf_groundtruth.py b/gtf_groundtruth.py
@@ -293,7 +293,7 @@ def bowtie_align(new_dir, num_mismatch = 2, input_fasta = "sequences.fasta ", p
 
 def bowtie_align_pipeline(index_name, working_dir, fasta):
     os.system('cd ' + working_dir + '; \
-    bowtie -f -x ' + index_name + ' ' + fasta +' -k 101 --best --strata -v 0 -S lookup_filtered.sam --reorder')
+    bowtie -f -x ' + index_name + ' ' + fasta +' -k 101 --best --strata -v 0 -S lookup_filtered.sam --reorder --norc')
 
 
 # Parse SAM File

diff --git a/gtf_modifiers.py b/gtf_modifiers.py
@@ -69,6 +69,27 @@ def add_sequence_gtf(gtf_file, ref_genome, output_name, attribute_name = "bed_se
 
             iter += 2
 
+def add_sequence_gtf_trna(gtf_file, ref_genome, output_name, attribute_name = "bed_sequence"):
+
+    seq_list = add_sequence_fast(gtf_file, ref_genome)
+
+    with open(gtf_file, "r") as bed, open(output_name, "w") as new:
+
+        iter = 1
+
+        for line in bed:
+            extracted_sequence = seq_list[iter]
+
+            extracted_sequence = extracted_sequence.upper()
+
+            extracted_sequence = extracted_sequence + "CCA"
+
+            temp_line = line.strip()
+
+            new.write(temp_line + '; ' + attribute_name + ' "' + extracted_sequence + '"\n')
+
+            iter += 2
+
 # we define a new file type, bedseq format
 def add_sequence_bed(bed_file, ref_genome, output_name):
 

diff --git a/install/.DS_Store b/install/.DS_Store
diff --git a/install/requirements_pip.txt b/install/requirements_pip.txt
@@ -21,4 +21,5 @@ pybedtools==0.9.0
 fire==0.5.0
 biopython==1.81
 matplotlib==3.7.1
-seaborn==0.12.2
+seaborn==0.12.2
+pymsaviz==0.4.0
diff --git a/install/sRNA_frag_config_setup_template.yaml b/install/sRNA_frag_config_setup_template.yaml
@@ -13,8 +13,6 @@ module_options:
     prefix: hg38
 
     # Remove UMIs, Default is QIAgen UMI with regex removal option
-    # for our cell line samples, UMIs and Adapters are already removed.
-    # We add adapter trimming time from 9 core run.
     umi_removal:
       bool: False
       regex: ".+(?P<discard_1>AACTGTAGGCACCATCAAT){s<=2}(?P<umi_1>.{12}).+"
@@ -24,7 +22,7 @@ module_options:
       bool: False
       sequence: AGATCGGAAGAG
 
-    # An annotation file to generate fragments from => Should be filtered with sequences
+    # An annotation file to generate fragments from => Should be filtered and have sequences
     annotation_options: 
       location: "/Volumes/Extreme_SSD/h_sapiens/h.sapiens_snoRNA_final.gtf"
 
@@ -49,12 +47,10 @@ module_options:
     bool: True
 
   # P2: Second Processing Module. Dependent on P1 and S1
-  # Ensure that an index location is given in the P1 module before running P2
   P2: 
     bool: True
 
-    # from ITAS publication
-    # merged with snoDB
+    # Annotation file for outside maps
     annotation_file: ""
 
     # Find out of space maps

diff --git a/readme.md b/readme.md
@@ -55,7 +55,7 @@ This creates a conda environment that has all the required package. This makes i
 Please check the `install/test_results/out/tables` directory after the install. There should be 7 tables. 
 
 #### Note about R!
-Sometimes, your package folder will not be in R lib path. If this occurs:
+Sometimes, your package folder will not be in R lib path. If this occurs, enter this:
 ```
 cd install
 cwd=$(pwd)