Skip to content

Commit

Permalink
final push!
Browse files Browse the repository at this point in the history
  • Loading branch information
kenminsoo committed Aug 19, 2023
1 parent a5d63d1 commit 224a55c
Show file tree
Hide file tree
Showing 38 changed files with 10,408 additions and 5,820 deletions.
Binary file modified .DS_Store
Binary file not shown.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ SPRMT_merge_031323_seq.gtf
*.fasta
.Rdata
.Rhistory
*.bt2
mirna_to_snorna/
mirna_to_snorna_mi/
piRNAbank_to_pirnadb/
Expand Down
Binary file modified __pycache__/conversion_tools.cpython-310.pyc
Binary file not shown.
Binary file modified __pycache__/fragmentation.cpython-310.pyc
Binary file not shown.
Binary file modified __pycache__/gtf_groundtruth.cpython-310.pyc
Binary file not shown.
Binary file modified __pycache__/gtf_modifiers.cpython-310.pyc
Binary file not shown.
5 changes: 3 additions & 2 deletions conversion_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def gtf_attribute_to_fasta(gtf, output, attribute, primary_key, pipeline = False
names[extracted_sequence].add(primary)
# generate a deduplicated gtf
else:
new.write(">" + primary + "\n" + extracted_sequence + "\n")
new.write(">" + primary.replace('"', "") + "\n" + extracted_sequence.replace('"', "") + "\n")
new_gtf.write(line)

names[extracted_sequence].add(primary)
Expand All @@ -370,6 +370,7 @@ def gtf_attribute_to_fasta(gtf, output, attribute, primary_key, pipeline = False

seq_index = attributes.index(attribute)
extracted_sequence = attributes[seq_index + 1]
extracted_sequence = extracted_sequence.upper()

primary_index = attributes.index(primary_key)
primary = attributes[primary_index + 1]
Expand All @@ -378,7 +379,7 @@ def gtf_attribute_to_fasta(gtf, output, attribute, primary_key, pipeline = False
names[extracted_sequence].add(primary)
# generate a deduplicated gtf
else:
new.write(">" + primary + "\n" + extracted_sequence + "\n")
new.write(">" + primary.strip('"') + "\n" + extracted_sequence.strip('"') + "\n")

names[extracted_sequence].add(primary)

Expand Down
161 changes: 161 additions & 0 deletions fragmentation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from basics import *
from MINTplates import *
import numpy as np
from collections import defaultdict

# takes in a gtf with sequences

Expand All @@ -23,6 +24,8 @@ def create_lookup(gtf_file, seq_key, source_key,start, end, output, id_pre = "sd

seq = attributes[seq_index + 1]
seq = seq.replace('"', "")
seq = seq.strip(" ")
seq = seq.upper()

source_index = attributes.index(source_key)

Expand Down Expand Up @@ -131,4 +134,162 @@ def create_lookup(gtf_file, seq_key, source_key,start, end, output, id_pre = "sd

lookup_table["ID"] = lookup_table["sequence"].apply(lambda x: encode_sequence(x, id_pre))

lookup_table.to_csv(output, index = False)

def create_lookup_selective(gtf_file, seq_key, source_key,min_length, max_length, output, lookup_builder, id_pre = "sdr_snoRNAdb2023_", flank = 5):
lookup_dict = my_dictionary()
selector = defaultdict(set)

constructor_headers = ["sequence", "sources", "num", "5p_end", "n_5p", "avg_start", "std", "flanking5p", "all_same5p","flanking3p", "all_same3p"]
df_constructor = []

with open(lookup_builder, "r") as lookup_build:
for line in lookup_build:
splitted_line = line.split(sep = "\t")

the_number = splitted_line[2]

the_number = the_number.replace("M", "")

the_number = int(the_number)

loci = splitted_line[1]

loci = int(loci)

selector[splitted_line[0].replace('"', "")].add((loci, the_number))

with open(gtf_file, "r") as gtf, open(output, "w") as new:
for line in gtf:
sepped = separate_gtf_line(line)

main_cols = sepped[0]
attributes = sepped[1]

strand = main_cols[6]

seq_index = attributes.index(seq_key)

seq = attributes[seq_index + 1]
seq = seq.replace('"', "")
seq = seq.strip(" ")

source_index = attributes.index(source_key)

source_base = attributes[source_index + 1]
source_base = source_base.replace('"', "")

selector_key_in_gtf = source_base

source_base = source_base + "__" + strand

seq_length = len(seq)

# make it s.t. we extract each info
query_set_list = selector[selector_key_in_gtf]


for item in query_set_list:
start = item[0]
start = start - 1
length = item[1]
end = start + length

segment = seq[start:end]
segment = segment.strip(" ")
segment = segment.upper()

if len(segment) < min_length:
continue
elif len(segment) > max_length:
continue

source = str(start + 1) + ";" + str(end + 1) + "__" + source_base + "__.__" + str(start/seq_length)

p5_flank = seq[(start - flank):start]
p3_flank = seq[end:end + flank]

if segment in lookup_dict:

lookup_dict[segment][0].append(source)

lookup_dict[segment][1] = lookup_dict[segment][1] + 1

lookup_dict[segment][3].append(start/seq_length)

lookup_dict[segment][4].append(p5_flank)

lookup_dict[segment][6].append(p3_flank)

if start == 0:

lookup_dict[segment][2] = lookup_dict[segment][2] + 1

else:

if start == 0:
lookup_dict.add(segment, [[source], 1, 1, [start/seq_length], [p5_flank], 1, [p3_flank], 1])

else:
lookup_dict.add(segment, [[source], 1, 0, [start/seq_length], [p5_flank], 1, [p3_flank], 1])

for key in lookup_dict:

# Skip if there are more than 300 sources
# Likely represents sequence of low complexity
if lookup_dict[key][1] > 300:
continue

entry = []

entry.append(key)

entry.append(">".join(lookup_dict[key][0]))
entry.append(str(lookup_dict[key][1]))

avg = np.mean(lookup_dict[key][3])
std = np.std(lookup_dict[key][3])

if lookup_dict[key][2] > 0:
entry.append(True)
else:
entry.append(False)

entry.append(str(lookup_dict[key][2]))

entry.append(str(avg))
entry.append(str(std))

# check if all 5p flanks are the same
if all(element == lookup_dict[key][4][0] for element in lookup_dict[key][4]):
entry.append(max(lookup_dict[key][4]))
entry.append(str(1))
else:
entry.append(max(lookup_dict[key][4]))
entry.append(str(0))

# check if all 3p flanks are the same
if all(element == lookup_dict[key][6][0] for element in lookup_dict[key][6]):
entry.append(max(lookup_dict[key][6]))
entry.append(str(1))
else:
entry.append(max(lookup_dict[key][6]))
entry.append(str(0))

df_constructor.append(entry)

lookup_table = pd.DataFrame(df_constructor, columns = constructor_headers)

lookup_table["sequence"] = lookup_table["sequence"].str.upper()

lookup_table["flanking5p"] = lookup_table["flanking5p"].str.upper()

lookup_table["flanking3p"] = lookup_table["flanking3p"].str.upper()

lookup_table = lookup_table.sort_values(by = ["sequence"])

lookup_table = lookup_table.loc[~lookup_table["sequence"].str.contains("N"),:]

lookup_table["ID"] = lookup_table["sequence"].apply(lambda x: encode_sequence(x, id_pre))

lookup_table.to_csv(output, index = False)
19 changes: 19 additions & 0 deletions gtf_descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,25 @@
# - count # of times value in column appears
# - count # of times an attribute appears

# find number of possible sequences
def possible_fragments(gtf_file, m, k):
total = 0

with open(gtf_file, "r") as gtf:
for line in gtf:
sep = separate_gtf_line(line)
attributes = sep[1]
cols = sep[0]

length = int(cols[4]) - int(cols[3]) + 1

length = length

total_frags = (length * (m - k + 1)) - ((m * (m+1))/2) - ((m*(k-1))/2)

total += total_frags

return total
# Input gtf file, if sequence exists, this will compare the extracted one (with bedtools) to see if any errors exist in the annotation
def compare_sequence(gtf_file, output_name, sequence_feature1, sequence_feature2):
# store data in here
Expand Down
2 changes: 1 addition & 1 deletion gtf_groundtruth.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def bowtie_align(new_dir, num_mismatch = 2, input_fasta = "sequences.fasta ", p

def bowtie_align_pipeline(index_name, working_dir, fasta):
os.system('cd ' + working_dir + '; \
bowtie -f -x ' + index_name + ' ' + fasta +' -k 101 --best --strata -v 0 -S lookup_filtered.sam --reorder')
bowtie -f -x ' + index_name + ' ' + fasta +' -k 101 --best --strata -v 0 -S lookup_filtered.sam --reorder --norc')


# Parse SAM File
Expand Down
21 changes: 21 additions & 0 deletions gtf_modifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,27 @@ def add_sequence_gtf(gtf_file, ref_genome, output_name, attribute_name = "bed_se

iter += 2

def add_sequence_gtf_trna(gtf_file, ref_genome, output_name, attribute_name = "bed_sequence"):

seq_list = add_sequence_fast(gtf_file, ref_genome)

with open(gtf_file, "r") as bed, open(output_name, "w") as new:

iter = 1

for line in bed:
extracted_sequence = seq_list[iter]

extracted_sequence = extracted_sequence.upper()

extracted_sequence = extracted_sequence + "CCA"

temp_line = line.strip()

new.write(temp_line + '; ' + attribute_name + ' "' + extracted_sequence + '"\n')

iter += 2

# we define a new file type, bedseq format
def add_sequence_bed(bed_file, ref_genome, output_name):

Expand Down
Binary file modified install/.DS_Store
Binary file not shown.
3 changes: 2 additions & 1 deletion install/requirements_pip.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ pybedtools==0.9.0
fire==0.5.0
biopython==1.81
matplotlib==3.7.1
seaborn==0.12.2
seaborn==0.12.2
pymsaviz==0.4.0
8 changes: 2 additions & 6 deletions install/sRNA_frag_config_setup_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ module_options:
prefix: hg38

# Remove UMIs, Default is QIAgen UMI with regex removal option
# for our cell line samples, UMIs and Adapters are already removed.
# We add adapter trimming time from 9 core run.
umi_removal:
bool: False
regex: ".+(?P<discard_1>AACTGTAGGCACCATCAAT){s<=2}(?P<umi_1>.{12}).+"
Expand All @@ -24,7 +22,7 @@ module_options:
bool: False
sequence: AGATCGGAAGAG

# An annotation file to generate fragments from => Should be filtered with sequences
# An annotation file to generate fragments from => Should be filtered and have sequences
annotation_options:
location: "/Volumes/Extreme_SSD/h_sapiens/h.sapiens_snoRNA_final.gtf"

Expand All @@ -49,12 +47,10 @@ module_options:
bool: True

# P2: Second Processing Module. Dependent on P1 and S1
# Ensure that an index location is given in the P1 module before running P2
P2:
bool: True

# from ITAS publication
# merged with snoDB
# Annotation file for outside maps
annotation_file: ""

# Find out of space maps
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ This creates a conda environment that has all the required package. This makes i
Please check the `install/test_results/out/tables` directory after the install. There should be 7 tables.

#### Note about R!
Sometimes, your package folder will not be in R lib path. If this occurs:
Sometimes, your package folder will not be in R lib path. If this occurs, enter this:
```
cd install
cwd=$(pwd)
Expand Down
Loading

0 comments on commit 224a55c

Please sign in to comment.