Skip to content

Commit

Permalink
Code move
Browse files Browse the repository at this point in the history
  • Loading branch information
ntfargo committed Mar 19, 2024
1 parent 97f1418 commit 1f24d70
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 58 deletions.
2 changes: 1 addition & 1 deletion genet/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from genet.analysis.functional import(
loadseq,
SortByBarcodes,

)

from genet.analysis.SGE_analysis import *
from genet.analysis.UMItools import *
from genet.analysis._dev_UMI import *
50 changes: 31 additions & 19 deletions genet/analysis/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,9 @@ def sort_barcode(list_sParameters):
SeqIO.write(seq_rec, '%s/%s.%s' % (temp_dir, barcode, output_format), output_format)

# def END: sort_barcode



def combine_files(list_combine_param):
"""Combine files by name
"""
"""Combine files by name"""

# parameters
splits_dir = list_combine_param[0]
Expand Down Expand Up @@ -278,17 +275,32 @@ def sort_barcode_and_combine(list_sParameters):
if silence == False: print('Make temp sorted %s file: %s' % (output_format, subsplit_name))

for barcode, seq_rec in dict_barcode.items():
SeqIO.write(seq_rec, '%s/%s.%s' % (temp_dir, barcode, output_format), output_format)

# def END: sort_barcode


def loadseq():
'''
테스트용으로 만든 코드
'''

print('For testing')


SeqIO.write(seq_rec, '%s/%s.%s' % (temp_dir, barcode, output_format), output_format)

""" Codon usage analysis "temporary" """

def calculate_codon_composition(seq):
"""Calculates the frequency of each codon in a DNA sequence."""
codon_counts = {}
for i in range(0, len(seq) - 2, 3):
codon = seq[i:i+3]
codon_counts[codon] = codon_counts.get(codon, 0) + 1

total_count = sum(codon_counts.values())
for codon, count in codon_counts.items():
codon_counts[codon] = count / total_count

return codon_counts

def find_orfs(seq):
"""Identifies potential open reading frames (ORFs) in a DNA sequence."""
orfs = []
for frame in range(3):
for start in range(frame, len(seq), 3):
codon = seq[start:start + 3]
if codon == 'ATG':
end = start + 3
while end < len(seq) and seq[end:end + 3] not in ['TAA', 'TAG', 'TGA']:
end += 3
orfs.append((start, end, '+'))
return orfs
51 changes: 13 additions & 38 deletions genet/predict/PredUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
def preprocess_masked_seq(data, seq_length):
"""더 이상 쓰이지 않는 함수. 테스트 해보고 문제 없으면 없앨 예정.
"""

"""
seq_onehot = np.zeros((len(data), 1, seq_length, 4), dtype=float)

for l in range(len(data)):
Expand All @@ -27,7 +26,7 @@ def preprocess_masked_seq(data, seq_length):

def one_hot_encode(seq):
mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "X": 4}
map_seq = np.array([mapping[i] for i in seq])
map_seq = [mapping[i] for i in seq]
arr_seq = np.eye(5)[map_seq]
return np.delete(arr_seq, -1, axis=1)

Expand All @@ -36,43 +35,19 @@ def preprocess_seq(data, length:int):
return np.stack(encoded_seq, axis=0).reshape(len(data), 1, length, 4)

def reverse_complement(sSeq):
"""Biopython의 reverse_complement 또는 reverse_complement_rna로 모두 대체함.
더 이상 쓰이지 않는 함수.
테스트 해보고 문제 없으면 없앨 예정.
"""
dict_sBases = {k: v for k, v in zip('ACGTNU.acgt', 'TGCANU.tgca')}
return sSeq.translate(str.maketrans(dict_sBases))[::-1]

# def END: reverse_complement

def calculate_codon_composition(seq):
"""Calculates the frequency of each codon in a DNA sequence."""
codon_counts = {}
for i in range(0, len(seq) - 2, 3):
codon = seq[i:i+3]
codon_counts[codon] = codon_counts.get(codon, 0) + 1
"""
Replace with Biopython's reverse_complement or reverse_complement_rna.
A function no longer in use.
Plan to remove it if no issues are found upon testing.
"""
dict_sBases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'U', 'n': '',
'.': '.', '*': '*', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}
list_sSeq = list(sSeq) # Turns the sequence in to a gigantic list
list_sSeq = [dict_sBases[sBase] for sBase in list_sSeq]
return ''.join(list_sSeq)[::-1]

total_count = sum(codon_counts.values())
for codon, count in codon_counts.items():
codon_counts[codon] = count / total_count

return codon_counts

def find_orfs(seq):
"""Identifies potential open reading frames (ORFs) in a DNA sequence."""
orfs = []
for frame in range(3):
for start in range(frame, len(seq), 3):
codon = seq[start:start + 3]
if codon == 'ATG': # Potential start codon
end = start + 3
while end < len(seq) and seq[end:end + 3] not in ['TAA', 'TAG', 'TGA']:
end += 3
orfs.append((start, end, '+'))
# Also consider ORFs on the reverse complementary strand of the DNA
return orfs

def padding(arr, max_length):
""" Padding sequences to the same length """
str_arr = []
c = arr[0]
if max_length > len(c):
Expand Down

2 comments on commit 1f24d70

@ntfargo
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Goosang-You Please review the code, if good i make a pull request to main tree. 🙏

@Goosang-Yu
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks good, thanks @ntfargo !

Please sign in to comment.