Code move

Goosang-Yu · Mar 19, 2024 · 1f24d70 · 1f24d70 · ntfargo · Apr 20, 2024
1 parent 97f1418
commit 1f24d70
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 58 deletions.
diff --git a/genet/analysis/__init__.py b/genet/analysis/__init__.py
@@ -1,8 +1,8 @@
 from genet.analysis.functional import(
     loadseq,
     SortByBarcodes,
-
 )
+
 from genet.analysis.SGE_analysis import *
 from genet.analysis.UMItools import *
 from genet.analysis._dev_UMI import *
diff --git a/genet/analysis/functional.py b/genet/analysis/functional.py
@@ -189,12 +189,9 @@ def sort_barcode(list_sParameters):
         SeqIO.write(seq_rec, '%s/%s.%s' % (temp_dir, barcode, output_format), output_format)
 
 # def END: sort_barcode
-
-
+
 def combine_files(list_combine_param):
-    """Combine files by name
-
-    """
+    """Combine files by name"""
 
     # parameters
     splits_dir    = list_combine_param[0]
@@ -278,17 +275,32 @@ def sort_barcode_and_combine(list_sParameters):
     if silence == False: print('Make temp sorted %s file: %s' % (output_format, subsplit_name))
 
     for barcode, seq_rec in dict_barcode.items():
-        SeqIO.write(seq_rec, '%s/%s.%s' % (temp_dir, barcode, output_format), output_format)
-
-# def END: sort_barcode
-
-
-def loadseq():
-    '''
-    테스트용으로 만든 코드
-    
-    '''
-
-    print('For testing')
-
-
+        SeqIO.write(seq_rec, '%s/%s.%s' % (temp_dir, barcode, output_format), output_format) 
+
+""" Codon usage analysis "temporary" """
+
+def calculate_codon_composition(seq):
+    """Calculates the frequency of each codon in a DNA sequence."""
+    codon_counts = {}
+    for i in range(0, len(seq) - 2, 3):
+        codon = seq[i:i+3]
+        codon_counts[codon] = codon_counts.get(codon, 0) + 1
+
+    total_count = sum(codon_counts.values())
+    for codon, count in codon_counts.items():
+        codon_counts[codon] = count / total_count
+
+    return codon_counts
+
+def find_orfs(seq):
+    """Identifies potential open reading frames (ORFs) in a DNA sequence."""
+    orfs = []
+    for frame in range(3):
+        for start in range(frame, len(seq), 3):
+            codon = seq[start:start + 3]
+            if codon == 'ATG':
+                end = start + 3
+                while end < len(seq) and seq[end:end + 3] not in ['TAA', 'TAG', 'TGA']:
+                    end += 3
+                orfs.append((start, end, '+'))
+    return orfs
diff --git a/genet/predict/PredUtils.py b/genet/predict/PredUtils.py
@@ -3,8 +3,7 @@
 def preprocess_masked_seq(data, seq_length):
     """더 이상 쓰이지 않는 함수. 테스트 해보고 문제 없으면 없앨 예정.
 
-    """    
-
+    """  
     seq_onehot = np.zeros((len(data), 1, seq_length, 4), dtype=float)
 
     for l in range(len(data)):
@@ -27,7 +26,7 @@ def preprocess_masked_seq(data, seq_length):
 
 def one_hot_encode(seq):
     mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "X": 4}
-    map_seq = np.array([mapping[i] for i in seq])
+    map_seq = [mapping[i] for i in seq]
     arr_seq = np.eye(5)[map_seq]
     return np.delete(arr_seq, -1, axis=1)
 
@@ -36,43 +35,19 @@ def preprocess_seq(data, length:int):
     return np.stack(encoded_seq, axis=0).reshape(len(data), 1, length, 4)
 
 def reverse_complement(sSeq):
-    """Biopython의 reverse_complement 또는 reverse_complement_rna로 모두 대체함. 
-    더 이상 쓰이지 않는 함수.
-    테스트 해보고 문제 없으면 없앨 예정.
-    """ 
-    dict_sBases = {k: v for k, v in zip('ACGTNU.acgt', 'TGCANU.tgca')}
-    return sSeq.translate(str.maketrans(dict_sBases))[::-1]
-
-    # def END: reverse_complement
-
-def calculate_codon_composition(seq):
-    """Calculates the frequency of each codon in a DNA sequence."""
-    codon_counts = {}
-    for i in range(0, len(seq) - 2, 3):
-        codon = seq[i:i+3]
-        codon_counts[codon] = codon_counts.get(codon, 0) + 1
+    """
+        Replace with Biopython's reverse_complement or reverse_complement_rna.
+        A function no longer in use.
+        Plan to remove it if no issues are found upon testing.
+    """
+    dict_sBases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'U', 'n': '',
+                   '.': '.', '*': '*', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}
+    list_sSeq = list(sSeq)  # Turns the sequence in to a gigantic list
+    list_sSeq = [dict_sBases[sBase] for sBase in list_sSeq]
+    return ''.join(list_sSeq)[::-1]
 
-    total_count = sum(codon_counts.values())
-    for codon, count in codon_counts.items():
-        codon_counts[codon] = count / total_count
-
-    return codon_counts
-
-def find_orfs(seq):
-    """Identifies potential open reading frames (ORFs) in a DNA sequence."""
-    orfs = []
-    for frame in range(3):
-        for start in range(frame, len(seq), 3):
-            codon = seq[start:start + 3]
-            if codon == 'ATG':  # Potential start codon
-                end = start + 3
-                while end < len(seq) and seq[end:end + 3] not in ['TAA', 'TAG', 'TGA']:
-                    end += 3
-                orfs.append((start, end, '+'))
-    # Also consider ORFs on the reverse complementary strand of the DNA
-    return orfs
-
 def padding(arr, max_length):
+    """ Padding sequences to the same length """
     str_arr = []
     c = arr[0]
     if max_length > len(c):