1
1
import sys
2
2
import subprocess
3
- import re
4
3
from Bio import SeqIO
5
4
6
5
"""
7
6
Replaces gene sequences set in config:fusion_genes
8
7
in fasta file with Ns.
9
8
9
+ TO DO: instead of whole gene sequence, mask only exonic regions
10
+
10
11
Reason: plasmid expressing Dam fusion
11
12
genes can be methylated at very high levels
12
13
"""
13
14
14
15
# Load Snakemake variables
15
16
gtf = snakemake .input ["gtf" ]
16
17
genes2mask = snakemake .params ["g2m" ]
17
- genome = snakemake .params ["genome " ]
18
+ feature2mask = snakemake .params ["f2m " ]
18
19
fasta = snakemake .input ["fa" ]
19
20
masked_fasta = snakemake .output ["out" ]
20
21
@@ -39,29 +40,35 @@ def write_dict2fasta(d, out):
39
40
write_dict2fasta (chr_seq , masked_fasta )
40
41
else :
41
42
for gene in genes2mask .split ("_" ):
42
- print (f"Masking { gene } sequence from { fasta } ..." )
43
-
44
- # Get genomic coordinates of genes to mask from GTF file
45
- cmd = f"""sed '1,4d' { gtf } | awk '{{if ($3 == "gene ") {{print $0}} }}' | grep { gene } """
43
+ print (f"Masking { gene } sequence from { fasta } (feature { feature2mask } ) ..." )
44
+
45
+ # Get genomic coordinates of selected feature of gene to mask from GTF file
46
+ cmd = f"""sed '1,4d' { gtf } | awk '{{if ($3 == "{ feature2mask } ") {{print $0}} }}' | grep { gene } """
46
47
try :
47
- line = subprocess .check_output (cmd , shell = True ).decode ()
48
+ lines = subprocess .check_output (cmd , shell = True ).decode (). split ( " \n " )
48
49
except subprocess .CalledProcessError :
49
50
print (f"Gene { gene } not found in { gtf } ..." )
50
51
sys .exit (1 )
51
- chr , db , t , start , end , * args = line .split ("\t " )
52
-
53
- # Load chromosome sequence where gene is located
54
- seq = chr_seq [chr ]
55
-
56
- # Correct start and end positions for 0-based indexing
57
- start = int (start ) - 1
58
- end = int (end ) - 1
59
-
60
- # Mask gene sequence with Ns
61
- seq_masked = seq [:start ] + "N" * (end - start ) + seq [end :]
62
52
63
- # Replace sequence in dict
64
- chr_seq [chr ] = seq_masked
53
+ # Mask each feature of gene with Ns
54
+ for line in lines :
55
+ try :
56
+ chr , db , t , start , end , * args = line .split ("\t " )
57
+ except ValueError :
58
+ continue # Skip empty line (last one)
59
+
60
+ # Load chromosome sequence where gene is located
61
+ seq = chr_seq [chr ]
62
+
63
+ # Correct start and end positions for 0-based indexing
64
+ start = int (start ) - 1
65
+ end = int (end ) - 1
66
+
67
+ # Mask gene sequence with Ns
68
+ seq_masked = seq [:start ] + "N" * (end - start ) + seq [end :]
69
+
70
+ # Replace sequence in dict
71
+ chr_seq [chr ] = seq_masked
65
72
66
73
# Write masked fasta to file
67
74
print (f"Writing masked sequence(s) to { masked_fasta } ..." )
0 commit comments