Skip to content

Commit ec48628

Browse files
committed
Fasta masking with either whole gene seq or exons
1 parent aa872d4 commit ec48628

File tree

5 files changed

+44
-28
lines changed

5 files changed

+44
-28
lines changed

config/config.yaml

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
genome: dm6
22
ensembl_genome_build: 110
3-
plasmid_fasta: none
4-
fusion_genes: FBgn0038542,FBgn0085506 # Genes from these proteins will be removed from the analysis
5-
bowtie2:
6-
extra: ""
3+
plasmid_fasta: none # Path to plasmid fasta file with sequences to be removed
4+
fusion_genes:
5+
genes: FBgn0038542,FBgn0085506 # Genes from these proteins will be masked from the fasta file
6+
feature_to_mask: "exon" # Gene feature to mask from the fasta file (exon or gene)
77
damidseq_pipeline:
88
normalization: kde # kde, rpm or rawbins
99
binsize: 300
@@ -67,6 +67,7 @@ resources: # computing resources
6767
damid:
6868
cpu: 24
6969
time: 720
70+
tmpdir: /tmp
7071
index:
7172
cpu: 40
7273
time: 60

workflow/rules/resources.smk

+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ rule masked_fasta:
8080
params:
8181
g2m=maskedgenes,
8282
genome=resources.genome,
83+
f2m=config["fusion_genes"]["feature_to_mask"]
8384
log:
8485
"logs/resources/masked_fasta.log"
8586
threads: config["resources"]["plotting"]["cpu"]

workflow/schemas/config.schema.yaml

+10-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,16 @@ properties:
1616
description: Ensembl genome build
1717

1818
fusion_genes:
19-
type: string
20-
description: Genes from these proteins will be removed from the analysis
21-
19+
type: object
20+
properties:
21+
genes:
22+
type: string
23+
description: Genes from these proteins will be masked from the fasta file
24+
feature_to_mask:
25+
type: string
26+
enum: ["exon", "gene"]
27+
description: Feature to mask from the fasta file (exon or gene)
28+
2229
damidseq_pipeline:
2330
type: object
2431
properties:

workflow/scripts/general_functions.smk

+1-1
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ def masked_genes():
362362
"""
363363
Returns string with genes that were masked in fasta file
364364
"""
365-
genes = config["fusion_genes"]
365+
genes = config["fusion_genes"]["genes"]
366366

367367
# If no genes are given, return no_genes
368368
if genes == "":

workflow/scripts/mask_fasta.py

+27-20
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
import sys
22
import subprocess
3-
import re
43
from Bio import SeqIO
54

65
"""
76
Replaces gene sequences set in config:fusion_genes
87
in fasta file with Ns.
98
9+
TO DO: instead of whole gene sequence, mask only exonic regions
10+
1011
Reason: plasmid expressing Dam fusion
1112
genes can be methylated at very high levels
1213
"""
1314

1415
# Load Snakemake variables
1516
gtf = snakemake.input["gtf"]
1617
genes2mask = snakemake.params["g2m"]
17-
genome = snakemake.params["genome"]
18+
feature2mask = snakemake.params["f2m"]
1819
fasta = snakemake.input["fa"]
1920
masked_fasta = snakemake.output["out"]
2021

@@ -39,29 +40,35 @@ def write_dict2fasta(d, out):
3940
write_dict2fasta(chr_seq, masked_fasta)
4041
else:
4142
for gene in genes2mask.split("_"):
42-
print(f"Masking {gene} sequence from {fasta}...")
43-
44-
# Get genomic coordinates of genes to mask from GTF file
45-
cmd = f"""sed '1,4d' {gtf} | awk '{{if ($3 == "gene") {{print $0}} }}' | grep {gene}"""
43+
print(f"Masking {gene} sequence from {fasta} (feature {feature2mask})...")
44+
45+
# Get genomic coordinates of selected feature of gene to mask from GTF file
46+
cmd = f"""sed '1,4d' {gtf} | awk '{{if ($3 == "{feature2mask}") {{print $0}} }}' | grep {gene}"""
4647
try:
47-
line = subprocess.check_output(cmd, shell=True).decode()
48+
lines = subprocess.check_output(cmd, shell=True).decode().split("\n")
4849
except subprocess.CalledProcessError:
4950
print(f"Gene {gene} not found in {gtf}...")
5051
sys.exit(1)
51-
chr, db, t, start, end, *args = line.split("\t")
52-
53-
# Load chromosome sequence where gene is located
54-
seq = chr_seq[chr]
55-
56-
# Correct start and end positions for 0-based indexing
57-
start = int(start) - 1
58-
end = int(end) - 1
59-
60-
# Mask gene sequence with Ns
61-
seq_masked = seq[:start] + "N" * (end - start) + seq[end:]
6252

63-
# Replace sequence in dict
64-
chr_seq[chr] = seq_masked
53+
# Mask each feature of gene with Ns
54+
for line in lines:
55+
try:
56+
chr, db, t, start, end, *args = line.split("\t")
57+
except ValueError:
58+
continue # Skip empty line (last one)
59+
60+
# Load chromosome sequence where gene is located
61+
seq = chr_seq[chr]
62+
63+
# Correct start and end positions for 0-based indexing
64+
start = int(start) - 1
65+
end = int(end) - 1
66+
67+
# Mask gene sequence with Ns
68+
seq_masked = seq[:start] + "N" * (end - start) + seq[end:]
69+
70+
# Replace sequence in dict
71+
chr_seq[chr] = seq_masked
6572

6673
# Write masked fasta to file
6774
print(f"Writing masked sequence(s) to {masked_fasta}...")

0 commit comments

Comments
 (0)