Skip to content

Commit 7863452

Browse files
committed
updated regex list
1 parent 9c2d7a1 commit 7863452

File tree

1 file changed

+14
-18
lines changed

1 file changed

+14
-18
lines changed

workflow/scripts/gatc.track.maker.py

+14-18
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
#!/usr/bin/env python3
22

33
import argparse
4-
import gzip
54
import re
65
from concurrent.futures import ProcessPoolExecutor, as_completed
76
from Bio import SeqIO
87

98
# Command line argument processing
109
def process_cli():
11-
parser = argparse.ArgumentParser(description='Generates a GFF file containing the locations of all GATC sites in the genome sequence')
10+
parser = argparse.ArgumentParser(
11+
description="""
12+
Generates a GFF file containing the locations of all motif sites in the genome sequence
13+
"""
14+
)
1215
parser.add_argument("--fasta", "-f",
1316
type=str,
14-
help='Input genome FASTA file')
17+
help="Input genome FASTA file")
1518
parser.add_argument("--outfile", "-o",
1619
type=str,
1720
required=True,
@@ -22,7 +25,7 @@ def process_cli():
2225
choices=["hg38", "mm38", "dm6",
2326
"dm3", "hg19", "mm39"],
2427
required=True,
25-
help='Genome build (default: hg38)')
28+
help="Genome build (default: hg38)")
2629
parser.add_argument("--threads", "-t",
2730
type=int,
2831
default=1,
@@ -43,16 +46,13 @@ def process_cli():
4346
def regex_patterns(genome):
4447
# Dictionary to store scaffold and mitochondrial regex patterns
4548
patterns = {
46-
"hg38": [r"",""],
47-
"hg19": [r"",""],
48-
"mm38": [r"",""],
49-
"mm39": [r"",""],
49+
"hg38": [r"^KI|^GL","MT"],
50+
"hg19": [r"^GL","MT"],
51+
"mm39": [r"^JH|^GL|^MU","MT"],
5052
"dm6": [r"Scaffold|\d{15}","mitochondrion_genome"],
51-
"dm3": [r"",""],
5253
}
5354
return patterns[genome]
5455

55-
5656
def load_fasta(fasta, args):
5757
"""
5858
Load fasta file and return dict with chr as key and sequence as value.
@@ -71,11 +71,10 @@ def load_fasta(fasta, args):
7171
chr_seq = {k: v for k, v in chr_seq.items() if not re.search(mito_pattern, k, re.IGNORECASE)}
7272

7373
return chr_seq
74-
75-
74+
7675
# Generate track function
7776
def generate_track(args):
78-
motif = "GATC"
77+
motif = args.motif
7978
motif_len = len(motif)
8079

8180
fasta = args.fasta
@@ -101,16 +100,11 @@ def generate_track(args):
101100
for result in results:
102101
track.write(result)
103102

104-
print("All done!")
105-
106-
107103
# Process function
108104
def process(chr_name, seq, motif, motif_len):
109-
#print(f"Processing {chr_name} ...", file=sys.stderr)
110105
results = motif_hash(seq, chr_name, motif, motif_len)
111106
return chr_name, results
112107

113-
114108
# Motif hash function
115109
def motif_hash(seq, chr_name, motif, motif_len):
116110
results = []
@@ -125,7 +119,9 @@ def motif_hash(seq, chr_name, motif, motif_len):
125119
# Main function
126120
def main():
127121
args = process_cli()
122+
print(f"Mapping {args.motif} sites in {args.fasta}")
128123
generate_track(args)
124+
print("All done!")
129125

130126
if __name__ == "__main__":
131127
main()

0 commit comments

Comments
 (0)