forked from chenmy33/IntronGetting
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFormats.py
98 lines (78 loc) · 3.54 KB
/
Formats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
'''this module is applied to reformat the header of seqkit-extracted fasta'''
from collections import defaultdict
from Bio import SeqIO
import re
import os
from Path import ExonPath
import sys
def RFHeader(inputpath, outputpath, filename, sortedfilename):
'''this function is used to reformat the sequence ID of a specified fasta'''
# Read fasta, and reformat the header
gene_dict = defaultdict(list)
handle = open(os.path.join(inputpath, filename), 'r')
for record in SeqIO.parse(handle, 'fasta'):
# print(record.id)
# print(vars(record))
geneid = record.description.split(' ')[-1]
# Using re to grep start and end position of exon
pos_p = re.compile(r'([0-9]+)-([0-9]+)')
start = pos_p.search(record.id).group(1)
end = pos_p.search(record.id).group(2)
# print(exonid)
# print(f'{start} {end}')
chr = record.id.replace(f'_{start}-{end}', '').split(':')[0]
# print(chr)
strand_sym = record.id.replace(f'_{start}-{end}', '').split(':')[1]
# print(strand_sym)
if strand_sym == "+":
strand = "1"
elif strand_sym == "-":
strand = "-1"
else:
strand = "."
# Reformat the header
record.id = f"{geneid}|{chr}|{start}|{end}|{strand}"
gene_dict[geneid].append([record.id, record.seq]) # .seq is a unchangeble object
handle.close()
# print(gene_dict.items())
# Output reformatted fasta
with open(os.path.join(outputpath, sortedfilename), 'w') as output:
# Define lambda function to sort the exon
exonLocation = lambda x:int(x[0].split('|')[2])
# Reformat the record id and assign exon ID
for key in list(gene_dict.keys()):
# records = sorted(gene_dict.items(), key = exonLocation)
records = sorted(gene_dict[key], key = exonLocation)
# print(records)
# print(len(records))
for id in range(len(records)):
records[id][0] = "%s|%s|%s|%s|%s|%s|%s" % (records[id][0].split('|')[0],
records[id][0].split('|')[0],
records[id][0].split('|')[1],
records[id][0].split('|')[0] + '.' + str(id + 1),
records[id][0].split('|')[2],
records[id][0].split('|')[3],
records[id][0].split('|')[4])
# print(records)
gene_dict[key] = records
for key in list(gene_dict.keys()):
records = gene_dict[key]
for record in records:
output.write(">" + record[0] + '\n' + str(record[1]) + '\n')
output.close()
def main():
inputpath = sys.argv[1]
outputpath = sys.argv[2]
filenames = ExonPath(inputpath)
# Using a loop to reformat original exon fasta
for fa in filenames:
print(f"{fa}\t{fa.replace('.singlecopy.exon.fa', '_exon')}")
RFHeader(inputpath, outputpath, fa, fa.replace('.fa', '_exon'))
if __name__ == "__main__":
# RFHeader('tmp_data', 'tmp_data', 'test.fa', 'test.rfmt.fa')
main()
'''
inputpath is the input path of seqkit-extracted exon fasta
outputpath is the output path of reformatted exon fasta
Note: pattern of the input filename for Introngettng should be 'Species_exon'
'''