-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbed_count_generic.py
95 lines (79 loc) · 4.39 KB
/
bed_count_generic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import argparse
import pandas as pd
import re
import os
import subprocess
####MAIN function
def main():
##Use the get_args function
BED = get_args()
BASEPREFIX = os.path.basename(BED)
# print(BASEPREFIX)
PREFIX = re.split("[.]", BASEPREFIX)[0]
# print(PREFIX)
WORD = BASEPREFIX.replace('_young_rm', '|')
# print(WORD)
ANIMAL = re.split("[|]", WORD)[0]
print(ANIMAL)
IN_BED = pd.read_table(BED, sep='\t', names=['chrom', 'start', 'stop', 'name', 'size', 'strand', 'class', 'family', 'diverge'])
CLASSES = ['Unknown', 'DNA', 'SINE', 'LINE', 'LTR', 'RC']
def DO_COUNT(CLASSTOCOUNT):
CLASS_IN_BED = IN_BED['class'] == CLASSTOCOUNT
IN_BED_CLASS = IN_BED[CLASS_IN_BED]
IN_UNIQUE = IN_BED_CLASS['name'].unique()
COUNTS = []
for UNIQUE in IN_UNIQUE:
COUNT = len(IN_BED_CLASS[IN_BED_CLASS['name'] == UNIQUE])
COUNTS.append(COUNT)
OUTFRAME = pd.DataFrame({'NAME': IN_UNIQUE, 'COUNT': COUNTS})
OUTFRAME.sort_values('COUNT', ascending=False, inplace=True)
OUTFRAME.to_csv(PREFIX + '_' + CLASSTOCOUNT + '_counts.txt', sep='\t', index=False)
DIR = r'/lustre/scratch/daray/200mammals/SINEandLINE/'
if os.path.isdir(DIR + '/' + ANIMAL + '_young/'):
# print(PREFIX + ' exists.')
OUTFRAME1000 = OUTFRAME[OUTFRAME['COUNT'] >= 1000]
OUTFRAME1000.to_csv(DIR + '/' + ANIMAL + '_young/' + ANIMAL + '_' + CLASSTOCOUNT + '_outframe1000.txt', sep='\t', index=False)
# print(OUTFRAME1000)
TELIST = OUTFRAME1000['NAME'].tolist()
# print(TELIST)
for TENAME in TELIST:
subprocess.check_call('cp /lustre/scratch/daray/200mammals/analyses/{}/extract_align/cons_comb/{}*_cons_comb.fas /lustre/scratch/daray/200mammals/SINEandLINE/{}/' .format (ANIMAL, TENAME + '_', ANIMAL + '_young'), shell=True)
subprocess.check_call('cp /lustre/scratch/daray/200mammals/analyses/{}/extract_align/muscle/{}*.muscle.fas /lustre/scratch/daray/200mammals/SINEandLINE/{}/' .format (ANIMAL, TENAME + '_', ANIMAL + '_young'), shell=True)
else:
# print(PREFIX + ' does not exist.')
os.mkdir(DIR + '/' + ANIMAL + '_young/')
OUTFRAME1000 = OUTFRAME[OUTFRAME['COUNT'] >= 1000]
OUTFRAME1000.to_csv(DIR + '/' + ANIMAL + '_young/' + ANIMAL + '_' + CLASSTOCOUNT + '_outframe1000.txt', sep='\t', index=False)
# print(OUTFRAME1000)
TELIST = OUTFRAME1000['NAME'].tolist()
# print(TELIST)
for TENAME in TELIST:
subprocess.check_call('cp /lustre/scratch/daray/200mammals/analyses/{}/extract_align/cons_comb/{}*_cons_comb.fas /lustre/scratch/daray/200mammals/SINEandLINE/{}/' .format (ANIMAL, TENAME + '_', ANIMAL + '_young'), shell=True)
subprocess.check_call('cp /lustre/scratch/daray/200mammals/analyses/{}/extract_align/muscle/{}*.muscle.fas /lustre/scratch/daray/200mammals/SINEandLINE/{}/' .format (ANIMAL, TENAME + '_', ANIMAL + '_young'), shell=True)
for CLASS in CLASSES:
DO_COUNT(CLASS)
##Get arguments function
def get_args():
parser = argparse.ArgumentParser(description="Will process a .bed file, identify putative families categorized as 'Unknown' and count the number of instances in the genome of that family.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-b', '--bed', type=str, help='Name of the .bed file to be parsed', required=True)
# parser.add_argument('-m', '--minsize', type=int, help='Minimum size of hit to include in sorted file', default = 100)
# parser.add_argument('-s', '--sortcriterion', type=str, help='Sort criterion, i.e. size, name, family, class, size, or divergence (diverge), etc.')
# parser.add_argument('-p', '--prefix', type=str, help='Prefix to use for output file - default is first field of input filename')
# parser.add_argument('-sp', '--split', type=str, help='Split into files based on name, family, class? This is optional.')
# parser.add_argument('-n', '--minhitnum', type=int, help='Minimum number of hits in file before being created. Only implemented if --split option is invoked. Optional.')
# parser.add_argument('-d', '--maxdiverge', type=float, help='Maximum divergence allowed in output file.')
# parser.add_argument('-dmin', '--mindiverge', type=float, help='Minimum divergence allowed in output file.')
args = parser.parse_args()
BED = args.bed
# MINSIZE = args.minsize
# PREFIX = args.prefix
# CRITERION = args.sortcriterion
# SPLIT = args.split
# HITS = args.minhitnum
# MAX = args.maxdiverge
# MINDIV = args.mindiverge
return BED
if __name__ =="__main__":main()
#gzip $ABBREV".align" &
#gzip $ABBREV"_div.out" &
#gzip $ABBREV"_wCpG_div.out" &