-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_sample_name.py
executable file
·63 lines (51 loc) · 2.93 KB
/
add_sample_name.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
import sys
import os
import re
import argparse
def run(args):
sample_file_name = args['sample_file_name'][0]
match = re.match(args["regexp"], os.path.basename(sample_file_name))
if match is None:
raise Exception("unable to parse sample id from %s"%os.path.basename(sample_file_name))
sample = match.groups()[0]
sample=re.sub("_","-",sample)
if args["dry_run"]:
print "(parsed sample name %s from filename %s)"%(sample, sample_file_name)
return
seq_number = 1
for record in sys.stdin:
match = re.match("^>(\S+)", record )
if match is not None:
illumina_moniker = match.groups()[0]
sys.stdout.write(">%s_%07d %s\n"%(sample, seq_number, illumina_moniker))
seq_number += 1
else:
sys.stdout.write(record)
def get_options():
description = """
"""
long_description = """
# fasta text is streamed in , and each sequence name is edited using the
# command line argument, which is a sample filename like
# /dataset/Kittelmann_Buccal_Ill/scratch/nzgl01005/buildR1/processed_S398Buccal-SD-75MG_S96_L001_R2_001.fastq.trimmed.combined
# or
# /dataset/public_invermay_scratch/scratch/SL_Vac13/joined/VAC13-003_S3_L001_R1_001.fastq.combined.fasta
# example :
# cat /dataset/Kittelmann_Buccal_Ill/scratch/nzgl01005/buildR1/R1_combined.fa | ./add_sample_name.py /dataset/Kittelmann_Buccal_Ill/scratch/nzgl01005/buildR1/processed_S398Buccal-SD-75MG_S96_L001_R2_001.fastq.trimmed.combined
#
# example specifying an alt regexp
# tardis -d /dataset/gseq_processing/scratch/batch_qiime/afm_test/qiime_analysis -c 999999999 cat _condition_fastq2fasta_input_/dataset/Rumen_Livestock/scratch/PRJ0255318_FILES/39_samples_16S_Data/scratch/J293_2.fq.gz \| /dataset/gseq_processing/active/bin/batch_qiime_prism/add_sample_name_new.py -r "\"^([^_]+)_\"" /dataset/Rumen_Livestock/scratch/PRJ0255318_FILES/39_samples_16S_Data/scratch/J293_2.fq.gz \> _condition_uncompressedtext_output_/dataset/gseq_processing/scratch/batch_qiime/afm_test/qiime_analysis/J293_2.fq.gz.combined.fasta
#
# dry run to test parsing
# ./add_sample_name.py -n -r "^([^_]+)_" /dataset/Rumen_Livestock/scratch/PRJ0255318_FILES/39_samples_16S_Data/scratch/J293_2.fq.gz
"""
parser = argparse.ArgumentParser(description=description, epilog=long_description, formatter_class = argparse.RawDescriptionHelpFormatter)
parser.add_argument('sample_file_name', type=str, nargs=1,help='sample_file_name') # not opened - we only use the name
parser.add_argument('-r', '--regexp', dest='regexp', type=str, metavar='regexp to parse sampleidf', default = "^[^_]+_(\S+)_", help="regexp to parse sampleid")
parser.add_argument('-n','--dry_run', dest='dry_run', action='store_const', default = False, const=True, help='dry run only - just to test parsing ')
args = vars(parser.parse_args())
return args
if __name__ == "__main__":
args=get_options()
run(args)