-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmining-d.smk
108 lines (102 loc) · 4.6 KB
/
mining-d.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Rules for running MINING-D using deduplicated CDRH3 sequences from SONAR's clustered IgM+ reads.
"""
import gzip
from collections import defaultdict
# The MINING-D authors actually clustered the CDR3 sequences *themselves*
# whereas I'm just deduplicating the CDR3s from SONAR's antibody cluster
# centroids. Possibly clustering their way would be even better; not sure.
def cdrh3s_from_airr(tsvgzs_in, fasta_out):
seqs = defaultdict(int)
for path in tsvgzs_in:
opener = open
if str(path).endswith(".gz"):
opener = gzip.open
with opener(path, "rt") as f_in:
reader = DictReader(f_in, delimiter="\t")
for row in reader:
# If this is SONAR AIRR only use the cluster centroids
if "cluster_count" in row and not row["cluster_count"]:
continue
# IgBLAST puts both cdr3 and junction cols in its AIRR TSV
# output while SONAR only provides junction. We'll work
# with either.
if row["locus"] == "IGH" and (row.get("cdr3") or row.get("junction")):
try:
seq = row["cdr3"]
except KeyError:
seq = row["junction"][3:-3]
seqs[seq] += 1
# Write deduplicated sequences sorted by abundance
seqs = list(seqs.items())
seqs.sort(key=lambda pair: -pair[1])
with open(fasta_out, "wt") as f_out:
for idx, pair in enumerate(seqs):
seqid = f"cdr3_{idx}"
desc = f"count={pair[1]}"
seq = pair[0]
f_out.write(f">{seqid} {desc}\n{seq}\n")
def miningd_sonar_input(w):
specs = []
for specimen in SPECIMENS.values():
if specimen["Subject"] == w.subject and "IgM" in specimen["CellType"]:
specs.append(specimen["Specimen"])
return expand(
"analysis/sonar/{subject}.mu/{specimen}/output/tables/{specimen}_rearrangements.tsv",
subject=w.subject, specimen=specs)
rule miningd_get_cdrh3s:
output: "analysis/mining-d/{subject}.cdr3.fasta"
input: miningd_sonar_input
run:
cdrh3s_from_airr(input, output[0])
# Note that the order and naming of sequences changes from run to run, but the
# sequences themselves do appear to typically be the same for any given input.
# The miningd_final rule below should give consistent output.
# Note that the sequences are not *always* the same, though-- there's very
# slight randomness in the output in some cases.
rule miningd_run:
output: "analysis/mining-d/{subject}.output.{pval}.fasta"
input: "analysis/mining-d/{subject}.cdr3.fasta"
log:
main="analysis/mining-d/{subject}.{pval}.log.txt",
conda="analysis/mining-d/{subject}.{pval}.conda_build.txt"
conda: "envs/mining-d.yaml"
threads: 8
params:
# The authors thought 600 was a good choice for rhesus macaque and
# human
num_k_mers=600,
# 4.5e-36 is The default setting. Will tend to leave off a few
# nucleotides at the edges, but will avoid false positives.
# (If a wildcard value is given that's not "default" or "sensitive",
# take that text itself as the P value threshold.)
p_val_th=lambda w: {"default": "4.5e-36", "sensitive": "4.5e-16"}.get(w.pval, w.pval)
shell:
"""
conda list --explicit > {log.conda}
numseqs_in=$(grep -c "^>" {input} || echo 0)
echo "start: $(date +'%Y-%m-%d %H:%M:%S %Z')" >> {log.main}
echo "threads: {threads}" >> {log.main}
echo "input: {input}" >> {log.main}
echo "input seqs: $numseqs_in" >> {log.main}
echo "num_k_mers: {params.num_k_mers}" >> {log.main}
echo "p_val_th: {params.p_val_th}" >> {log.main}
MINING_D.py -t {threads} \
-n {params.num_k_mers} \
-p {params.p_val_th} \
-i {input} -o {output}
numseqs_out=$(grep -c "^>" {output} || echo 0)
echo "output seqs: $numseqs_out" >> {log.main}
echo "end: $(date +'%Y-%m-%d %H:%M:%S %Z')" >> {log.main}
"""
# Sort the sequences and write as plain text, one sequence per line, to get the
# same output for the same input.
rule miningd_final:
output: "analysis/mining-d/{subject}.output.{pval}.txt"
input: "analysis/mining-d/{subject}.output.{pval}.fasta"
run:
with open(input[0]) as f_in, open(output[0], "w") as f_out:
seqs = [str(record.seq) for record in SeqIO.parse(f_in, "fasta")]
seqs.sort()
for seq in seqs:
f_out.write(f"{seq}\n")