Skip to content

Commit

Permalink
Merge pull request #14 from cov-lineages/dev
Browse files Browse the repository at this point in the history
Merge in dev
  • Loading branch information
rmcolq authored Jun 23, 2021
2 parents 443fe86 + 59f8945 commit 1685579
Show file tree
Hide file tree
Showing 11 changed files with 589 additions and 90 deletions.
2 changes: 1 addition & 1 deletion scorpio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
_program = "scorpio"
__version__ = "0.3.1"
__version__ = "0.3.2"
11 changes: 9 additions & 2 deletions scorpio/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ def main(sysargs = sys.argv[1:]):
help="One or more JSON (or CSV) files specifying variants")
constellation_group.add_argument("-n", "--names", dest="names", required=False, nargs='+',
help="Names of constellations to include")
constellation_group.add_argument("-l", "--label", dest="label", required=False,
help="Use alternative label specified in JSON where possible")
constellation_group.add_argument('--pangolin', dest='pangolin', action='store_true',
help='Uses `mrca_lineage` as label and excludes constellations/data directory for'
' robustness')

misc_group = common.add_argument_group('Misc options')
misc_group.add_argument('--tempdir', action="store",
Expand Down Expand Up @@ -191,9 +196,11 @@ def main(sysargs = sys.argv[1:]):
for fn in f:
if fn == "SARS-CoV-2.json":
reference_json = os.path.join(r, fn)
elif fn.endswith(".json"):
elif args.pangolin and r.endswith('definitions') and fn.endswith(".json"):
list_constellation_files.append(os.path.join(r, fn))
elif fn.endswith(".csv"):
elif not args.pangolin and fn.endswith(".json"):
list_constellation_files.append(os.path.join(r, fn))
elif not args.pangolin and fn.endswith(".csv"):
list_constellation_files.append(os.path.join(r, fn))
if (not args.reference_json and reference_json == "") or (not args.constellations and list_constellation_files == []):
print(sfunk.cyan(
Expand Down
134 changes: 92 additions & 42 deletions scorpio/scripts/type_constellations.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,24 +150,25 @@ def variant_to_variant_record(l, refseq, features_dict):
info["name"] = l
print("Warning: found variant of type insertion, which will be ignored during typing")
elif lsplit[0] in ["snp", "nuc"]:
type = "snp"
ref_allele = lsplit[1][0]
ref_start = int(lsplit[1][1:-1])
alt_allele = lsplit[1][-1]
ref_allele_check = refseq[ref_start - 1]
info = {"name": l, "type": "snp"}
m = re.match(r'(?P<ref_allele>[ACGTUN]+)(?P<ref_start>\d+)(?P<alt_allele>[AGCTUN]*)', l[4:])
if not m:
sys.stderr.write("Warning: couldn't parse the following string: %s - ignoring\n" % l)
sys.exit(1)
info.update(m.groupdict())
info["ref_start"] = int(info["ref_start"])
ref_allele_check = refseq[info["ref_start"] - 1]

if ref_allele != '?' and ref_allele != ref_allele_check:
if info["ref_allele"] != '?' and info["ref_allele"] != ref_allele_check:
sys.stderr.write(
"variants file says reference nucleotide at position %d is %s, but reference sequence has %s, "
"context %s\n" % (ref_start, ref_allele, ref_allele_check, refseq[ref_start - 4:ref_start + 3]))
"context %s\n" % (info["ref_start"], info["ref_allele"], ref_allele_check, refseq[info["ref_start"] - 4:info["ref_start"] + 3]))
sys.exit(1)

info = {"name": l, "type": type, "ref_start": ref_start, "ref_allele": ref_allele, "alt_allele": alt_allele}

elif lsplit[0] == "del":
length = int(lsplit[2])
info = {"name": l, "type": lsplit[0], "ref_start": int(lsplit[1]), "length": length,
"ref_allele": refseq[int(lsplit[1]) - 1:int(lsplit[1]) + length - 1]}
"ref_allele": refseq[int(lsplit[1]) - 1:int(lsplit[1]) + length - 1], "space": "nuc"}

else:
m = re.match(r'[aa:]*(?P<cds>\w+):(?P<ref_allele>[a-zA-Z-*]+)(?P<aa_pos>\d+)(?P<alt_allele>[a-zA-Z-*]*)', l)
Expand Down Expand Up @@ -196,8 +197,10 @@ def variant_to_variant_record(l, refseq, features_dict):
info["ref_start"] = ref_start
if info["alt_allele"] in ['del', '-']:
info["type"] = "del"
info["length"] = 3*len(info["ref_allele"])
info["ref_allele"] = str(ref_allele)
info["space"] = "aa"
info["length"] = len(info["ref_allele"])
info["before"] = str(Seq(refseq[ref_start - 4:ref_start - 1]).translate())
info["after"] = str(Seq(refseq[ref_start - 1 + 3*info["length"]:ref_start - 1 + 3*info["length"]+3]).translate())
elif info["alt_allele"] == '':
info["fuzzy"] = True
else:
Expand All @@ -213,25 +216,40 @@ def parse_name_from_file(constellation_file):
return name


def parse_json_in(refseq, features_dict, variants_file, constellation_names=None, include_ancestral=False):
def parse_json_in(refseq, features_dict, variants_file, constellation_names=None, include_ancestral=False, label=None):
"""
returns variant_list name and rules
"""
variant_list = []
name = None
rules = None
mrca_lineage = ""

in_json = open(variants_file, 'r')
json_dict = json.load(in_json, strict=False)

if "label" in json_dict:
if "type" in json_dict and json_dict["type"] in json_dict and "mrca_lineage" in json_dict[json_dict["type"]]:
m = re.match(r'[A-Z0-9.]*', json_dict[json_dict["type"]]["mrca_lineage"])
if not m:
sys.stderr.write("Warning: mrca_lineage %s not in acceptable format - ignoring\n" % json_dict[json_dict["type"]]["mrca_lineage"])
else:
mrca_lineage = json_dict[json_dict["type"]]["mrca_lineage"]


if label:
if "type" in json_dict and json_dict["type"] in json_dict and label in json_dict[json_dict["type"]]:
name = json_dict[json_dict["type"]][label]
elif "label" in json_dict:
name = json_dict["label"]
elif "name" in json_dict:
name = json_dict["name"]
else:
name = parse_name_from_file(variants_file)

if not name:
return variant_list, name, rules, mrca_lineage
if constellation_names and name not in constellation_names:
return variant_list, name, rules
return variant_list, name, rules, mrca_lineage

print("\nParsing constellation JSON file %s" % variants_file)

Expand All @@ -251,7 +269,7 @@ def parse_json_in(refseq, features_dict, variants_file, constellation_names=None

in_json.close()

return variant_list, name, rules
return variant_list, name, rules, mrca_lineage


def parse_csv_in(refseq, features_dict, variants_file, constellation_names=None):
Expand Down Expand Up @@ -323,7 +341,7 @@ def parse_textfile_in(refseq, features_dict, variants_file, constellation_names=
return variant_list, name


def parse_variants_in(refseq, features_dict, variants_file, constellation_names=None, include_ancestral=False):
def parse_variants_in(refseq, features_dict, variants_file, constellation_names=None, include_ancestral=False, label=None):
"""
read in a variants file and parse its contents and
return something sensible.
Expand All @@ -343,19 +361,20 @@ def parse_variants_in(refseq, features_dict, variants_file, constellation_names=
"""
variant_list = []
rule_dict = None
mrca_lineage = ""

if variants_file.endswith(".json"):
variant_list, name, rule_dict = parse_json_in(refseq, features_dict, variants_file, constellation_names, include_ancestral=include_ancestral)
variant_list, name, rule_dict, mrca_lineage = parse_json_in(refseq, features_dict, variants_file, constellation_names, include_ancestral=include_ancestral,label=label)
elif variants_file.endswith(".csv"):
variant_list, name, rule_dict = parse_csv_in(refseq, features_dict, variants_file, constellation_names)

if len(variant_list) == 0 and not variants_file.endswith(".json"):
variant_list, name = parse_textfile_in(refseq, features_dict, variants_file, constellation_names)

return name, variant_list, rule_dict
return name, variant_list, rule_dict, mrca_lineage


def call_variant_from_fasta(record_seq, var, ins_char="?", oth_char=None):
def call_variant_from_fasta(record_seq, var, ins_char="?", oth_char=None, codon=False):
#print("Call variant for ", var)
call = None
query_allele = None
Expand All @@ -373,15 +392,16 @@ def call_variant_from_fasta(record_seq, var, ins_char="?", oth_char=None):
call = 'ref'
elif query_allele == "N":
call = 'ambig'
elif query_allele == var["alt_allele"]:
elif query_allele == var["alt_allele"] or var["alt_allele"] == "":
call = 'alt'
else:
call = 'oth'
#print(call, query_allele)

elif var["type"] == "aa":
try:
query_allele = record_seq.upper()[var["ref_start"] - 1:var["ref_start"] + 2].translate()
query = record_seq.upper()[var["ref_start"] - 1:var["ref_start"] - 1 + 3 * len(var["ref_allele"])]
query_allele = query.translate()
#query_allele_minus = record_seq.upper()[var["ref_start"] - 2:var["ref_start"] + 1].translate()
#query_allele_plus = record_seq.upper()[var["ref_start"]:var["ref_start"] + 3].translate()
#print("Found", query_allele, query_allele_minus, query_allele_plus)
Expand All @@ -400,14 +420,37 @@ def call_variant_from_fasta(record_seq, var, ins_char="?", oth_char=None):
call = 'oth'
#print(call, query_allele)

elif var["type"] == "del":
elif var["type"] == "del" and var["space"] == "aa":
query_allele = record_seq.upper()[var["ref_start"] - 4:var["ref_start"] + 3*var["length"] + 2]
query = query_allele.replace("-","")
if len(query) % 3 != 0:
query = query.replace("N","")
if len(query) % 3 != 0:
query = query_allele.replace("-","N")
if len(query) % 3 != 0:
print("Warning: while typing variant %s (before,ref,after) = (%s,%s,%s) found sequence with query allele %s treated as %s. Handling by adding Ns which will result in ambiguous calls" %(var["name"], var["before"], var["ref_allele"], var["after"], query_allele, query))
query_allele = query
while len(query_allele) % 3 != 0:
query_allele += "N"
query_allele = query_allele.translate()
#print("call for del in aa space with before %s, ref %s, after %s, length %d and query_allele %s" %(var["before"], var["ref_allele"], var["after"], var["length"], query_allele))
if query_allele == var["before"] + var["ref_allele"] + var["after"]:
call = 'ref'
query_allele = 0
elif query_allele == var["before"] + var["after"]:
call = 'alt'
query_allele = var["length"]
elif "X" in query_allele:
call = 'ambig'
query_allele = "X"
else:
call = 'oth'
if not oth_char:
query_allele = "X"
elif var["type"] == "del" and var["space"] == "nuc":
query_allele = record_seq.upper()[var["ref_start"] - 1:var["ref_start"] + var["length"] - 1]
#print("del allele", query_allele)
#query_allele_minus = record_seq.upper()[var["ref_start"] - 2:var["ref_start"] + var["length"] - 2]
#query_allele_plus = record_seq.upper()[var["ref_start"]:var["ref_start"] + var["length"]]
#print("Found", query_allele, query_allele_minus, query_allele_plus)
#print(var["ref_allele"], query_allele == var["ref_allele"], var["alt_allele"],
# query_allele == var["alt_allele"])
#print("call for del in nuc space with ref %s, length %d and query_allele %s" %(var["ref_allele"], var["length"], query_allele))

if query_allele == var["ref_allele"]:
call = 'ref'
query_allele = 0
Expand All @@ -419,7 +462,7 @@ def call_variant_from_fasta(record_seq, var, ins_char="?", oth_char=None):
query_allele = "N"
elif query_allele == "-" * var["length"]:
call = 'alt'
query_allele = int(var["length"]/3)
query_allele = int(var["length"] / 3)
else:
call = 'oth'
if not oth_char:
Expand Down Expand Up @@ -508,12 +551,12 @@ def generate_barcode(record_seq, variant_list, ref_char=None, ins_char="?", oth_


def type_constellations(in_fasta, list_constellation_files, constellation_names, out_csv, reference_json, ref_char=None,
output_counts=False):
output_counts=False, label=None):
reference_seq, features_dict = load_feature_coordinates(reference_json)

constellation_dict = {}
for constellation_file in list_constellation_files:
constellation, variants, ignore = parse_variants_in(reference_seq, features_dict, constellation_file)
constellation, variants, ignore, mrca_lineage = parse_variants_in(reference_seq, features_dict, constellation_file, label=label)
if not constellation:
continue
if len(variants) > 0:
Expand All @@ -529,7 +572,8 @@ def type_constellations(in_fasta, list_constellation_files, constellation_names,
counts_out = {}
if output_counts:
for constellation in constellation_dict:
counts_out[constellation] = open("%s.%s_counts.csv" % (out_csv.replace(".csv", ""), constellation), "w")
clean_name = re.sub("[^a-zA-Z0-9_\-.]", "_", constellation)
counts_out[constellation] = open("%s.%s_counts.csv" % (out_csv.replace(".csv", ""), clean_name), "w")
counts_out[constellation].write("query,ref_count,alt_count,ambig_count,other_count,support,conflict\n")

with open(in_fasta, "r") as f:
Expand All @@ -556,15 +600,16 @@ def type_constellations(in_fasta, list_constellation_files, constellation_names,


def classify_constellations(in_fasta, list_constellation_files, constellation_names, out_csv, reference_json,
output_counts=False, call_all=False, long=False):
output_counts=False, call_all=False, long=False, label=None):

reference_seq, features_dict = load_feature_coordinates(reference_json)

constellation_dict = {}
rule_dict = {}
mrca_lineage_dict = {}
for constellation_file in list_constellation_files:
constellation, variants, rules = parse_variants_in(reference_seq, features_dict, constellation_file,
constellation_names, include_ancestral=True)
constellation, variants, rules, mrca_lineage = parse_variants_in(reference_seq, features_dict, constellation_file,
constellation_names, include_ancestral=True, label=label)
if constellation_names and constellation not in constellation_names:
continue
if not rules:
Expand All @@ -577,15 +622,16 @@ def classify_constellations(in_fasta, list_constellation_files, constellation_na
print("Found file %s for constellation %s containing %i variants" % (
constellation_file, constellation, len([v["name"] for v in variants])))
print("Rules", rule_dict[constellation])
mrca_lineage_dict[constellation] = mrca_lineage
else:
print("Warning: %s is not a valid constellation file - ignoring" % constellation_file)

variants_out = open(out_csv, "w")
if long and not call_all:
variants_out.write("query,constellations,ref_count,alt_count,ambig_count,other_count,rule_count,support,"
variants_out.write("query,constellations,mrca_lineage,ref_count,alt_count,ambig_count,other_count,rule_count,support,"
"conflict\n")
else:
variants_out.write("query,constellations\n")
variants_out.write("query,constellations,mrca_lineage\n")

counts_out = {}
if output_counts:
Expand Down Expand Up @@ -633,14 +679,18 @@ def classify_constellations(in_fasta, list_constellation_files, constellation_na
if not call_all and best_constellation:
lineages.append(best_constellation)
if long and best_counts is not None:
variants_out.write("%s,%s,%i,%i,%i,%i,%i,%f,%f\n" % (record.id, "|".join(lineages), best_counts['ref'],
variants_out.write("%s,%s,%s,%i,%i,%i,%i,%i,%f,%f\n" % (record.id, "|".join(lineages),
"|".join([mrca_lineage_dict[l] for l in lineages]),
best_counts['ref'],
best_counts['alt'], best_counts['ambig'],
best_counts['oth'], best_counts['rules'],
best_counts['support'], best_counts['conflict']))
elif long:
variants_out.write("%s,%s,,,,,,,\n" % (record.id, "|".join(lineages)))
elif long and not call_all:
variants_out.write("%s,%s,%s,,,,,,,\n" % (record.id, "|".join(lineages),
"|".join([mrca_lineage_dict[l] for l in lineages])))
else:
variants_out.write("%s,%s\n" % (record.id, "|".join(lineages)))
variants_out.write("%s,%s,%s\n" % (record.id, "|".join(lineages),
"|".join([mrca_lineage_dict[l] for l in lineages])))

variants_out.close()
for constellation in counts_out:
Expand Down
3 changes: 2 additions & 1 deletion scorpio/subcommands/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ def run(options):
options.reference_json,
options.output_counts,
options.call_all,
options.long)
options.long,
options.label)
3 changes: 2 additions & 1 deletion scorpio/subcommands/haplotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ def run(options):
options.output,
options.reference_json,
options.ref_char,
options.output_counts)
options.output_counts,
options.label)
16 changes: 8 additions & 8 deletions scorpio/tests/data/type_constellations/expect.classified.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
query,constellations
Flatland01,Lineage_X
Flatland02,Lineage_X
Flatland03,Lineage_X
Flatland04,
Flatland05,
Flatland06,
Reference,
query,constellations,mrca_lineage
Flatland01,Lineage_X,B.1.1.7
Flatland02,Lineage_X,B.1.1.7
Flatland03,Lineage_X,B.1.1.7
Flatland04,,
Flatland05,,
Flatland06,,
Reference,,
16 changes: 8 additions & 8 deletions scorpio/tests/data/type_constellations/expected.classified.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
query,constellations
Flatland01,Lineage_X
Flatland02,Lineage_X
Flatland03,Lineage_X
Flatland04,
Flatland05,
Flatland06,
Reference,
query,constellations,mrca_lineage
Flatland01,Lineage_X,B.1.1.7
Flatland02,Lineage_X,B.1.1.7
Flatland03,Lineage_X,B.1.1.7
Flatland04,,
Flatland05,,
Flatland06,,
Reference,,
8 changes: 4 additions & 4 deletions scorpio/tests/data/type_constellations/expected.typed.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
query,Lineage_X
Flatland01,-IDTT3TT-XXYDHIAH-*ICLFK
Flatland02,-IDTT3TT-XXYDHIAH-*ICLFK
Flatland03,-IDTT3TT-XXYDHIAH-*ICLFK
Flatland04,-IDTT3TT-XNYDHIAH-*ICLF-
Flatland01,-IDTT3TT-21YDHIAH-*ICLFK
Flatland02,-IDTT3TT-21YDHIAH-*ICLFK
Flatland03,-IDTT3TT-21YDHIAH-*ICLFK
Flatland04,-IDTT3TT-2XYDHIAH-*ICLF-
Flatland05,-----3-----Y------XX---K
Flatland06,-----3-----X-----------X
Reference,------------------------
4 changes: 4 additions & 0 deletions scorpio/tests/data/type_constellations/lineage_X.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{
"name": "Lineage_X",
"type": "variant",
"variant": {
"mrca_lineage": "B.1.1.7"
},
"sites": [
"nuc:C912T",
"1ab:T1001I",
Expand Down
429 changes: 428 additions & 1 deletion scorpio/tests/data/type_constellations/test.fa

Large diffs are not rendered by default.

Loading

0 comments on commit 1685579

Please sign in to comment.