Skip to content

Commit

Permalink
Merge pull request #31 from cov-lineages/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
rmcolq authored Oct 19, 2021
2 parents 1278e79 + f4d1fa7 commit e12eb0e
Show file tree
Hide file tree
Showing 11 changed files with 473 additions and 177 deletions.
2 changes: 1 addition & 1 deletion scorpio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
_program = "scorpio"
__version__ = "0.3.12"
__version__ = "0.3.13"
10 changes: 9 additions & 1 deletion scorpio/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ def main(sysargs = sys.argv[1:]):
"--append-genotypes", dest="append_genotypes", action="store_true",
help="Output a column per variant with the call"
)
subparser_haplotype.add_argument(
"--combination", dest="combination", action="store_true",
help="Combines the mutations for the specified constellations, and outputs a string across them all, with counts per found constellation"
)
subparser_haplotype.set_defaults(func=scorpio.subcommands.haplotype.run)

# _______________________________ report __________________________________#
Expand Down Expand Up @@ -154,6 +158,10 @@ def main(sysargs = sys.argv[1:]):
'--outgroups', dest='outgroups', required=False,
help='Two column CSV with group, and pipe separated list of outgroup sequence_names for that list. '
'Assumes outgroups will be in main input CSV')
subparser_define.add_argument(
"--protein", dest="protein", action="store_true",
help="Translates definition coordinates to proteins where possible"
)

subparser_define.set_defaults(func=scorpio.subcommands.define.run)

Expand Down Expand Up @@ -243,7 +251,7 @@ def main(sysargs = sys.argv[1:]):
if not args.reference_json:
args.reference_json = reference_json
logging.info("Found reference %s" %args.reference_json)
if not args.constellations:
if not args.constellations and args.command in ['haplotype', 'classify']:
args.constellations = list_constellation_files
logging.info("Found constellations:")
for c in args.constellations:
Expand Down
46 changes: 34 additions & 12 deletions scorpio/scripts/extract_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from Bio.Seq import Seq
from operator import itemgetter

from .type_constellations import load_feature_coordinates
from .type_constellations import load_feature_coordinates, resolve_ambiguous_cds

def parse_args():
parser = argparse.ArgumentParser(description="""Pick a representative sample for each unique sequence""",
Expand Down Expand Up @@ -90,6 +90,15 @@ def update_var_dict(var_dict, group, variants):
return


def update_feature_dict(feature_dict):
for feature in feature_dict:
if len(feature_dict[feature]) > 2:
cds, aa_pos = resolve_ambiguous_cds(feature_dict[feature][2], feature_dict[feature][0], feature_dict)
if aa_pos:
feature_dict[feature] = (aa_pos, feature_dict[feature][1] + feature_dict[feature][0] - aa_pos, cds)
return feature_dict


def get_common_mutations(var_dict, min_occurance=3, threshold_common=0.98, threshold_intermediate=0.25):
sorted_tuples = sorted(var_dict.items(), key=operator.itemgetter(1))
var_dict = {k: v for k, v in sorted_tuples}
Expand All @@ -110,7 +119,7 @@ def get_common_mutations(var_dict, min_occurance=3, threshold_common=0.98, thres
return common, intermediate


def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_seq):
def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_seq, include_protein=False):
nuc_end = nuc_start + len(nuc_ref)
nuc_start = int(nuc_start)
nuc_end = int(nuc_end)
Expand Down Expand Up @@ -138,12 +147,26 @@ def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_s
if ref_allele == query_allele:
return "nuc:%s%i%s" % (nuc_ref, nuc_start, nuc_alt)
aa_pos = int((start - feature_dict[feature][0]) / 3) + 1
if include_protein:
feature, aa_pos = translate_to_protein_if_possible(feature, aa_pos, feature_dict)
#print(start, end, ref_allele, query_allele, aa_pos, feature)
return "%s:%s%i%s" % (feature, ref_allele, aa_pos, query_allele)
return "nuc:%s%i%s" % (nuc_ref, nuc_start, nuc_alt)


def define_mutations(list_variants, feature_dict, reference_seq):
def translate_to_protein_if_possible(cds, aa_start, feature_dict):
if not cds.startswith("orf"):
return cds, aa_start

for feature in feature_dict:
if len(feature_dict[feature]) < 3:
continue # only want nsp definitions
if feature_dict[feature][2] == cds:
if feature_dict[feature][0] <= aa_start <= feature_dict[feature][1]:
return feature, aa_start-feature_dict[feature][0]+1
return cds, aa_start

def define_mutations(list_variants, feature_dict, reference_seq, include_protein=False):
merged_list = []
if not list_variants:
return merged_list
Expand Down Expand Up @@ -184,7 +207,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
elif new[3]:
current[3] = new[3]
elif current[0] != "":
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq)
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq, include_protein)
if freq:
merged_list.append("%s:%s" % (var, freq))
else:
Expand All @@ -193,7 +216,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
else:
current = new
if current[0] != "":
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq)
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq, include_protein)
if freq:
merged_list.append("%s:%s" % (var, freq))
else:
Expand All @@ -214,17 +237,15 @@ def subtract_outgroup(common, outgroup_common):

def write_constellation(prefix, group, list_variants, list_intermediates, list_ancestral):
group_dict = {"name": group, "sites": list_variants, "intermediate": list_intermediates,
"rules": {"min_alt": int((len(list_variants) + 1) / 4), "max_ref": int((len(list_variants) - 1) / 4)}}
"rules": {"min_alt": max(len(list_variants) - 3, min(len(list_variants), 3)), "max_ref": 3}}
if list_ancestral:
group_dict["ancestral"] = list_ancestral
group_dict["rules"]["min_alt"] += int((len(list_ancestral)+1)/4)
group_dict["rules"]["max_ref"] += int((len(list_ancestral)-1)/4)
with open('%s/%s.json' % (prefix, group), 'w') as outfile:
json.dump(group_dict, outfile, indent=4)


def extract_definitions(in_variants, in_groups, group_column, index_column, reference_json, prefix, subset,
threshold_common, threshold_intermediate, outgroup_file):
threshold_common, threshold_intermediate, outgroup_file, include_protein):
if not in_groups:
in_groups = in_variants

Expand All @@ -239,6 +260,7 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
group_dict = get_group_dict(in_groups, group_column, index_column, groups_to_get)

reference_seq, feature_dict = load_feature_coordinates(reference_json)
feature_dict = update_feature_dict(feature_dict)

var_dict = {}
outgroup_var_dict = {}
Expand Down Expand Up @@ -283,9 +305,9 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
if group in outgroup_var_dict:
outgroup_common, outgroup_intermediate = get_common_mutations(outgroup_var_dict[group], min_occurance=1, threshold_common=threshold_common, threshold_intermediate=threshold_intermediate)
common, ancestral = subtract_outgroup(common, outgroup_common)
nice_common = define_mutations(common, feature_dict, reference_seq)
nice_intermediate = define_mutations(intermediate, feature_dict, reference_seq)
nice_ancestral = define_mutations(ancestral, feature_dict, reference_seq)
nice_common = define_mutations(common, feature_dict, reference_seq, include_protein)
nice_intermediate = define_mutations(intermediate, feature_dict, reference_seq, include_protein)
nice_ancestral = define_mutations(ancestral, feature_dict, reference_seq, include_protein)
write_constellation(prefix, group, nice_common, nice_intermediate, nice_ancestral)


Expand Down
Loading

0 comments on commit e12eb0e

Please sign in to comment.