diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index e486f47..c07a948 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -17,6 +17,8 @@ - `Assay` attribute `sequencer` changed to `sequence_protocol` - `Assay` function `get_modality` changed to `get_libspec` - `Region` function `update_attr` uses the `max_len` to generate `random` and `onlist` sequence lengths instead of `min_len` +- `get_region_by_type` changed to `get_region_by_region_type` to disambiguate between `region_type` and `sequence_type` +- `seqspec onlist` (by default) searches for onlists in the `Region`s intersected by the `Read` passed to `-r`. ### Added @@ -25,7 +27,6 @@ - Add `sequence_spec` to the seqspec json schema - Add `Read` object to specification document - Add `Read` generator to website GUI -- Add prior version seqspec schema to seqspec/schema (note to self, this must be done for every release) - Add pattern matching to `date` in `Assay` (expected date format: DAY MONTH YEAR, where day is one or two numbers, month is the full named month starting with a Capital letter and year is the full year) - Add `library_kit` to `Assay` object (kit that adds seq adapters) - Add `library_protocol` to `Assay` object (library that generates insert) @@ -45,6 +46,8 @@ - check that the min len is less than or equal to the max len - check that the length of the sequence is between min and max len - Note a strong assumption in `seqspec print` is that the sequence have a length equal to the `max_len` for visualization purposes +- Add `RegionCoordinate` object that maps `Region` min/max lengths to 0-indexed positions +- `seqspec onlist` searches for onlists in a `Region` based on `--region` flag ### Removed diff --git a/examples/seqspec-dev.ipynb b/examples/seqspec-dev.ipynb index 8ce92ef..a7bbc8e 100644 --- a/examples/seqspec-dev.ipynb +++ b/examples/seqspec-dev.ipynb @@ -294,7 +294,7 @@ "from datetime import datetime\n", "import matplotlib.dates as mdates\n", "\n", - "from seqspec.utils import load_spec, get_cuts\n", + "from seqspec.utils import load_spec, project_regions_to_coordinates\n", "from seqspec.seqspec_index import run_index\n", "from seqspec.seqspec_find import run_find\n", "import os\n", @@ -331,7 +331,7 @@ { "cell_type": "code", "source": [ - "from seqspec.utils import get_cuts\n", + "from seqspec.utils import project_regions_to_coordinates\n", "\n", "def complement_nucleotide(nucleotide):\n", " complements = {\n", @@ -353,7 +353,7 @@ " p = []\n", " n = []\n", " leaves = libspec.get_leaves()\n", - " cuts = get_cuts(leaves)\n", + " cuts = project_regions_to_coordinates(leaves)\n", " for idx, read in enumerate(seqspec, 1):\n", " read_len = read.max_len\n", " read_id = read.read_id\n", @@ -450,4 +450,4 @@ "outputs": [] } ] -} \ No newline at end of file +} diff --git a/examples/seqspec_dev2.ipynb b/examples/seqspec_dev2.ipynb index b6ee20a..f699213 100644 --- a/examples/seqspec_dev2.ipynb +++ b/examples/seqspec_dev2.ipynb @@ -97,7 +97,7 @@ { "cell_type": "code", "source": [ - "def get_cuts(regions, cuts=[]):\n", + "def project_regions_to_coordinates(regions, cuts=[]):\n", " if not cuts:\n", " cuts = []\n", " prev = 0\n", @@ -156,7 +156,7 @@ " rgn = leaves[primer_idx + 1:]\n", "\n", "# get the cuts for all of the atomic elements (tuples of 0-indexed start stop)\n", - "cuts = get_cuts(rgn)\n", + "cuts = project_regions_to_coordinates(rgn)\n", "\n", "# associate each cut with its region type\n", "for idx, r in enumerate(rgn):\n", diff --git a/seqspec/Region.py b/seqspec/Region.py index 060c9fb..db736f2 100644 --- a/seqspec/Region.py +++ b/seqspec/Region.py @@ -129,14 +129,14 @@ def get_region_by_id(self, region_id, found=[]): found = r.get_region_by_id(region_id, found) return found - def get_region_by_type(self, region_type, found=[]): + def get_region_by_region_type(self, region_type, found=[]): if not found: found = [] if self.region_type == region_type: found.append(self) if self.regions: for r in self.regions: - found = r.get_region_by_type(region_type, found) + found = r.get_region_by_region_type(region_type, found) return found def get_onlist_regions(self, found=[]): @@ -222,6 +222,123 @@ def update_region_by_id( target_region.max_len = max_len return + def reverse(self): + if self.regions: + # reverse the list of sub regions + for r in self.regions[::-1]: + r.reverse() + else: + # reverse the actual sequence + self.sequence = self.sequence[::-1] + return + + def complement(self): + if self.regions: + for r in self.regions: + r.complement() + else: + self.sequence = complement_sequence(self.sequence) + + +def complement_nucleotide(nucleotide): + complements = { + "A": "T", + "T": "A", + "G": "C", + "C": "G", + "R": "Y", + "Y": "R", + "S": "S", + "W": "W", + "K": "M", + "M": "K", + "B": "V", + "D": "H", + "V": "B", + "H": "D", + "N": "N", + "X": "X", + } + return complements.get( + nucleotide, "N" + ) # Default to 'N' if nucleotide is not recognized + + +def complement_sequence(sequence): + return "".join(complement_nucleotide(n) for n in sequence.upper()) + + +class RegionCoordinate(Region): + def __init__( + self, + region: Region, + start: int = 0, + stop: int = 0, + ): + super().__init__( + region.region_id, + region.region_type, + region.name, + region.sequence_type, + region.sequence, + region.min_len, + region.max_len, + region.onlist, + region.regions, + ) + self.start = start + self.stop = stop + + def __repr__(self): + return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})" + + def __str__(self): + return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})" + + def __eq__(self, other): + return self.start == other.start and self.stop == other.stop + + +def project_regions_to_coordinates( + regions: List[Region], rcs: List[RegionCoordinate] = [] +) -> List[RegionCoordinate]: + if not rcs: + rcs = [] + prev = 0 + for r in regions: + nxt = prev + r.max_len + rc = RegionCoordinate(r, prev, nxt) + rcs.append(rc) + prev = nxt + return rcs + + +def itx_read( + region_coordinates: List[RegionCoordinate], read_start: int, read_stop: int +): + # return a list of region_coordinates intersect with read start/stop + new_rcs = [] + + for idx, rc in enumerate(region_coordinates): + # read start after rc ends, ignore + if read_start >= rc.stop: + continue + # read stop before rc starts, ignore + if read_stop <= rc.start: + continue + + # all region_coordinates now have read start or stop in the rc + + # read start in rc, update start + if read_start >= rc.start: + rc.start = read_start + # read stop in rc, update stop + if read_stop < rc.stop: + rc.stop = read_stop + new_rcs.append(rc) + + return new_rcs + class Onlist(yaml.YAMLObject): yaml_tag = "!Onlist" diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index b808848..cdd2b09 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -103,8 +103,10 @@ def run_check(schema, spec, spec_fn): # get all of the regions with type fastq in the spec and check that those files exist relative to the path of the spec fqrgns = [] for m in modes: - fqrgns += [i for i in spec.get_libspec(m).get_region_by_type("fastq")] - fqrgns += [i for i in spec.get_libspec(m).get_region_by_type("fastq_link")] + fqrgns += [i for i in spec.get_libspec(m).get_region_by_region_type("fastq")] + fqrgns += [ + i for i in spec.get_libspec(m).get_region_by_region_type("fastq_link") + ] for fqrgn in fqrgns: if fqrgn.region_type == "fastq": check = path.join(path.dirname(spec_fn), fqrgn.region_id) diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py index 4fd207a..5c8257f 100644 --- a/seqspec/seqspec_find.py +++ b/seqspec/seqspec_find.py @@ -75,5 +75,5 @@ def run_find(spec: Assay, modality: str, region_id: str): def run_find_by_type(spec: Assay, modality: str, region_type: str): m = spec.get_libspec(modality) - regions = m.get_region_by_type(region_type) + regions = m.get_region_by_region_type(region_type) return regions diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py index 615dd81..5aded7c 100644 --- a/seqspec/seqspec_index.py +++ b/seqspec/seqspec_index.py @@ -1,9 +1,10 @@ -from seqspec.utils import load_spec, get_cuts +from seqspec.utils import load_spec, map_read_id_to_regions from seqspec.seqspec_find import run_find from collections import defaultdict from typing import Dict, List, Tuple from argparse import SUPPRESS import os +from seqspec.Region import RegionCoordinate, project_regions_to_coordinates, itx_read def setup_index_args(parser): @@ -43,6 +44,12 @@ def setup_index_args(parser): "--rev", help="Returns 3'->5' region order", action="store_true" ) + # boolean to indicate specifying a region or a read + subparser.add_argument( + "--region", + help="Specify a region", + action="store_true", + ) subparser_required.add_argument( "-m", metavar="MODALITY", @@ -60,13 +67,6 @@ def setup_index_args(parser): required=True, ) - # boolean to indicate specifying a region or a read - subparser.add_argument( - "--region", - help="Specify a region", - action="store_true", - ) - return subparser @@ -118,11 +118,12 @@ def run_index( if region: index = get_index(spec, modality, r, rev=rev) else: - index = get_index_by_primer(spec, modality, r, rev=rev) - indices.append({r: index}) + index = get_index_by_primer(spec, modality, r) + indices.append(index) return FORMAT[fmt](indices, subregion_type) +# TODO: modify to use RegionCoordinate object def get_index_by_type( spec, modality, region_id, rev=False ) -> Dict[str, List[Tuple[int, int]]]: @@ -133,7 +134,11 @@ def get_index_by_type( leaves = regions[0].get_leaves() if rev: leaves.reverse() - cuts = get_cuts(leaves) + cuts = project_regions_to_coordinates(leaves) + + # index is a legacy data structure, todo fix + for c in cuts: + index[c.start, c.stop] = c.region_type # groupby requested region for idx, l in enumerate(leaves): @@ -144,72 +149,31 @@ def get_index_by_type( return index -def get_index(spec, modality, region_id, rev=False) -> Dict[Tuple[int, int], str]: +def get_index( + spec, modality, region_id, rev=False +) -> Dict[str, List[RegionCoordinate]]: rid = region_id - # run function - index = defaultdict() regions = run_find(spec, modality, rid) leaves = regions[0].get_leaves() if rev: leaves.reverse() - cuts = get_cuts(leaves) + cuts = project_regions_to_coordinates(leaves) - for idx, l in enumerate(leaves): - t = l.region_type - c = cuts[idx] - index[c] = t - - return index + return {region_id: cuts} def get_index_by_primer( - spec, modality: str, read_id: str, rev: bool = False -) -> Dict[Tuple[int, int], str]: # noqa - index = defaultdict() - - # get all atomic elements from library - leaves = spec.get_libspec(modality).get_leaves() - - # get the read object and primer id - read = [i for i in spec.sequence_spec if i.read_id == read_id][0] - primer_id = read.primer_id - - # get the index of the primer in the list of leaves (ASSUMPTION, 5'->3' and primer is an atomic element) - primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0] - - # If we are on the opposite strand, we go in the opposite way - if read.strand == "neg": - rgn = leaves[:primer_idx][::-1] - else: - rgn = leaves[primer_idx + 1 :] + spec, modality: str, read_id: str +) -> Dict[str, List[RegionCoordinate]]: # noqa + # this manages the strandedness internally + (read, rgns) = map_read_id_to_regions(spec, modality, read_id) # get the cuts for all of the atomic elements (tuples of 0-indexed start stop) - cuts = get_cuts(rgn) - - # associate each cut with its region type - for idx, r in enumerate(rgn): - t = r.region_type - c = cuts[idx] - index[c] = t - # intersect read with index - new_index = itx_read(index, read.max_len) - - return new_index - - -def itx_read(index: Dict[Tuple[int, int], str], read_stop: int): - new_index = defaultdict() + rcs = project_regions_to_coordinates(rgns) - # three cases - for (itv_start, itv_stop), e in index.items(): # noqa - # A: read is shorter than itv_start of next element - if read_stop < itv_start: - continue + new_rcs = itx_read(rcs, 0, read.max_len) - # B: read splits an element (same as case c) - elif read_stop > itv_start: - new_index[itv_start, min(itv_stop, read_stop)] = e - return new_index + return {read_id: new_rcs} def format_kallisto_bus(indices, subregion_type=None): @@ -217,19 +181,19 @@ def format_kallisto_bus(indices, subregion_type=None): umi = [] feature = [] for idx, region in enumerate(indices): - for rgn, index in region.items(): - for k, v in index.items(): - if v.upper() == "BARCODE": - bcs.append(f"{idx},{k[0]},{k[1]}") - elif v.upper() == "UMI": - umi.append(f"{idx},{k[0]},{k[1]}") + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type.upper() == "BARCODE": + bcs.append(f"{idx},{cut.start},{cut.stop}") + elif cut.region_type.upper() == "UMI": + umi.append(f"{idx},{cut.start},{cut.stop}") elif ( - v.upper() == "CDNA" - or v.upper() == "GDNA" - or v.upper() == "PROTEIN" - or v.upper() == "TAG" + cut.region_type.upper() == "CDNA" + or cut.region_type.upper() == "GDNA" + or cut.region_type.upper() == "PROTEIN" + or cut.region_type.upper() == "TAG" ): - feature.append(f"{idx},{k[0]},{k[1]}") + feature.append(f"{idx},{cut.start},{cut.stop}") if len(umi) == 0: umi.append("-1,-1,-1") if len(bcs) == 0: @@ -240,6 +204,7 @@ def format_kallisto_bus(indices, subregion_type=None): # this one should only return one string +# TODO: return to this def format_seqkit_subseq(indices, subregion_type=None): # The x string format is start:stop (1-indexed) # x = "" @@ -247,10 +212,10 @@ def format_seqkit_subseq(indices, subregion_type=None): # # need to get the right start position x = "" region = indices[0] - for rgn, index in region.items(): - for k, v in index.items(): - if v == subregion_type: - x = f"{k[0]+1}:{k[1]}\n" + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type == subregion_type: + x = f"{cut.start+1}:{cut.stop}\n" return x @@ -258,9 +223,9 @@ def format_seqkit_subseq(indices, subregion_type=None): def format_tab(indices, subregion_type=None): x = "" for idx, region in enumerate(indices): - for rgn, index in region.items(): - for k, v in index.items(): - x += f"{rgn}\t{v}\t{k[0]}\t{k[1]}\n" + for rgn, cuts in region.items(): + for cut in cuts: + x += f"{rgn}\t{cut.name}\t{cut.region_type}\t{cut.start}\t{cut.stop}\n" return x[:-1] @@ -270,14 +235,16 @@ def format_starsolo(indices, subregion_type=None): umi = [] cdna = [] for idx, region in enumerate(indices): - for rgn, index in region.items(): - for k, v in index.items(): - if v.upper() == "BARCODE": - bcs.append(f"--soloCBstart {k[0] + 1} --soloCBlen {k[1]}") - elif v.upper() == "UMI": - umi.append(f"--soloUMIstart {k[0] + 1} --soloUMIlen {k[1] - k[0]}") - elif v.upper() == "CDNA": - cdna.append(f"{k[0]},{k[1]}") + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type.upper() == "BARCODE": + bcs.append(f"--soloCBstart {cut.start + 1} --soloCBlen {cut.stop}") + elif cut.region_type.upper() == "UMI": + umi.append( + f"--soloUMIstart {cut.start + 1} --soloUMIlen {cut.stop - cut.start}" + ) + elif cut.region_type.upper() == "CDNA": + cdna.append(f"{cut.start},{cut.stop}") x = f"--soloType CB_UMI_Simple {bcs[0]} {umi[0]}" return x @@ -288,14 +255,14 @@ def format_simpleaf(indices, subregion_type=None): for idx, region in enumerate(indices): fn = idx x = f"{fn+1}{{" - for rgn, index in region.items(): - for k, v in index.items(): - if v.upper() == "BARCODE": - x += f"b[{k[1]-k[0]}]" - elif v.upper() == "UMI": - x += f"u[{k[1]-k[0]}]" - elif v.upper() == "CDNA": - x += f"r[{k[1] - k[0]}]" + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type.upper() == "BARCODE": + x += f"b[{cut.stop-cut.start}]" + elif cut.region_type.upper() == "UMI": + x += f"u[{cut.stop-cut.start}]" + elif cut.region_type.upper() == "CDNA": + x += f"r[{cut.stop - cut.start}]" x += "x:}" xl.append(x) return "".join(xl) @@ -305,14 +272,14 @@ def format_zumis(indices, subregion_type=None): xl = [] for idx, region in enumerate(indices): x = "" - for rgn, index in region.items(): - for k, v in index.items(): - if v.upper() == "BARCODE": - x += f"- BCS({k[0] + 1}-{k[1]})\n" - elif v.upper() == "UMI": - x += f"- UMI({k[0] + 1}-{k[1]})\n" - elif v.upper() == "CDNA": - x += f"- cDNA({k[0] + 1}-{k[1]})\n" + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type.upper() == "BARCODE": + x += f"- BCS({cut.start + 1}-{cut.stop})\n" + elif cut.region_type.upper() == "UMI": + x += f"- UMI({cut.start + 1}-{cut.stop})\n" + elif cut.region_type.upper() == "CDNA": + x += f"- cDNA({cut.start + 1}-{cut.stop})\n" xl.append(x) return "\n".join(xl)[:-1] @@ -324,19 +291,19 @@ def format_chromap(indices, subregion_type=None): gdna_fqs = [] gdna_str = [] for idx, region in enumerate(indices): - for rgn, index in region.items(): - for k, v in index.items(): - if v.upper() == "BARCODE": + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type.upper() == "BARCODE": bc_fqs.append(rgn) - bc_str.append(f"bc:{k[0]}:{k[1]}") + bc_str.append(f"bc:{cut.start}:{cut.stop}") pass - elif v.upper() == "GDNA": + elif cut.region_type.upper() == "GDNA": gdna_fqs.append(rgn) - gdna_str.append(f"{k[0]}:{k[1]}") + gdna_str.append(f"{cut.start}:{cut.stop}") if len(set(bc_fqs)) > 1: - raise "chromap only supports barcodes from one fastq" + raise Exception("chromap only supports barcodes from one fastq") if len(set(gdna_fqs)) > 2: - raise "chromap only supports genomic dna from two fastqs" + raise Exception("chromap only supports genomic dna from two fastqs") barcode_fq = bc_fqs[0] read1_fq = list(set(gdna_fqs))[0] diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py index 5a15a17..46cac2f 100644 --- a/seqspec/seqspec_onlist.py +++ b/seqspec/seqspec_onlist.py @@ -1,5 +1,6 @@ from seqspec.Assay import Assay -from seqspec.utils import load_spec +from seqspec.Region import project_regions_to_coordinates, itx_read +from seqspec.utils import load_spec, map_read_id_to_regions from seqspec.seqspec_find import run_find_by_type import os from seqspec.utils import read_list @@ -21,6 +22,11 @@ def setup_onlist_args(parser): type=str, default=None, ) + subparser.add_argument( + "--region", + help="Specify a region", + action="store_true", + ) subparser_required.add_argument( "-m", metavar="MODALITY", @@ -30,7 +36,12 @@ def setup_onlist_args(parser): required=True, ) subparser_required.add_argument( - "-r", metavar="REGION", help=("Region"), type=str, default=None, required=False + "-r", + metavar="READ or REGION", + help=("Read or Region"), + type=str, + default=None, + required=False, ) subparser.add_argument("--list", action="store_true", help=("List onlists")) return subparser @@ -53,13 +64,15 @@ def validate_onlist_args(parser, args): for ol in onlists: print(f"{ol['region_id']}\t{ol['filename']}\t{ol['location']}\t{ol['md5']}") return - - olist = run_onlist(spec, m, r) + if args.region: + olist = run_onlist_region(spec, m, r) + else: + olist = run_onlist_read(spec, m, r) print(os.path.join(os.path.dirname(os.path.abspath(fn)), olist)) return -def run_onlist(spec: Assay, modality: str, region_id: str): +def run_onlist_region(spec: Assay, modality: str, region_id: str): # for now return the path to the onlist file for the modality/region pair # run function @@ -67,6 +80,29 @@ def run_onlist(spec: Assay, modality: str, region_id: str): onlists = [] for r in regions: onlists.append(r.get_onlist().filename) + if len(onlists) == 0: + raise ValueError(f"No onlist found for region {region_id}") + return join_onlists(onlists) + + +def run_onlist_read(spec: Assay, modality: str, read_id: str): + # for now return the path to the onlist file for the modality/region pair + + # run function + (read, rgns) = map_read_id_to_regions(spec, modality, read_id) + # convert regions to region coordinates + rcs = project_regions_to_coordinates(rgns) + # intersect read with region coordinates + new_rcs = itx_read(rcs, 0, read.max_len) + + onlists = [] + for r in new_rcs: + ol = r.get_onlist() + if ol: + onlists.append(ol.filename) + + if len(onlists) == 0: + raise ValueError(f"No onlist found for read {read_id}") return join_onlists(onlists) diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py index d40e93a..ca40db2 100644 --- a/seqspec/seqspec_print.py +++ b/seqspec/seqspec_print.py @@ -1,7 +1,8 @@ from seqspec.utils import load_spec from seqspec.seqspec_print_html import run_print_html import newick -from .utils import REGION_TYPE_COLORS, complement_sequence, get_cuts +from .utils import REGION_TYPE_COLORS, complement_sequence +from seqspec.Region import project_regions_to_coordinates def setup_print_args(parser): @@ -67,7 +68,7 @@ def libseq(spec, modality): p = [] n = [] leaves = libspec.get_leaves() - cuts = get_cuts(leaves) + cuts = project_regions_to_coordinates(leaves) for idx, read in enumerate(seqspec, 1): read_len = read.max_len read_id = read.read_id @@ -75,7 +76,7 @@ def libseq(spec, modality): primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0] primer_pos = cuts[primer_idx] if read.strand == "pos": - wsl = primer_pos[1] - 1 + wsl = primer_pos.stop - 1 ws = wsl * " " arrowl = read_len - 1 @@ -83,7 +84,7 @@ def libseq(spec, modality): p.append(f"{ws}|{arrow}>({idx}) {read_id}") elif read.strand == "neg": - wsl = primer_pos[0] - read_len + wsl = primer_pos.start - read_len ws = wsl * " " arrowl = read_len - 1 diff --git a/seqspec/utils.py b/seqspec/utils.py index 2ac9a1b..3d0636c 100644 --- a/seqspec/utils.py +++ b/seqspec/utils.py @@ -28,16 +28,21 @@ def load_genbank_stream(gbk_stream: io.IOBase): return data -# return cut indices for all atomic regions -def get_cuts(regions, cuts=[]): - if not cuts: - cuts = [] - prev = 0 - for r in regions: - nxt = prev + r.max_len - cuts.append((prev, nxt)) - prev = nxt - return cuts +class RegionCoordinate: + def __init__(self, cut_name, cut_type, start, end): + self.cut_name = cut_name + self.cut_type = cut_type + self.start = start + self.end = end + + def __repr__(self): + return f"RegionCoordinate {self.cut_name} [{self.cut_type}]: ({self.start}, {self.end})" + + def __str__(self): + return f"RegionCoordinate {self.cut_name} [{self.cut_type}]: ({self.start}, {self.end})" + + def __eq__(self, other): + return self.start == other.start and self.end == other.end def write_read(header, seq, qual, f): @@ -126,3 +131,20 @@ def complement_nucleotide(nucleotide): def complement_sequence(sequence): return "".join(complement_nucleotide(n) for n in sequence.upper()) + + +def map_read_id_to_regions(spec, modality, region_id): + # get all atomic elements from library + leaves = spec.get_libspec(modality).get_leaves() + # get the read object and primer id + read = [i for i in spec.sequence_spec if i.read_id == region_id][0] + primer_id = read.primer_id + # get the index of the primer in the list of leaves (ASSUMPTION, 5'->3' and primer is an atomic element) + primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0] + # If we are on the opposite strand, we go in the opposite way + if read.strand == "neg": + rgns = leaves[:primer_idx][::-1] + else: + rgns = leaves[primer_idx + 1 :] + + return (read, rgns) diff --git a/tests/test_region.py b/tests/test_region.py index 382366e..172eb41 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -67,7 +67,7 @@ def test_minimal_region(self): self.assertEqual(r.regions, []) self.assertEqual(r.get_region_by_id(expected["region_id"]), [r]) - self.assertEqual(r.get_region_by_type(expected["region_type"]), [r]) + self.assertEqual(r.get_region_by_region_type(expected["region_type"]), [r]) self.assertEqual(r.get_leaves(), [r]) self.assertEqual(r.get_leaf_region_types(), set([expected["region_type"]])) @@ -94,13 +94,14 @@ def test_unique_subregions(self): self.assertEqual(r_expected.get_region_by_id(r_umi_dict["region_id"]), [r_umi]) self.assertEqual( - r_expected.get_region_by_type(r_umi_dict["region_type"]), [r_umi] + r_expected.get_region_by_region_type(r_umi_dict["region_type"]), [r_umi] ) self.assertEqual( r_expected.get_region_by_id(r_linker_dict["region_id"]), [r_linker] ) self.assertEqual( - r_expected.get_region_by_type(r_linker_dict["region_type"]), [r_linker] + r_expected.get_region_by_region_type(r_linker_dict["region_type"]), + [r_linker], ) self.assertEqual(r_expected.get_leaves(), [r_umi, r_linker]) self.assertEqual( @@ -127,7 +128,7 @@ def test_retrieving_subregions_with_same_types(self): r1_dict = region_rna_joined_dict("region-1", [r2, r3, r4]) r1 = Region(**r1_dict) - self.assertEqual(r1.get_region_by_type(r2_dict["region_type"]), [r2, r4]) + self.assertEqual(r1.get_region_by_region_type(r2_dict["region_type"]), [r2, r4]) def test_set_parent_id(self): r2_dict = region_rna_umi_dict("region-2") diff --git a/tests/test_utils.py b/tests/test_utils.py index 0f8ac2f..0bebd80 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,7 @@ from unittest import TestCase from seqspec.Region import Region -from seqspec.utils import load_spec_stream, get_cuts, write_read +from seqspec.utils import load_spec_stream, project_regions_to_coordinates, write_read from .test_region import ( region_rna_joined_dict, @@ -99,7 +99,7 @@ def test_load_spec_stream(self): head = spec.get_libspec("RNA") self.assertEqual(len(head.regions), 5) - def test_get_cuts(self): + def test_project_regions_to_coordinates(self): r_umi_dict = region_rna_umi_dict("region-2") r_umi = Region(**r_umi_dict) r_linker_dict = region_rna_linker_dict("region-3") @@ -112,7 +112,7 @@ def test_get_cuts(self): r_linker_min, r_linker_max = r_linker.get_len() r_linker_min += r_umi_max r_linker_max += r_linker_max - cuts = get_cuts(r_expected.regions) + cuts = project_regions_to_coordinates(r_expected.regions) self.assertEqual(cuts, [(r_umi_min, r_umi_max), (r_linker_min, r_linker_max)]) def test_write_header(self):