Skip to content

Commit

Permalink
fixed seqspec index and seqspec onlist to use the RegionCoordinate class
Browse files Browse the repository at this point in the history
  • Loading branch information
sbooeshaghi committed Mar 11, 2024
1 parent c22574c commit f2ad2dd
Show file tree
Hide file tree
Showing 12 changed files with 301 additions and 152 deletions.
5 changes: 4 additions & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
- `Assay` attribute `sequencer` changed to `sequence_protocol`
- `Assay` function `get_modality` changed to `get_libspec`
- `Region` function `update_attr` uses the `max_len` to generate `random` and `onlist` sequence lengths instead of `min_len`
- `get_region_by_type` changed to `get_region_by_region_type` to disambiguate between `region_type` and `sequence_type`
- `seqspec onlist` (by default) searches for onlists in the `Region`s intersected by the `Read` passed to `-r`.

### Added

Expand All @@ -25,7 +27,6 @@
- Add `sequence_spec` to the seqspec json schema
- Add `Read` object to specification document
- Add `Read` generator to website GUI
- Add prior version seqspec schema to seqspec/schema (note to self, this must be done for every release)
- Add pattern matching to `date` in `Assay` (expected date format: DAY MONTH YEAR, where day is one or two numbers, month is the full named month starting with a Capital letter and year is the full year)
- Add `library_kit` to `Assay` object (kit that adds seq adapters)
- Add `library_protocol` to `Assay` object (library that generates insert)
Expand All @@ -45,6 +46,8 @@
- check that the min len is less than or equal to the max len
- check that the length of the sequence is between min and max len
- Note a strong assumption in `seqspec print` is that the sequence have a length equal to the `max_len` for visualization purposes
- Add `RegionCoordinate` object that maps `Region` min/max lengths to 0-indexed positions
- `seqspec onlist` searches for onlists in a `Region` based on `--region` flag

### Removed

Expand Down
8 changes: 4 additions & 4 deletions examples/seqspec-dev.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@
"from datetime import datetime\n",
"import matplotlib.dates as mdates\n",
"\n",
"from seqspec.utils import load_spec, get_cuts\n",
"from seqspec.utils import load_spec, project_regions_to_coordinates\n",
"from seqspec.seqspec_index import run_index\n",
"from seqspec.seqspec_find import run_find\n",
"import os\n",
Expand Down Expand Up @@ -331,7 +331,7 @@
{
"cell_type": "code",
"source": [
"from seqspec.utils import get_cuts\n",
"from seqspec.utils import project_regions_to_coordinates\n",
"\n",
"def complement_nucleotide(nucleotide):\n",
" complements = {\n",
Expand All @@ -353,7 +353,7 @@
" p = []\n",
" n = []\n",
" leaves = libspec.get_leaves()\n",
" cuts = get_cuts(leaves)\n",
" cuts = project_regions_to_coordinates(leaves)\n",
" for idx, read in enumerate(seqspec, 1):\n",
" read_len = read.max_len\n",
" read_id = read.read_id\n",
Expand Down Expand Up @@ -450,4 +450,4 @@
"outputs": []
}
]
}
}
4 changes: 2 additions & 2 deletions examples/seqspec_dev2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
{
"cell_type": "code",
"source": [
"def get_cuts(regions, cuts=[]):\n",
"def project_regions_to_coordinates(regions, cuts=[]):\n",
" if not cuts:\n",
" cuts = []\n",
" prev = 0\n",
Expand Down Expand Up @@ -156,7 +156,7 @@
" rgn = leaves[primer_idx + 1:]\n",
"\n",
"# get the cuts for all of the atomic elements (tuples of 0-indexed start stop)\n",
"cuts = get_cuts(rgn)\n",
"cuts = project_regions_to_coordinates(rgn)\n",
"\n",
"# associate each cut with its region type\n",
"for idx, r in enumerate(rgn):\n",
Expand Down
121 changes: 119 additions & 2 deletions seqspec/Region.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ def get_region_by_id(self, region_id, found=[]):
found = r.get_region_by_id(region_id, found)
return found

def get_region_by_type(self, region_type, found=[]):
def get_region_by_region_type(self, region_type, found=[]):
if not found:
found = []
if self.region_type == region_type:
found.append(self)
if self.regions:
for r in self.regions:
found = r.get_region_by_type(region_type, found)
found = r.get_region_by_region_type(region_type, found)
return found

def get_onlist_regions(self, found=[]):
Expand Down Expand Up @@ -222,6 +222,123 @@ def update_region_by_id(
target_region.max_len = max_len
return

def reverse(self):
if self.regions:
# reverse the list of sub regions
for r in self.regions[::-1]:
r.reverse()
else:
# reverse the actual sequence
self.sequence = self.sequence[::-1]
return

def complement(self):
if self.regions:
for r in self.regions:
r.complement()
else:
self.sequence = complement_sequence(self.sequence)


def complement_nucleotide(nucleotide):
complements = {
"A": "T",
"T": "A",
"G": "C",
"C": "G",
"R": "Y",
"Y": "R",
"S": "S",
"W": "W",
"K": "M",
"M": "K",
"B": "V",
"D": "H",
"V": "B",
"H": "D",
"N": "N",
"X": "X",
}
return complements.get(
nucleotide, "N"
) # Default to 'N' if nucleotide is not recognized


def complement_sequence(sequence):
return "".join(complement_nucleotide(n) for n in sequence.upper())


class RegionCoordinate(Region):
def __init__(
self,
region: Region,
start: int = 0,
stop: int = 0,
):
super().__init__(
region.region_id,
region.region_type,
region.name,
region.sequence_type,
region.sequence,
region.min_len,
region.max_len,
region.onlist,
region.regions,
)
self.start = start
self.stop = stop

def __repr__(self):
return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"

def __str__(self):
return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"

def __eq__(self, other):
return self.start == other.start and self.stop == other.stop


def project_regions_to_coordinates(
regions: List[Region], rcs: List[RegionCoordinate] = []
) -> List[RegionCoordinate]:
if not rcs:
rcs = []
prev = 0
for r in regions:
nxt = prev + r.max_len
rc = RegionCoordinate(r, prev, nxt)
rcs.append(rc)
prev = nxt
return rcs


def itx_read(
region_coordinates: List[RegionCoordinate], read_start: int, read_stop: int
):
# return a list of region_coordinates intersect with read start/stop
new_rcs = []

for idx, rc in enumerate(region_coordinates):
# read start after rc ends, ignore
if read_start >= rc.stop:
continue
# read stop before rc starts, ignore
if read_stop <= rc.start:
continue

# all region_coordinates now have read start or stop in the rc

# read start in rc, update start
if read_start >= rc.start:
rc.start = read_start
# read stop in rc, update stop
if read_stop < rc.stop:
rc.stop = read_stop
new_rcs.append(rc)

return new_rcs


class Onlist(yaml.YAMLObject):
yaml_tag = "!Onlist"
Expand Down
6 changes: 4 additions & 2 deletions seqspec/seqspec_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ def run_check(schema, spec, spec_fn):
# get all of the regions with type fastq in the spec and check that those files exist relative to the path of the spec
fqrgns = []
for m in modes:
fqrgns += [i for i in spec.get_libspec(m).get_region_by_type("fastq")]
fqrgns += [i for i in spec.get_libspec(m).get_region_by_type("fastq_link")]
fqrgns += [i for i in spec.get_libspec(m).get_region_by_region_type("fastq")]
fqrgns += [
i for i in spec.get_libspec(m).get_region_by_region_type("fastq_link")
]
for fqrgn in fqrgns:
if fqrgn.region_type == "fastq":
check = path.join(path.dirname(spec_fn), fqrgn.region_id)
Expand Down
2 changes: 1 addition & 1 deletion seqspec/seqspec_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,5 @@ def run_find(spec: Assay, modality: str, region_id: str):

def run_find_by_type(spec: Assay, modality: str, region_type: str):
m = spec.get_libspec(modality)
regions = m.get_region_by_type(region_type)
regions = m.get_region_by_region_type(region_type)
return regions
Loading

0 comments on commit f2ad2dd

Please sign in to comment.