Skip to content

Commit

Permalink
added multiple checks to seqspec check, added libseq format for seqsp…
Browse files Browse the repository at this point in the history
…ec print
  • Loading branch information
sbooeshaghi committed Mar 6, 2024
1 parent 1284270 commit 0371352
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 10 deletions.
13 changes: 13 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@
- Add `sequence_kit` to `Assay` object
- Add website to view example `seqspec` objects
- Add `get_seqspec` to assay returns sequence structure for a given modality
- Add multiple checks to `seqspec check`
- check read modalities exist in assay modalities
- check primer ids from seqspec are unique and exists as region ids in libspec
- check that the primer id exists as an atomic region (currently a strong assumption that may be relaxed in the future)
- check properties of multiple sequence types
- `fixed` and `regions` not null incompatible
- `joined` and `regions` null incompatible
- `random` and `regions` not null incompatible
- `random` must have `sequence` of all X's
- `onlist` and `onlist` property null incompatible
- check that the min len is less than or equal to the max len
- check that the length of the sequence is between min and max len
- Note a strong assumption in `seqspec print` is that the sequence have a length equal to the `max_len` for visualization purposes

### Removed

Expand Down
84 changes: 74 additions & 10 deletions seqspec/seqspec_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,16 +193,80 @@ def run_check(schema, spec, spec_fn):
# leaves = mode.get_leaves()
# idx = [i.region_id for i in leaves].index(read.primer_id)

# check that sequence length is the same as min_length
# if a region has a sequence type "fixed" then it should not contain subregions
# if a region has a sequence type "joiend" then it should contain subregions
# if a region has a sequence type "random" then it should not contain subregions and should be all X's
# if a region has a sequence type "onlist" then it should have an onlist object
def seqtype_check(rgn, errors, idx):
# this is a recursive function that iterates through all regions and checks the sequence type
if rgn.sequence_type == "fixed" and rgn.regions:
errors.append(
f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions"
)
idx += 1
if rgn.sequence_type == "joined" and not rgn.regions:
errors.append(
f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions"
)
idx += 1
if rgn.sequence_type == "random" and rgn.regions:
errors.append(
f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions"
)
idx += 1
if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len:
errors.append(
f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's"
)
idx += 1
if rgn.sequence_type == "onlist" and not rgn.onlist:
errors.append(
f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object"
)
idx += 1
if rgn.regions:
for r in rgn.regions:
errors, idx = seqtype_check(r, errors, idx)
return (errors, idx)

for m in modes:
for rgn in spec.get_libspec(m).get_leaves():
if rgn.sequence and (
len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len
):
# noqa
errors.append(
f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})"
)
idx += 1
for rgn in [spec.get_libspec(m)]:
errors, idx = seqtype_check(rgn, errors, idx)

# check the lengths of every region against the max_len, using a recursive function
def len_check(rgn, errors, idx):
if rgn.regions:
for r in rgn.regions:
errors, idx = len_check(r, errors, idx)
if rgn.max_len < rgn.min_len:
errors.append(
f"[error {idx}] '{rgn.region_id}' max_len is less than min_len"
)
idx += 1
return (errors, idx)

for m in modes:
for rgn in [spec.get_libspec(m)]:
errors, idx = len_check(rgn, errors, idx)

# check that the length of the sequence is equal to the max_len using a recursive function
# an assumption in the code and spec is that the displayed sequence is equal to the max_len
def seq_len_check(rgn, errors, idx):
if rgn.regions:
for r in rgn.regions:
errors, idx = seq_len_check(r, errors, idx)
if rgn.sequence and (
len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len
):
# noqa
errors.append(
f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})"
)
idx += 1
return (errors, idx)

for m in modes:
for rgn in [spec.get_libspec(m)]:
errors, idx = seq_len_check(rgn, errors, idx)

return errors
2 changes: 2 additions & 0 deletions seqspec/seqspec_print.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def libseq(spec, modality):

s = "\n".join(
[
modality,
"---",
"\n".join(p),
libspec.sequence,
complement_sequence(libspec.sequence),
Expand Down

0 comments on commit 0371352

Please sign in to comment.