Skip to content

Commit

Permalink
added ability to check onlist files (under updated schema) and modifi…
Browse files Browse the repository at this point in the history
…ed the seqpsec load to add properties to onlist regions if the version is older than 0.3.0. Updated 10xv1/2/3 templates to the most recent version and added their onlists
  • Loading branch information
sbooeshaghi committed Aug 17, 2024
1 parent 3438ffb commit aeac13f
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 13 deletions.
137 changes: 137 additions & 0 deletions examples/specs/template/10xv1-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
!Assay
seqspec_version: 0.3.0
assay_id: 10xv2
name: 10xv2
doi: https://doi.org/10.1126/science.aam8999
date: 15 March 2018
description: 10x Genomics v2 single-cell rnaseq
modalities:
- rna
lib_struct: https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium3.html
sequence_protocol: Not-specified
sequence_kit: Not-specified
library_protocol: 10xv2 RNA
library_kit: Not-specified
sequence_spec:
- !Read
read_id: R1.fastq.gz
name: Read 1
modality: rna
primer_id: custom_primer1
min_len: 98
max_len: 98
strand: pos
files:
- !File
filename: R1.fastq.gz
filetype: fastq
filesize: 0
url: "./fastq/R1.fastq.gz"
urltype: local
md5: 0
- !Read
read_id: I1.fastq.gz
name: Read 2
modality: rna
primer_id: custom_primer2
min_len: 14
max_len: 14
strand: pos
files:
- !File
filename: I1.fastq.gz
filetype: fastq
filesize: 0
url: "./fastq/I1.fastq.gz"
urltype: local
md5: 0
- !Read
read_id: R2.fastq.gz
name: Read 2
modality: rna
primer_id: custom_primer2
min_len: 10
max_len: 10
strand: neg
files:
- !File
filename: R2.fastq.gz
filetype: fastq
filesize: 0
url: "./fastq/R2.fastq.gz"
urltype: local
md5: 0
library_spec:
- !Region
parent_id: null
region_id: rna
region_type: rna
name: rna
sequence_type: joined
sequence: AAAAAAAAAAAAAAAANNNNNNNNNNNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAAAAAAAAAAAAAAAA
min_len: 59
max_len: 208
onlist: null
regions:
- !Region
parent_id: rna
region_id: custom_primer1
region_type: custom_primer
name: custom_primer1
sequence_type: fixed
sequence: ''
min_len: 0
max_len: 0
onlist: null
regions: null
- !Region
parent_id: rna
region_id: cdna
region_type: cdna
name: cdna
sequence_type: random
sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
min_len: 1
max_len: 150
onlist: null
regions: null
- !Region
parent_id: rna
region_id: umi
region_type: umi
name: umi
sequence_type: random
sequence: XXXXXXXXXX
min_len: 10
max_len: 10
onlist: null
regions: null
- !Region
parent_id: rna
region_id: custom_primer2
region_type: custom_primer
name: custom_primer2
sequence_type: fixed
sequence: ''
min_len: 0
max_len: 0
onlist: null
regions: null
- !Region
parent_id: rna
region_id: barcode
region_type: barcode
name: barcode
sequence_type: onlist
sequence: NNNNNNNNNNNNNNNN
min_len: 16
max_len: 16
onlist: !Onlist
location: remote
filename: 737K-april-2014.txt.gz
filetype: txt
filesize: 11059200
url: https://github.com/pachterlab/qcbc/raw/main/tests/10xRNAv1/737K-april-2014.txt.gz
urltype: http
md5: 9911f5f3fbab451d79b6b38068a001f0
regions: null
16 changes: 8 additions & 8 deletions seqspec/Region.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,10 +368,10 @@ def __init__(
def __repr__(self) -> str:
d = {
"filename": self.filename,
# "filetype": self.filetype,
# "filesize": self.filesize,
# "url": self.url,
# "urltype": self.urltype,
"filetype": self.filetype,
"filesize": self.filesize,
"url": self.url,
"urltype": self.urltype,
"md5": self.md5,
"location": self.location,
}
Expand All @@ -380,10 +380,10 @@ def __repr__(self) -> str:
def to_dict(self):
d = {
"filename": self.filename,
# "filetype": self.filetype,
# "filesize": self.filesize,
# "url": self.url,
# "urltype": self.urltype,
"filetype": self.filetype,
"filesize": self.filesize,
"url": self.url,
"urltype": self.urltype,
"md5": self.md5,
"location": self.location,
}
Expand Down
11 changes: 8 additions & 3 deletions seqspec/seqspec_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,14 @@ def run_check(schema, spec, spec_fn):
idx += 1
elif ol.location == "remote":
# ping the link with a simple http request to check if the file exists at that URI
if not file_exists(ol.filename):
errors.append(f"[error {idx}] {ol.filename} does not exist")
idx += 1
if spec.seqspec_version == "0.3.0":
if not file_exists(ol.url):
errors.append(f"[error {idx}] {ol.filename} does not exist")
idx += 1
else:
if not file_exists(ol.filename):
errors.append(f"[error {idx}] {ol.filename} does not exist")
idx += 1

# get all of the regions with type fastq in the spec and check that those files exist relative to the path of the spec
fqrgns = []
Expand Down
21 changes: 19 additions & 2 deletions seqspec/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,27 @@ def load_spec_stream(spec_stream: io.IOBase):
r.set_parent_id(None)

# for backwards compatibilty, for specs < v0.3.0 set the files to empty
for r in data.sequence_spec:
if version.parse(data.seqspec_version) < version.parse("0.3.0"):
# TODO for backwards compatibility of the specs < v0.3.0, set the onlist regions with missing properties
if version.parse(data.seqspec_version) < version.parse("0.3.0"):
for r in data.sequence_spec:
r.set_files([])

for r in data.library_spec:
for lf in r.get_leaves():
if lf.onlist is not None:
filename = lf.onlist.filename
location = lf.onlist.location
md5 = lf.onlist.md5
lf.onlist = Onlist(
filename,
filetype="",
filesize=0,
url="",
urltype="",
md5=md5,
location=location,
)

return data


Expand Down

0 comments on commit aeac13f

Please sign in to comment.