From aeac13f8192b7357d029c05fddfbc3c23de127d0 Mon Sep 17 00:00:00 2001 From: Sina Booeshaghi Date: Sat, 17 Aug 2024 12:24:00 -0700 Subject: [PATCH] added ability to check onlist files (under updated schema) and modified the seqpsec load to add properties to onlist regions if the version is older than 0.3.0. Updated 10xv1/2/3 templates to the most recent version and added their onlists --- examples/specs/template/10xv1-template.yaml | 137 ++++++++++++++++++++ seqspec/Region.py | 16 +-- seqspec/seqspec_check.py | 11 +- seqspec/utils.py | 21 ++- 4 files changed, 172 insertions(+), 13 deletions(-) create mode 100644 examples/specs/template/10xv1-template.yaml diff --git a/examples/specs/template/10xv1-template.yaml b/examples/specs/template/10xv1-template.yaml new file mode 100644 index 0000000..690b157 --- /dev/null +++ b/examples/specs/template/10xv1-template.yaml @@ -0,0 +1,137 @@ +!Assay +seqspec_version: 0.3.0 +assay_id: 10xv2 +name: 10xv2 +doi: https://doi.org/10.1126/science.aam8999 +date: 15 March 2018 +description: 10x Genomics v2 single-cell rnaseq +modalities: +- rna +lib_struct: https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium3.html +sequence_protocol: Not-specified +sequence_kit: Not-specified +library_protocol: 10xv2 RNA +library_kit: Not-specified +sequence_spec: +- !Read + read_id: R1.fastq.gz + name: Read 1 + modality: rna + primer_id: custom_primer1 + min_len: 98 + max_len: 98 + strand: pos + files: + - !File + filename: R1.fastq.gz + filetype: fastq + filesize: 0 + url: "./fastq/R1.fastq.gz" + urltype: local + md5: 0 +- !Read + read_id: I1.fastq.gz + name: Read 2 + modality: rna + primer_id: custom_primer2 + min_len: 14 + max_len: 14 + strand: pos + files: + - !File + filename: I1.fastq.gz + filetype: fastq + filesize: 0 + url: "./fastq/I1.fastq.gz" + urltype: local + md5: 0 +- !Read + read_id: R2.fastq.gz + name: Read 2 + modality: rna + primer_id: custom_primer2 + min_len: 10 + max_len: 10 + strand: neg + files: + - !File + filename: R2.fastq.gz + filetype: fastq + filesize: 0 + url: "./fastq/R2.fastq.gz" + urltype: local + md5: 0 +library_spec: +- !Region + parent_id: null + region_id: rna + region_type: rna + name: rna + sequence_type: joined + sequence: AAAAAAAAAAAAAAAANNNNNNNNNNNNNNNNNNNNNNNNNNXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXAAAAAAAAAAAAAAAA + min_len: 59 + max_len: 208 + onlist: null + regions: + - !Region + parent_id: rna + region_id: custom_primer1 + region_type: custom_primer + name: custom_primer1 + sequence_type: fixed + sequence: '' + min_len: 0 + max_len: 0 + onlist: null + regions: null + - !Region + parent_id: rna + region_id: cdna + region_type: cdna + name: cdna + sequence_type: random + sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + min_len: 1 + max_len: 150 + onlist: null + regions: null + - !Region + parent_id: rna + region_id: umi + region_type: umi + name: umi + sequence_type: random + sequence: XXXXXXXXXX + min_len: 10 + max_len: 10 + onlist: null + regions: null + - !Region + parent_id: rna + region_id: custom_primer2 + region_type: custom_primer + name: custom_primer2 + sequence_type: fixed + sequence: '' + min_len: 0 + max_len: 0 + onlist: null + regions: null + - !Region + parent_id: rna + region_id: barcode + region_type: barcode + name: barcode + sequence_type: onlist + sequence: NNNNNNNNNNNNNNNN + min_len: 16 + max_len: 16 + onlist: !Onlist + location: remote + filename: 737K-april-2014.txt.gz + filetype: txt + filesize: 11059200 + url: https://github.com/pachterlab/qcbc/raw/main/tests/10xRNAv1/737K-april-2014.txt.gz + urltype: http + md5: 9911f5f3fbab451d79b6b38068a001f0 + regions: null diff --git a/seqspec/Region.py b/seqspec/Region.py index a58fe3f..b57199d 100644 --- a/seqspec/Region.py +++ b/seqspec/Region.py @@ -368,10 +368,10 @@ def __init__( def __repr__(self) -> str: d = { "filename": self.filename, - # "filetype": self.filetype, - # "filesize": self.filesize, - # "url": self.url, - # "urltype": self.urltype, + "filetype": self.filetype, + "filesize": self.filesize, + "url": self.url, + "urltype": self.urltype, "md5": self.md5, "location": self.location, } @@ -380,10 +380,10 @@ def __repr__(self) -> str: def to_dict(self): d = { "filename": self.filename, - # "filetype": self.filetype, - # "filesize": self.filesize, - # "url": self.url, - # "urltype": self.urltype, + "filetype": self.filetype, + "filesize": self.filesize, + "url": self.url, + "urltype": self.urltype, "md5": self.md5, "location": self.location, } diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index c39226a..b660230 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -99,9 +99,14 @@ def run_check(schema, spec, spec_fn): idx += 1 elif ol.location == "remote": # ping the link with a simple http request to check if the file exists at that URI - if not file_exists(ol.filename): - errors.append(f"[error {idx}] {ol.filename} does not exist") - idx += 1 + if spec.seqspec_version == "0.3.0": + if not file_exists(ol.url): + errors.append(f"[error {idx}] {ol.filename} does not exist") + idx += 1 + else: + if not file_exists(ol.filename): + errors.append(f"[error {idx}] {ol.filename} does not exist") + idx += 1 # get all of the regions with type fastq in the spec and check that those files exist relative to the path of the spec fqrgns = [] diff --git a/seqspec/utils.py b/seqspec/utils.py index 6f6a2f5..d4908b6 100644 --- a/seqspec/utils.py +++ b/seqspec/utils.py @@ -22,10 +22,27 @@ def load_spec_stream(spec_stream: io.IOBase): r.set_parent_id(None) # for backwards compatibilty, for specs < v0.3.0 set the files to empty - for r in data.sequence_spec: - if version.parse(data.seqspec_version) < version.parse("0.3.0"): + # TODO for backwards compatibility of the specs < v0.3.0, set the onlist regions with missing properties + if version.parse(data.seqspec_version) < version.parse("0.3.0"): + for r in data.sequence_spec: r.set_files([]) + for r in data.library_spec: + for lf in r.get_leaves(): + if lf.onlist is not None: + filename = lf.onlist.filename + location = lf.onlist.location + md5 = lf.onlist.md5 + lf.onlist = Onlist( + filename, + filetype="", + filesize=0, + url="", + urltype="", + md5=md5, + location=location, + ) + return data