Skip to content

Commit

Permalink
remote onlist download with the kallisto multiple lists onlist format (
Browse files Browse the repository at this point in the history
…#31)

* Support older versions of matplotlib

the spines[["top", "bottom"...]] structure is a relatively recent
update, this allows working with older versions of matplotlib

* Get the test of seqspec check working again.

The refactoring of repositories to split out the example specification
yaml files means we didn't have any local files to try validating.

So I had to use the stub I had added for other tests, however it
needed some updates to be compatible with the library spec version of
the schema.

Also I did some mocking to avoid needing to create test fastq and
barcode files.

* Increase the number of Xs in the random region

The validator now checks that the length of the sequence string is
"X" * max_len characters.

* Update minimal Region tests and add minimal Read tests.

* Make some minimal tests for the seqspec print functions

* update print command to use the replacment assay_id attribute

previously it was assay

* My test assay used custom_primer which didn't have a color.

I randomly picked sea green.

* Implement downloading lists via urls

Also to work with barcode lists hosted by the DACC transparently
decompress gzip files.

The old read_list function took a filename, but I changed it to take
the onlist object so it would have access to the location attribute
to know if it should be reading locally or remotely instead of just
guessing if the filename string started with a scheme url.

* Only return the onlist filename if it a local file

Even if there's one list but it's remote we need to download it and
put into into a local file.

* Add onlist argument to specify combine barcode list file format.

Kallisto has a format where multiple barcode lists are in one file
separated by whitespace. That's different from the more common
cartisean product format where all the lists are crossed with each
other.

This adds the kallisto format as -f multi, and adds an argument for
the current version -f product, but treats it as a default.

* Fix test for project_regions_to_coordinates

* Minimally test RegionCoordinate and  project_regions_to_coordinates

* test run_onlist_region and run_onlist_read

A new accessor function was added to get onlists for the new read
objects in addition to the older by region type.

I also added some type annotations to be more clear that join_onlists
needs a list of Onlist objects to work. (Since we need the full
information to know if we need to download files)
  • Loading branch information
detrout authored Mar 12, 2024
1 parent f2ad2dd commit 4e2bd0a
Show file tree
Hide file tree
Showing 11 changed files with 398 additions and 69 deletions.
2 changes: 1 addition & 1 deletion seqspec/seqspec_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def validate_check_args(parser, args):
else:
print("\n".join(errors))

return
return len(errors)


def run_check(schema, spec, spec_fn):
Expand Down
74 changes: 59 additions & 15 deletions seqspec/seqspec_onlist.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from seqspec.Assay import Assay
from seqspec.Region import project_regions_to_coordinates, itx_read
from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist
from seqspec.utils import load_spec, map_read_id_to_regions
from seqspec.seqspec_find import run_find_by_type
import os
Expand Down Expand Up @@ -43,6 +43,14 @@ def setup_onlist_args(parser):
default=None,
required=False,
)
subparser.add_argument(
"-f",
metavar="FORMAT",
type=str,
default="product",
choices=["product", "multi"],
help="select between cartesian product 'product' or multiple barcode lists in file 'multi'"
)
subparser.add_argument("--list", action="store_true", help=("List onlists"))
return subparser

Expand All @@ -52,6 +60,7 @@ def validate_onlist_args(parser, args):
fn = args.yaml
m = args.m
r = args.r
f = args.f
# TODO: if onlist is a link, download. also fix output path
# o = args.o
# load spec
Expand All @@ -65,27 +74,27 @@ def validate_onlist_args(parser, args):
print(f"{ol['region_id']}\t{ol['filename']}\t{ol['location']}\t{ol['md5']}")
return
if args.region:
olist = run_onlist_region(spec, m, r)
olist = run_onlist_region(spec, m, r, f)
else:
olist = run_onlist_read(spec, m, r)
olist = run_onlist_read(spec, m, r, f)
print(os.path.join(os.path.dirname(os.path.abspath(fn)), olist))
return


def run_onlist_region(spec: Assay, modality: str, region_id: str):
def run_onlist_region(spec: Assay, modality: str, region_id: str, fmt: str):
# for now return the path to the onlist file for the modality/region pair

# run function
regions = run_find_by_type(spec, modality, region_id)
onlists = []
for r in regions:
onlists.append(r.get_onlist().filename)
onlists.append(r.get_onlist())
if len(onlists) == 0:
raise ValueError(f"No onlist found for region {region_id}")
return join_onlists(onlists)
return join_onlists(onlists, fmt)


def run_onlist_read(spec: Assay, modality: str, read_id: str):
def run_onlist_read(spec: Assay, modality: str, read_id: str, fmt: str):
# for now return the path to the onlist file for the modality/region pair

# run function
Expand All @@ -99,12 +108,12 @@ def run_onlist_read(spec: Assay, modality: str, read_id: str):
for r in new_rcs:
ol = r.get_onlist()
if ol:
onlists.append(ol.filename)
onlists.append(ol)

if len(onlists) == 0:
raise ValueError(f"No onlist found for read {read_id}")

return join_onlists(onlists)
return join_onlists(onlists, fmt)


def run_list_onlists(spec: Assay, modality: str):
Expand All @@ -122,15 +131,50 @@ def run_list_onlists(spec: Assay, modality: str):
return olsts


def join_onlists(onlists):
base_path = os.path.dirname(os.path.abspath(onlists[0]))
if len(onlists) == 1:
return onlists[0]
def find_list_target_dir(onlists):
for l in onlists:
if l.location == "local":
base_path = os.path.dirname(os.path.abspath(onlists[0].filename))
if os.access(base_path, os.W_OK):
return base_path

return os.getcwd()


def join_onlists(onlists: [Onlist], fmt: str):
"""Given a list of onlist objects return a file containing the combined list
"""
if len(onlists) == 0:
print("No lists present")
return
elif len(onlists) == 1 and onlists[0].location.lower() == "local":
return onlists[0].filename
else:
base_path = find_list_target_dir(onlists)
# join the onlists
lsts = [read_list(o) for o in onlists]
joined_path = os.path.join(base_path, "onlist_joined.txt")
formatter_functions = {
"product": join_product_onlist,
"multi": join_multi_onlist,
}
formatter = formatter_functions.get(fmt)
if formatter is None:
raise ValueError("Unrecognized format type {}. Expected".format(
fmt, list(formatter_functions.keys())))

with open(joined_path, "w") as f:
for i in itertools.product(*lsts):
f.write(f"{''.join(i)}\n")
for line in formatter(lsts):
f.write(line)

return joined_path


def join_product_onlist(lsts):
for i in itertools.product(*lsts):
yield f"{''.join(i)}\n"


def join_multi_onlist(lsts):
for row in itertools.zip_longest(*lsts, fillvalue='-'):
yield f"{' '.join((str(x) for x in row))}\n"
9 changes: 5 additions & 4 deletions seqspec/seqspec_print.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,9 @@ def run_print_png(spec):
modalities = [modalities[i] for i in asort]
lengths = [lengths[i] for i in asort]
modes = [modes[i] for i in asort]
assay = spec.assay
assay_id = spec.assay_id

fig, _ = plot_png(assay, modalities, modes, nmodes, lengths)
fig, _ = plot_png(assay_id, modalities, modes, nmodes, lengths)
return fig


Expand Down Expand Up @@ -216,7 +216,8 @@ def plot_png(assay, modalities, modes, nmodes, lengths):
ax.set(**{"xlim": (0, max(lengths))})

# hide the spines
ax.spines[["right", "top", "left", "bottom"]].set_visible(False)
for spine in ["right", "top", "left", "bottom"]:
ax.spines[spine].set_visible(False)
# Hide the axis and ticks and labels
ax.xaxis.set_visible(False)
ax.set_yticklabels([])
Expand All @@ -227,7 +228,7 @@ def plot_png(assay, modalities, modes, nmodes, lengths):

# adjust the xaxis for the last modality to show the length
ax.xaxis.set_visible(True)
ax.spines[["bottom"]].set_visible(True)
ax.spines["bottom"].set_visible(True)
ax.minorticks_on()

ax.set(
Expand Down
33 changes: 29 additions & 4 deletions seqspec/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import io
import gzip
from seqspec.Assay import Assay
from seqspec.Region import Onlist
import yaml
import requests
from Bio import GenBank
Expand Down Expand Up @@ -49,10 +51,32 @@ def write_read(header, seq, qual, f):
f.write(f"{header}\n{seq}\n+\n{qual}\n")


def read_list(fname):
with open(fname, "r") as f:
# just get the first column
return [line.strip().split()[0] for line in f.readlines()]
def yield_onlist_contents(stream):
for line in stream:
yield line.strip().split()[0]


def read_list(onlist: Onlist):
"""Given an onlist object read the local or remote data
"""
if onlist.location == "remote":
response = requests.get(onlist.filename, stream=True)
response.raise_for_status()
# TODO: instead of just looking at the filename, should we check the content-type?
if onlist.filename.endswith(".gz"):
stream = io.TextIOWrapper(gzip.GzipFile(fileobj=response.raw))
else:
stream = response.raw
return list(yield_onlist_contents(stream))
elif onlist.location == "local" and onlist.filename.endswith(".gz"):
with gzip.open(onlist.filename, "rt") as f:
return list(yield_onlist_contents(f))
elif onlist.location == "local":
with open(onlist.filename, "rt") as f:
# just get the first column
return list(yield_onlist_contents(f))
else:
raise ValueError("Unsupported location {}. Expected remote or local".format(onlist.location))


def region_ids_in_spec(seqspec, modality, region_ids):
Expand All @@ -75,6 +99,7 @@ def file_exists(uri):
REGION_TYPE_COLORS = {
"barcode": "#2980B9",
"cdna": "#8E44AD",
"custom_primer": "#3CB371",
"fastq": "#F1C40F",
"gdna": "#E67E22",
"illumina_p5": "#E17A47",
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ commands = pytest --cov=seqspec {posargs:tests}
deps =
pytest
pytest-cov
matplotlib >= 3.4.0
18 changes: 12 additions & 6 deletions tests/test_assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,30 @@
from seqspec.Region import Region
from seqspec.Assay import Assay

from seqspec import __version__

from .test_region import (
region_rna_joined_dict,
region_rna_umi_dict,
region_rna_linker_dict,
)


def assay_dict(regions=[]):
def assay_dict(reads=[], regions=[]):
return {
"seqspec_version": "0.0.0",
"assay": "My assay",
"sequencer": "My sequencing machine",
"seqspec_version": __version__,
"assay_id": "My assay",
"name": "A machine-readable specification for genomics assays",
"doi": "https://doi.org/10.1101/2023.03.17.533215",
"publication_date": "20230317",
"date": "20230317",
"description": "description",
"modalities": ["RNA", "cDNA"],
"lib_struct": "lib_struct",
"sequence_protocol": "illumina protocol",
"sequence_kit": "illumina kit",
"library_protocol": "10x protocol",
"library_kit": "10x v3 kit",
"sequence_spec": reads,
"library_spec": regions,
}

Expand All @@ -42,7 +48,7 @@ def test_assay_with_regions(self):
r_expected_dict = region_rna_joined_dict("region-1", [r_umi, r_linker])
r_expected = Region(**r_expected_dict)

expected = assay_dict(regions=[r_expected])
expected = assay_dict(reads=[], regions=[r_expected])

a = Assay(**expected)

Expand Down
56 changes: 54 additions & 2 deletions tests/test_region.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from unittest import TestCase

from seqspec.Region import Region, Onlist

from seqspec.Region import (
project_regions_to_coordinates,
Read,
Region,
RegionCoordinate,
Onlist,
)

def region_rna_joined_dict(region_id, regions=[]):
expected = {
Expand Down Expand Up @@ -38,6 +43,19 @@ def region_rna_linker_dict(region_id, regions=[]):
return expected


def read_rna_dict(read_id, min_len=0, max_len=100):
expected = {
"read_id": read_id,
"name": f"{read_id}-name",
"modality": "RNA",
"primer_id": f"{read_id}-primer",
"min_len": min_len,
"max_len": max_len,
"strand": "pos",
}
return expected


class TestOnlist(TestCase):
def test_simple_onlist(self):
name = "barcodes.txt"
Expand Down Expand Up @@ -201,3 +219,37 @@ def test_onlists(self):
# and region: None for repr()
expected["regions"] = None
self.assertEqual(repr(r), repr(expected))


class TestRegionCoordinates(TestCase):
def test_project_regions_to_coordinates(self):
r1_dict = region_rna_umi_dict("region-1")
r1 = Region(**r1_dict)
r2_dict = region_rna_linker_dict("region-2")
r2 = Region(**r2_dict)

r3_dict = region_rna_umi_dict("region-3")
r3 = Region(**r3_dict)
r4_dict = region_rna_linker_dict("region-4")
r4 = Region(**r4_dict)

regions = [r1,r2,r3,r4]
coords = project_regions_to_coordinates(regions)

cur_start = 0
for r, c in zip(regions, coords):
cur_stop = cur_start + r.max_len
self.assertEqual(c.start, cur_start)
self.assertEqual(c.stop, cur_stop)
cur_start = cur_stop


class TestRead(TestCase):
def test_minimal_read(self):
expected = read_rna_dict("read-1")
r = Read(**expected)
for key in expected:
self.assertEqual(getattr(r, key), expected[key])

self.assertEqual(repr(r), repr(expected))
self.assertEqual(r.to_dict(), expected)
Loading

0 comments on commit 4e2bd0a

Please sign in to comment.