Skip to content

Commit

Permalink
added relative indexing with seqspec index, added splitcode indexing,…
Browse files Browse the repository at this point in the history
… updated docs
  • Loading branch information
sbooeshaghi committed Oct 8, 2024
1 parent 7accfa4 commit be14ae9
Show file tree
Hide file tree
Showing 7 changed files with 430 additions and 82 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
![python versions](https://img.shields.io/pypi/pyversions/seqspec)
[![license](https://img.shields.io/pypi/l/seqspec)](LICENSE)

`seqspec` is a file format that describes data generated from genomics experiments. Both the file format and `seqspec` tool [enable uniform processing](./docs/UNIFORM.md) of genomics data.
`seqspec`, short for "sequence specification" (pronounced "seek-speck"), is a file format that describes data generated from genomics experiments. Both the file format and `seqspec` tool [enable uniform processing](./docs/UNIFORM.md) of genomics data.

![alt text](docs/images/simple_file_structure.png)
**Figure 1**: Anatomy of a `seqspec` file.
Expand Down
2 changes: 1 addition & 1 deletion docs/SEQSPEC_FILE.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ authors:

# Overview

`seqspec` is a machine-readable specification for annotating sequencing libraries produced by genomics assays. Genomic library structure depends on both the assay and sequencer (and kits) used to generate and bind the assay-specific construct to the sequencing adapters to generate a sequencing library. `seqspec` is specific to both a genomics assay and sequencer and provides a standardized format for describing the structure of sequencing libraries and the resulting sequencing reads. Specifically, a `seqspec` file is a machine-readable YAML file that annotates the content of molecules in genomic libraries, the structure of reads generated from them, and how those are stored in files.
`seqspec`, short for "sequence specification" (pronounced "seek-speck"), is a machine-readable specification for annotating sequencing libraries produced by genomics assays. Genomic library structure depends on both the assay and sequencer (and kits) used to generate and bind the assay-specific construct to the sequencing adapters to generate a sequencing library. `seqspec` is specific to both a genomics assay and sequencer and provides a standardized format for describing the structure of sequencing libraries and the resulting sequencing reads. Specifically, a `seqspec` file is a machine-readable YAML file that annotates the content of molecules in genomic libraries, the structure of reads generated from them, and how those are stored in files.

The primary goal of a `seqspec` file is to:

Expand Down
7 changes: 6 additions & 1 deletion docs/SEQ_PRIMER.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,9 @@ Paper describing nanopore sequencing: https://www.nature.com/articles/nbt.1495

### Strandedness

Unlike Illumina, nanopore sequencing produces the direct nucleotide sequence in linear order starting at the beginning of the molecule.
Unlike Illumina, nanopore sequencing produces the direct nucleotide sequence in linear order starting at the beginning of the molecule. [But the read generated from a molecule can start from either end and either strand](https://nanoporetech.com/document/genomic-dna-by-ligation-sqk-lsk114). And the strand can be either the top or bottom strand. So a read is produced in one of four "orientations"

1. forward, top strand
2. forward, reverse strand
3. backward, top strand
4. backward, reverse strand
153 changes: 122 additions & 31 deletions seqspec/Region.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,34 +242,6 @@ def complement(self):
self.sequence = complement_sequence(self.sequence)


def complement_nucleotide(nucleotide):
complements = {
"A": "T",
"T": "A",
"G": "C",
"C": "G",
"R": "Y",
"Y": "R",
"S": "S",
"W": "W",
"K": "M",
"M": "K",
"B": "V",
"D": "H",
"V": "B",
"H": "D",
"N": "N",
"X": "X",
}
return complements.get(
nucleotide, "N"
) # Default to 'N' if nucleotide is not recognized


def complement_sequence(sequence):
return "".join(complement_nucleotide(n) for n in sequence.upper())


class RegionCoordinate(Region):
def __init__(
self,
Expand All @@ -291,15 +263,106 @@ def __init__(
self.start = start
self.stop = stop

def __repr__(self):
return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
def __repr__(self) -> str:
d = {
"region": self.region_id,
"start": self.start,
"stop": self.stop,
}
return f"{d}"

# def __repr__(self):
# return f"RegionCoordinate {self.name} [{self.region_type}]: [{self.start}, {self.stop})"

def __str__(self):
return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
return f"RegionCoordinate {self.name} [{self.region_type}]: [{self.start}, {self.stop})"

def __eq__(self, other):
return self.start == other.start and self.stop == other.stop

def __sub__(self, other):
"""
Define the subtraction between two RegionCoordinate objects.
rc2 - rc1 should return a new RegionCoordinate with:
- start as rc2.start - rc1.stop
- stop as rc2.stop - rc1.start
- min_len and max_len is max(self.start, other.start) - min(self.stop, other.stop)
"""
if not isinstance(other, RegionCoordinate):
raise TypeError(
f"Unsupported operand type(s) for -: 'RegionCoordinate' and '{type(other).__name__}'"
)

# Case 1: self is to the left of other
# need to handle the case where the diff is zero but the position of other is left or right of self
# self - other
if self.stop <= other.start:
new_start = self.stop
new_stop = other.start

# Case 2: self is to the right of other
elif other.stop <= self.start:
new_start = other.stop
new_stop = self.start
else:
# not so obious what to do with the start and the stop for the same element
if self.start == other.start and self.stop == other.stop:
new_start = self.start
new_stop = self.stop
else:
raise ValueError("Subtraction is not defined.")

# Create a new RegionCoordinate object with the updated start and stop values
new_region = RegionCoordinate(
region=Region(
region_id=f"{self.region_id} - {other.region_id}",
region_type="difference",
name=f"{self.name} - {other.name}",
sequence_type="diff",
sequence="X",
min_len=0,
max_len=0,
onlist=None,
regions=None,
), # Assuming the region meta-data stays the same
start=new_start,
stop=new_stop,
)

# Set min_len and max_len
new_region.min_len = abs(new_region.stop - new_region.start)
new_region.max_len = abs(new_region.stop - new_region.start)
new_region.sequence = "X" * new_region.min_len

return new_region


class RegionCoordinateDifference:
def __init__(
self,
obj: RegionCoordinate,
fixed: RegionCoordinate,
rgncdiff: RegionCoordinate,
loc: str = "",
):
self.obj = obj
self.fixed = fixed
self.rgncdiff = rgncdiff
self.loc = loc
if obj.stop <= fixed.start:
self.loc = "-"
elif obj.start >= fixed.stop:
self.loc = "+"

def __repr__(self) -> str:
d = {
"obj": self.obj,
"fixed": self.fixed,
"rgncdiff": self.rgncdiff,
"loc": self.loc,
}
return f"{d}"


def project_regions_to_coordinates(
regions: List[Region], rcs: List[RegionCoordinate] = []
Expand Down Expand Up @@ -395,3 +458,31 @@ def to_dict(self):

def update_file_id(self, file_id):
self.file_id = file_id


def complement_nucleotide(nucleotide):
complements = {
"A": "T",
"T": "A",
"G": "C",
"C": "G",
"R": "Y",
"Y": "R",
"S": "S",
"W": "W",
"K": "M",
"M": "K",
"B": "V",
"D": "H",
"V": "B",
"H": "D",
"N": "N",
"X": "X",
}
return complements.get(
nucleotide, "N"
) # Default to 'N' if nucleotide is not recognized


def complement_sequence(sequence):
return "".join(complement_nucleotide(n) for n in sequence.upper())
Loading

0 comments on commit be14ae9

Please sign in to comment.