added relative indexing with seqspec index, added splitcode indexing,…

… updated docs
pachterlab · Oct 8, 2024 · be14ae9 · be14ae9
1 parent 7accfa4
commit be14ae9
Show file tree

Hide file tree

Showing 7 changed files with 430 additions and 82 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 ![python versions](https://img.shields.io/pypi/pyversions/seqspec)
 [![license](https://img.shields.io/pypi/l/seqspec)](LICENSE)
 
-`seqspec` is a file format that describes data generated from genomics experiments. Both the file format and `seqspec` tool [enable uniform processing](./docs/UNIFORM.md) of genomics data.
+`seqspec`, short for "sequence specification" (pronounced "seek-speck"), is a file format that describes data generated from genomics experiments. Both the file format and `seqspec` tool [enable uniform processing](./docs/UNIFORM.md) of genomics data.
 
 ![alt text](docs/images/simple_file_structure.png)
 **Figure 1**: Anatomy of a `seqspec` file.

diff --git a/docs/SEQSPEC_FILE.md b/docs/SEQSPEC_FILE.md
@@ -7,7 +7,7 @@ authors:
 
 # Overview
 
-`seqspec` is a machine-readable specification for annotating sequencing libraries produced by genomics assays. Genomic library structure depends on both the assay and sequencer (and kits) used to generate and bind the assay-specific construct to the sequencing adapters to generate a sequencing library. `seqspec` is specific to both a genomics assay and sequencer and provides a standardized format for describing the structure of sequencing libraries and the resulting sequencing reads. Specifically, a `seqspec` file is a machine-readable YAML file that annotates the content of molecules in genomic libraries, the structure of reads generated from them, and how those are stored in files.
+`seqspec`, short for "sequence specification" (pronounced "seek-speck"), is a machine-readable specification for annotating sequencing libraries produced by genomics assays. Genomic library structure depends on both the assay and sequencer (and kits) used to generate and bind the assay-specific construct to the sequencing adapters to generate a sequencing library. `seqspec` is specific to both a genomics assay and sequencer and provides a standardized format for describing the structure of sequencing libraries and the resulting sequencing reads. Specifically, a `seqspec` file is a machine-readable YAML file that annotates the content of molecules in genomic libraries, the structure of reads generated from them, and how those are stored in files.
 
 The primary goal of a `seqspec` file is to:
 

diff --git a/docs/SEQ_PRIMER.md b/docs/SEQ_PRIMER.md
@@ -51,4 +51,9 @@ Paper describing nanopore sequencing: https://www.nature.com/articles/nbt.1495
 
 ### Strandedness
 
-Unlike Illumina, nanopore sequencing produces the direct nucleotide sequence in linear order starting at the beginning of the molecule.
+Unlike Illumina, nanopore sequencing produces the direct nucleotide sequence in linear order starting at the beginning of the molecule. [But the read generated from a molecule can start from either end and either strand](https://nanoporetech.com/document/genomic-dna-by-ligation-sqk-lsk114). And the strand can be either the top or bottom strand. So a read is produced in one of four "orientations"
+
+1. forward, top strand
+2. forward, reverse strand
+3. backward, top strand
+4. backward, reverse strand
diff --git a/seqspec/Region.py b/seqspec/Region.py
@@ -242,34 +242,6 @@ def complement(self):
             self.sequence = complement_sequence(self.sequence)
 
 
-def complement_nucleotide(nucleotide):
-    complements = {
-        "A": "T",
-        "T": "A",
-        "G": "C",
-        "C": "G",
-        "R": "Y",
-        "Y": "R",
-        "S": "S",
-        "W": "W",
-        "K": "M",
-        "M": "K",
-        "B": "V",
-        "D": "H",
-        "V": "B",
-        "H": "D",
-        "N": "N",
-        "X": "X",
-    }
-    return complements.get(
-        nucleotide, "N"
-    )  # Default to 'N' if nucleotide is not recognized
-
-
-def complement_sequence(sequence):
-    return "".join(complement_nucleotide(n) for n in sequence.upper())
-
-
 class RegionCoordinate(Region):
     def __init__(
         self,
@@ -291,15 +263,106 @@ def __init__(
         self.start = start
         self.stop = stop
 
-    def __repr__(self):
-        return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
+    def __repr__(self) -> str:
+        d = {
+            "region": self.region_id,
+            "start": self.start,
+            "stop": self.stop,
+        }
+        return f"{d}"
+
+    # def __repr__(self):
+    #     return f"RegionCoordinate {self.name} [{self.region_type}]: [{self.start}, {self.stop})"
 
     def __str__(self):
-        return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
+        return f"RegionCoordinate {self.name} [{self.region_type}]: [{self.start}, {self.stop})"
 
     def __eq__(self, other):
         return self.start == other.start and self.stop == other.stop
 
+    def __sub__(self, other):
+        """
+        Define the subtraction between two RegionCoordinate objects.
+        rc2 - rc1 should return a new RegionCoordinate with:
+        - start as rc2.start - rc1.stop
+        - stop as rc2.stop - rc1.start
+        - min_len and max_len is max(self.start, other.start) - min(self.stop, other.stop)
+        """
+        if not isinstance(other, RegionCoordinate):
+            raise TypeError(
+                f"Unsupported operand type(s) for -: 'RegionCoordinate' and '{type(other).__name__}'"
+            )
+
+        # Case 1: self is to the left of other
+        # need to handle the case where the diff is zero but the position of other is left or right of self
+        # self - other
+        if self.stop <= other.start:
+            new_start = self.stop
+            new_stop = other.start
+
+        # Case 2: self is to the right of other
+        elif other.stop <= self.start:
+            new_start = other.stop
+            new_stop = self.start
+        else:
+            # not so obious what to do with the start and the stop for the same element
+            if self.start == other.start and self.stop == other.stop:
+                new_start = self.start
+                new_stop = self.stop
+            else:
+                raise ValueError("Subtraction is not defined.")
+
+        # Create a new RegionCoordinate object with the updated start and stop values
+        new_region = RegionCoordinate(
+            region=Region(
+                region_id=f"{self.region_id} - {other.region_id}",
+                region_type="difference",
+                name=f"{self.name} - {other.name}",
+                sequence_type="diff",
+                sequence="X",
+                min_len=0,
+                max_len=0,
+                onlist=None,
+                regions=None,
+            ),  # Assuming the region meta-data stays the same
+            start=new_start,
+            stop=new_stop,
+        )
+
+        # Set min_len and max_len
+        new_region.min_len = abs(new_region.stop - new_region.start)
+        new_region.max_len = abs(new_region.stop - new_region.start)
+        new_region.sequence = "X" * new_region.min_len
+
+        return new_region
+
+
+class RegionCoordinateDifference:
+    def __init__(
+        self,
+        obj: RegionCoordinate,
+        fixed: RegionCoordinate,
+        rgncdiff: RegionCoordinate,
+        loc: str = "",
+    ):
+        self.obj = obj
+        self.fixed = fixed
+        self.rgncdiff = rgncdiff
+        self.loc = loc
+        if obj.stop <= fixed.start:
+            self.loc = "-"
+        elif obj.start >= fixed.stop:
+            self.loc = "+"
+
+    def __repr__(self) -> str:
+        d = {
+            "obj": self.obj,
+            "fixed": self.fixed,
+            "rgncdiff": self.rgncdiff,
+            "loc": self.loc,
+        }
+        return f"{d}"
+
 
 def project_regions_to_coordinates(
     regions: List[Region], rcs: List[RegionCoordinate] = []
@@ -395,3 +458,31 @@ def to_dict(self):
 
     def update_file_id(self, file_id):
         self.file_id = file_id
+
+
+def complement_nucleotide(nucleotide):
+    complements = {
+        "A": "T",
+        "T": "A",
+        "G": "C",
+        "C": "G",
+        "R": "Y",
+        "Y": "R",
+        "S": "S",
+        "W": "W",
+        "K": "M",
+        "M": "K",
+        "B": "V",
+        "D": "H",
+        "V": "B",
+        "H": "D",
+        "N": "N",
+        "X": "X",
+    }
+    return complements.get(
+        nucleotide, "N"
+    )  # Default to 'N' if nucleotide is not recognized
+
+
+def complement_sequence(sequence):
+    return "".join(complement_nucleotide(n) for n in sequence.upper())