diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml index 4e38c08b..4fa534e1 100644 --- a/.github/workflows/code-quality.yml +++ b/.github/workflows/code-quality.yml @@ -13,4 +13,4 @@ jobs: - run: pip install --upgrade pip - run: pip install "black<24" mypy==v1.3 - run: black --diff --check $(git ls-files '*.py') - - run: MYPYPATH=mygfa mypy --disallow-untyped-defs mygfa slow_odgi pollen_data_gen + - run: MYPYPATH=mygfa mypy --no-namespace-packages --disallow-untyped-defs mygfa slow_odgi pollen_data_gen diff --git a/.gitignore b/.gitignore index 68db9c6c..be091d89 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ __pycache__/ **/*.validate **/*.og **/*.out +**/*.flatgfa *.json og_to_gfa.py compute_maxes.py diff --git a/Makefile b/Makefile index 5ccbbce2..2a2a2958 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,6 @@ TEST_FILES := t k note5 overlap q.chop LPA DRB1-3123 chr6.C4 BASIC_TESTS := ex1 ex2 GFA_URL := https://raw.githubusercontent.com/pangenome/odgi/ebc493f2622f49f1e67c63c1935d68967cd16d85/test -GFA_ZIP_URL := https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/scratch/2021_05_06_pggb/gfas/chr8.pan.gfa.gz # A smaller set of test inputs for faster runs. ifdef SMALL @@ -41,7 +40,7 @@ test-slow-odgi: slow-odgi-setup slow-odgi-oracles slow-odgi-tests # Produce some input files that are necessary for the slow_odgi tests. slow-odgi-setup: og -turnt -j --save --env depth_setup --env inject_setup \ - --env overlap_setup --env validate_setup tests/*.gfa + --env overlap_setup --env validate_setup $(TEST_FILES:%=tests/%.og) # Produce the oracle output (from "real" odgi) for each test input. Run this # once, noisily, to obtain the expected outputs. Then run `slow-odgi-tests` to @@ -52,7 +51,7 @@ ORACLES := chop_oracle crush_oracle degree_oracle depth_oracle \ flip_oracle flatten_oracle inject_oracle matrix_oracle overlap_oracle \ paths_oracle validate_oracle slow-odgi-oracles: og - -turnt -j --save $(ORACLES:%=--env %) tests/*.og + -turnt -j --save $(ORACLES:%=--env %) $(TEST_FILES:%=tests/%.og) -turnt -j --save --env validate_oracle_err tests/invalid/*.gfa -turnt -j --save --env crush_oracle tests/handmade/crush*.gfa -turnt -j --save --env flip_oracle tests/handmade/flip*.gfa @@ -63,7 +62,7 @@ slow-odgi-oracles: og TEST_ENVS := chop_test crush_test degree_test depth_test flip_test \ flatten_test inject_test matrix_test overlap_test paths_test validate_test slow-odgi-tests: - -turnt -j $(TEST_ENVS:%=--env %) tests/*.gfa + -turnt -j $(TEST_ENVS:%=--env %) $(TEST_FILES:%=tests/%.gfa) -turnt -j --env validate_test tests/invalid/*.gfa -turnt -j --env crush_test tests/handmade/crush*.gfa -turnt -j --env flip_test tests/handmade/flip*.gfa @@ -81,10 +80,6 @@ clean: rm -rf tests/handmade/*.flip rm -rf tests/invalid/*.* -tests/chr8.pan.gfa: - curl -Lo ./tests/chr8.pan.gfa.gz $(GFA_ZIP_URL) - gunzip ./tests/chr8.pan.gfa.gz - tests/%.gfa: curl -Lo ./$@ $(GFA_URL)/$*.gfa diff --git a/mygfa/example.py b/mygfa/example.py new file mode 100644 index 00000000..2ee090e4 --- /dev/null +++ b/mygfa/example.py @@ -0,0 +1,19 @@ +import sys +import mygfa + + +def print_depth(graph: mygfa.Graph) -> None: + # Count the number of times that any path passes through a segment. + seg_depths = {name: 0 for name in graph.segments} + for path in graph.paths.values(): + for step in path.segments: + seg_depths[step.name] += 1 + + # Print the counts. + print("seg\tdepth") + for name, depth in seg_depths.items(): + print(f"{name}\t{depth}") + + +if __name__ == "__main__": + print_depth(mygfa.Graph.parse(sys.stdin)) diff --git a/mygfa/mygfa/__init__.py b/mygfa/mygfa/__init__.py index 8e506ac3..b6ff3e6b 100644 --- a/mygfa/mygfa/__init__.py +++ b/mygfa/mygfa/__init__.py @@ -1,3 +1,5 @@ -"""GFA parsing and pre-processing in Python.""" +"""Simple GFA parsing, printing, and pre-processing in Python.""" + +from .gfa import * # noqa __version__ = "0.1" diff --git a/mygfa/mygfa/__main__.py b/mygfa/mygfa/__main__.py new file mode 100644 index 00000000..545be45c --- /dev/null +++ b/mygfa/mygfa/__main__.py @@ -0,0 +1,10 @@ +import sys +from .gfa import Graph + + +if __name__ == "__main__": + mygraph = Graph.parse(sys.stdin) + if len(sys.argv) > 1 and sys.argv[1] == "--nl": + mygraph.emit(sys.stdout, False) + else: + mygraph.emit(sys.stdout) diff --git a/mygfa/mygfa/mygfa.py b/mygfa/mygfa/gfa.py similarity index 96% rename from mygfa/mygfa/mygfa.py rename to mygfa/mygfa/gfa.py index efc1a8d4..bb2b4e09 100644 --- a/mygfa/mygfa/mygfa.py +++ b/mygfa/mygfa/gfa.py @@ -1,6 +1,4 @@ import re -import sys - from collections import OrderedDict from dataclasses import dataclass from enum import Enum @@ -153,7 +151,7 @@ def linkstr(self) -> str: @dataclass(eq=True, order=True) class Link: - """A GFA link is an edge connecting two sequences.""" + """A GFA link is an edge connecting two segments.""" from_: Handle to_: Handle @@ -197,7 +195,7 @@ def __str__(self) -> str: @dataclass class Path: - """A GFA path is an ordered series of links.""" + """A GFA path is a walk through the graph.""" name: str segments: List[Handle] # Segment names and orientations. @@ -306,11 +304,3 @@ def emit(self, outfile: TextIO, showlinks: bool = True) -> None: if showlinks: for link in sorted(self.links): print(str(link), file=outfile) - - -if __name__ == "__main__": - mygraph = Graph.parse(sys.stdin) - if len(sys.argv) > 1 and sys.argv[1] == "--nl": - mygraph.emit(sys.stdout, False) - else: - mygraph.emit(sys.stdout) diff --git a/mygfa/mygfa/preprocess.py b/mygfa/mygfa/preprocess.py index 5331855a..c41fdb4f 100644 --- a/mygfa/mygfa/preprocess.py +++ b/mygfa/mygfa/preprocess.py @@ -1,5 +1,5 @@ from typing import List, Tuple, Dict -from . import mygfa +from . import gfa as mygfa def node_steps(graph: mygfa.Graph) -> Dict[str, List[Tuple[str, int, bool]]]: diff --git a/pollen_data_gen/pollen_data_gen/__main__.py b/pollen_data_gen/pollen_data_gen/__main__.py index 32eb60f1..0276a28e 100644 --- a/pollen_data_gen/pollen_data_gen/__main__.py +++ b/pollen_data_gen/pollen_data_gen/__main__.py @@ -1,6 +1,6 @@ import sys import argparse -from mygfa import mygfa +import mygfa from typing import List from . import depth, simple diff --git a/pollen_data_gen/pollen_data_gen/depth.py b/pollen_data_gen/pollen_data_gen/depth.py index ece7bd2c..7d698e7c 100644 --- a/pollen_data_gen/pollen_data_gen/depth.py +++ b/pollen_data_gen/pollen_data_gen/depth.py @@ -1,8 +1,9 @@ import sys -from typing import Any, Collection, Dict, OrderedDict, Union, Optional, List +from typing import Any, Collection, Dict, Union, Optional, List import json from json import JSONEncoder -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess FormatType = Dict[str, Union[bool, str, int]] @@ -24,7 +25,7 @@ def paths_viewed_from_nodes( output = {} json_format = format_gen(max_p.bit_length()) # segment name, (path name, index on path, direction) list - for seg, crossings in preprocess.node_steps(graph).items(): + for seg, crossings in mygfa.preprocess.node_steps(graph).items(): data = list(path2id[c[0]] for c in crossings) data = data + [0] * (max_e - len(data)) output[f"path_ids{seg}"] = {"data": data, "format": json_format} @@ -115,7 +116,7 @@ def depth_json( """Returns a JSON representation of `graph` that is specific to the exine command `depth`. """ - n_tight, e_tight, p_tight = preprocess.get_maxes(graph) + n_tight, e_tight, p_tight = mygfa.preprocess.get_maxes(graph) # These values have been calculated automatically, and are likely optimal. # However, they are only to be used when the user-does not supply them via CLI. if not max_n: diff --git a/pollen_data_gen/pollen_data_gen/simple.py b/pollen_data_gen/pollen_data_gen/simple.py index 017f8fd4..0b2909fb 100644 --- a/pollen_data_gen/pollen_data_gen/simple.py +++ b/pollen_data_gen/pollen_data_gen/simple.py @@ -1,9 +1,8 @@ -# import sys import json from typing import Dict, Union, Optional, Any, List, Sequence, TextIO from io import TextIOWrapper from json import JSONEncoder -from mygfa import mygfa +import mygfa from . import depth diff --git a/slow_odgi/slow_odgi/__main__.py b/slow_odgi/slow_odgi/__main__.py index eb83ae45..0d16b625 100644 --- a/slow_odgi/slow_odgi/__main__.py +++ b/slow_odgi/slow_odgi/__main__.py @@ -3,7 +3,7 @@ import io from typing import Dict, Tuple, List, Optional from collections.abc import Callable -from mygfa import mygfa +import mygfa from . import ( chop, diff --git a/slow_odgi/slow_odgi/chop.py b/slow_odgi/slow_odgi/chop.py index 1462b9e1..06fe3f14 100644 --- a/slow_odgi/slow_odgi/chop.py +++ b/slow_odgi/slow_odgi/chop.py @@ -1,5 +1,5 @@ from typing import Dict, Tuple -from mygfa import mygfa +import mygfa def chop_segs( diff --git a/slow_odgi/slow_odgi/crush.py b/slow_odgi/slow_odgi/crush.py index 7b363fc7..061663c4 100644 --- a/slow_odgi/slow_odgi/crush.py +++ b/slow_odgi/slow_odgi/crush.py @@ -1,4 +1,5 @@ -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess def crush_seg(seg: mygfa.Segment) -> mygfa.Segment: @@ -23,6 +24,6 @@ def crush(graph: mygfa.Graph) -> mygfa.Graph: graph.headers, crushed_segs, graph.links, - preprocess.drop_all_overlaps(graph.paths), + mygfa.preprocess.drop_all_overlaps(graph.paths), # odgi drops overlaps, so we do too. ) diff --git a/slow_odgi/slow_odgi/degree.py b/slow_odgi/slow_odgi/degree.py index f806f73e..510b3761 100644 --- a/slow_odgi/slow_odgi/degree.py +++ b/slow_odgi/slow_odgi/degree.py @@ -1,10 +1,11 @@ -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess def degree(graph: mygfa.Graph) -> mygfa.Graph: """The degree of a node is just the cardinality of adjlist for that node.""" print("\t".join(["#node.id", "node.degree"])) - ins, outs = preprocess.adjlist(graph) + ins, outs = mygfa.preprocess.adjlist(graph) for seg in graph.segments.values(): segname = seg.name out_degree = len(outs[mygfa.Handle(segname, True)]) + len( diff --git a/slow_odgi/slow_odgi/depth.py b/slow_odgi/slow_odgi/depth.py index 15a7061f..b9e3a7ae 100644 --- a/slow_odgi/slow_odgi/depth.py +++ b/slow_odgi/slow_odgi/depth.py @@ -1,11 +1,12 @@ from typing import List, Optional -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess def depth(graph: mygfa.Graph, inputpaths: Optional[List[str]]) -> mygfa.Graph: """The depth of a node is the cardinality of node_step for that node.""" print("\t".join(["#node.id", "depth", "depth.uniq"])) - for seg, crossings in preprocess.node_steps(graph).items(): + for seg, crossings in mygfa.preprocess.node_steps(graph).items(): # Each crossing is a (path name, index on path, direction) tuple. # We only want to count crossings that are on input paths. crossings = [c for c in crossings if inputpaths is None or c[0] in inputpaths] diff --git a/slow_odgi/slow_odgi/flatten.py b/slow_odgi/slow_odgi/flatten.py index 4ce4af22..c4bcd489 100644 --- a/slow_odgi/slow_odgi/flatten.py +++ b/slow_odgi/slow_odgi/flatten.py @@ -1,5 +1,5 @@ from typing import Tuple -from mygfa import mygfa +import mygfa def get_fasta_legend(graph: mygfa.Graph) -> Tuple[str, mygfa.LegendType]: diff --git a/slow_odgi/slow_odgi/flip.py b/slow_odgi/slow_odgi/flip.py index 382698e9..87093520 100644 --- a/slow_odgi/slow_odgi/flip.py +++ b/slow_odgi/slow_odgi/flip.py @@ -1,6 +1,6 @@ from typing import List, Tuple, Dict from collections.abc import Callable -from mygfa import mygfa +import mygfa def path_is_rev(path: mygfa.Path, graph: mygfa.Graph) -> bool: diff --git a/slow_odgi/slow_odgi/inject.py b/slow_odgi/slow_odgi/inject.py index c1a66aea..673e6d04 100644 --- a/slow_odgi/slow_odgi/inject.py +++ b/slow_odgi/slow_odgi/inject.py @@ -1,5 +1,5 @@ from typing import List, Optional, Tuple -from mygfa import mygfa +import mygfa from . import chop diff --git a/slow_odgi/slow_odgi/inject_setup.py b/slow_odgi/slow_odgi/inject_setup.py index 43ae5518..36ff2dc7 100644 --- a/slow_odgi/slow_odgi/inject_setup.py +++ b/slow_odgi/slow_odgi/inject_setup.py @@ -1,6 +1,7 @@ import sys import random -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess def print_bed(graph: mygfa.Graph) -> None: @@ -12,7 +13,7 @@ def print_bed(graph: mygfa.Graph) -> None: """ random.seed(4) for path in graph.paths.values(): - length = len(preprocess.pathseq(graph)[path.name]) + length = len(mygfa.preprocess.pathseq(graph)[path.name]) for i in range(random.randint(0, 5)): low = random.randint(0, length - 1) high = random.randint(low + 1, length) diff --git a/slow_odgi/slow_odgi/matrix.py b/slow_odgi/slow_odgi/matrix.py index 0711fc9a..a062adb3 100644 --- a/slow_odgi/slow_odgi/matrix.py +++ b/slow_odgi/slow_odgi/matrix.py @@ -1,4 +1,5 @@ -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess def matrix(graph: mygfa.Graph) -> mygfa.Graph: @@ -8,7 +9,7 @@ def matrix(graph: mygfa.Graph) -> mygfa.Graph: topseg = max([int(i) for i in graph.segments.keys()]) print(" ".join(str(i) for i in [topseg, topseg, 2 * len(graph.links)])) - _, outs = preprocess.adjlist(graph) + _, outs = mygfa.preprocess.adjlist(graph) for seg, neighbors in outs.items(): for neighbor in neighbors: print(" ".join([seg.name, neighbor.name, "1"])) diff --git a/slow_odgi/slow_odgi/norm.py b/slow_odgi/slow_odgi/norm.py index 3ac64cab..150a8bd9 100644 --- a/slow_odgi/slow_odgi/norm.py +++ b/slow_odgi/slow_odgi/norm.py @@ -1,5 +1,5 @@ import sys -from mygfa import mygfa +import mygfa def norm(graph: mygfa.Graph) -> mygfa.Graph: diff --git a/slow_odgi/slow_odgi/overlap.py b/slow_odgi/slow_odgi/overlap.py index 828d8611..c1076fca 100644 --- a/slow_odgi/slow_odgi/overlap.py +++ b/slow_odgi/slow_odgi/overlap.py @@ -1,5 +1,6 @@ -from typing import List, Optional -from mygfa import mygfa, preprocess +from typing import List +import mygfa +import mygfa.preprocess def touches(path1: str, path2: str, graph: mygfa.Graph) -> bool: @@ -24,6 +25,8 @@ def overlap(graph: mygfa.Graph, inputpaths: List[str]) -> mygfa.Graph: print("\t".join(["#path", "start", "end", "path.touched"])) header_printed = True print( - "\t".join([ip, "0", str(len(preprocess.pathseq(graph)[ip])), path]) + "\t".join( + [ip, "0", str(len(mygfa.preprocess.pathseq(graph)[ip])), path] + ) ) return graph diff --git a/slow_odgi/slow_odgi/paths.py b/slow_odgi/slow_odgi/paths.py index 0ec4b5b2..688ff0ab 100644 --- a/slow_odgi/slow_odgi/paths.py +++ b/slow_odgi/slow_odgi/paths.py @@ -1,6 +1,6 @@ import sys import random -from mygfa import mygfa +import mygfa def paths(graph: mygfa.Graph, droprate: int = 0) -> mygfa.Graph: diff --git a/slow_odgi/slow_odgi/proofs.py b/slow_odgi/slow_odgi/proofs.py index 1d16a44e..6f199041 100644 --- a/slow_odgi/slow_odgi/proofs.py +++ b/slow_odgi/slow_odgi/proofs.py @@ -1,4 +1,5 @@ -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess def paths_logically_le(g1: mygfa.Graph, g2: mygfa.Graph) -> bool: @@ -6,8 +7,8 @@ def paths_logically_le(g1: mygfa.Graph, g2: mygfa.Graph) -> bool: That is, for all paths p in g1, does the sequence charted by p in g1 match the sequence charted by p in g2? """ - pathseqs_g1 = preprocess.pathseq(g1) - pathseqs_g2 = preprocess.pathseq(g2) + pathseqs_g1 = mygfa.preprocess.pathseq(g1) + pathseqs_g2 = mygfa.preprocess.pathseq(g2) for p in g1.paths.keys(): if p not in g2.paths.keys() or pathseqs_g1[p] != pathseqs_g2[p]: return False diff --git a/slow_odgi/slow_odgi/validate.py b/slow_odgi/slow_odgi/validate.py index 7727e693..ddbdc63f 100644 --- a/slow_odgi/slow_odgi/validate.py +++ b/slow_odgi/slow_odgi/validate.py @@ -1,9 +1,10 @@ -from mygfa import mygfa, preprocess +import mygfa +import mygfa.preprocess def validate(graph: mygfa.Graph) -> mygfa.Graph: """Does the underlying set of Links support the paths that the graph has?""" - _, outs = preprocess.adjlist(graph) + _, outs = mygfa.preprocess.adjlist(graph) for path in graph.paths.values(): length = len(path.segments) diff --git a/slow_odgi/slow_odgi/validate_setup.py b/slow_odgi/slow_odgi/validate_setup.py index 4f6c87ea..242b425b 100644 --- a/slow_odgi/slow_odgi/validate_setup.py +++ b/slow_odgi/slow_odgi/validate_setup.py @@ -1,6 +1,6 @@ import sys import random -from mygfa import mygfa +import mygfa def drop_some_links(graph: mygfa.Graph) -> mygfa.Graph: