Skip to content

Commit

Permalink
mygfa: A little bit of cleanup (#161)
Browse files Browse the repository at this point in the history
  • Loading branch information
sampsyo authored Mar 23, 2024
2 parents 3b54da3 + df3a70e commit 5e10abc
Show file tree
Hide file tree
Showing 27 changed files with 83 additions and 56 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ jobs:
- run: pip install --upgrade pip
- run: pip install "black<24" mypy==v1.3
- run: black --diff --check $(git ls-files '*.py')
- run: MYPYPATH=mygfa mypy --disallow-untyped-defs mygfa slow_odgi pollen_data_gen
- run: MYPYPATH=mygfa mypy --no-namespace-packages --disallow-untyped-defs mygfa slow_odgi pollen_data_gen
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ __pycache__/
**/*.validate
**/*.og
**/*.out
**/*.flatgfa
*.json
og_to_gfa.py
compute_maxes.py
Expand Down
11 changes: 3 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
TEST_FILES := t k note5 overlap q.chop LPA DRB1-3123 chr6.C4
BASIC_TESTS := ex1 ex2
GFA_URL := https://raw.githubusercontent.com/pangenome/odgi/ebc493f2622f49f1e67c63c1935d68967cd16d85/test
GFA_ZIP_URL := https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/scratch/2021_05_06_pggb/gfas/chr8.pan.gfa.gz

# A smaller set of test inputs for faster runs.
ifdef SMALL
Expand Down Expand Up @@ -41,7 +40,7 @@ test-slow-odgi: slow-odgi-setup slow-odgi-oracles slow-odgi-tests
# Produce some input files that are necessary for the slow_odgi tests.
slow-odgi-setup: og
-turnt -j --save --env depth_setup --env inject_setup \
--env overlap_setup --env validate_setup tests/*.gfa
--env overlap_setup --env validate_setup $(TEST_FILES:%=tests/%.og)

# Produce the oracle output (from "real" odgi) for each test input. Run this
# once, noisily, to obtain the expected outputs. Then run `slow-odgi-tests` to
Expand All @@ -52,7 +51,7 @@ ORACLES := chop_oracle crush_oracle degree_oracle depth_oracle \
flip_oracle flatten_oracle inject_oracle matrix_oracle overlap_oracle \
paths_oracle validate_oracle
slow-odgi-oracles: og
-turnt -j --save $(ORACLES:%=--env %) tests/*.og
-turnt -j --save $(ORACLES:%=--env %) $(TEST_FILES:%=tests/%.og)
-turnt -j --save --env validate_oracle_err tests/invalid/*.gfa
-turnt -j --save --env crush_oracle tests/handmade/crush*.gfa
-turnt -j --save --env flip_oracle tests/handmade/flip*.gfa
Expand All @@ -63,7 +62,7 @@ slow-odgi-oracles: og
TEST_ENVS := chop_test crush_test degree_test depth_test flip_test \
flatten_test inject_test matrix_test overlap_test paths_test validate_test
slow-odgi-tests:
-turnt -j $(TEST_ENVS:%=--env %) tests/*.gfa
-turnt -j $(TEST_ENVS:%=--env %) $(TEST_FILES:%=tests/%.gfa)
-turnt -j --env validate_test tests/invalid/*.gfa
-turnt -j --env crush_test tests/handmade/crush*.gfa
-turnt -j --env flip_test tests/handmade/flip*.gfa
Expand All @@ -81,10 +80,6 @@ clean:
rm -rf tests/handmade/*.flip
rm -rf tests/invalid/*.*

tests/chr8.pan.gfa:
curl -Lo ./tests/chr8.pan.gfa.gz $(GFA_ZIP_URL)
gunzip ./tests/chr8.pan.gfa.gz

tests/%.gfa:
curl -Lo ./$@ $(GFA_URL)/$*.gfa

Expand Down
19 changes: 19 additions & 0 deletions mygfa/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import sys
import mygfa


def print_depth(graph: mygfa.Graph) -> None:
# Count the number of times that any path passes through a segment.
seg_depths = {name: 0 for name in graph.segments}
for path in graph.paths.values():
for step in path.segments:
seg_depths[step.name] += 1

# Print the counts.
print("seg\tdepth")
for name, depth in seg_depths.items():
print(f"{name}\t{depth}")


if __name__ == "__main__":
print_depth(mygfa.Graph.parse(sys.stdin))
4 changes: 3 additions & 1 deletion mygfa/mygfa/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""GFA parsing and pre-processing in Python."""
"""Simple GFA parsing, printing, and pre-processing in Python."""

from .gfa import * # noqa

__version__ = "0.1"
10 changes: 10 additions & 0 deletions mygfa/mygfa/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import sys
from .gfa import Graph


if __name__ == "__main__":
mygraph = Graph.parse(sys.stdin)
if len(sys.argv) > 1 and sys.argv[1] == "--nl":
mygraph.emit(sys.stdout, False)
else:
mygraph.emit(sys.stdout)
14 changes: 2 additions & 12 deletions mygfa/mygfa/mygfa.py → mygfa/mygfa/gfa.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import re
import sys

from collections import OrderedDict
from dataclasses import dataclass
from enum import Enum
Expand Down Expand Up @@ -153,7 +151,7 @@ def linkstr(self) -> str:

@dataclass(eq=True, order=True)
class Link:
"""A GFA link is an edge connecting two sequences."""
"""A GFA link is an edge connecting two segments."""

from_: Handle
to_: Handle
Expand Down Expand Up @@ -197,7 +195,7 @@ def __str__(self) -> str:

@dataclass
class Path:
"""A GFA path is an ordered series of links."""
"""A GFA path is a walk through the graph."""

name: str
segments: List[Handle] # Segment names and orientations.
Expand Down Expand Up @@ -306,11 +304,3 @@ def emit(self, outfile: TextIO, showlinks: bool = True) -> None:
if showlinks:
for link in sorted(self.links):
print(str(link), file=outfile)


if __name__ == "__main__":
mygraph = Graph.parse(sys.stdin)
if len(sys.argv) > 1 and sys.argv[1] == "--nl":
mygraph.emit(sys.stdout, False)
else:
mygraph.emit(sys.stdout)
2 changes: 1 addition & 1 deletion mygfa/mygfa/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import List, Tuple, Dict
from . import mygfa
from . import gfa as mygfa


def node_steps(graph: mygfa.Graph) -> Dict[str, List[Tuple[str, int, bool]]]:
Expand Down
2 changes: 1 addition & 1 deletion pollen_data_gen/pollen_data_gen/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
import argparse
from mygfa import mygfa
import mygfa
from typing import List

from . import depth, simple
Expand Down
9 changes: 5 additions & 4 deletions pollen_data_gen/pollen_data_gen/depth.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import sys
from typing import Any, Collection, Dict, OrderedDict, Union, Optional, List
from typing import Any, Collection, Dict, Union, Optional, List
import json
from json import JSONEncoder
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


FormatType = Dict[str, Union[bool, str, int]]
Expand All @@ -24,7 +25,7 @@ def paths_viewed_from_nodes(
output = {}
json_format = format_gen(max_p.bit_length())
# segment name, (path name, index on path, direction) list
for seg, crossings in preprocess.node_steps(graph).items():
for seg, crossings in mygfa.preprocess.node_steps(graph).items():
data = list(path2id[c[0]] for c in crossings)
data = data + [0] * (max_e - len(data))
output[f"path_ids{seg}"] = {"data": data, "format": json_format}
Expand Down Expand Up @@ -115,7 +116,7 @@ def depth_json(
"""Returns a JSON representation of `graph`
that is specific to the exine command `depth`.
"""
n_tight, e_tight, p_tight = preprocess.get_maxes(graph)
n_tight, e_tight, p_tight = mygfa.preprocess.get_maxes(graph)
# These values have been calculated automatically, and are likely optimal.
# However, they are only to be used when the user-does not supply them via CLI.
if not max_n:
Expand Down
3 changes: 1 addition & 2 deletions pollen_data_gen/pollen_data_gen/simple.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# import sys
import json
from typing import Dict, Union, Optional, Any, List, Sequence, TextIO
from io import TextIOWrapper
from json import JSONEncoder
from mygfa import mygfa
import mygfa
from . import depth


Expand Down
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import io
from typing import Dict, Tuple, List, Optional
from collections.abc import Callable
from mygfa import mygfa
import mygfa

from . import (
chop,
Expand Down
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/chop.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, Tuple
from mygfa import mygfa
import mygfa


def chop_segs(
Expand Down
5 changes: 3 additions & 2 deletions slow_odgi/slow_odgi/crush.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


def crush_seg(seg: mygfa.Segment) -> mygfa.Segment:
Expand All @@ -23,6 +24,6 @@ def crush(graph: mygfa.Graph) -> mygfa.Graph:
graph.headers,
crushed_segs,
graph.links,
preprocess.drop_all_overlaps(graph.paths),
mygfa.preprocess.drop_all_overlaps(graph.paths),
# odgi drops overlaps, so we do too.
)
5 changes: 3 additions & 2 deletions slow_odgi/slow_odgi/degree.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


def degree(graph: mygfa.Graph) -> mygfa.Graph:
"""The degree of a node is just the cardinality of adjlist for that node."""
print("\t".join(["#node.id", "node.degree"]))
ins, outs = preprocess.adjlist(graph)
ins, outs = mygfa.preprocess.adjlist(graph)
for seg in graph.segments.values():
segname = seg.name
out_degree = len(outs[mygfa.Handle(segname, True)]) + len(
Expand Down
5 changes: 3 additions & 2 deletions slow_odgi/slow_odgi/depth.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from typing import List, Optional
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


def depth(graph: mygfa.Graph, inputpaths: Optional[List[str]]) -> mygfa.Graph:
"""The depth of a node is the cardinality of node_step for that node."""
print("\t".join(["#node.id", "depth", "depth.uniq"]))
for seg, crossings in preprocess.node_steps(graph).items():
for seg, crossings in mygfa.preprocess.node_steps(graph).items():
# Each crossing is a (path name, index on path, direction) tuple.
# We only want to count crossings that are on input paths.
crossings = [c for c in crossings if inputpaths is None or c[0] in inputpaths]
Expand Down
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/flatten.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Tuple
from mygfa import mygfa
import mygfa


def get_fasta_legend(graph: mygfa.Graph) -> Tuple[str, mygfa.LegendType]:
Expand Down
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/flip.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Tuple, Dict
from collections.abc import Callable
from mygfa import mygfa
import mygfa


def path_is_rev(path: mygfa.Path, graph: mygfa.Graph) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/inject.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import List, Optional, Tuple
from mygfa import mygfa
import mygfa
from . import chop


Expand Down
5 changes: 3 additions & 2 deletions slow_odgi/slow_odgi/inject_setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import random
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


def print_bed(graph: mygfa.Graph) -> None:
Expand All @@ -12,7 +13,7 @@ def print_bed(graph: mygfa.Graph) -> None:
"""
random.seed(4)
for path in graph.paths.values():
length = len(preprocess.pathseq(graph)[path.name])
length = len(mygfa.preprocess.pathseq(graph)[path.name])
for i in range(random.randint(0, 5)):
low = random.randint(0, length - 1)
high = random.randint(low + 1, length)
Expand Down
5 changes: 3 additions & 2 deletions slow_odgi/slow_odgi/matrix.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


def matrix(graph: mygfa.Graph) -> mygfa.Graph:
Expand All @@ -8,7 +9,7 @@ def matrix(graph: mygfa.Graph) -> mygfa.Graph:
topseg = max([int(i) for i in graph.segments.keys()])
print(" ".join(str(i) for i in [topseg, topseg, 2 * len(graph.links)]))

_, outs = preprocess.adjlist(graph)
_, outs = mygfa.preprocess.adjlist(graph)
for seg, neighbors in outs.items():
for neighbor in neighbors:
print(" ".join([seg.name, neighbor.name, "1"]))
Expand Down
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/norm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
from mygfa import mygfa
import mygfa


def norm(graph: mygfa.Graph) -> mygfa.Graph:
Expand Down
9 changes: 6 additions & 3 deletions slow_odgi/slow_odgi/overlap.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List, Optional
from mygfa import mygfa, preprocess
from typing import List
import mygfa
import mygfa.preprocess


def touches(path1: str, path2: str, graph: mygfa.Graph) -> bool:
Expand All @@ -24,6 +25,8 @@ def overlap(graph: mygfa.Graph, inputpaths: List[str]) -> mygfa.Graph:
print("\t".join(["#path", "start", "end", "path.touched"]))
header_printed = True
print(
"\t".join([ip, "0", str(len(preprocess.pathseq(graph)[ip])), path])
"\t".join(
[ip, "0", str(len(mygfa.preprocess.pathseq(graph)[ip])), path]
)
)
return graph
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/paths.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
import random
from mygfa import mygfa
import mygfa


def paths(graph: mygfa.Graph, droprate: int = 0) -> mygfa.Graph:
Expand Down
7 changes: 4 additions & 3 deletions slow_odgi/slow_odgi/proofs.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


def paths_logically_le(g1: mygfa.Graph, g2: mygfa.Graph) -> bool:
"""Are the paths in g1 logically "less than or equal to" those in g2?
That is, for all paths p in g1, does the sequence charted by
p in g1 match the sequence charted by p in g2?
"""
pathseqs_g1 = preprocess.pathseq(g1)
pathseqs_g2 = preprocess.pathseq(g2)
pathseqs_g1 = mygfa.preprocess.pathseq(g1)
pathseqs_g2 = mygfa.preprocess.pathseq(g2)
for p in g1.paths.keys():
if p not in g2.paths.keys() or pathseqs_g1[p] != pathseqs_g2[p]:
return False
Expand Down
5 changes: 3 additions & 2 deletions slow_odgi/slow_odgi/validate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from mygfa import mygfa, preprocess
import mygfa
import mygfa.preprocess


def validate(graph: mygfa.Graph) -> mygfa.Graph:
"""Does the underlying set of Links support the paths that the graph has?"""
_, outs = preprocess.adjlist(graph)
_, outs = mygfa.preprocess.adjlist(graph)

for path in graph.paths.values():
length = len(path.segments)
Expand Down
2 changes: 1 addition & 1 deletion slow_odgi/slow_odgi/validate_setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
import random
from mygfa import mygfa
import mygfa


def drop_some_links(graph: mygfa.Graph) -> mygfa.Graph:
Expand Down

0 comments on commit 5e10abc

Please sign in to comment.