Skip to content

Commit

Permalink
Merge pull request #6 from alea-institute/v0.1.4
Browse files Browse the repository at this point in the history
merge 0.1.4
  • Loading branch information
mjbommar authored Sep 4, 2024
2 parents 6fe8538 + 013790f commit 203b9b8
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 19 deletions.
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
Version 0.1.4 (2024-09-04)
---------------------------
* Add prefix search for typeahead/search bars (with optional trie-based search)
* Enhanced sort order for _basic_search (search_by_label, search_by_definition)


Version 0.1.3 (2024-09-03)
---------------------------
* Separate rapidfuzz dependency into optional [search] extra
Expand Down
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,13 @@ from soli import SOLI
# Initialize the SOLI client
soli = SOLI()

# Search by prefix
results = soli.search_by_prefix("Mich")
for owl_class in results:
print(f"Class: {owl_class.label}")

# Search for a class by label
results = soli.search_by_label("Contract")
results = soli.search_by_label("Mich")
for owl_class, score in results:
print(f"Class: {owl_class.label}, Score: {score}")

Expand All @@ -69,6 +74,14 @@ We welcome contributions to the SOLI Python library! If you'd like to contribute

For more information, see our [contribution guidelines](CONTRIBUTING.md).

## SOLI API
A public, freely-accessible API is available for the SOLI ontology.

The API is hosted at [https://soli.openlegalstandard.org/](https://soli.openlegalstandard.org/).

The source code for the API is available on GitHub at [https://github.com/alea-institute/soli-api](https://github.com/alea-institute/soli-api).


## License

The SOLI Python library is released under the MIT License. See the [LICENSE](LICENSE) file for details.
Expand Down
2 changes: 1 addition & 1 deletion docker/ubuntu2204-install/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM ubuntu:22.04

# define package version
ARG SOLI_VERSION=0.1.3
ARG SOLI_VERSION=0.1.4

# Avoid prompts from apt
ENV DEBIAN_FRONTEND=noninteractive
Expand Down
2 changes: 1 addition & 1 deletion docker/ubuntu2404-install/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM ubuntu:24.04

# define package version
ARG SOLI_VERSION=0.1.3
ARG SOLI_VERSION=0.1.4

# Avoid prompts from apt
ENV DEBIAN_FRONTEND=noninteractive
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
project = "soli-python"
copyright = "2024, ALEA Institute"
author = "ALEA Institute (https://aleainstitute.ai)"
release = "0.1.3"
version = "0.1.3"
release = "0.1.4"
version = "0.1.4"
master_doc = "index"
language = "en"

Expand Down
127 changes: 125 additions & 2 deletions poetry.lock

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "soli-python"
version = "0.1.3"
version = "0.1.4"
description = "Python library for SOLI, the Standard for Open Legal Information"
authors = ["ALEA Institute <[email protected]>"]
license = "MIT"
Expand Down Expand Up @@ -40,6 +40,8 @@ pydantic = "^2.8.2"
lxml = "^5.2.2"
httpx = "^0.27.2"
rapidfuzz = {version = "^3.9.7", optional = true}
marisa-trie = {version = "^1.2.0", optional = true}


[tool.poetry.group.dev.dependencies]
types-lxml = "^2024.8.7"
Expand All @@ -59,10 +61,11 @@ sphinx-plausible = "^0.1.2"

[tool.poetry.group.search.dependencies]
rapidfuzz = "^3.9.7"
marisa-trie = "^1.2.0"

# extras
[tool.poetry.extras]
search = ["rapidfuzz"]
search = ["rapidfuzz", "marisa-trie"]


[build-system]
Expand Down
2 changes: 1 addition & 1 deletion soli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# SPDX-License-Identifier: MIT
# (c) 2024 ALEA Institute.

__version__ = "0.1.3"
__version__ = "0.1.4"
__author__ = "ALEA Institute"
__license__ = "MIT"
__description__ = "Python library for SOLI, the Standard for Open Legal Information"
Expand Down
90 changes: 81 additions & 9 deletions soli/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,20 +110,30 @@ class SOLITypes(Enum):
# IRI max generation attempt for safety.
MAX_IRI_ATTEMPTS: int = 16

# minimum length for prefix search
MIN_PREFIX_LENGTH: int = 3

# Set up logger
LOGGER = get_logger(__name__)


# try to import rapidfuzz with importlib; log if not able to.
# try to import rapidfuzz and marisa_trie with importlib; log if not able to.
try:
if importlib.util.find_spec("rapidfuzz") is not None:
import rapidfuzz
else:
LOGGER.warning("Disabling search functionality: rapidfuzz not found.")
rapidfuzz = None

if importlib.util.find_spec("marisa_trie") is not None:
import marisa_trie
else:
LOGGER.warning("Disabling search functionality: marisa_trie not found.")
marisa_trie = None
except ImportError as e:
LOGGER.warning("Failed to check for search functionality: %s", e)
rapidfuzz = None
marisa_trie = None


# pylint: disable=too-many-instance-attributes
Expand Down Expand Up @@ -178,6 +188,8 @@ def __init__(
self.alt_label_to_index: Dict[str, List[int]] = {}
self.class_edges: Dict[str, List[str]] = {}
self._cached_triples: Tuple[Tuple[str, str, str], ...] = ()
self._label_trie: Optional[marisa_trie.Trie] = None
self._prefix_cache: Dict[str, List[OWLClass]] = {}
self.triples: List[Tuple[str, str, str]] = []

# load the ontology
Expand Down Expand Up @@ -770,6 +782,16 @@ def parse_owl(self, buffer: str) -> None:
# freeze triple tuples
self._cached_triples = tuple(self.triples)

# now create the Trie for the labels in label_to_index and alt_label_to_index
if marisa_trie is not None:
all_labels = [
label
for label in list(self.label_to_index.keys())
+ list(self.alt_label_to_index.keys())
if len(label) >= MIN_PREFIX_LENGTH
]
self._label_trie = marisa_trie.Trie(all_labels)

def get_subgraph(
self, iri: str, max_depth: int = DEFAULT_MAX_DEPTH
) -> List[OWLClass]:
Expand Down Expand Up @@ -994,6 +1016,52 @@ def refresh(self) -> None:
end_time = time.time()
LOGGER.info("Parsed SOLI ontology in %.2f seconds", end_time - start_time)

def search_by_prefix(self, prefix: str) -> List[OWLClass]:
"""
Search for IRIs by prefix.
Args:
prefix (str): The prefix to search for.
Returns:
List[OWLClass]: The list of OWL classes with IRIs that start with the prefix.
"""
# check for cache
if prefix in self._prefix_cache:
return self._prefix_cache[prefix]

# search in trie
if marisa_trie is not None:
# return in sorted by length ascending list
keys = sorted(
self._label_trie.keys(prefix),
key=lambda x: len(x),
)
else:
# search with pure python
keys = sorted(
[
label
for label in list(self.label_to_index.keys())
+ list(self.alt_label_to_index.keys())
if label.startswith(prefix)
],
key=lambda x: len(x),
)

# get the list of IRIs
iri_list = []
for key in keys:
iri_list.extend(self.label_to_index.get(key, []))
iri_list.extend(self.alt_label_to_index.get(key, []))

# materialize and cache
classes = [self[index] for index in iri_list]
self._prefix_cache[prefix] = classes

# return the classes
return classes

@staticmethod
@cache
def _basic_search(
Expand All @@ -1015,14 +1083,18 @@ def _basic_search(
List[Tuple[str, int | float, int]]: The list of search results with
the string, the search score, and the index.
"""
return rapidfuzz.process.extract( # type: ignore
query,
search_list,
scorer=rapidfuzz.fuzz.WRatio
if search_type == "string"
else rapidfuzz.fuzz.partial_token_set_ratio,
processor=rapidfuzz.utils.default_process,
limit=limit,
return sorted(
rapidfuzz.process.extract( # type: ignore
query,
search_list,
scorer=rapidfuzz.fuzz.WRatio
if search_type == "string"
else rapidfuzz.fuzz.partial_token_set_ratio,
processor=rapidfuzz.utils.default_process,
limit=limit,
),
# sort first by score, then by length of text
key=lambda x: (-x[1], len(x[0])),
)

def search_by_label(
Expand Down
13 changes: 13 additions & 0 deletions tests/test_soli.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,13 @@ def test_all_formatters(soli_graph):
assert c.to_owl_xml() is not None


def test_search_prefix(soli_graph):
for c in soli_graph.search_by_prefix("Mich"):
assert c.label == "Michigan"
assert "US+MI" in c.alternative_labels
break


def test_search_label(soli_graph):
for c, score in soli_graph.search_by_label("Georgia"):
assert "Georgia" in c.label
Expand Down Expand Up @@ -349,6 +356,12 @@ def get_parents():
return soli_graph.get_parents(pb_iri)


def test_benchmark_search_prefix(benchmark, soli_graph):
@benchmark
def search_prefix():
return soli_graph.search_by_prefix("Mich")


def test_benchmark_search_labels(benchmark, soli_graph):
@benchmark
def search_labels():
Expand Down

0 comments on commit 203b9b8

Please sign in to comment.