From 559ab5acd9f51f9176463098c4db9ef3a9d91f93 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 24 Jul 2023 16:05:36 -0400 Subject: [PATCH 01/74] add beginnings of a package --- .gitignore | 160 +++++++++++++++++++++++ compressors.py | 13 +- data.py | 12 +- experiments.py | 43 +++++-- main_text.py | 21 +-- npc_gzip/__init__.py | 3 + npc_gzip/compressors/__init__.py | 0 npc_gzip/compressors/base.py | 78 +++++++++++ npc_gzip/compressors/gzip_compressor.py | 16 +++ npc_gzip/compressors/lzma_compressor.py | 29 +++++ npc_gzip/exceptions.py | 35 +++++ poetry.lock | 164 ++++++++++++++++++++++++ pyproject.toml | 19 +++ utils.py | 31 +++-- 14 files changed, 571 insertions(+), 53 deletions(-) create mode 100644 .gitignore create mode 100644 npc_gzip/__init__.py create mode 100644 npc_gzip/compressors/__init__.py create mode 100644 npc_gzip/compressors/base.py create mode 100644 npc_gzip/compressors/gzip_compressor.py create mode 100644 npc_gzip/compressors/lzma_compressor.py create mode 100644 npc_gzip/exceptions.py create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..6769e21d --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/compressors.py b/compressors.py index d4f2e125..35e13dfc 100644 --- a/compressors.py +++ b/compressors.py @@ -1,14 +1,13 @@ # Compressor Framework +import io import sys from importlib import import_module -from tqdm import tqdm -import torch.nn.functional as F -import io class DefaultCompressor: """for non-neural-based compressor""" - def __init__(self, compressor, typ='text'): + + def __init__(self, compressor, typ="text"): try: self.compressor = import_module(compressor) except ModuleNotFoundError: @@ -38,6 +37,6 @@ def get_bits_per_char(self, original_fn: str) -> float: """Test Compressors""" -if __name__ == '__main__': - comp = DefaultCompressor('gzip') - print(comp.get_compressed_len('Hello world')) +if __name__ == "__main__": + comp = DefaultCompressor("gzip") + print(comp.get_compressed_len("Hello world")) diff --git a/data.py b/data.py index d6a7d12b..9cce1747 100644 --- a/data.py +++ b/data.py @@ -321,6 +321,7 @@ def load_filipino(): Returns: tuple: Tuple of lists containing the training and testing datasets respectively. """ + def process(dataset: list) -> list: label_dict = OrderedDict() d = {"absent": 0, "dengue": 1, "health": 2, "mosquito": 3, "sick": 4} @@ -526,15 +527,16 @@ def pick_n_sample_from_each_class_img( return result, labels, recorded_idx -def load_custom_dataset(di, delimiter='\t'): +def load_custom_dataset(di, delimiter="\t"): def process(fn): l = [] - text_list = open(fn).read().strip().split('\n') + text_list = open(fn).read().strip().split("\n") for t in text_list: label, text = t.split(delimiter) - l.append((label,text)) + l.append((label, text)) return l - test_fn = os.path.join(di, 'test.txt') - train_fn = os.path.join(di, 'train.txt') + + test_fn = os.path.join(di, "test.txt") + train_fn = os.path.join(di, "train.txt") train_ds, test_ds = process(train_fn), process(test_fn) return train_ds, test_ds diff --git a/experiments.py b/experiments.py index d45de132..c2a132bc 100644 --- a/experiments.py +++ b/experiments.py @@ -9,17 +9,24 @@ from functools import partial from itertools import repeat from statistics import mode +from typing import Callable import numpy as np import torch -from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score +from sklearn.metrics.cluster import (adjusted_rand_score, + normalized_mutual_info_score) from tqdm import tqdm -from typing import Callable from compressors import DefaultCompressor + class KnnExpText: - def __init__(self, aggregation_function: Callable, compressor: DefaultCompressor, distance_function: Callable): + def __init__( + self, + aggregation_function: Callable, + compressor: DefaultCompressor, + distance_function: Callable, + ): self.aggregation_func = aggregation_function self.compressor = compressor self.distance_func = distance_function @@ -81,7 +88,7 @@ def calc_dis_with_single_compressed_given( Returns: None: None """ - + data_to_compare = data if train_data is not None: data_to_compare = train_data @@ -106,7 +113,7 @@ def calc_dis_with_single_compressed_given( def calc_dis_single(self, t1: str, t2: str) -> float: """ - Calculates the distance between `t1` and `t2` and returns + Calculates the distance between `t1` and `t2` and returns that distance value as a float-like object. Arguments: @@ -127,7 +134,7 @@ def calc_dis_single(self, t1: str, t2: str) -> float: def calc_dis_single_multi(self, train_data: list, datum: str) -> list: """ - Calculates the distance between `train_data` and `datum` and returns + Calculates the distance between `train_data` and `datum` and returns that distance value as a float-like object. Arguments: @@ -151,7 +158,7 @@ def calc_dis_single_multi(self, train_data: list, datum: str) -> list: def calc_dis_with_vector(self, data: list, train_data: list = None): """ - Calculates the distance between `train_data` and `data` and returns + Calculates the distance between `train_data` and `data` and returns that distance value as a float-like object. Arguments: @@ -174,7 +181,12 @@ def calc_dis_with_vector(self, data: list, train_data: list = None): self.distance_matrix.append(distance4i) def calc_acc( - self, k: int, label: str, train_label: str = None, provided_distance_matrix: list = None, rand: bool = False + self, + k: int, + label: str, + train_label: str = None, + provided_distance_matrix: list = None, + rand: bool = False, ) -> tuple: """ Calculates the accuracy of the algorithm. @@ -187,7 +199,7 @@ def calc_acc( rand (bool): TODO Returns: - tuple: predictions, and list of bools indicating prediction correctness. + tuple: predictions, and list of bools indicating prediction correctness. """ if provided_distance_matrix is not None: @@ -233,7 +245,14 @@ def calc_acc( print("Accuracy is {}".format(sum(correct) / len(correct))) return pred, correct - def combine_dis_acc(self, k: int, data: list, label: str, train_data: list = None, train_label: str = None) -> tuple: + def combine_dis_acc( + self, + k: int, + data: list, + label: str, + train_data: list = None, + train_label: str = None, + ) -> tuple: correct = [] pred = [] if train_label is not None: @@ -272,7 +291,9 @@ def combine_dis_acc(self, k: int, data: list, label: str, train_data: list = Non print("Accuracy is {}".format(sum(correct) / len(correct))) return pred, correct - def combine_dis_acc_single(self, k: int, train_data: list, train_label: str, datum: list, label: str): + def combine_dis_acc_single( + self, k: int, train_data: list, train_label: str, datum: list, label: str + ): # Support multi processing - must provide train data and train label distance4i = self.calc_dis_single_multi(train_data, datum) sorted_idx = np.argpartition(np.array(distance4i), range(k)) diff --git a/main_text.py b/main_text.py index 5f467181..27ec38a8 100644 --- a/main_text.py +++ b/main_text.py @@ -1,23 +1,16 @@ import argparse import time from functools import partial +from typing import Callable from pathos.multiprocessing import ProcessingPool as Pool -from torchtext.datasets import ( - AG_NEWS, - IMDB, - AmazonReviewPolarity, - DBpedia, - SogouNews, - YahooAnswers, - YelpReviewPolarity, -) +from torchtext.datasets import (AG_NEWS, IMDB, AmazonReviewPolarity, DBpedia, + SogouNews, YahooAnswers, YelpReviewPolarity) from compressors import * from data import * from experiments import * from utils import * -from typing import Callable # np.random.seed(6) @@ -146,7 +139,7 @@ def non_neurl_knn_exp_given_dis(dis_matrix, k, test_label, train_label): "swahili": 6, "filipino": 5, "kirnews": 14, - "custom": args.class_num + "custom": args.class_num, } # load dataset data_dir = os.path.join(args.data_dir, args.dataset) @@ -244,11 +237,7 @@ def non_neurl_knn_exp_given_dis(dis_matrix, k, test_label, train_label): else: start_idx = args.test_idx_start for i in range(0, len(test_data), 100): - print( - "from {} to {}".format( - start_idx + i, start_idx + i + 100 - ) - ) + print("from {} to {}".format(start_idx + i, start_idx + i + 100)) output_rel_fn = "test_dis_idx_from_{}_to_{}".format( start_idx + i, start_idx + i + 100 ) diff --git a/npc_gzip/__init__.py b/npc_gzip/__init__.py new file mode 100644 index 00000000..e2fb446d --- /dev/null +++ b/npc_gzip/__init__.py @@ -0,0 +1,3 @@ +from .exceptions import * +from . import compressors + diff --git a/npc_gzip/compressors/__init__.py b/npc_gzip/compressors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py new file mode 100644 index 00000000..cd95bb04 --- /dev/null +++ b/npc_gzip/compressors/base.py @@ -0,0 +1,78 @@ +from types import ModuleType +from npc_gzip.exceptions import InvalidCompressorException + +class BaseCompressor: + """ + Default compressor class that other compressors inherit + from. + + """ + + def __init__(self, compressor: ModuleType): + self.compressor = compressor + + def _compress(self, x: str) -> bytes: + """ + Applies the compression algorithm to `x` and + returns the results as bytes. + + Arguments: + x (str): The string you want to compress. + + Returns: + bytes: The compressed bytes representation of `x`. + """ + + x = str(x) + x = x.encode('utf-8') + compressed = self.compressor.compress(x) + return compressed + + def get_compressed_length(self, x: str) -> int: + """ + Calculates the size of `x` once compressed. + + Arguments: + x (str): String you want to compute the compressed length of. + + Returns: + int: Length of `x` once compressed. + """ + + compressed_value: bytes = self._compress(x) + compressed_length: int = len(compressed_value) + return compressed_length + + def get_bits_per_char(self, x: str) -> float: + """ + Returns the compressed size of `x` relative to + the number of characters in string `x`. + + Arguments: + x (str): String you want to compute the compressed length per + character in. + + Returns: + float: Length of `x` once compressed divided by the number of + characters in `x`. + """ + + compressed_value: bytes = self._compress(x) + compressed_length: int = len(compressed_value) + compressed_length_in_bits: int = compressed_length * 8 + number_of_characters: int = len(x) + + compressed_length_per_number_of_characters: float = compressed_length_in_bits / number_of_characters + return compressed_length_per_number_of_characters + + +if __name__ == "__main__": + + import gzip + compressor = BaseCompressor(compressor=gzip) + + example = 'Hello there!' + compressed_length: int = compressor.get_compressed_length(example) + bits_per_character: float = compressor.get_bits_per_char(example) + + print(compressed_length, bits_per_character) \ No newline at end of file diff --git a/npc_gzip/compressors/gzip_compressor.py b/npc_gzip/compressors/gzip_compressor.py new file mode 100644 index 00000000..cd825217 --- /dev/null +++ b/npc_gzip/compressors/gzip_compressor.py @@ -0,0 +1,16 @@ +import gzip +from npc_gzip.compressors.base import BaseCompressor + +class GZipCompressor(BaseCompressor): + def __init__(self): + super().__init__(self) + + self.compressor = gzip + + +if __name__ == '__main__': + + compressor = GZipCompressor() + example: str = 'Hello there!' + compressed_length: int = compressor.get_compressed_length(example) + bits_per_character: float = compressor.get_bits_per_char(example) diff --git a/npc_gzip/compressors/lzma_compressor.py b/npc_gzip/compressors/lzma_compressor.py new file mode 100644 index 00000000..af0bbd13 --- /dev/null +++ b/npc_gzip/compressors/lzma_compressor.py @@ -0,0 +1,29 @@ +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.exceptions import MissingDependencyException + +class LzmaCompressor(BaseCompressor): + def __init__(self): + super().__init__(self) + + try: + import lzma + except ModuleNotFoundError as e: + import platform + major, minor, patch = platform.python_version_tuple() + if int(major) >= 3 and int(minor) >= 11: + raise ExceptionGroup([ + MissingDependencyException('lzma'), + e + ]) + else: + raise MissingDependencyException('lzma') + + self.compressor = lzma + + +if __name__ == '__main__': + + compressor = LzmaCompressor() + example: str = 'Hello there!' + compressed_length: int = compressor.get_compressed_length(example) + bits_per_character: float = compressor.get_bits_per_char(example) diff --git a/npc_gzip/exceptions.py b/npc_gzip/exceptions.py new file mode 100644 index 00000000..872104ea --- /dev/null +++ b/npc_gzip/exceptions.py @@ -0,0 +1,35 @@ +class InvalidCompressorException(Exception): + """ + Is raised when a user is trying to use a compression + library that is not supported. + """ + def __init__(self, compression_library: str): + self.message = f''' + Compression Library ({compression_library}) + is not currently supported. + ''' + super().__init__(self.message) + +class MissingDependencyException(Exception): + """ + Is raised when an underlying dependency is not + found when loading a library. + """ + def __init__(self, compression_library: str): + self.message = f''' + Compression Library ({compression_library}) + is missing an underlying dependency. Try + installing those missing dependencies and + load this again. + + Common missing dependencies for: + + * lzma: + * brew install xz + * sudo apt-get install lzma liblzma-dev libbz2-dev + + * bz2: + * sudo apt-get install lzma liblzma-dev libbz2-dev + + ''' + super().__init__(self.message) \ No newline at end of file diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 00000000..f4de8dbd --- /dev/null +++ b/poetry.lock @@ -0,0 +1,164 @@ +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. + +[[package]] +name = "black" +version = "23.7.0" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.8" +files = [ + {file = "black-23.7.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:5c4bc552ab52f6c1c506ccae05681fab58c3f72d59ae6e6639e8885e94fe2587"}, + {file = "black-23.7.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:552513d5cd5694590d7ef6f46e1767a4df9af168d449ff767b13b084c020e63f"}, + {file = "black-23.7.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:86cee259349b4448adb4ef9b204bb4467aae74a386bce85d56ba4f5dc0da27be"}, + {file = "black-23.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:501387a9edcb75d7ae8a4412bb8749900386eaef258f1aefab18adddea1936bc"}, + {file = "black-23.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb074d8b213749fa1d077d630db0d5f8cc3b2ae63587ad4116e8a436e9bbe995"}, + {file = "black-23.7.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:b5b0ee6d96b345a8b420100b7d71ebfdd19fab5e8301aff48ec270042cd40ac2"}, + {file = "black-23.7.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:893695a76b140881531062d48476ebe4a48f5d1e9388177e175d76234ca247cd"}, + {file = "black-23.7.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:c333286dc3ddca6fdff74670b911cccedacb4ef0a60b34e491b8a67c833b343a"}, + {file = "black-23.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831d8f54c3a8c8cf55f64d0422ee875eecac26f5f649fb6c1df65316b67c8926"}, + {file = "black-23.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:7f3bf2dec7d541b4619b8ce526bda74a6b0bffc480a163fed32eb8b3c9aed8ad"}, + {file = "black-23.7.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:f9062af71c59c004cd519e2fb8f5d25d39e46d3af011b41ab43b9c74e27e236f"}, + {file = "black-23.7.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:01ede61aac8c154b55f35301fac3e730baf0c9cf8120f65a9cd61a81cfb4a0c3"}, + {file = "black-23.7.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:327a8c2550ddc573b51e2c352adb88143464bb9d92c10416feb86b0f5aee5ff6"}, + {file = "black-23.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d1c6022b86f83b632d06f2b02774134def5d4d4f1dac8bef16d90cda18ba28a"}, + {file = "black-23.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:27eb7a0c71604d5de083757fbdb245b1a4fae60e9596514c6ec497eb63f95320"}, + {file = "black-23.7.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:8417dbd2f57b5701492cd46edcecc4f9208dc75529bcf76c514864e48da867d9"}, + {file = "black-23.7.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:47e56d83aad53ca140da0af87678fb38e44fd6bc0af71eebab2d1f59b1acf1d3"}, + {file = "black-23.7.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:25cc308838fe71f7065df53aedd20327969d05671bac95b38fdf37ebe70ac087"}, + {file = "black-23.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:642496b675095d423f9b8448243336f8ec71c9d4d57ec17bf795b67f08132a91"}, + {file = "black-23.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:ad0014efc7acf0bd745792bd0d8857413652979200ab924fbf239062adc12491"}, + {file = "black-23.7.0-py3-none-any.whl", hash = "sha256:9fd59d418c60c0348505f2ddf9609c1e1de8e7493eab96198fc89d9f865e7a96"}, + {file = "black-23.7.0.tar.gz", hash = "sha256:022a582720b0d9480ed82576c920a8c1dde97cc38ff11d8d8859b3bd6ca9eedb"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "click" +version = "8.1.6" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"}, + {file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "isort" +version = "5.12.0" +description = "A Python utility / library to sort Python imports." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "isort-5.12.0-py3-none-any.whl", hash = "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"}, + {file = "isort-5.12.0.tar.gz", hash = "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504"}, +] + +[package.extras] +colors = ["colorama (>=0.4.3)"] +pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"] +plugins = ["setuptools"] +requirements-deprecated-finder = ["pip-api", "pipreqs"] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + +[[package]] +name = "packaging" +version = "23.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, +] + +[[package]] +name = "pathspec" +version = "0.11.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, + {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, +] + +[[package]] +name = "platformdirs" +version = "3.9.1" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +optional = false +python-versions = ">=3.7" +files = [ + {file = "platformdirs-3.9.1-py3-none-any.whl", hash = "sha256:ad8291ae0ae5072f66c16945166cb11c63394c7a3ad1b1bc9828ca3162da8c2f"}, + {file = "platformdirs-3.9.1.tar.gz", hash = "sha256:1b42b450ad933e981d56e59f1b97495428c9bd60698baab9f3eb3d00d5822421"}, +] + +[package.extras] +docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "typing-extensions" +version = "4.7.1" +description = "Backported and Experimental Type Hints for Python 3.7+" +optional = false +python-versions = ">=3.7" +files = [ + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.9" +content-hash = "8911ec5744aae9b13dd8874953578099aaa7c68b8791606e24ac7be62cc2d3cb" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..89870345 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[tool.poetry] +name = "npc-gzip" +version = "0.0.1" +description = "Code for Paper: “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors" +authors = ["Zach Bloss "] +license = "MIT" +readme = "README.md" +packages = [{include = "npc_gzip"}] + +[tool.poetry.dependencies] +python = "^3.9" + +[tool.poetry.group.dev.dependencies] +black = "^23.7.0" +isort = "^5.12.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/utils.py b/utils.py index 448b90a6..3afcd59e 100644 --- a/utils.py +++ b/utils.py @@ -27,11 +27,11 @@ def CDM(c1: float, c2: float, c12: float) -> float: return dis -def MSE(v1: float, v2: float) ->float: +def MSE(v1: float, v2: float) -> float: """ Calculates Mean Squared Error. """ - + return np.sum((v1 - v2) ** 2) / len(v1) @@ -59,7 +59,7 @@ def agg_by_jag_word(t1: str, t2: str) -> str: t2 (str): Second item. Returns: - str: + str: """ t1_list = t1.split(" ") @@ -85,7 +85,7 @@ def agg_by_jag_char(t1: str, t2: str): t2 (str): Second item. Returns: - str: + str: """ t1_list = list(t1) @@ -109,7 +109,7 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> stringa (str): First item. stringb (str): Second item. by_character (bool): True if you want to join the combined string by character, - Else combines by word + Else combines by word Returns: str: combination of stringa and stringb @@ -129,6 +129,7 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> return "".join(combined) return " ".join(combined) + def agg_by_avg(i1: list, i2: list) -> torch.Tensor: """ Calculates the average of i1 and i2 rounding to the @@ -137,7 +138,7 @@ def agg_by_avg(i1: list, i2: list) -> torch.Tensor: Arguments: i1 (list): List 1. i2 (list): List 2. - + Returns: torch.Tensor: Average of the two lists. """ @@ -145,7 +146,9 @@ def agg_by_avg(i1: list, i2: list) -> torch.Tensor: return torch.div(i1 + i2, 2, rounding_mode="trunc") -def agg_by_min_or_max(i1: list, i2: list, aggregate_by_minimum: bool = False) -> torch.Tensor: +def agg_by_min_or_max( + i1: list, i2: list, aggregate_by_minimum: bool = False +) -> torch.Tensor: """ Calculates the average of i1 and i2 rounding to the shortest list. @@ -155,37 +158,37 @@ def agg_by_min_or_max(i1: list, i2: list, aggregate_by_minimum: bool = False) -> i2 (list): List 2. aggregate_by_minimum (bool): True if you want to take the minimum of the two lists. False if you want to take the maximum instead. - + Returns: torch.Tensor: Average of the two lists. """ - + stacked = torch.stack([i1, i2], axis=0) if aggregate_by_minimum: return torch.min(stacked, axis=0)[0] - + return torch.max(stacked, axis=0)[0] def agg_by_stack(i1: list, i2: list) -> torch.Tensor: """ Combines `i1` and `i2` via `torch.stack`. - + Arguments: i1 (list): List 1. i2 (list): List 2. - + Returns: torch.Tensor: Stack of the two lists. """ - + return torch.stack([i1, i2]) def mean_confidence_interval(data: list, confidence: float = 0.95) -> tuple: """ Computes the mean confidence interval of `data` with `confidence` - + Arguments: data (list): List of data to compute a confidence interval over. confidence (float): Level to compute confidence. From 014a25d73ba9230d7ca408113119de9ae8bf2aff Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Tue, 25 Jul 2023 13:51:05 -0400 Subject: [PATCH 02/74] add bz2 compressor refactor gzip_compressor to match the others --- npc_gzip/compressors/bz2_compressor.py | 29 +++++++++++++++++++++++++ npc_gzip/compressors/gzip_compressor.py | 15 ++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 npc_gzip/compressors/bz2_compressor.py diff --git a/npc_gzip/compressors/bz2_compressor.py b/npc_gzip/compressors/bz2_compressor.py new file mode 100644 index 00000000..297bfb5b --- /dev/null +++ b/npc_gzip/compressors/bz2_compressor.py @@ -0,0 +1,29 @@ +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.exceptions import MissingDependencyException + +class LzmaCompressor(BaseCompressor): + def __init__(self): + super().__init__(self) + + try: + import lzma + except ModuleNotFoundError as e: + import platform + major, minor, patch = platform.python_version_tuple() + if int(major) >= 3 and int(minor) >= 11: + raise ExceptionGroup([ + MissingDependencyException('bz2'), + e + ]) + else: + raise MissingDependencyException('bz2') + + self.compressor = lzma + + +if __name__ == '__main__': + + compressor = LzmaCompressor() + example: str = 'Hello there!' + compressed_length: int = compressor.get_compressed_length(example) + bits_per_character: float = compressor.get_bits_per_char(example) diff --git a/npc_gzip/compressors/gzip_compressor.py b/npc_gzip/compressors/gzip_compressor.py index cd825217..13a4f496 100644 --- a/npc_gzip/compressors/gzip_compressor.py +++ b/npc_gzip/compressors/gzip_compressor.py @@ -1,10 +1,23 @@ -import gzip from npc_gzip.compressors.base import BaseCompressor class GZipCompressor(BaseCompressor): def __init__(self): super().__init__(self) + + try: + import gzip + except ModuleNotFoundError as e: + import platform + major, minor, patch = platform.python_version_tuple() + if int(major) >= 3 and int(minor) >= 11: + raise ExceptionGroup([ + MissingDependencyException('gzip'), + e + ]) + else: + raise MissingDependencyException('gzip') + self.compressor = gzip From bddbd4eb7770ad4d322cdfafd0d450df2f40b51f Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Tue, 25 Jul 2023 14:05:02 -0400 Subject: [PATCH 03/74] refactor get_bits_per_char method to be more verbose --- compressors.py | 2 +- npc_gzip/compressors/base.py | 4 ++-- npc_gzip/compressors/bz2_compressor.py | 2 +- npc_gzip/compressors/gzip_compressor.py | 2 +- npc_gzip/compressors/lzma_compressor.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/compressors.py b/compressors.py index 2114fedc..d3381e6b 100644 --- a/compressors.py +++ b/compressors.py @@ -25,7 +25,7 @@ def get_compressed_len(self, x: str) -> int: else: return len(self.compressor.compress(np.array(x).tobytes())) - def get_bits_per_char(self, original_fn: str) -> float: + def get_bits_per_character(self, original_fn: str) -> float: """ Returns the compressed size of the original function in bits. diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py index cd95bb04..cee98acd 100644 --- a/npc_gzip/compressors/base.py +++ b/npc_gzip/compressors/base.py @@ -43,7 +43,7 @@ def get_compressed_length(self, x: str) -> int: compressed_length: int = len(compressed_value) return compressed_length - def get_bits_per_char(self, x: str) -> float: + def get_bits_per_character(self, x: str) -> float: """ Returns the compressed size of `x` relative to the number of characters in string `x`. @@ -73,6 +73,6 @@ def get_bits_per_char(self, x: str) -> float: example = 'Hello there!' compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_char(example) + bits_per_character: float = compressor.get_bits_per_character(example) print(compressed_length, bits_per_character) \ No newline at end of file diff --git a/npc_gzip/compressors/bz2_compressor.py b/npc_gzip/compressors/bz2_compressor.py index 297bfb5b..4d403677 100644 --- a/npc_gzip/compressors/bz2_compressor.py +++ b/npc_gzip/compressors/bz2_compressor.py @@ -26,4 +26,4 @@ def __init__(self): compressor = LzmaCompressor() example: str = 'Hello there!' compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_char(example) + bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/compressors/gzip_compressor.py b/npc_gzip/compressors/gzip_compressor.py index 13a4f496..61c02dd0 100644 --- a/npc_gzip/compressors/gzip_compressor.py +++ b/npc_gzip/compressors/gzip_compressor.py @@ -26,4 +26,4 @@ def __init__(self): compressor = GZipCompressor() example: str = 'Hello there!' compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_char(example) + bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/compressors/lzma_compressor.py b/npc_gzip/compressors/lzma_compressor.py index af0bbd13..4842f817 100644 --- a/npc_gzip/compressors/lzma_compressor.py +++ b/npc_gzip/compressors/lzma_compressor.py @@ -26,4 +26,4 @@ def __init__(self): compressor = LzmaCompressor() example: str = 'Hello there!' compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_char(example) + bits_per_character: float = compressor.get_bits_per_character(example) From 005edc82d9b9ecf327072c5cb799e611dfa4e578 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Tue, 25 Jul 2023 16:12:59 -0400 Subject: [PATCH 04/74] add stringtooshortexception --- .gitignore | 3 +- npc_gzip/aggregations.py | 151 +++++++++++++++++++++++++++++++++++++++ npc_gzip/exceptions.py | 10 +++ 3 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 npc_gzip/aggregations.py diff --git a/.gitignore b/.gitignore index 6769e21d..3736894d 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ +notebooks/ \ No newline at end of file diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py new file mode 100644 index 00000000..157f6d16 --- /dev/null +++ b/npc_gzip/aggregations.py @@ -0,0 +1,151 @@ +import torch +from npc_gzip.exceptions import StringTooShortException + +def agg_by_concat_space(t1: str, t2: str) -> str: + """ + Combines `t1` and `t2` with a space. + + Arguments: + t1 (str): First item. + t2 (str): Second item. + + Returns: + str: `{t1} {t2}` + """ + + return t1 + " " + t2 + + +def agg_by_jag_word(t1: str, t2: str) -> str: + """ + # TODO: Better description + + Arguments: + t1 (str): First item. + t2 (str): Second item. + + Returns: + str: + """ + + t1_list = t1.split(" ") + t2_list = t2.split(" ") + + i = None + combined = [] + minimum_list_size = min([len(t1_list), len(t2_list)]) + for i in range(0, minimum_list_size - 1, 2): + combined.append(t1_list[i]) + combined.append(t2_list[i + 1]) + if i is None: + raise StringTooShortException(t1, t2, 'agg_by_jag_word') + if len(t1_list) > len(t2_list): + combined += t1_list[i:] + return " ".join(combined) + + +def agg_by_jag_char(t1: str, t2: str): + """ + # TODO: Better description + + Arguments: + t1 (str): First item. + t2 (str): Second item. + + Returns: + str: + """ + + t1_list = list(t1) + t2_list = list(t2) + combined = [] + minimum_list_size = min([len(t1_list), len(t2_list)]) + for i in range(0, minimum_list_size - 1, 2): + combined.append(t1_list[i]) + combined.append(t2_list[i + 1]) + if len(t1_list) > len(t2_list): + combined += t1_list[i:] + + return "".join(combined) + + +def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> str: + """ + Aggregates strings. + + Arguments: + stringa (str): First item. + stringb (str): Second item. + by_character (bool): True if you want to join the combined string by character, + Else combines by word + + Returns: + str: combination of stringa and stringb + """ + + lista = list(stringa) + listb = list(stringb) + combined = [] + minimum_list_size = min([len(lista), len(listb)]) + for i in range(0, minimum_list_size - 1, 2): + combined.append(lista[i]) + combined.append(listb[i + 1]) + if len(lista) > len(listb): + combined += lista[i:] + + if by_character: + return "".join(combined) + return " ".join(combined) + + +def agg_by_avg(i1: torch.Tensor, i2: torch.Tensor) -> torch.Tensor: + """ + Calculates the average of i1 and i2, rounding to the shortest. + + Arguments: + i1 (torch.Tensor): First series of numbers. + i2 (torch.Tensor): Second series of numbers. + + Returns: + torch.Tensor: Average of the two series of numbers. + """ + + return torch.div(i1 + i2, 2, rounding_mode="trunc") + + +def agg_by_min_or_max( + i1: torch.Tensor, i2: torch.Tensor, aggregate_by_minimum: bool = False +) -> torch.Tensor: + """ + Calculates the average of i1 and i2, rounding to the shortest. + + Arguments: + i1 (torch.Tensor): First series of numbers. + i2 (torch.Tensor): Second series of numbers. + aggregate_by_minimum (bool): True if you want to take the minimum of the two series. + False if you want to take the maximum instead. + + Returns: + torch.Tensor: Average of the two series. + """ + + stacked = torch.stack([i1, i2], axis=0) + if aggregate_by_minimum: + return torch.min(stacked, axis=0)[0] + + return torch.max(stacked, axis=0)[0] + + +def agg_by_stack(i1: torch.Tensor, i2: torch.Tensor) -> torch.Tensor: + """ + Combines `i1` and `i2` via `torch.stack`. + + Arguments: + i1 (torch.Tensor): First series of numbers. + i2 (torch.Tensor): Second series of numbers. + + Returns: + torch.Tensor: Stack of the two series. + """ + + return torch.stack([i1, i2]) diff --git a/npc_gzip/exceptions.py b/npc_gzip/exceptions.py index 872104ea..2751a00d 100644 --- a/npc_gzip/exceptions.py +++ b/npc_gzip/exceptions.py @@ -31,5 +31,15 @@ def __init__(self, compression_library: str): * bz2: * sudo apt-get install lzma liblzma-dev libbz2-dev + ''' + super().__init__(self.message) + +class StringTooShortException(Exception): + def __init__(self, stringa: str, stringb: str, function_name: str): + self.message = f''' + Unable to aggregate ({stringa}) and ({stringb}) using + function: ({function_name}). One or both of the two + strings are too short to concatenate. + ''' super().__init__(self.message) \ No newline at end of file From 4625c561c4b716115794f7d67d2101cdfed0ccb3 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Tue, 25 Jul 2023 21:20:01 -0400 Subject: [PATCH 05/74] add distance calculation class --- experiments.py | 3 +- main_text.py | 11 +- npc_gzip/__init__.py | 4 +- npc_gzip/aggregations.py | 122 +++++--------- npc_gzip/compressors/base.py | 17 +- npc_gzip/compressors/bz2_compressor.py | 19 ++- npc_gzip/compressors/gzip_compressor.py | 18 +-- npc_gzip/compressors/lzma_compressor.py | 19 ++- npc_gzip/distance.py | 206 ++++++++++++++++++++++++ npc_gzip/exceptions.py | 95 +++++++++-- poetry.lock | 36 ++++- pyproject.toml | 1 + 12 files changed, 419 insertions(+), 132 deletions(-) create mode 100644 npc_gzip/distance.py diff --git a/experiments.py b/experiments.py index ca3c5b55..c49a59b1 100644 --- a/experiments.py +++ b/experiments.py @@ -13,8 +13,7 @@ import numpy as np import torch -from sklearn.metrics.cluster import (adjusted_rand_score, - normalized_mutual_info_score) +from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score from tqdm import tqdm from compressors import DefaultCompressor diff --git a/main_text.py b/main_text.py index f8d0aad4..08cbb728 100644 --- a/main_text.py +++ b/main_text.py @@ -4,8 +4,15 @@ from typing import Callable from pathos.multiprocessing import ProcessingPool as Pool -from torchtext.datasets import (AG_NEWS, IMDB, AmazonReviewPolarity, DBpedia, - SogouNews, YahooAnswers, YelpReviewPolarity) +from torchtext.datasets import ( + AG_NEWS, + IMDB, + AmazonReviewPolarity, + DBpedia, + SogouNews, + YahooAnswers, + YelpReviewPolarity, +) from compressors import * from data import * diff --git a/npc_gzip/__init__.py b/npc_gzip/__init__.py index e2fb446d..08b4077f 100644 --- a/npc_gzip/__init__.py +++ b/npc_gzip/__init__.py @@ -1,3 +1,3 @@ -from .exceptions import * from . import compressors - +from .distance import * +from .exceptions import * diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index 157f6d16..272f8547 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -1,10 +1,14 @@ -import torch +import numpy as np + from npc_gzip.exceptions import StringTooShortException -def agg_by_concat_space(t1: str, t2: str) -> str: + +def concatenate_with_space(t1: str, t2: str) -> str: """ Combines `t1` and `t2` with a space. + (formerly agg_by_concat_space) + Arguments: t1 (str): First item. t2 (str): Second item. @@ -16,63 +20,12 @@ def agg_by_concat_space(t1: str, t2: str) -> str: return t1 + " " + t2 -def agg_by_jag_word(t1: str, t2: str) -> str: - """ - # TODO: Better description - - Arguments: - t1 (str): First item. - t2 (str): Second item. - - Returns: - str: - """ - - t1_list = t1.split(" ") - t2_list = t2.split(" ") - - i = None - combined = [] - minimum_list_size = min([len(t1_list), len(t2_list)]) - for i in range(0, minimum_list_size - 1, 2): - combined.append(t1_list[i]) - combined.append(t2_list[i + 1]) - if i is None: - raise StringTooShortException(t1, t2, 'agg_by_jag_word') - if len(t1_list) > len(t2_list): - combined += t1_list[i:] - return " ".join(combined) - - -def agg_by_jag_char(t1: str, t2: str): - """ - # TODO: Better description - - Arguments: - t1 (str): First item. - t2 (str): Second item. - - Returns: - str: - """ - - t1_list = list(t1) - t2_list = list(t2) - combined = [] - minimum_list_size = min([len(t1_list), len(t2_list)]) - for i in range(0, minimum_list_size - 1, 2): - combined.append(t1_list[i]) - combined.append(t2_list[i + 1]) - if len(t1_list) > len(t2_list): - combined += t1_list[i:] - - return "".join(combined) - - def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> str: """ Aggregates strings. + (replaces agg_by_jag_char, agg_by_jag_word) + Arguments: stringa (str): First item. stringb (str): Second item. @@ -87,65 +40,78 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> listb = list(stringb) combined = [] minimum_list_size = min([len(lista), len(listb)]) + i = None for i in range(0, minimum_list_size - 1, 2): combined.append(lista[i]) combined.append(listb[i + 1]) + + if i is None: + raise StringTooShortException(t1, t2, "aggregate_strings") + if len(lista) > len(listb): combined += lista[i:] if by_character: return "".join(combined) + return " ".join(combined) -def agg_by_avg(i1: torch.Tensor, i2: torch.Tensor) -> torch.Tensor: +def average(array_a: np.ndarray, array_b: np.ndarray) -> np.ndarray: """ - Calculates the average of i1 and i2, rounding to the shortest. + Calculates the average of `array_a` and `array_b`, + rounding down. + + (replaces agg_by_avg) Arguments: - i1 (torch.Tensor): First series of numbers. - i2 (torch.Tensor): Second series of numbers. + array_a (np.ndarray): First series of numbers. + array_b (np.ndarray): Second series of numbers. Returns: - torch.Tensor: Average of the two series of numbers. + np.ndarray: Average of the two series of numbers rounded + towards zero. """ - return torch.div(i1 + i2, 2, rounding_mode="trunc") + average_array = (a + b) / 2 + average_array = average_array.astype(int) + return average_array -def agg_by_min_or_max( - i1: torch.Tensor, i2: torch.Tensor, aggregate_by_minimum: bool = False -) -> torch.Tensor: + +def min_max_aggregation( + array_a: np.ndarray, array_b: np.ndarray, aggregate_by_minimum: bool = False +) -> np.ndarray: """ - Calculates the average of i1 and i2, rounding to the shortest. + Stacks `array_a` and `array_b` then returns the minimum value if + `aggregate_by_minimum`, else returns the maximum value. Arguments: - i1 (torch.Tensor): First series of numbers. - i2 (torch.Tensor): Second series of numbers. + array_a (np.ndarray): First series of numbers. + array_b (np.ndarray): Second series of numbers. aggregate_by_minimum (bool): True if you want to take the minimum of the two series. False if you want to take the maximum instead. Returns: - torch.Tensor: Average of the two series. + np.ndarray: 1-D Numpy array of the min/max value from `array_a` & `array_b`. """ - stacked = torch.stack([i1, i2], axis=0) + stacked = np.stack([array_a, array_b], axis=0) if aggregate_by_minimum: - return torch.min(stacked, axis=0)[0] - - return torch.max(stacked, axis=0)[0] + return np.min(stacked, axis=0).reshape(-1) + return torch.max(stacked, axis=0).reshape(-1) -def agg_by_stack(i1: torch.Tensor, i2: torch.Tensor) -> torch.Tensor: +def stack(array_a: np.ndarray, array_b: np.ndarray) -> np.ndarray: """ - Combines `i1` and `i2` via `torch.stack`. + Combines `array_a` and `array_b` via `np.stack`. Arguments: - i1 (torch.Tensor): First series of numbers. - i2 (torch.Tensor): Second series of numbers. + array_a (np.ndarray): First series of numbers. + array_b (np.ndarray): Second series of numbers. Returns: - torch.Tensor: Stack of the two series. + np.ndarray: Stack of the two series. """ - return torch.stack([i1, i2]) + return np.stack([array_a, array_b], axis=0) diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py index cee98acd..3e3b4ea4 100644 --- a/npc_gzip/compressors/base.py +++ b/npc_gzip/compressors/base.py @@ -1,6 +1,8 @@ from types import ModuleType + from npc_gzip.exceptions import InvalidCompressorException + class BaseCompressor: """ Default compressor class that other compressors inherit @@ -24,7 +26,7 @@ def _compress(self, x: str) -> bytes: """ x = str(x) - x = x.encode('utf-8') + x = x.encode("utf-8") compressed = self.compressor.compress(x) return compressed @@ -45,7 +47,7 @@ def get_compressed_length(self, x: str) -> int: def get_bits_per_character(self, x: str) -> float: """ - Returns the compressed size of `x` relative to + Returns the compressed size of `x` relative to the number of characters in string `x`. Arguments: @@ -53,7 +55,7 @@ def get_bits_per_character(self, x: str) -> float: character in. Returns: - float: Length of `x` once compressed divided by the number of + float: Length of `x` once compressed divided by the number of characters in `x`. """ @@ -62,17 +64,20 @@ def get_bits_per_character(self, x: str) -> float: compressed_length_in_bits: int = compressed_length * 8 number_of_characters: int = len(x) - compressed_length_per_number_of_characters: float = compressed_length_in_bits / number_of_characters + compressed_length_per_number_of_characters: float = ( + compressed_length_in_bits / number_of_characters + ) return compressed_length_per_number_of_characters if __name__ == "__main__": import gzip + compressor = BaseCompressor(compressor=gzip) - example = 'Hello there!' + example = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) bits_per_character: float = compressor.get_bits_per_character(example) - print(compressed_length, bits_per_character) \ No newline at end of file + print(compressed_length, bits_per_character) diff --git a/npc_gzip/compressors/bz2_compressor.py b/npc_gzip/compressors/bz2_compressor.py index 4d403677..e489a44b 100644 --- a/npc_gzip/compressors/bz2_compressor.py +++ b/npc_gzip/compressors/bz2_compressor.py @@ -1,29 +1,28 @@ from npc_gzip.compressors.base import BaseCompressor from npc_gzip.exceptions import MissingDependencyException + class LzmaCompressor(BaseCompressor): def __init__(self): super().__init__(self) - + try: import lzma except ModuleNotFoundError as e: import platform + major, minor, patch = platform.python_version_tuple() if int(major) >= 3 and int(minor) >= 11: - raise ExceptionGroup([ - MissingDependencyException('bz2'), - e - ]) + raise ExceptionGroup([MissingDependencyException("bz2"), e]) else: - raise MissingDependencyException('bz2') + raise MissingDependencyException("bz2") self.compressor = lzma - - -if __name__ == '__main__': + + +if __name__ == "__main__": compressor = LzmaCompressor() - example: str = 'Hello there!' + example: str = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/compressors/gzip_compressor.py b/npc_gzip/compressors/gzip_compressor.py index 61c02dd0..b885853a 100644 --- a/npc_gzip/compressors/gzip_compressor.py +++ b/npc_gzip/compressors/gzip_compressor.py @@ -1,29 +1,27 @@ from npc_gzip.compressors.base import BaseCompressor + class GZipCompressor(BaseCompressor): def __init__(self): super().__init__(self) - try: import gzip except ModuleNotFoundError as e: import platform + major, minor, patch = platform.python_version_tuple() if int(major) >= 3 and int(minor) >= 11: - raise ExceptionGroup([ - MissingDependencyException('gzip'), - e - ]) + raise ExceptionGroup([MissingDependencyException("gzip"), e]) else: - raise MissingDependencyException('gzip') + raise MissingDependencyException("gzip") self.compressor = gzip - - -if __name__ == '__main__': + + +if __name__ == "__main__": compressor = GZipCompressor() - example: str = 'Hello there!' + example: str = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/compressors/lzma_compressor.py b/npc_gzip/compressors/lzma_compressor.py index 4842f817..dd01b735 100644 --- a/npc_gzip/compressors/lzma_compressor.py +++ b/npc_gzip/compressors/lzma_compressor.py @@ -1,29 +1,28 @@ from npc_gzip.compressors.base import BaseCompressor from npc_gzip.exceptions import MissingDependencyException + class LzmaCompressor(BaseCompressor): def __init__(self): super().__init__(self) - + try: import lzma except ModuleNotFoundError as e: import platform + major, minor, patch = platform.python_version_tuple() if int(major) >= 3 and int(minor) >= 11: - raise ExceptionGroup([ - MissingDependencyException('lzma'), - e - ]) + raise ExceptionGroup([MissingDependencyException("lzma"), e]) else: - raise MissingDependencyException('lzma') + raise MissingDependencyException("lzma") self.compressor = lzma - - -if __name__ == '__main__': + + +if __name__ == "__main__": compressor = LzmaCompressor() - example: str = 'Hello there!' + example: str = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/distance.py b/npc_gzip/distance.py new file mode 100644 index 00000000..211bec86 --- /dev/null +++ b/npc_gzip/distance.py @@ -0,0 +1,206 @@ +from typing import Union + +import numpy as np + +from npc_gzip.exceptions import ( + AllOrNoneException, + CompressedValuesEqualZero, + InvalidShapeException, +) + + +class Distance: + def __init__( + self, + compressed_values_a: Union[list, np.ndarray], + compressed_values_b: Union[list, np.ndarray], + compressed_values_ab: Union[list, np.ndarray], + ): + + if not isinstance(compressed_values_a, np.ndarray): + compressed_values_a = np.array(compressed_values_a) + + if not isinstance(compressed_values_b, np.ndarray): + compressed_values_b = np.array(compressed_values_b) + + if not isinstance(compressed_values_ab, np.ndarray): + compressed_values_ab = np.array(compressed_values_ab) + + compressed_values_a = compressed_values_a.reshape(-1) + compressed_values_b = compressed_values_b.reshape(-1) + compressed_values_ab = compressed_values_ab.reshape(-1) + + if ( + compressed_values_a.shape + != compressed_values_b.shape + != compressed_values_ab.shape + ): + raise InvalidShapeException( + compressed_values_a, + compressed_values_b, + compressed_values_ab, + function_name="Distance.__init__", + ) + + self.compressed_values_a = compressed_values_a + self.compressed_values_b = compressed_values_b + self.compressed_values_ab = compressed_values_ab + + def _ncd( + self, + compressed_value_a: float, + compressed_value_b: float, + compressed_value_ab: float, + ) -> float: + + denominator = max(compressed_value_a, compressed_value_b) + if denominator == 0: + raise CompressedValuesEqualZero( + compressed_value_a, compressed_value_b, function_name="Distance._ncd" + ) + + numerator = compressed_value_ab - min(compressed_value_a, compressed_value_b) + distance = numerator / denominator + return distance + + def _cdm( + self, + compressed_value_a: float, + compressed_value_b: float, + compressed_value_ab: float, + ) -> float: + + denominator = compressed_value_a + compressed_value_b + if denominator == 0: + raise CompressedValuesEqualZero( + compressed_value_a, compressed_value_b, function_name="Distance._cdm" + ) + + numerator = compressed_value_ab + + distance = numerator / denominator + return distance + + def _clm( + self, + compressed_value_a: float, + compressed_value_b: float, + compressed_value_ab: float, + ): + denominator = compressed_value_ab + if denominator == 0: + raise CompressedValuesEqualZero( + compressed_value_ab, function_name="Distance._clm" + ) + + numerator = 1 - (compressed_value_a + compressed_value_b - compressed_value_ab) + + distance = numerator / denominator + return distance + + def _mse( + self, + compressed_value_a: Union[list, np.ndarray], + compressed_value_b: Union[list, np.ndarray], + ) -> np.ndarray: + """ + Computes the mean squared error between two + values. + + """ + + if not isinstance(compressed_value_a, np.ndarray): + compressed_value_a = np.array(compressed_value_a) + + if not isinstance(compressed_value_b, np.ndarray): + compressed_value_b = np.array(compressed_value_b) + + compressed_value_a = compressed_value_a.reshape(-1) + compressed_value_b = compressed_value_b.reshape(-1) + + numerator = np.sum((compressed_value_a - compressed_value_b) ** 2) + denominator = compressed_value_a.shape[0] + + if denominator == 0: + raise CompressedValuesEqualZero( + compressed_value_a, compressed_value_b, function_name="Distance._mse" + ) + + mse = numerator / denominator + return mse + + @property + def ncd(self) -> np.ndarray: + """ + A numpy vectorized form of self._ncd. + + + """ + return np.vectorize(self._ncd)( + self.compressed_values_a, + self.compressed_values_b, + self.compressed_values_ab, + ) + + @property + def cdm(self) -> np.ndarray: + """ + A numpy vectorized form of self._cdm. + + + """ + return np.vectorize(self._cdm)( + self.compressed_values_a, + self.compressed_values_b, + self.compressed_values_ab, + ) + + @property + def clm(self) -> np.ndarray: + """ + A numpy vectorized form of self._clm. + + + """ + return np.vectorize(self._clm)( + self.compressed_values_a, + self.compressed_values_b, + self.compressed_values_ab, + ) + + @property + def mse(self) -> np.ndarray: + """ + A numpy vectorized form of self._mse. + + + """ + return np.vectorize(self._mse)( + self.compressed_values_a, + self.compressed_values_b, + ) + + +if __name__ == "__main__": + import random + + a = random.random() + b = random.random() + ab = random.random() + + distance = Distance(a, b, ab) + ncd = distance._ncd(a, b, ab) + cdm = distance._cdm(a, b, ab) + clm = distance._clm(a, b, ab) + mse = distance._mse(a, b) + + a = np.random.rand(3, 10) + b = np.random.rand(3, 10) + ab = np.random.rand(3, 10) + + distance = Distance(a, b, ab) + + ncd = distance.ncd + cdm = distance.cdm + clm = distance.clm + mse = distance.mse diff --git a/npc_gzip/exceptions.py b/npc_gzip/exceptions.py index 2751a00d..17a8b67d 100644 --- a/npc_gzip/exceptions.py +++ b/npc_gzip/exceptions.py @@ -1,22 +1,30 @@ +from typing import Any + +import numpy as np + + class InvalidCompressorException(Exception): """ Is raised when a user is trying to use a compression library that is not supported. """ + def __init__(self, compression_library: str): - self.message = f''' + self.message = f""" Compression Library ({compression_library}) is not currently supported. - ''' + """ super().__init__(self.message) + class MissingDependencyException(Exception): """ Is raised when an underlying dependency is not found when loading a library. """ + def __init__(self, compression_library: str): - self.message = f''' + self.message = f""" Compression Library ({compression_library}) is missing an underlying dependency. Try installing those missing dependencies and @@ -31,15 +39,80 @@ def __init__(self, compression_library: str): * bz2: * sudo apt-get install lzma liblzma-dev libbz2-dev - ''' + """ super().__init__(self.message) + class StringTooShortException(Exception): - def __init__(self, stringa: str, stringb: str, function_name: str): - self.message = f''' - Unable to aggregate ({stringa}) and ({stringb}) using - function: ({function_name}). One or both of the two - strings are too short to concatenate. + def __init__(self, stringa: str, stringb: str, function_name: str = None): + self.message = f""" + Unable to aggregate ({stringa}) and ({stringb}). + One or both of the two strings are too short to concatenate. + + """ + + if function_name is not None: + self.message += f"function_name: {function_name}" + + super().__init__(self.message) + + +class CompressedValuesEqualZero(Exception): + def __init__( + self, + compressed_value_a: float, + compressed_value_b: float = None, + function_name: str = None, + ): + self.message = f""" + The combination of compressed values passed equal zero. + This will result in a divide by zero error. + - ''' - super().__init__(self.message) \ No newline at end of file + """ + + if function_name is not None: + self.message += f"function_name: {function_name}" + super().__init__(self.message) + + +class AllOrNoneException(Exception): + def __init__( + self, + a: Any, + b: Any, + c: Any, + function_name: str = None, + ): + self.message = f""" + The passed values must either all be None or not None. + arg1: {type(a)} + arg2: {type(b)} + arg3: {type(c)} + + """ + + if function_name is not None: + self.message += f"function_name: {function_name}" + super().__init__(self.message) + + +class InvalidShapeException(Exception): + def __init__( + self, + array_a: np.ndarray, + array_b: np.ndarray, + array_c: np.ndarray, + function_name: str = None, + ): + self.message = f""" + The passed values must either all of the same shape. + arg1: {array_a.shape} + arg2: {array_b.shape} + arg3: {array_c.shape} + + """ + + if function_name is not None: + self.message += f"function_name: {function_name}" + super().__init__(self.message) diff --git a/poetry.lock b/poetry.lock index f4de8dbd..b89f8cd2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -99,6 +99,40 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "numpy" +version = "1.25.1" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d339465dff3eb33c701430bcb9c325b60354698340229e1dff97745e6b3efa"}, + {file = "numpy-1.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d736b75c3f2cb96843a5c7f8d8ccc414768d34b0a75f466c05f3a739b406f10b"}, + {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a90725800caeaa160732d6b31f3f843ebd45d6b5f3eec9e8cc287e30f2805bf"}, + {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6c9261d21e617c6dc5eacba35cb68ec36bb72adcff0dee63f8fbc899362588"}, + {file = "numpy-1.25.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0def91f8af6ec4bb94c370e38c575855bf1d0be8a8fbfba42ef9c073faf2cf19"}, + {file = "numpy-1.25.1-cp310-cp310-win32.whl", hash = "sha256:fd67b306320dcadea700a8f79b9e671e607f8696e98ec255915c0c6d6b818503"}, + {file = "numpy-1.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:c1516db588987450b85595586605742879e50dcce923e8973f79529651545b57"}, + {file = "numpy-1.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b82655dd8efeea69dbf85d00fca40013d7f503212bc5259056244961268b66e"}, + {file = "numpy-1.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f6049c4878cb16960fbbfb22105e49d13d752d4d8371b55110941fb3b17800"}, + {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41a56b70e8139884eccb2f733c2f7378af06c82304959e174f8e7370af112e09"}, + {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5154b1a25ec796b1aee12ac1b22f414f94752c5f94832f14d8d6c9ac40bcca6"}, + {file = "numpy-1.25.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38eb6548bb91c421261b4805dc44def9ca1a6eef6444ce35ad1669c0f1a3fc5d"}, + {file = "numpy-1.25.1-cp311-cp311-win32.whl", hash = "sha256:791f409064d0a69dd20579345d852c59822c6aa087f23b07b1b4e28ff5880fcb"}, + {file = "numpy-1.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:c40571fe966393b212689aa17e32ed905924120737194b5d5c1b20b9ed0fb171"}, + {file = "numpy-1.25.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d7abcdd85aea3e6cdddb59af2350c7ab1ed764397f8eec97a038ad244d2d105"}, + {file = "numpy-1.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a180429394f81c7933634ae49b37b472d343cccb5bb0c4a575ac8bbc433722f"}, + {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d412c1697c3853c6fc3cb9751b4915859c7afe6a277c2bf00acf287d56c4e625"}, + {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e1266411120a4f16fad8efa8e0454d21d00b8c7cee5b5ccad7565d95eb42dd"}, + {file = "numpy-1.25.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f76aebc3358ade9eacf9bc2bb8ae589863a4f911611694103af05346637df1b7"}, + {file = "numpy-1.25.1-cp39-cp39-win32.whl", hash = "sha256:247d3ffdd7775bdf191f848be8d49100495114c82c2bd134e8d5d075fb386a1c"}, + {file = "numpy-1.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:1d5d3c68e443c90b38fdf8ef40e60e2538a27548b39b12b73132456847f4b631"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:35a9527c977b924042170a0887de727cd84ff179e478481404c5dc66b4170009"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d3fe3dd0506a28493d82dc3cf254be8cd0d26f4008a417385cbf1ae95b54004"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:012097b5b0d00a11070e8f2e261128c44157a8689f7dedcf35576e525893f4fe"}, + {file = "numpy-1.25.1.tar.gz", hash = "sha256:9a3a9f3a61480cc086117b426a8bd86869c213fc4072e606f01c4e4b66eb92bf"}, +] + [[package]] name = "packaging" version = "23.1" @@ -161,4 +195,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "8911ec5744aae9b13dd8874953578099aaa7c68b8791606e24ac7be62cc2d3cb" +content-hash = "ad0dc1856f0e0fa52fd0a780469aa28df52143d481b56fad901c8042552a2f81" diff --git a/pyproject.toml b/pyproject.toml index 89870345..fa450e4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ packages = [{include = "npc_gzip"}] [tool.poetry.dependencies] python = "^3.9" +numpy = "^1.25.1" [tool.poetry.group.dev.dependencies] black = "^23.7.0" From 332723d00f4e4dec9aeecbab4279bebfb98a3266 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Tue, 25 Jul 2023 21:27:36 -0400 Subject: [PATCH 06/74] add github actions --- .github/workflows/publish-package.yml | 44 +++++++++++++++++++++++++++ .github/workflows/test-package.yml | 37 ++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 .github/workflows/publish-package.yml create mode 100644 .github/workflows/test-package.yml diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml new file mode 100644 index 00000000..850c4242 --- /dev/null +++ b/.github/workflows/publish-package.yml @@ -0,0 +1,44 @@ +name: Publish npc_gzip package + +on: + release: + types: [published] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + poetry --version + poetry export --with test --without-hashes --format=requirements.txt > requirements.txt + + - name: Run Tests + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + poetry install + python -m pytest --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + + - name: Upload pytest test results + uses: actions/upload-artifact@v3 + with: + name: pytest-results-${{ matrix.python-version }} + path: junit/test-results-${{ matrix.python-version }}.xml + + - name: Poetry Build & Publish + run: | + poetry build + poetry publish \ No newline at end of file diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml new file mode 100644 index 00000000..aa733859 --- /dev/null +++ b/.github/workflows/test-package.yml @@ -0,0 +1,37 @@ +name: Run npc_gzip tests + +on: [ push ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + poetry --version + poetry export --with test --without-hashes --format=requirements.txt > requirements.txt + + - name: Run Tests + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + poetry install + python -m pytest --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + + - name: Upload pytest test results + uses: actions/upload-artifact@v3 + with: + name: pytest-results-${{ matrix.python-version }} + path: junit/test-results-${{ matrix.python-version }}.xml \ No newline at end of file From 013ffb35a43d66c7e68b72ed5cb50f1205dff039 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Wed, 26 Jul 2023 17:03:58 -0400 Subject: [PATCH 07/74] add pytest to test dependencies --- poetry.lock | 64 +++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 4 ++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index b89f8cd2..6c67272c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -71,6 +71,31 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "exceptiongroup" +version = "1.1.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"}, + {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "isort" version = "5.12.0" @@ -170,6 +195,43 @@ files = [ docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] +[[package]] +name = "pluggy" +version = "1.2.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, + {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pytest" +version = "7.4.0" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + [[package]] name = "tomli" version = "2.0.1" @@ -195,4 +257,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "ad0dc1856f0e0fa52fd0a780469aa28df52143d481b56fad901c8042552a2f81" +content-hash = "a4d722b628042e77d96061368f7fe7885a1307a42f56fba0e440e512d3e30d65" diff --git a/pyproject.toml b/pyproject.toml index fa450e4e..4c012fa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,10 @@ numpy = "^1.25.1" black = "^23.7.0" isort = "^5.12.0" + +[tool.poetry.group.test.dependencies] +pytest = "^7.4.0" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From d81ef2cd2a0eb42c54966d059c9b772051137f1f Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Wed, 26 Jul 2023 17:04:32 -0400 Subject: [PATCH 08/74] remove unused exception refactor logic to propery trigger InvalidShapeException --- npc_gzip/distance.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/npc_gzip/distance.py b/npc_gzip/distance.py index 211bec86..d5c2784a 100644 --- a/npc_gzip/distance.py +++ b/npc_gzip/distance.py @@ -3,7 +3,6 @@ import numpy as np from npc_gzip.exceptions import ( - AllOrNoneException, CompressedValuesEqualZero, InvalidShapeException, ) @@ -32,9 +31,15 @@ def __init__( if ( compressed_values_a.shape - != compressed_values_b.shape - != compressed_values_ab.shape + == compressed_values_b.shape + == compressed_values_ab.shape ): + + + self.compressed_values_a = compressed_values_a + self.compressed_values_b = compressed_values_b + self.compressed_values_ab = compressed_values_ab + else: raise InvalidShapeException( compressed_values_a, compressed_values_b, @@ -42,10 +47,6 @@ def __init__( function_name="Distance.__init__", ) - self.compressed_values_a = compressed_values_a - self.compressed_values_b = compressed_values_b - self.compressed_values_ab = compressed_values_ab - def _ncd( self, compressed_value_a: float, @@ -86,7 +87,7 @@ def _clm( compressed_value_a: float, compressed_value_b: float, compressed_value_ab: float, - ): + ) -> float: denominator = compressed_value_ab if denominator == 0: raise CompressedValuesEqualZero( @@ -197,7 +198,7 @@ def mse(self) -> np.ndarray: a = np.random.rand(3, 10) b = np.random.rand(3, 10) ab = np.random.rand(3, 10) - + distance = Distance(a, b, ab) ncd = distance.ncd From 5b404968105e5341b1700a33f7b6d39b5a9962a6 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Wed, 26 Jul 2023 17:05:07 -0400 Subject: [PATCH 09/74] add helper function to open files add BaseCompressor as acceptable compressor type --- npc_gzip/compressors/base.py | 41 ++++++++++++++++++++++++++++++++---- tests/__init__.py | 0 2 files changed, 37 insertions(+), 4 deletions(-) create mode 100644 tests/__init__.py diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py index 3e3b4ea4..c0564f7e 100644 --- a/npc_gzip/compressors/base.py +++ b/npc_gzip/compressors/base.py @@ -1,3 +1,4 @@ +import os from types import ModuleType from npc_gzip.exceptions import InvalidCompressorException @@ -11,8 +12,41 @@ class BaseCompressor: """ def __init__(self, compressor: ModuleType): + + if not isinstance(compressor, (ModuleType, BaseCompressor)): + raise InvalidCompressorException(f"Not a function passed: {type(compressor)}") self.compressor = compressor + def _open_file(self, filepath: str, as_bytes: bool = False): + """ + Helper function that loads and returns the contents + of a file at `filepath`. Optional `as_bytes` parameter + will load the file contents with `open(filepath, 'rb')` + if True. + + Arguments: + filepath (str): Path to the file to read contents from. + as_bytes (bool): [Optional] If true, opens the file as bytes. + + Returns: + str: File contents as a string. + """ + + assert os.path.isfile(filepath), f"Filepath ({filepath}) does not exist." + file_contents = None + open_mode = "r" + if as_bytes: + open_mode = "rb" + + with open(filepath, open_mode) as f: + file_contents = f.read() + f.close() + + if as_bytes: + file_contents = file_contents.decode("utf-8") + + return file_contents + def _compress(self, x: str) -> bytes: """ Applies the compression algorithm to `x` and @@ -26,8 +60,8 @@ def _compress(self, x: str) -> bytes: """ x = str(x) - x = x.encode("utf-8") - compressed = self.compressor.compress(x) + x: bytes = x.encode("utf-8") + compressed: bytes = self.compressor.compress(x) return compressed def get_compressed_length(self, x: str) -> int: @@ -59,6 +93,7 @@ def get_bits_per_character(self, x: str) -> float: characters in `x`. """ + x = str(x) compressed_value: bytes = self._compress(x) compressed_length: int = len(compressed_value) compressed_length_in_bits: int = compressed_length * 8 @@ -79,5 +114,3 @@ def get_bits_per_character(self, x: str) -> float: example = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) bits_per_character: float = compressor.get_bits_per_character(example) - - print(compressed_length, bits_per_character) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b From c073764792a8d62d1b0e9c470333fc664bf9b545 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Wed, 26 Jul 2023 17:05:25 -0400 Subject: [PATCH 10/74] fix bz2 compressor from being a duplicate lzma compressor --- npc_gzip/compressors/bz2_compressor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/npc_gzip/compressors/bz2_compressor.py b/npc_gzip/compressors/bz2_compressor.py index e489a44b..bd84fee2 100644 --- a/npc_gzip/compressors/bz2_compressor.py +++ b/npc_gzip/compressors/bz2_compressor.py @@ -2,12 +2,12 @@ from npc_gzip.exceptions import MissingDependencyException -class LzmaCompressor(BaseCompressor): +class Bz2Compressor(BaseCompressor): def __init__(self): super().__init__(self) try: - import lzma + import bz2 except ModuleNotFoundError as e: import platform @@ -17,12 +17,12 @@ def __init__(self): else: raise MissingDependencyException("bz2") - self.compressor = lzma + self.compressor = bz2 if __name__ == "__main__": - compressor = LzmaCompressor() + compressor = Bz2Compressor() example: str = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) bits_per_character: float = compressor.get_bits_per_character(example) From 74b5d8527dc199ac65d419c28fcd06550ffe545e Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Wed, 26 Jul 2023 17:05:31 -0400 Subject: [PATCH 11/74] add initial tests --- tests/test_base_compressor.py | 73 ++++++++++++++ tests/test_bz2_compressor.py | 52 ++++++++++ tests/test_distance.py | 183 ++++++++++++++++++++++++++++++++++ tests/test_gzip_compressor.py | 52 ++++++++++ tests/test_lzma_compressor.py | 52 ++++++++++ 5 files changed, 412 insertions(+) create mode 100644 tests/test_base_compressor.py create mode 100644 tests/test_bz2_compressor.py create mode 100644 tests/test_distance.py create mode 100644 tests/test_gzip_compressor.py create mode 100644 tests/test_lzma_compressor.py diff --git a/tests/test_base_compressor.py b/tests/test_base_compressor.py new file mode 100644 index 00000000..6c0f315e --- /dev/null +++ b/tests/test_base_compressor.py @@ -0,0 +1,73 @@ +import gzip +from types import ModuleType + +import pytest + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.exceptions import InvalidCompressorException + + +class TestBaseCompressor: + + compressor = BaseCompressor(compressor=gzip) + example_input = "hello there!" + + def test_types(self): + invalid_compressors = ["test", 0.1, 123, ("hello", "there"), {"hey": "hi"}] + + for compressor in invalid_compressors: + with pytest.raises(InvalidCompressorException): + BaseCompressor(compressor) + + valid_compressor = gzip + BaseCompressor(valid_compressor) + + def test__open_file(self): + valid_filepath = "tests/test_base_compressor.py" + invalid_filepath = "tests/test_base_compressor.py.xyz" + + with pytest.raises(AssertionError): + self.compressor._open_file(invalid_filepath) + self.compressor._open_file(invalid_filepath, as_bytes=True) + + file_contents = self.compressor._open_file(valid_filepath, as_bytes=False) + assert isinstance(file_contents, str) + + file_contents = self.compressor._open_file(valid_filepath, as_bytes=True) + assert isinstance(file_contents, str) + + def test__compress(self): + compressed_bytes = self.compressor._compress(self.example_input) + assert isinstance(compressed_bytes, bytes) + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.compressor._compress(input_) + assert isinstance(out, bytes) + + def test_get_compressed_length(self): + + example_input_length = self.compressor.get_compressed_length(self.example_input) + assert isinstance(example_input_length, int) + assert example_input_length > 0 + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.compressor.get_compressed_length(input_) + assert isinstance(out, int) + assert out > 0 + + def test_get_bits_per_character(self): + + example_bits_per_character = self.compressor.get_bits_per_character( + self.example_input + ) + assert isinstance(example_bits_per_character, float) + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] + + for input_ in example_inputs: + out = self.compressor.get_bits_per_character(input_) + assert isinstance(out, float) + assert out > 0 diff --git a/tests/test_bz2_compressor.py b/tests/test_bz2_compressor.py new file mode 100644 index 00000000..b75f8575 --- /dev/null +++ b/tests/test_bz2_compressor.py @@ -0,0 +1,52 @@ +from types import ModuleType + +import pytest + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.exceptions import InvalidCompressorException + +import bz2 +from npc_gzip.compressors.bz2_compressor import Bz2Compressor + +class TestBz2Compressor: + + base_compressor = BaseCompressor(bz2) + compressor = Bz2Compressor() + example_input = "hello there!" + + def test__compress(self): + compressed_bytes = self.compressor._compress(self.example_input) + base_compressed_bytes = self.base_compressor._compress(self.example_input) + assert compressed_bytes == base_compressed_bytes + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.base_compressor._compress(input_) + assert isinstance(out, bytes) + + def test_get_compressed_length(self): + + example_input_length = self.compressor.get_compressed_length(self.example_input) + assert isinstance(example_input_length, int) + assert example_input_length > 0 + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.compressor.get_compressed_length(input_) + assert isinstance(out, int) + assert out > 0 + + def test_get_bits_per_character(self): + + example_bits_per_character = self.compressor.get_bits_per_character( + self.example_input + ) + assert isinstance(example_bits_per_character, float) + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] + + for input_ in example_inputs: + out = self.compressor.get_bits_per_character(input_) + assert isinstance(out, float) + assert out > 0 diff --git a/tests/test_distance.py b/tests/test_distance.py new file mode 100644 index 00000000..8875e932 --- /dev/null +++ b/tests/test_distance.py @@ -0,0 +1,183 @@ +import pytest + +import random +import numpy as np +from npc_gzip.distance import Distance +from npc_gzip.exceptions import InvalidShapeException, CompressedValuesEqualZero + +class TestDistance: + + array_a = np.random.rand(3, 10) + array_b = np.random.rand(3, 10) + array_ab = np.random.rand(3, 10) + + float_a = random.random() + float_b = random.random() + float_ab = random.random() + + def test_types(self): + distance = Distance( + self.array_a, + self.array_b, + self.array_ab + ) + + assert isinstance(distance.compressed_values_a, np.ndarray) + assert isinstance(distance.compressed_values_b, np.ndarray) + assert isinstance(distance.compressed_values_ab, np.ndarray) + + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + assert isinstance(distance.compressed_values_a, np.ndarray) + assert isinstance(distance.compressed_values_b, np.ndarray) + assert isinstance(distance.compressed_values_ab, np.ndarray) + + def test_invalid_shapes(self): + + array_a_shape = self.array_a.shape + invalid_shape_array = np.random.rand(array_a_shape[0] + 1, array_a_shape[1] + 1) + assert invalid_shape_array.shape != self.array_a.shape + + with pytest.raises(InvalidShapeException): + Distance( + self.array_a, + self.array_b, + invalid_shape_array + ) + + def test__ncd(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance._ncd(self.float_a, self.float_b, self.float_ab) + assert isinstance(out, float) + + a = b = ab = 0 + with pytest.raises(CompressedValuesEqualZero): + distance._ncd(a, b, ab) + + with pytest.raises(ValueError): + out = distance._ncd(self.array_a, self.array_b, self.array_ab) + + def test__cdm(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance._cdm(self.float_a, self.float_b, self.float_ab) + assert isinstance(out, float) + + a = b = ab = 0 + with pytest.raises(CompressedValuesEqualZero): + distance._cdm(a, b, ab) + + with pytest.raises(ValueError): + out = distance._ncd(self.array_a, self.array_b, self.array_ab) + + def test__clm(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance._clm(self.float_a, self.float_b, self.float_ab) + assert isinstance(out, float) + + a = b = ab = 0 + with pytest.raises(CompressedValuesEqualZero): + distance._clm(a, b, ab) + + with pytest.raises(ValueError): + out = distance._ncd(self.array_a, self.array_b, self.array_ab) + + def test__mse(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance._mse(self.float_a, self.float_b) + assert isinstance(out, float) + + a = b = 0 + out = distance._mse(a, b) + assert isinstance(out, float) + + def test_ncd(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance.ncd + assert isinstance(out, np.ndarray) + + distance = Distance( + self.array_a, + self.array_b, + self.array_ab + ) + out = distance.ncd + assert isinstance(out, np.ndarray) + + def test_cdm(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance.cdm + assert isinstance(out, np.ndarray) + + distance = Distance( + self.array_a, + self.array_b, + self.array_ab + ) + out = distance.cdm + assert isinstance(out, np.ndarray) + + def test_clm(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance.clm + assert isinstance(out, np.ndarray) + + distance = Distance( + self.array_a, + self.array_b, + self.array_ab + ) + out = distance.clm + assert isinstance(out, np.ndarray) + + def test_mse(self): + distance = Distance( + self.float_a, + self.float_b, + self.float_ab + ) + out = distance.mse + assert isinstance(out, np.ndarray) + + distance = Distance( + self.array_a, + self.array_b, + self.array_ab + ) + out = distance.mse + assert isinstance(out, np.ndarray) + + + + + diff --git a/tests/test_gzip_compressor.py b/tests/test_gzip_compressor.py new file mode 100644 index 00000000..c4b7c754 --- /dev/null +++ b/tests/test_gzip_compressor.py @@ -0,0 +1,52 @@ +from types import ModuleType + +import pytest + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.exceptions import InvalidCompressorException + +import gzip +from npc_gzip.compressors.gzip_compressor import GZipCompressor + +class TestBz2Compressor: + + base_compressor = BaseCompressor(gzip) + compressor = GZipCompressor() + example_input = "hello there!" + + def test__compress(self): + compressed_bytes = self.compressor._compress(self.example_input) + base_compressed_bytes = self.base_compressor._compress(self.example_input) + assert compressed_bytes == base_compressed_bytes + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.base_compressor._compress(input_) + assert isinstance(out, bytes) + + def test_get_compressed_length(self): + + example_input_length = self.compressor.get_compressed_length(self.example_input) + assert isinstance(example_input_length, int) + assert example_input_length > 0 + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.compressor.get_compressed_length(input_) + assert isinstance(out, int) + assert out > 0 + + def test_get_bits_per_character(self): + + example_bits_per_character = self.compressor.get_bits_per_character( + self.example_input + ) + assert isinstance(example_bits_per_character, float) + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] + + for input_ in example_inputs: + out = self.compressor.get_bits_per_character(input_) + assert isinstance(out, float) + assert out > 0 diff --git a/tests/test_lzma_compressor.py b/tests/test_lzma_compressor.py new file mode 100644 index 00000000..97926ff3 --- /dev/null +++ b/tests/test_lzma_compressor.py @@ -0,0 +1,52 @@ +from types import ModuleType + +import pytest + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.exceptions import InvalidCompressorException + +import lzma +from npc_gzip.compressors.lzma_compressor import LzmaCompressor + +class TestBz2Compressor: + + base_compressor = BaseCompressor(lzma) + compressor = LzmaCompressor() + example_input = "hello there!" + + def test__compress(self): + compressed_bytes = self.compressor._compress(self.example_input) + base_compressed_bytes = self.base_compressor._compress(self.example_input) + assert compressed_bytes == base_compressed_bytes + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.base_compressor._compress(input_) + assert isinstance(out, bytes) + + def test_get_compressed_length(self): + + example_input_length = self.compressor.get_compressed_length(self.example_input) + assert isinstance(example_input_length, int) + assert example_input_length > 0 + + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] + + for input_ in example_inputs: + out = self.compressor.get_compressed_length(input_) + assert isinstance(out, int) + assert out > 0 + + def test_get_bits_per_character(self): + + example_bits_per_character = self.compressor.get_bits_per_character( + self.example_input + ) + assert isinstance(example_bits_per_character, float) + example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] + + for input_ in example_inputs: + out = self.compressor.get_bits_per_character(input_) + assert isinstance(out, float) + assert out > 0 From b17e0b1777ad00ca2aa06a702a58f02265513c06 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Wed, 26 Jul 2023 17:07:57 -0400 Subject: [PATCH 12/74] modify test-package.yml to only run tests/ --- .github/workflows/test-package.yml | 2 +- .gitignore | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index aa733859..96428e75 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -28,7 +28,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt poetry install - python -m pytest --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + python -m pytest tests/ --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 diff --git a/.gitignore b/.gitignore index 3736894d..99638bcf 100644 --- a/.gitignore +++ b/.gitignore @@ -158,4 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -notebooks/ \ No newline at end of file +notebooks/ +junit/ \ No newline at end of file From c27597f6c2ffbe7e7519cb8d43f8e40b656717ea Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Thu, 27 Jul 2023 19:51:24 -0400 Subject: [PATCH 13/74] simplify aggregate_strings logic with itertools --- npc_gzip/aggregations.py | 53 +++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index 272f8547..efb92b30 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -1,23 +1,24 @@ import numpy as np - +import itertools from npc_gzip.exceptions import StringTooShortException -def concatenate_with_space(t1: str, t2: str) -> str: +def concatenate_with_space(stringa: str, stringb: str) -> str: """ - Combines `t1` and `t2` with a space. - - (formerly agg_by_concat_space) + Combines `stringa` and `stringb` with a space. Arguments: - t1 (str): First item. - t2 (str): Second item. + stringa (str): First item. + stringb (str): Second item. Returns: - str: `{t1} {t2}` + str: `{stringa} {stringb}` """ - return t1 + " " + t2 + stringa = str(stringa) + stringb = str(stringb) + + return stringa + " " + stringb def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> str: @@ -36,26 +37,22 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> str: combination of stringa and stringb """ - lista = list(stringa) - listb = list(stringb) - combined = [] - minimum_list_size = min([len(lista), len(listb)]) - i = None - for i in range(0, minimum_list_size - 1, 2): - combined.append(lista[i]) - combined.append(listb[i + 1]) - - if i is None: - raise StringTooShortException(t1, t2, "aggregate_strings") - - if len(lista) > len(listb): - combined += lista[i:] - + stringa = str(stringa) + stringb = str(stringb) + + stringa_list: list = stringa.split() + stringb_list: list = stringb.split() + + zipped_lists: list = list(zip(stringa_list, stringb_list)) + out: list = list( + itertools.chain(*zipped_lists) + ) + + aggregated: str = ' '.join(out) if by_character: - return "".join(combined) - - return " ".join(combined) - + aggregated: str = ''.join(out) + + return aggregated def average(array_a: np.ndarray, array_b: np.ndarray) -> np.ndarray: """ From 419d54536e4e2a170c1e6b230e4711d6d471724c Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Thu, 27 Jul 2023 19:51:32 -0400 Subject: [PATCH 14/74] add test cases for aggregations --- tests/test_aggregations.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tests/test_aggregations.py diff --git a/tests/test_aggregations.py b/tests/test_aggregations.py new file mode 100644 index 00000000..d1b8ca04 --- /dev/null +++ b/tests/test_aggregations.py @@ -0,0 +1,34 @@ +import pytest +from npc_gzip.exceptions import StringTooShortException +from npc_gzip.aggregations import ( + concatenate_with_space, + aggregate_strings, + average, + min_max_aggregation, + stack +) + +class TestAggregations: + + stringa: str = 'hey there how are you?' + stringb: str = 'I am just hanging out!' + + def test_concatenate_with_space(self): + out = concatenate_with_space(self.stringa, self.stringb) + + assert len(out) == len(self.stringa) + len(self.stringb) + 1 + assert out == f'{self.stringa} {self.stringb}' + + def test_aggregate_strings(self): + out = aggregate_strings( + self.stringa, self.stringb, by_character=False + ) + assert len(out) == len(self.stringa) + len(self.stringb) + 1 + + def test_aggregate_strings_by_character(self): + out = aggregate_strings( + self.stringa, self.stringb, by_character=True + ) + total_length = len(''.join(self.stringa.split())) + total_length += len(''.join(self.stringb.split())) + assert len(out) == total_length From 14b5c226e54da12ea414386e4120096596375c15 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Thu, 27 Jul 2023 20:33:10 -0400 Subject: [PATCH 15/74] remove unused StringTooShortException --- npc_gzip/aggregations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index efb92b30..94b4ebef 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -1,6 +1,5 @@ import numpy as np import itertools -from npc_gzip.exceptions import StringTooShortException def concatenate_with_space(stringa: str, stringb: str) -> str: From f7d4d54d577a8a24a307778979ec2ff5438d00a8 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 02:02:19 -0400 Subject: [PATCH 16/74] formatting --- npc_gzip/aggregations.py | 17 +++--- npc_gzip/compressors/base.py | 4 +- npc_gzip/distance.py | 35 ++++++++----- tests/test_aggregations.py | 25 ++++----- tests/test_bz2_compressor.py | 1 + tests/test_distance.py | 98 ++++++----------------------------- tests/test_gzip_compressor.py | 1 + tests/test_lzma_compressor.py | 1 + 8 files changed, 65 insertions(+), 117 deletions(-) diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index 94b4ebef..83c79cd7 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -16,7 +16,7 @@ def concatenate_with_space(stringa: str, stringb: str) -> str: stringa = str(stringa) stringb = str(stringb) - + return stringa + " " + stringb @@ -38,21 +38,20 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> stringa = str(stringa) stringb = str(stringb) - + stringa_list: list = stringa.split() stringb_list: list = stringb.split() zipped_lists: list = list(zip(stringa_list, stringb_list)) - out: list = list( - itertools.chain(*zipped_lists) - ) - - aggregated: str = ' '.join(out) + out: list = list(itertools.chain(*zipped_lists)) + + aggregated: str = " ".join(out) if by_character: - aggregated: str = ''.join(out) - + aggregated: str = "".join(out) + return aggregated + def average(array_a: np.ndarray, array_b: np.ndarray) -> np.ndarray: """ Calculates the average of `array_a` and `array_b`, diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py index c0564f7e..a4993888 100644 --- a/npc_gzip/compressors/base.py +++ b/npc_gzip/compressors/base.py @@ -14,7 +14,9 @@ class BaseCompressor: def __init__(self, compressor: ModuleType): if not isinstance(compressor, (ModuleType, BaseCompressor)): - raise InvalidCompressorException(f"Not a function passed: {type(compressor)}") + raise InvalidCompressorException( + f"Not a function passed: {type(compressor)}" + ) self.compressor = compressor def _open_file(self, filepath: str, as_bytes: bool = False): diff --git a/npc_gzip/distance.py b/npc_gzip/distance.py index d5c2784a..2bc5938d 100644 --- a/npc_gzip/distance.py +++ b/npc_gzip/distance.py @@ -25,16 +25,15 @@ def __init__( if not isinstance(compressed_values_ab, np.ndarray): compressed_values_ab = np.array(compressed_values_ab) - compressed_values_a = compressed_values_a.reshape(-1) - compressed_values_b = compressed_values_b.reshape(-1) - compressed_values_ab = compressed_values_ab.reshape(-1) + compressed_values_a = compressed_values_a + compressed_values_b = compressed_values_b + compressed_values_ab = compressed_values_ab if ( compressed_values_a.shape == compressed_values_b.shape == compressed_values_ab.shape ): - self.compressed_values_a = compressed_values_a self.compressed_values_b = compressed_values_b @@ -135,51 +134,63 @@ def ncd(self) -> np.ndarray: """ A numpy vectorized form of self._ncd. - """ - return np.vectorize(self._ncd)( + + out = np.vectorize(self._ncd)( self.compressed_values_a, self.compressed_values_b, self.compressed_values_ab, ) + out = out.reshape(self.compressed_values_a.shape) + + return out @property def cdm(self) -> np.ndarray: """ A numpy vectorized form of self._cdm. - """ - return np.vectorize(self._cdm)( + + out = np.vectorize(self._cdm)( self.compressed_values_a, self.compressed_values_b, self.compressed_values_ab, ) + out = out.reshape(self.compressed_values_a.shape) + + return out @property def clm(self) -> np.ndarray: """ A numpy vectorized form of self._clm. - """ - return np.vectorize(self._clm)( + + out = np.vectorize(self._clm)( self.compressed_values_a, self.compressed_values_b, self.compressed_values_ab, ) + out = out.reshape(self.compressed_values_a.shape) + + return out @property def mse(self) -> np.ndarray: """ A numpy vectorized form of self._mse. - """ - return np.vectorize(self._mse)( + + out = np.vectorize(self._mse)( self.compressed_values_a, self.compressed_values_b, ) + out = out.reshape(self.compressed_values_a.shape) + + return out if __name__ == "__main__": diff --git a/tests/test_aggregations.py b/tests/test_aggregations.py index d1b8ca04..db38fa8d 100644 --- a/tests/test_aggregations.py +++ b/tests/test_aggregations.py @@ -5,30 +5,27 @@ aggregate_strings, average, min_max_aggregation, - stack + stack, ) + class TestAggregations: - stringa: str = 'hey there how are you?' - stringb: str = 'I am just hanging out!' + stringa: str = "hey there how are you?" + stringb: str = "I am just hanging out!" def test_concatenate_with_space(self): out = concatenate_with_space(self.stringa, self.stringb) - + assert len(out) == len(self.stringa) + len(self.stringb) + 1 - assert out == f'{self.stringa} {self.stringb}' - + assert out == f"{self.stringa} {self.stringb}" + def test_aggregate_strings(self): - out = aggregate_strings( - self.stringa, self.stringb, by_character=False - ) + out = aggregate_strings(self.stringa, self.stringb, by_character=False) assert len(out) == len(self.stringa) + len(self.stringb) + 1 def test_aggregate_strings_by_character(self): - out = aggregate_strings( - self.stringa, self.stringb, by_character=True - ) - total_length = len(''.join(self.stringa.split())) - total_length += len(''.join(self.stringb.split())) + out = aggregate_strings(self.stringa, self.stringb, by_character=True) + total_length = len("".join(self.stringa.split())) + total_length += len("".join(self.stringb.split())) assert len(out) == total_length diff --git a/tests/test_bz2_compressor.py b/tests/test_bz2_compressor.py index b75f8575..cbb7af2d 100644 --- a/tests/test_bz2_compressor.py +++ b/tests/test_bz2_compressor.py @@ -8,6 +8,7 @@ import bz2 from npc_gzip.compressors.bz2_compressor import Bz2Compressor + class TestBz2Compressor: base_compressor = BaseCompressor(bz2) diff --git a/tests/test_distance.py b/tests/test_distance.py index 8875e932..96eb7481 100644 --- a/tests/test_distance.py +++ b/tests/test_distance.py @@ -5,6 +5,7 @@ from npc_gzip.distance import Distance from npc_gzip.exceptions import InvalidShapeException, CompressedValuesEqualZero + class TestDistance: array_a = np.random.rand(3, 10) @@ -16,21 +17,13 @@ class TestDistance: float_ab = random.random() def test_types(self): - distance = Distance( - self.array_a, - self.array_b, - self.array_ab - ) + distance = Distance(self.array_a, self.array_b, self.array_ab) assert isinstance(distance.compressed_values_a, np.ndarray) assert isinstance(distance.compressed_values_b, np.ndarray) assert isinstance(distance.compressed_values_ab, np.ndarray) - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) assert isinstance(distance.compressed_values_a, np.ndarray) assert isinstance(distance.compressed_values_b, np.ndarray) assert isinstance(distance.compressed_values_ab, np.ndarray) @@ -42,18 +35,10 @@ def test_invalid_shapes(self): assert invalid_shape_array.shape != self.array_a.shape with pytest.raises(InvalidShapeException): - Distance( - self.array_a, - self.array_b, - invalid_shape_array - ) + Distance(self.array_a, self.array_b, invalid_shape_array) def test__ncd(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._ncd(self.float_a, self.float_b, self.float_ab) assert isinstance(out, float) @@ -63,13 +48,9 @@ def test__ncd(self): with pytest.raises(ValueError): out = distance._ncd(self.array_a, self.array_b, self.array_ab) - + def test__cdm(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._cdm(self.float_a, self.float_b, self.float_ab) assert isinstance(out, float) @@ -81,11 +62,7 @@ def test__cdm(self): out = distance._ncd(self.array_a, self.array_b, self.array_ab) def test__clm(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._clm(self.float_a, self.float_b, self.float_ab) assert isinstance(out, float) @@ -97,11 +74,7 @@ def test__clm(self): out = distance._ncd(self.array_a, self.array_b, self.array_ab) def test__mse(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._mse(self.float_a, self.float_b) assert isinstance(out, float) @@ -110,74 +83,37 @@ def test__mse(self): assert isinstance(out, float) def test_ncd(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.ncd assert isinstance(out, np.ndarray) - distance = Distance( - self.array_a, - self.array_b, - self.array_ab - ) + distance = Distance(self.array_a, self.array_b, self.array_ab) out = distance.ncd assert isinstance(out, np.ndarray) def test_cdm(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.cdm assert isinstance(out, np.ndarray) - distance = Distance( - self.array_a, - self.array_b, - self.array_ab - ) + distance = Distance(self.array_a, self.array_b, self.array_ab) out = distance.cdm assert isinstance(out, np.ndarray) def test_clm(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.clm assert isinstance(out, np.ndarray) - distance = Distance( - self.array_a, - self.array_b, - self.array_ab - ) + distance = Distance(self.array_a, self.array_b, self.array_ab) out = distance.clm assert isinstance(out, np.ndarray) def test_mse(self): - distance = Distance( - self.float_a, - self.float_b, - self.float_ab - ) + distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.mse assert isinstance(out, np.ndarray) - distance = Distance( - self.array_a, - self.array_b, - self.array_ab - ) + distance = Distance(self.array_a, self.array_b, self.array_ab) out = distance.mse assert isinstance(out, np.ndarray) - - - - - diff --git a/tests/test_gzip_compressor.py b/tests/test_gzip_compressor.py index c4b7c754..f3068a35 100644 --- a/tests/test_gzip_compressor.py +++ b/tests/test_gzip_compressor.py @@ -8,6 +8,7 @@ import gzip from npc_gzip.compressors.gzip_compressor import GZipCompressor + class TestBz2Compressor: base_compressor = BaseCompressor(gzip) diff --git a/tests/test_lzma_compressor.py b/tests/test_lzma_compressor.py index 97926ff3..8d7431e4 100644 --- a/tests/test_lzma_compressor.py +++ b/tests/test_lzma_compressor.py @@ -8,6 +8,7 @@ import lzma from npc_gzip.compressors.lzma_compressor import LzmaCompressor + class TestBz2Compressor: base_compressor = BaseCompressor(lzma) From 83c2eef8a6cf5000c9f0e8daabd2d1cce661e970 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 02:02:31 -0400 Subject: [PATCH 17/74] add new exceptions for knn_compressor --- npc_gzip/exceptions.py | 93 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/npc_gzip/exceptions.py b/npc_gzip/exceptions.py index 17a8b67d..d78697fe 100644 --- a/npc_gzip/exceptions.py +++ b/npc_gzip/exceptions.py @@ -116,3 +116,96 @@ def __init__( if function_name is not None: self.message += f"function_name: {function_name}" super().__init__(self.message) + + +class UnsupportedDistanceMetricException(Exception): + def __init__( + self, + distance_metric: str, + supported_distance_metrics: list = None, + function_name: str = None, + ): + self.message = f""" + The `distance_metric` ({distance_metric}) provided is not + currently supported. Please submit an Issue and/or + Pull Request here to add support: + https://github.com/bazingagin/npc_gzip + + """ + + if supported_distance_metrics is not None: + self.message += ( + f"supported_distance_metrics: {supported_distance_metrics}\n" + ) + + if function_name is not None: + self.message += f"function_name: {function_name}" + super().__init__(self.message) + + +class InvalidObjectTypeException(Exception): + def __init__( + self, + passed_type: str, + supported_types: list = None, + function_name: str = None, + ): + self.message = f""" + The type passed ({passed_type}) provided is not + currently supported. + + """ + + if supported_types is not None: + self.message += f"supported types: {supported_types}\n" + + if function_name is not None: + self.message += f"function_name: {function_name}" + super().__init__(self.message) + + +class InputLabelEqualLengthException(Exception): + def __init__( + self, + training_samples: int, + label_samples: int, + function_name: str = None, + ): + self.message = f""" + If training labels are passed, the number + of training data samples must equal the + number of training label samples + + training_samples: {training_samples} + label_samples: {label_samples} + + """ + + if function_name is not None: + self.message += f"function_name: {function_name}" + super().__init__(self.message) + + +class UnsupportedDistanceMetricException(Exception): + def __init__( + self, + distance_metric: str, + supported_distance_metrics: list = None, + function_name: str = None, + ): + self.message = f""" + The `distance_metric` ({distance_metric}) provided is not + currently supported. Please submit an Issue and/or + Pull Request here to add support: + https://github.com/bazingagin/npc_gzip + + """ + + if supported_distance_metrics is not None: + self.message += ( + f"supported_distance_metrics: {supported_distance_metrics}\n" + ) + + if function_name is not None: + self.message += f"function_name: {function_name}" + super().__init__(self.message) From 41fd76d0fce8a7163d3b96c016704ffaa02cbb62 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 02:02:42 -0400 Subject: [PATCH 18/74] add core KNN Compressor --- npc_gzip/knn_compressor.py | 290 +++++++++++++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 npc_gzip/knn_compressor.py diff --git a/npc_gzip/knn_compressor.py b/npc_gzip/knn_compressor.py new file mode 100644 index 00000000..2ab09c9b --- /dev/null +++ b/npc_gzip/knn_compressor.py @@ -0,0 +1,290 @@ +from npc_gzip.compressors.base import BaseCompressor +from typing import Union, Sequence +import numpy as np +from npc_gzip.exceptions import ( + InputLabelEqualLengthException, + InvalidObjectTypeException, + UnsupportedDistanceMetricException, +) +from npc_gzip.distance import Distance + + +class KnnCompressor: + """ + Given the training input and optional + training labels data, this class stores + the data in memory + + """ + + def __init__( + self, + compressor: BaseCompressor, + training_inputs: Union[np.ndarray, Sequence[str]], + training_labels: Union[np.ndarray, Sequence[Union[str, int]]] = None, + distance_metric: str = "ncd", + ): + self.compressor = compressor + + if isinstance(training_inputs, list) and isinstance(training_labels, list): + if training_labels is not None: + if len(training_inputs) != len(training_labels): + raise InputLabelEqualLengthException( + len(training_inputs), + len(training_labels), + function_name="KnnGzip.__init__", + ) + + self.training_inputs: np.ndarray = np.array(training_inputs).reshape(-1) + self.training_labels: np.ndarray = np.array(training_labels).reshape(-1) + + elif isinstance(training_inputs, np.ndarray) or isinstance( + training_labels, np.ndarray + ): + + self.training_inputs: np.ndarray = ( + np.array(training_inputs) + if isinstance(training_inputs, list) + else training_inputs + ) + self.training_labels: np.ndarray = ( + np.array(training_labels) + if isinstance(training_labels, list) + else training_labels + ) + + self.training_inputs = self.training_inputs.reshape(-1) + self.training_labels = self.training_labels.reshape(-1) + + else: + raise InvalidObjectTypeException( + type(training_inputs), + supported_types=[list, np.ndarray], + function_name="KnnGzip.__init__", + ) + + assert ( + self.training_inputs.shape == self.training_labels.shape + ), f""" + Training Inputs and Labels did not maintain their + shape during the conversion from lists to numpy arrays. + This is most likely a bug in the numpy package: + + self.training_inputs.shape: {self.training_inputs.shape} + self.training_labels.shape: {self.training_labels.shape} + """ + + self.supported_distance_metrics: list = ["ncd", "clm", "cdm", "mse"] + + if distance_metric not in self.supported_distance_metrics: + raise UnsupportedDistanceMetricException( + distance_metric, + self.supported_distance_metrics, + function_name="KnnGzip.__init__", + ) + + self.distance_metric = distance_metric + + self.compressed_training_inputs: list = [ + self.compressor.get_compressed_length(data) for data in self.training_inputs + ] + self.compressed_training_inputs: np.ndarray = np.array( + self.compressed_training_inputs + ).reshape(-1) + + def _calculate_distance( + self, compressed_x: np.ndarray, compressed_combined: np.ndarray + ) -> np.ndarray: + """ + Helper function that converts the string representation + of `self.distance_metric` to the actual + `npc_gzip.distance.Distance.[distance_metric]`. Then + that distance metric is calculated using + `self.compressed_training_inputs`, `compressed_x` and + `compressed_combined`. + + Arguments: + compressed_x (np.ndarray): Numpy array representing the + compressed lengths of the input + data to be scored. + compressed_combined (np.ndarray): Numpy array representing the + compressed lengths of the input + data combined with + each training sample to be scored. + Returns: + np.ndarray: Numpy array of dtype=np.float32 containing the distance + metric. + """ + + distance = Distance( + np.resize(self.compressed_training_inputs, compressed_x.shape), + compressed_x, + compressed_combined, + ) + + if self.distance_metric == "ncd": + return distance.ncd + elif self.distance_metric == "clm": + return distance.clm + elif self.distance_metric == "cdm": + return distance.cdm + elif self.distance_metric == "mse": + return distance.mse + else: + return "Invalid Distance Metric" + + def _compress_sample(self, x: str) -> tuple: + """ + Helper method that compresses `x` against each + item in `self.training_inputs` and returns the + distance between each sample using the + self.distance_metric from the + `npc_gzip.distance.Distance` object. + + Arguments: + x (np.ndarray): The sample data to compare against + the `training_inputs`. + + Returns: + np.ndarray: Compressed length of `x` as an array of shape + [self.compressed_training_inputs.shape[0]]. + np.ndarray: Compressed length of the combination of `x` and + each training sample as an array of shape + [self.compressed_training_inputs.shape[0]]. + """ + + assert isinstance( + x, str + ), f"Non-string was passed to self._compress_sample: {x}" + x_compressed = self.compressor.get_compressed_length(x) + compressed_x: list = [ + x_compressed for _ in range(self.training_inputs.shape[0]) + ] + compressed_x: np.ndarray = np.array(compressed_x).reshape(-1) + assert compressed_x.shape == self.training_inputs.shape + + combined: list = [] + for training_sample in self.training_inputs: + + train_and_x: str = self.concatenate_with_space(training_sample, x) + combined_compressed: int = self.compressor.get_compressed_length( + train_and_x + ) + combined.append(combined_compressed) + + combined: np.ndarray = np.array(combined).reshape(-1) + assert self.training_inputs.shape == compressed_x.shape == combined.shape + + return (compressed_x, combined) + + @staticmethod + def concatenate_with_space(stringa: str, stringb: str) -> str: + """ + Combines `stringa` and `stringb` with a space. + + Arguments: + stringa (str): First item. + stringb (str): Second item. + + Returns: + str: `{stringa} {stringb}` + """ + + stringa = str(stringa) + stringb = str(stringb) + + return stringa + " " + stringb + + def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): + """ + Given a test sample `x`, this method will + compare `x` against the training data provided + and will return the best matching label if + `training_labels` was passed during instantiation. + If `training_labels` was not passed during + instantiation, or if `top_k` is not None, the most + similar samples from `training_inputs` will be + returned. + + resulting array is [batch_size, self.training_inputs.shape[0] ] + + Arguments: + x (Union[np.ndarray, Sequence[str]]): The sample data to compare against + the `training_inputs`. + top_k (int): [Optional] If not None, the most similar + `k` number of samples will be returned. + `top_k` must be greater than zero. + [default: top_k=1] + + Returns: + np.ndarray: The `top_k` most similar `self.training_labels`. + np.ndarray: The `top_k` best matching samples + from `self.training_inputs`. + """ + + if not isinstance(x, np.ndarray): + x: np.ndarray = np.array(x) + + assert top_k > 0, f"top_k ({top_k}) must be greater than zero." + + x = x.reshape(-1) + assert ( + top_k <= x.shape[0] + ), f""" + top_k ({top_k}) must be less or equal to than the number of + samples provided to be predicted on ({x.shape[0]}) + + """ + + compressed_samples = [] + compressed_combined = [] + for sample in x: + compressed_sample, combined = self._compress_sample(sample) + compressed_samples.append(compressed_sample) + compressed_combined.append(combined) + + compressed_samples: np.ndarray = np.array(compressed_samples) # .reshape(-1) + compressed_combined: np.ndarray = np.array(compressed_combined) # .reshape(-1) + + distances: np.ndarray = self._calculate_distance( + compressed_samples, compressed_combined + ) + + # top matching training samples and labels by + # minimum distance. + + # get indicies of minimum top_k distances. + minimum_distance_indices = np.argpartition(distances, top_k)[:, :top_k] + + similar_samples = self.training_inputs[minimum_distance_indices] + labels = [] + if self.training_labels is not None: + labels = self.training_labels[minimum_distance_indices] + + return distances, labels, similar_samples + + +if __name__ == "__main__": + + import random + from npc_gzip.compressors.gzip_compressor import GZipCompressor + + training_data = ["hey", "hi", "how are you?", "not too bad"] + + training_labels = [random.randint(0, 1) for _ in range(len(training_data))] + assert len(training_data) == len(training_labels) + + model = KnnCompressor( + compressor=GZipCompressor(), + training_inputs=training_data, + training_labels=training_labels, + distance_metric='ncd' + ) + + test = np.array([ + 'hey', + 'you are a real pain in my ass', + 'go away please' + ]) + + distances, labels, similar_samples = model.predict(test, top_k=1) From 671544d22ff6cfba06d7d4a2f2edf20b2327fa66 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 16:23:26 -0400 Subject: [PATCH 19/74] add utils for test cases --- npc_gzip/__init__.py | 1 + npc_gzip/utils.py | 87 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 npc_gzip/utils.py diff --git a/npc_gzip/__init__.py b/npc_gzip/__init__.py index 08b4077f..e29214f7 100644 --- a/npc_gzip/__init__.py +++ b/npc_gzip/__init__.py @@ -1,3 +1,4 @@ from . import compressors from .distance import * from .exceptions import * +from .utils import * diff --git a/npc_gzip/utils.py b/npc_gzip/utils.py new file mode 100644 index 00000000..84cf5679 --- /dev/null +++ b/npc_gzip/utils.py @@ -0,0 +1,87 @@ +import random +import string + +from npc_gzip.exceptions import InvalidObjectTypeException + + +def generate_sentence(number_of_words: int = 10) -> str: + """ + Generates a sentence of random + numbers and letters, with + `number_of_words` words in the + sentence such that len(out.split()) \ + == `number_of_words`. + + Arguments: + number_of_words (int): The number of words you + want in the sentence. + + Returns: + str: Sentence of random numbers and letters. + """ + + if not isinstance(number_of_words, int): + if isinstance(number_of_words, float): + number_of_words = int(number_of_words) + else: + raise InvalidObjectTypeException( + type(number_of_words), + supported_types=[int, float], + function_name="generate_sentence", + ) + + assert number_of_words > 0, f"`number_of_words` must be greater than zero." + + words = [] + for word in range(number_of_words): + # initializing size of string + N = random.randint(1, 50) + + words.append( + "".join(random.choices(string.ascii_uppercase + string.digits, k=N)) + ) + + out = " ".join(words) + return out + + +def generate_dataset(number_of_sentences: int) -> list: + """ + Loops over `range(number_of_sentences)` that + utilizes `generate_sentence()` to generate + a dataset of randomly sized sentences. + + Arguments: + number_of_sentences (int): The number of + sentences you + want in your + dataset. + + Returns: + list: List of sentences (str). + """ + + if not isinstance(number_of_sentences, int): + if isinstance(number_of_sentences, float): + number_of_sentences = int(number_of_sentences) + else: + raise InvalidObjectTypeException( + type(number_of_sentences), + supported_types=[int, float], + function_name="generate_dataset", + ) + + assert number_of_sentences > 0, f"`number_of_sentences` must be greater than zero." + + dataset = [] + for sentence in range(number_of_sentences): + number_of_words = random.randint(1, 100) + dataset.append(generate_sentence(number_of_words)) + + return dataset + + +if __name__ == "__main__": + number_of_sentences = random.randint(1, 100) + dataset = generate_dataset(number_of_sentences) + assert len(dataset) == number_of_sentences From 2cf38b4d19de348660e9dd9305319bedda90442f Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 16:23:51 -0400 Subject: [PATCH 20/74] formatting --- npc_gzip/aggregations.py | 3 ++- npc_gzip/compressors/base.py | 2 -- npc_gzip/compressors/bz2_compressor.py | 1 - npc_gzip/compressors/gzip_compressor.py | 1 - npc_gzip/compressors/lzma_compressor.py | 1 - npc_gzip/distance.py | 9 +------ tests/test_aggregations.py | 6 ++--- tests/test_base_compressor.py | 3 --- tests/test_bz2_compressor.py | 8 ++----- tests/test_distance.py | 9 ++++--- tests/test_gzip_compressor.py | 8 ++----- tests/test_lzma_compressor.py | 8 ++----- tests/test_utils.py | 32 +++++++++++++++++++++++++ 13 files changed, 48 insertions(+), 43 deletions(-) create mode 100644 tests/test_utils.py diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index 83c79cd7..400f3df6 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -1,6 +1,7 @@ -import numpy as np import itertools +import numpy as np + def concatenate_with_space(stringa: str, stringb: str) -> str: """ diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py index a4993888..095ff6e0 100644 --- a/npc_gzip/compressors/base.py +++ b/npc_gzip/compressors/base.py @@ -12,7 +12,6 @@ class BaseCompressor: """ def __init__(self, compressor: ModuleType): - if not isinstance(compressor, (ModuleType, BaseCompressor)): raise InvalidCompressorException( f"Not a function passed: {type(compressor)}" @@ -108,7 +107,6 @@ def get_bits_per_character(self, x: str) -> float: if __name__ == "__main__": - import gzip compressor = BaseCompressor(compressor=gzip) diff --git a/npc_gzip/compressors/bz2_compressor.py b/npc_gzip/compressors/bz2_compressor.py index bd84fee2..8a802772 100644 --- a/npc_gzip/compressors/bz2_compressor.py +++ b/npc_gzip/compressors/bz2_compressor.py @@ -21,7 +21,6 @@ def __init__(self): if __name__ == "__main__": - compressor = Bz2Compressor() example: str = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) diff --git a/npc_gzip/compressors/gzip_compressor.py b/npc_gzip/compressors/gzip_compressor.py index b885853a..7b8f946d 100644 --- a/npc_gzip/compressors/gzip_compressor.py +++ b/npc_gzip/compressors/gzip_compressor.py @@ -20,7 +20,6 @@ def __init__(self): if __name__ == "__main__": - compressor = GZipCompressor() example: str = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) diff --git a/npc_gzip/compressors/lzma_compressor.py b/npc_gzip/compressors/lzma_compressor.py index dd01b735..62883024 100644 --- a/npc_gzip/compressors/lzma_compressor.py +++ b/npc_gzip/compressors/lzma_compressor.py @@ -21,7 +21,6 @@ def __init__(self): if __name__ == "__main__": - compressor = LzmaCompressor() example: str = "Hello there!" compressed_length: int = compressor.get_compressed_length(example) diff --git a/npc_gzip/distance.py b/npc_gzip/distance.py index 2bc5938d..07efc1f0 100644 --- a/npc_gzip/distance.py +++ b/npc_gzip/distance.py @@ -2,10 +2,7 @@ import numpy as np -from npc_gzip.exceptions import ( - CompressedValuesEqualZero, - InvalidShapeException, -) +from npc_gzip.exceptions import CompressedValuesEqualZero, InvalidShapeException class Distance: @@ -15,7 +12,6 @@ def __init__( compressed_values_b: Union[list, np.ndarray], compressed_values_ab: Union[list, np.ndarray], ): - if not isinstance(compressed_values_a, np.ndarray): compressed_values_a = np.array(compressed_values_a) @@ -34,7 +30,6 @@ def __init__( == compressed_values_b.shape == compressed_values_ab.shape ): - self.compressed_values_a = compressed_values_a self.compressed_values_b = compressed_values_b self.compressed_values_ab = compressed_values_ab @@ -52,7 +47,6 @@ def _ncd( compressed_value_b: float, compressed_value_ab: float, ) -> float: - denominator = max(compressed_value_a, compressed_value_b) if denominator == 0: raise CompressedValuesEqualZero( @@ -69,7 +63,6 @@ def _cdm( compressed_value_b: float, compressed_value_ab: float, ) -> float: - denominator = compressed_value_a + compressed_value_b if denominator == 0: raise CompressedValuesEqualZero( diff --git a/tests/test_aggregations.py b/tests/test_aggregations.py index db38fa8d..ba5edfb3 100644 --- a/tests/test_aggregations.py +++ b/tests/test_aggregations.py @@ -1,16 +1,16 @@ import pytest -from npc_gzip.exceptions import StringTooShortException + from npc_gzip.aggregations import ( - concatenate_with_space, aggregate_strings, average, + concatenate_with_space, min_max_aggregation, stack, ) +from npc_gzip.exceptions import StringTooShortException class TestAggregations: - stringa: str = "hey there how are you?" stringb: str = "I am just hanging out!" diff --git a/tests/test_base_compressor.py b/tests/test_base_compressor.py index 6c0f315e..38670b7f 100644 --- a/tests/test_base_compressor.py +++ b/tests/test_base_compressor.py @@ -8,7 +8,6 @@ class TestBaseCompressor: - compressor = BaseCompressor(compressor=gzip) example_input = "hello there!" @@ -47,7 +46,6 @@ def test__compress(self): assert isinstance(out, bytes) def test_get_compressed_length(self): - example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -60,7 +58,6 @@ def test_get_compressed_length(self): assert out > 0 def test_get_bits_per_character(self): - example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_bz2_compressor.py b/tests/test_bz2_compressor.py index cbb7af2d..30eb41d0 100644 --- a/tests/test_bz2_compressor.py +++ b/tests/test_bz2_compressor.py @@ -1,16 +1,14 @@ +import bz2 from types import ModuleType import pytest from npc_gzip.compressors.base import BaseCompressor -from npc_gzip.exceptions import InvalidCompressorException - -import bz2 from npc_gzip.compressors.bz2_compressor import Bz2Compressor +from npc_gzip.exceptions import InvalidCompressorException class TestBz2Compressor: - base_compressor = BaseCompressor(bz2) compressor = Bz2Compressor() example_input = "hello there!" @@ -27,7 +25,6 @@ def test__compress(self): assert isinstance(out, bytes) def test_get_compressed_length(self): - example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -40,7 +37,6 @@ def test_get_compressed_length(self): assert out > 0 def test_get_bits_per_character(self): - example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_distance.py b/tests/test_distance.py index 96eb7481..53b18c79 100644 --- a/tests/test_distance.py +++ b/tests/test_distance.py @@ -1,13 +1,13 @@ -import pytest - import random + import numpy as np +import pytest + from npc_gzip.distance import Distance -from npc_gzip.exceptions import InvalidShapeException, CompressedValuesEqualZero +from npc_gzip.exceptions import CompressedValuesEqualZero, InvalidShapeException class TestDistance: - array_a = np.random.rand(3, 10) array_b = np.random.rand(3, 10) array_ab = np.random.rand(3, 10) @@ -29,7 +29,6 @@ def test_types(self): assert isinstance(distance.compressed_values_ab, np.ndarray) def test_invalid_shapes(self): - array_a_shape = self.array_a.shape invalid_shape_array = np.random.rand(array_a_shape[0] + 1, array_a_shape[1] + 1) assert invalid_shape_array.shape != self.array_a.shape diff --git a/tests/test_gzip_compressor.py b/tests/test_gzip_compressor.py index f3068a35..d10f8394 100644 --- a/tests/test_gzip_compressor.py +++ b/tests/test_gzip_compressor.py @@ -1,16 +1,14 @@ +import gzip from types import ModuleType import pytest from npc_gzip.compressors.base import BaseCompressor -from npc_gzip.exceptions import InvalidCompressorException - -import gzip from npc_gzip.compressors.gzip_compressor import GZipCompressor +from npc_gzip.exceptions import InvalidCompressorException class TestBz2Compressor: - base_compressor = BaseCompressor(gzip) compressor = GZipCompressor() example_input = "hello there!" @@ -27,7 +25,6 @@ def test__compress(self): assert isinstance(out, bytes) def test_get_compressed_length(self): - example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -40,7 +37,6 @@ def test_get_compressed_length(self): assert out > 0 def test_get_bits_per_character(self): - example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_lzma_compressor.py b/tests/test_lzma_compressor.py index 8d7431e4..3fe1e8ba 100644 --- a/tests/test_lzma_compressor.py +++ b/tests/test_lzma_compressor.py @@ -1,16 +1,14 @@ +import lzma from types import ModuleType import pytest from npc_gzip.compressors.base import BaseCompressor -from npc_gzip.exceptions import InvalidCompressorException - -import lzma from npc_gzip.compressors.lzma_compressor import LzmaCompressor +from npc_gzip.exceptions import InvalidCompressorException class TestBz2Compressor: - base_compressor = BaseCompressor(lzma) compressor = LzmaCompressor() example_input = "hello there!" @@ -27,7 +25,6 @@ def test__compress(self): assert isinstance(out, bytes) def test_get_compressed_length(self): - example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -40,7 +37,6 @@ def test_get_compressed_length(self): assert out > 0 def test_get_bits_per_character(self): - example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..96bef109 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,32 @@ +import random + +import pytest + +from npc_gzip.exceptions import InvalidObjectTypeException +from npc_gzip.utils import generate_dataset, generate_sentence + + +class TestUtils: + def test_generate_sentence(self): + for _ in range(100): + number_of_words = random.randint(1, 100) + sentence = generate_sentence(number_of_words) + assert len(sentence.split()) == number_of_words + + with pytest.raises(InvalidObjectTypeException): + generate_sentence("hey there") + + with pytest.raises(AssertionError): + generate_sentence(-1) + + def test_generate_dataset(self): + for _ in range(100): + number_of_sentences = random.randint(1, 100) + dataset = generate_dataset(number_of_sentences) + assert len(dataset) == number_of_sentences + + with pytest.raises(InvalidObjectTypeException): + generate_dataset("hey there") + + with pytest.raises(AssertionError): + generate_dataset(-1) From 9939ed416ef150262df8599a7a5c682d8a7ab7f5 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 16:24:03 -0400 Subject: [PATCH 21/74] add knn compressor and tests --- npc_gzip/knn_compressor.py | 33 ++++---- tests/test_knn_compressor.py | 149 +++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 17 deletions(-) create mode 100644 tests/test_knn_compressor.py diff --git a/npc_gzip/knn_compressor.py b/npc_gzip/knn_compressor.py index 2ab09c9b..4a2ecdb5 100644 --- a/npc_gzip/knn_compressor.py +++ b/npc_gzip/knn_compressor.py @@ -1,12 +1,14 @@ -from npc_gzip.compressors.base import BaseCompressor -from typing import Union, Sequence +from typing import Sequence, Union + import numpy as np + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.distance import Distance from npc_gzip.exceptions import ( InputLabelEqualLengthException, InvalidObjectTypeException, UnsupportedDistanceMetricException, ) -from npc_gzip.distance import Distance class KnnCompressor: @@ -41,7 +43,6 @@ def __init__( elif isinstance(training_inputs, np.ndarray) or isinstance( training_labels, np.ndarray ): - self.training_inputs: np.ndarray = ( np.array(training_inputs) if isinstance(training_inputs, list) @@ -112,8 +113,7 @@ def _calculate_distance( data combined with each training sample to be scored. Returns: - np.ndarray: Numpy array of dtype=np.float32 containing the distance - metric. + np.ndarray: Numpy array containing the distance metric. """ distance = Distance( @@ -165,7 +165,6 @@ def _compress_sample(self, x: str) -> tuple: combined: list = [] for training_sample in self.training_inputs: - train_and_x: str = self.concatenate_with_space(training_sample, x) combined_compressed: int = self.compressor.get_compressed_length( train_and_x @@ -243,8 +242,8 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): compressed_samples.append(compressed_sample) compressed_combined.append(combined) - compressed_samples: np.ndarray = np.array(compressed_samples) # .reshape(-1) - compressed_combined: np.ndarray = np.array(compressed_combined) # .reshape(-1) + compressed_samples: np.ndarray = np.array(compressed_samples) + compressed_combined: np.ndarray = np.array(compressed_combined) distances: np.ndarray = self._calculate_distance( compressed_samples, compressed_combined @@ -265,8 +264,8 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): if __name__ == "__main__": - import random + from npc_gzip.compressors.gzip_compressor import GZipCompressor training_data = ["hey", "hi", "how are you?", "not too bad"] @@ -278,13 +277,13 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): compressor=GZipCompressor(), training_inputs=training_data, training_labels=training_labels, - distance_metric='ncd' + distance_metric="ncd", ) - test = np.array([ - 'hey', - 'you are a real pain in my ass', - 'go away please' - ]) + test = np.array(["hey", "you are a real pain in my ass", "go away please"]) - distances, labels, similar_samples = model.predict(test, top_k=1) + top_k = 1 + distances, labels, similar_samples = model.predict(test, top_k=top_k) + assert distances.shape == (test.shape[0], len(training_data)) + assert labels.shape == similar_samples.shape + assert distances.shape[0] == labels.shape[0] == similar_samples.shape[0] diff --git a/tests/test_knn_compressor.py b/tests/test_knn_compressor.py new file mode 100644 index 00000000..a9e5994c --- /dev/null +++ b/tests/test_knn_compressor.py @@ -0,0 +1,149 @@ +import random + +import numpy as np +import pytest + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.compressors.bz2_compressor import Bz2Compressor +from npc_gzip.compressors.gzip_compressor import GZipCompressor +from npc_gzip.compressors.lzma_compressor import LzmaCompressor +from npc_gzip.distance import Distance +from npc_gzip.exceptions import ( + InputLabelEqualLengthException, + InvalidObjectTypeException, + UnsupportedDistanceMetricException, +) +from npc_gzip.knn_compressor import KnnCompressor +from npc_gzip.utils import generate_dataset + + +class TestKnnCompressor: + gzip_compressor: BaseCompressor = GZipCompressor() + bz2_compressor: BaseCompressor = Bz2Compressor() + lzma_compressor: BaseCompressor = LzmaCompressor() + + dataset_size: int = 100 + training_dataset: list = generate_dataset(dataset_size) + training_labels: list = [random.randint(0, 10) for _ in range(dataset_size)] + + model = KnnCompressor( + compressor=gzip_compressor, + training_inputs=training_dataset, + training_labels=training_labels, + distance_metric="ncd", + ) + + sample_input: str = "hello there" + + def test_init(self): + assert self.model.distance_metric == "ncd" + assert isinstance(self.model.training_inputs, np.ndarray) + assert isinstance(self.model.compressed_training_inputs, np.ndarray) + + assert ( + self.model.training_inputs.shape[0] + == self.model.training_labels.shape[0] + == self.model.compressed_training_inputs.shape[0] + ) + + assert self.model.supported_distance_metrics == ["ncd", "clm", "cdm", "mse"] + + model = KnnCompressor( + compressor=self.gzip_compressor, + training_inputs=np.array(self.training_dataset), + training_labels=np.array(self.training_labels), + distance_metric="ncd", + ) + + assert isinstance(model.training_inputs, np.ndarray) + assert isinstance(model.compressed_training_inputs, np.ndarray) + assert ( + model.training_inputs.shape[0] + == model.training_labels.shape[0] + == model.compressed_training_inputs.shape[0] + ) + + def test_invalid_metric(self): + with pytest.raises(UnsupportedDistanceMetricException): + model = KnnCompressor( + compressor=self.gzip_compressor, + training_inputs=np.array(self.training_dataset), + training_labels=np.array(self.training_labels), + distance_metric="hoopla", + ) + + def test_invalid_data_and_label_size(self): + training_data_size = 10 + training_label_size = training_data_size - 1 + + dataset = generate_dataset(training_data_size) + labels = [random.randint(0, 10) for _ in range(training_label_size)] + + with pytest.raises(InputLabelEqualLengthException): + model = KnnCompressor( + compressor=self.gzip_compressor, + training_inputs=dataset, + training_labels=labels, + distance_metric="ncd", + ) + + def test__compress_sample(self): + compressed_x, compressed_combined = self.model._compress_sample( + self.sample_input + ) + + assert isinstance(compressed_x, np.ndarray) + assert isinstance(compressed_combined, np.ndarray) + assert compressed_x.shape == compressed_combined.shape + + def test__calculate_distance(self): + compressed_x, compressed_combined = self.model._compress_sample( + self.sample_input + ) + distance = self.model._calculate_distance(compressed_x, compressed_combined) + + assert isinstance(distance, np.ndarray) + assert distance.shape == compressed_x.shape == compressed_combined.shape + + def test_concatenate_with_space(self): + a = "hello there" + b = "who goes there?" + assert self.model.concatenate_with_space(a, b) == "hello there who goes there?" + + def test_predict(self): + top_k = 1 + (distance, labels, similar_samples) = self.model.predict( + self.sample_input, top_k + ) + + assert distance.shape == ( + len([self.sample_input]), + self.model.training_inputs.shape[0], + ) + assert labels.shape == (top_k, len([self.sample_input])) + assert similar_samples.shape == (top_k, len([self.sample_input])) + + test_set_size = random.randint(1, 50) + test_set = [self.sample_input for _ in range(test_set_size)] + top_k = 2 + (distance, labels, similar_samples) = self.model.predict(test_set, top_k) + + assert distance.shape == (test_set_size, self.model.training_inputs.shape[0]) + assert labels.shape == (test_set_size, top_k) + assert similar_samples.shape == (test_set_size, top_k) + + def test_negative_top_k(self): + test_set_size = random.randint(1, 50) + test_set = [self.sample_input for _ in range(test_set_size)] + top_k = -1 + + with pytest.raises(AssertionError): + (distance, labels, similar_samples) = self.model.predict(test_set, top_k) + + def test_top_k_bigger_than_test_set(self): + + test_set_size = random.randint(1, 10) + test_set = [self.sample_input for _ in range(test_set_size)] + top_k = test_set_size + 1 + with pytest.raises(AssertionError): + (distance, labels, similar_samples) = self.model.predict(test_set, top_k) From 3241ef6edef48b36fe045a0c0a48d6ee39a1896d Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 18:15:33 -0400 Subject: [PATCH 22/74] formatting --- npc_gzip/knn_compressor.py | 2 ++ tests/test_knn_compressor.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/npc_gzip/knn_compressor.py b/npc_gzip/knn_compressor.py index 4a2ecdb5..18cce689 100644 --- a/npc_gzip/knn_compressor.py +++ b/npc_gzip/knn_compressor.py @@ -216,6 +216,8 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): [default: top_k=1] Returns: + np.ndarray: The distance-metrics matrix computed on the test + set. np.ndarray: The `top_k` most similar `self.training_labels`. np.ndarray: The `top_k` best matching samples from `self.training_inputs`. diff --git a/tests/test_knn_compressor.py b/tests/test_knn_compressor.py index a9e5994c..a0060207 100644 --- a/tests/test_knn_compressor.py +++ b/tests/test_knn_compressor.py @@ -141,7 +141,6 @@ def test_negative_top_k(self): (distance, labels, similar_samples) = self.model.predict(test_set, top_k) def test_top_k_bigger_than_test_set(self): - test_set_size = random.randint(1, 10) test_set = [self.sample_input for _ in range(test_set_size)] top_k = test_set_size + 1 From e27c92719f431aea625639018d7a8e6818248db0 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 18:15:42 -0400 Subject: [PATCH 23/74] don't commit .npy files --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 99638bcf..213eba84 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ notebooks/ -junit/ \ No newline at end of file +junit/ +*.npy \ No newline at end of file From 7721f3dd7765134a44433f747f5dd09833adaafb Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 18:49:14 -0400 Subject: [PATCH 24/74] add torchtext to dev dependencies for examples --- poetry.lock | 454 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 + 2 files changed, 455 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 6c67272c..b35a8ad4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -46,6 +46,101 @@ d = ["aiohttp (>=3.7.4)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "certifi" +version = "2023.7.22" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, + {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.2.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"}, + {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, +] + [[package]] name = "click" version = "8.1.6" @@ -85,6 +180,32 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "filelock" +version = "3.12.2" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.7" +files = [ + {file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"}, + {file = "filelock-3.12.2.tar.gz", hash = "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81"}, +] + +[package.extras] +docs = ["furo (>=2023.5.20)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"] + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -113,6 +234,99 @@ pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib" plugins = ["setuptools"] requirements-deprecated-finder = ["pip-api", "pipreqs"] +[[package]] +name = "jinja2" +version = "3.1.2" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ + {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, + {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "markupsafe" +version = "2.1.3" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.7" +files = [ + {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-win32.whl", hash = "sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431"}, + {file = "MarkupSafe-2.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, + {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-win32.whl", hash = "sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0"}, + {file = "MarkupSafe-2.1.3-cp37-cp37m-win_amd64.whl", hash = "sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-win32.whl", hash = "sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5"}, + {file = "MarkupSafe-2.1.3-cp38-cp38-win_amd64.whl", hash = "sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-win32.whl", hash = "sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2"}, + {file = "MarkupSafe-2.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba"}, + {file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"}, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "mypy-extensions" version = "1.0.0" @@ -124,6 +338,24 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "numpy" version = "1.25.1" @@ -210,6 +442,25 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "portalocker" +version = "2.7.0" +description = "Wraps the portalocker recipe for easy usage" +optional = false +python-versions = ">=3.5" +files = [ + {file = "portalocker-2.7.0-py2.py3-none-any.whl", hash = "sha256:a07c5b4f3985c3cf4798369631fb7011adb498e2a46d8440efc75a8f29a0f983"}, + {file = "portalocker-2.7.0.tar.gz", hash = "sha256:032e81d534a88ec1736d03f780ba073f047a06c478b06e2937486f334e955c51"}, +] + +[package.dependencies] +pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""} + +[package.extras] +docs = ["sphinx (>=1.7.1)"] +redis = ["redis"] +tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-timeout (>=2.1.0)", "redis", "sphinx (>=6.0.0)"] + [[package]] name = "pytest" version = "7.4.0" @@ -232,6 +483,64 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pywin32" +version = "306" +description = "Python for Window Extensions" +optional = false +python-versions = "*" +files = [ + {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, + {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, + {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"}, + {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"}, + {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"}, + {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"}, + {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"}, + {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"}, + {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"}, + {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"}, + {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"}, + {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"}, + {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, + {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, +] + +[[package]] +name = "requests" +version = "2.31.0" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, + {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, +] + +[package.dependencies] +mpmath = ">=0.19" + [[package]] name = "tomli" version = "2.0.1" @@ -243,6 +552,132 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "torch" +version = "2.0.1" +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:8ced00b3ba471856b993822508f77c98f48a458623596a4c43136158781e306a"}, + {file = "torch-2.0.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:359bfaad94d1cda02ab775dc1cc386d585712329bb47b8741607ef6ef4950747"}, + {file = "torch-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c84e44d9002182edd859f3400deaa7410f5ec948a519cc7ef512c2f9b34d2c4"}, + {file = "torch-2.0.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:567f84d657edc5582d716900543e6e62353dbe275e61cdc36eda4929e46df9e7"}, + {file = "torch-2.0.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:787b5a78aa7917465e9b96399b883920c88a08f4eb63b5a5d2d1a16e27d2f89b"}, + {file = "torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e617b1d0abaf6ced02dbb9486803abfef0d581609b09641b34fa315c9c40766d"}, + {file = "torch-2.0.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b6019b1de4978e96daa21d6a3ebb41e88a0b474898fe251fd96189587408873e"}, + {file = "torch-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbd68cbd1cd9da32fe5d294dd3411509b3d841baecb780b38b3b7b06c7754434"}, + {file = "torch-2.0.1-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:ef654427d91600129864644e35deea761fb1fe131710180b952a6f2e2207075e"}, + {file = "torch-2.0.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:25aa43ca80dcdf32f13da04c503ec7afdf8e77e3a0183dd85cd3e53b2842e527"}, + {file = "torch-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5ef3ea3d25441d3957348f7e99c7824d33798258a2bf5f0f0277cbcadad2e20d"}, + {file = "torch-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0882243755ff28895e8e6dc6bc26ebcf5aa0911ed81b2a12f241fc4b09075b13"}, + {file = "torch-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:f66aa6b9580a22b04d0af54fcd042f52406a8479e2b6a550e3d9f95963e168c8"}, + {file = "torch-2.0.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:1adb60d369f2650cac8e9a95b1d5758e25d526a34808f7448d0bd599e4ae9072"}, + {file = "torch-2.0.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:1bcffc16b89e296826b33b98db5166f990e3b72654a2b90673e817b16c50e32b"}, + {file = "torch-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e10e1597f2175365285db1b24019eb6f04d53dcd626c735fc502f1e8b6be9875"}, + {file = "torch-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:423e0ae257b756bb45a4b49072046772d1ad0c592265c5080070e0767da4e490"}, + {file = "torch-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8742bdc62946c93f75ff92da00e3803216c6cce9b132fbca69664ca38cfb3e18"}, + {file = "torch-2.0.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:c62df99352bd6ee5a5a8d1832452110435d178b5164de450831a3a8cc14dc680"}, + {file = "torch-2.0.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:671a2565e3f63b8fe8e42ae3e36ad249fe5e567435ea27b94edaa672a7d0c416"}, +] + +[package.dependencies] +filelock = "*" +jinja2 = "*" +networkx = "*" +sympy = "*" +typing-extensions = "*" + +[package.extras] +opt-einsum = ["opt-einsum (>=3.3)"] + +[[package]] +name = "torchdata" +version = "0.6.1" +description = "Composable data loading modules for PyTorch" +optional = false +python-versions = ">=3.8" +files = [ + {file = "torchdata-0.6.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:b97f60619764d5f2f62ebcf18bff8a7cfc5c84d30d04d30724b753ec95864a70"}, + {file = "torchdata-0.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ff46aa549c0da9aa2bcba95767034a81c950dc7ccc0e24f38cf6f7878ca1826d"}, + {file = "torchdata-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cae1b4a516124c00ef47db12bab0cc6898bd2d7e749ffb14b6ebf1fe610a6b46"}, + {file = "torchdata-0.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:f2a8b0a388753502d49bd5bd186cd8d41dc2d91121246617804578371591b5e6"}, + {file = "torchdata-0.6.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:6474ed86ed7e060698888fa8ece9155dc94bdb9dfcc1874fa6d7707823551adc"}, + {file = "torchdata-0.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:05db07d2793d181867578be99c211b3c71f83f3fda0bf33a70e5c93794513692"}, + {file = "torchdata-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a9d4c7d86b0aab2f70cdf0eae50f6eea997051889b0c04ae2f843df048ba9ea"}, + {file = "torchdata-0.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:3501325411063b0da6d85c93222234973780ca3df7d110ee1795723f9b2e3405"}, + {file = "torchdata-0.6.1-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:129d5b2d35717862b194b31a27b61b5faa9efd4ffc36c1a07d85b320dea8b4c6"}, + {file = "torchdata-0.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2752fd8452d42aeaf480985f5d3d82506ab1ce51ca42cdb61c981145cd2890a8"}, + {file = "torchdata-0.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08b745c14adea7e4a36de4241485f9ea058e3ad3959c3e6dd1526c2f278006e7"}, + {file = "torchdata-0.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:080c1c54128b4eb8e83cf4de74fa38e892cd6bf3114a557a853969543285f2d9"}, + {file = "torchdata-0.6.1-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:7e11108140adaba55491b6124f07b61cce161f53f8604e70961da025ccd014bd"}, + {file = "torchdata-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42961c40bf9c3fb66aaf4189f9154cb4f2eb3862359e64267708e3d273452d68"}, + {file = "torchdata-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87d012adf90cd15d7164a886512b6746788477172394fe56971799d276f8809e"}, + {file = "torchdata-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:24d7fb2fcb0af43660fc44d8583cb8f7d25762bb759e8fde003db29c5036d940"}, + {file = "torchdata-0.6.1-py3-none-any.whl", hash = "sha256:6a6d51cfbb63efe65788ad71e84c2be23a0c6520869e075774e7fc2ee535b9ed"}, +] + +[package.dependencies] +requests = "*" +torch = "2.0.1" +urllib3 = ">=1.25" + +[[package]] +name = "torchtext" +version = "0.15.2" +description = "Text utilities and datasets for PyTorch" +optional = false +python-versions = ">=3.8" +files = [ + {file = "torchtext-0.15.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c60c2ec240d86c437478821891c0b167aa0b327f30047196f25da55b82f6785b"}, + {file = "torchtext-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fe1794f9b2a9f4923f87488783b06d6c683f332a1db60d17cf65ba3f7c724c56"}, + {file = "torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d265a648722164043bd76e59a8d49a44a96a2f4f74e5e1f84801c047db9f20ce"}, + {file = "torchtext-0.15.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:8fe27a3e2c0ae42f07ff5cae0ea19c49b23d05aa3666bdbc1a262d7b26b8da0d"}, + {file = "torchtext-0.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:a2dfd26a35e35e85d934c5fb58bd867bf61e73ca086f1736f36dff3a22c6083a"}, + {file = "torchtext-0.15.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbb03114513fa44cd6155f46c5341d4674b42cd86ac1e5123b3565a5295047c2"}, + {file = "torchtext-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa6f9115e71471cd6060abf4899663719d268d5662950a65357894234fad74a7"}, + {file = "torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:9650a0184528cdce9201cc656575219b4539536a991d1bf86c8be8b424e58f53"}, + {file = "torchtext-0.15.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:f44fa2bdeaba751eca9ff77c28b374234b5281c7b1d40cea38f67a871e64bfb9"}, + {file = "torchtext-0.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:3818c39032150d2c787a07692d92163af7a52b90dab16f985bb45273728a4207"}, + {file = "torchtext-0.15.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:91edf72b1819432d3524bcd1ab8d9b649afa4d0ed1dd4df594252fa5d073078c"}, + {file = "torchtext-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1f58b4a94e17aada5e2712f2d48437fa8617a09d18160064b0df0c094ad2a3f2"}, + {file = "torchtext-0.15.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:826ea4726e421d2c9309b35f485e46e10cd0e9a7f2b26275be98ccf73c2691be"}, + {file = "torchtext-0.15.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:7e57d7b7b05d56d8d80be5fc190f68c428fe26f98893f77783f9a1e3876e36e1"}, + {file = "torchtext-0.15.2-cp38-cp38-win_amd64.whl", hash = "sha256:6935af64b2af54bd3f4be71ab0a3776f2074834adc2f44e132a9f0da5f69ba47"}, + {file = "torchtext-0.15.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d8065c4032bfe3558c56e9090ad0cf9c7a01f8800235a938feb41244226b9d4"}, + {file = "torchtext-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e8d4e81bfad9ee7e4bc66ad41f96878cf5e3c4e9e829de824081bd3a77bc7aa7"}, + {file = "torchtext-0.15.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:a87c582bddf809a62fd6ecc3bc92a20cb78dba055fcf43ddfce668c4be8352b4"}, + {file = "torchtext-0.15.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:f8d8487945b0ea3df2fedf8a9cb3e7c2165bdc2107900e82c8725d96312addd3"}, + {file = "torchtext-0.15.2-cp39-cp39-win_amd64.whl", hash = "sha256:0c7b4671df132751323d09fee91dcc59da380d5172449ba903f04d4646a906aa"}, +] + +[package.dependencies] +numpy = "*" +requests = "*" +torch = "2.0.1" +torchdata = "0.6.1" +tqdm = "*" + +[[package]] +name = "tqdm" +version = "4.65.0" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, + {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "typing-extensions" version = "4.7.1" @@ -254,7 +689,24 @@ files = [ {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] +[[package]] +name = "urllib3" +version = "2.0.4" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.7" +files = [ + {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"}, + {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a4d722b628042e77d96061368f7fe7885a1307a42f56fba0e440e512d3e30d65" +content-hash = "a5574320c004bf2346fd5479b855fd7a956f03b2023e33fb5612d581747fe228" diff --git a/pyproject.toml b/pyproject.toml index 4c012fa2..c6f6c528 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,8 @@ numpy = "^1.25.1" [tool.poetry.group.dev.dependencies] black = "^23.7.0" isort = "^5.12.0" +torchtext = "^0.15.2" +portalocker = "^2.7.0" [tool.poetry.group.test.dependencies] From 97f3dc6bc84f0b243ea07f4a8832d84ee44766bd Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 18:49:22 -0400 Subject: [PATCH 25/74] add tqdm --- poetry.lock | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index b35a8ad4..99c9b05d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -709,4 +709,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a5574320c004bf2346fd5479b855fd7a956f03b2023e33fb5612d581747fe228" +content-hash = "2f9ca2a22605f7e818169ada7afac4da7523745e5725ad93e86263d829f2109e" diff --git a/pyproject.toml b/pyproject.toml index c6f6c528..e5443782 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ packages = [{include = "npc_gzip"}] [tool.poetry.dependencies] python = "^3.9" numpy = "^1.25.1" +tqdm = "^4.65.0" [tool.poetry.group.dev.dependencies] black = "^23.7.0" From 050ba6a9b9faabfff320265e153ccf3ca8e51538 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 18:52:42 -0400 Subject: [PATCH 26/74] add sampling_percentage to predict to speed up predictions --- npc_gzip/knn_compressor.py | 63 ++++++++++++++++++++++++------------ tests/test_knn_compressor.py | 11 +++++++ 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/npc_gzip/knn_compressor.py b/npc_gzip/knn_compressor.py index 18cce689..b6462794 100644 --- a/npc_gzip/knn_compressor.py +++ b/npc_gzip/knn_compressor.py @@ -9,7 +9,7 @@ InvalidObjectTypeException, UnsupportedDistanceMetricException, ) - +from tqdm import tqdm class KnnCompressor: """ @@ -94,7 +94,10 @@ def __init__( ).reshape(-1) def _calculate_distance( - self, compressed_x: np.ndarray, compressed_combined: np.ndarray + self, + compressed_x: np.ndarray, + compressed_combined: np.ndarray, + compressed_training: np.ndarray = None ) -> np.ndarray: """ Helper function that converts the string representation @@ -115,12 +118,19 @@ def _calculate_distance( Returns: np.ndarray: Numpy array containing the distance metric. """ - - distance = Distance( - np.resize(self.compressed_training_inputs, compressed_x.shape), - compressed_x, - compressed_combined, - ) + + if compressed_training is None: + distance = Distance( + np.resize(self.compressed_training_inputs, compressed_x.shape), + compressed_x, + compressed_combined, + ) + else: + distance = Distance( + np.resize(compressed_training, compressed_x.shape), + compressed_x, + compressed_combined, + ) if self.distance_metric == "ncd": return distance.ncd @@ -133,7 +143,7 @@ def _calculate_distance( else: return "Invalid Distance Metric" - def _compress_sample(self, x: str) -> tuple: + def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: """ Helper method that compresses `x` against each item in `self.training_inputs` and returns the @@ -156,15 +166,19 @@ def _compress_sample(self, x: str) -> tuple: assert isinstance( x, str ), f"Non-string was passed to self._compress_sample: {x}" + + sample_size: int = int(sampling_percentage * self.training_inputs.shape[0]) + training_inputs = np.random.choice(self.training_inputs, sample_size) + x_compressed = self.compressor.get_compressed_length(x) compressed_x: list = [ - x_compressed for _ in range(self.training_inputs.shape[0]) + x_compressed for _ in range(training_inputs.shape[0]) ] compressed_x: np.ndarray = np.array(compressed_x).reshape(-1) - assert compressed_x.shape == self.training_inputs.shape + assert compressed_x.shape == training_inputs.shape combined: list = [] - for training_sample in self.training_inputs: + for training_sample in training_inputs: train_and_x: str = self.concatenate_with_space(training_sample, x) combined_compressed: int = self.compressor.get_compressed_length( train_and_x @@ -172,7 +186,7 @@ def _compress_sample(self, x: str) -> tuple: combined.append(combined_compressed) combined: np.ndarray = np.array(combined).reshape(-1) - assert self.training_inputs.shape == compressed_x.shape == combined.shape + assert training_inputs.shape == compressed_x.shape == combined.shape return (compressed_x, combined) @@ -194,12 +208,14 @@ def concatenate_with_space(stringa: str, stringb: str) -> str: return stringa + " " + stringb - def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): + def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1, sampling_percentage: float = 1.0): """ - Given a test sample `x`, this method will - compare `x` against the training data provided - and will return the best matching label if - `training_labels` was passed during instantiation. + Faster version of `predict`. This method + will compare `x` against a sample of the + training data provided and will return the + best matching label if `training_labels` was + passed during instantiation. + If `training_labels` was not passed during instantiation, or if `top_k` is not None, the most similar samples from `training_inputs` will be @@ -214,6 +230,8 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): `k` number of samples will be returned. `top_k` must be greater than zero. [default: top_k=1] + sampling_percentage (float): (0.0, 1.0] % of `self.training_inputs` + to sample predictions against. Returns: np.ndarray: The distance-metrics matrix computed on the test @@ -239,16 +257,19 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1): compressed_samples = [] compressed_combined = [] - for sample in x: - compressed_sample, combined = self._compress_sample(sample) + for sample in tqdm(x, desc='Compressing input...'): + compressed_sample, combined = self._compress_sample(sample, sampling_percentage) compressed_samples.append(compressed_sample) compressed_combined.append(combined) compressed_samples: np.ndarray = np.array(compressed_samples) compressed_combined: np.ndarray = np.array(compressed_combined) + + compressed_training_size: int = int(self.compressed_training_inputs.shape[0] * sampling_percentage) + compressed_training = np.random.choice(self.compressed_training_inputs, compressed_training_size) distances: np.ndarray = self._calculate_distance( - compressed_samples, compressed_combined + compressed_samples, compressed_combined, compressed_training ) # top matching training samples and labels by diff --git a/tests/test_knn_compressor.py b/tests/test_knn_compressor.py index a0060207..f2b1b257 100644 --- a/tests/test_knn_compressor.py +++ b/tests/test_knn_compressor.py @@ -146,3 +146,14 @@ def test_top_k_bigger_than_test_set(self): top_k = test_set_size + 1 with pytest.raises(AssertionError): (distance, labels, similar_samples) = self.model.predict(test_set, top_k) + + def test_sampling_percentage(self): + + test_set_size = random.randint(1, 10) + test_set = [self.sample_input for _ in range(test_set_size)] + top_k = 1 + (full_distance, full_labels, full_similar_samples) = self.model.predict(test_set, top_k, sampling_percentage=1.0) + (distance, labels, similar_samples) = self.model.predict(test_set, top_k, sampling_percentage=0.1) + + assert full_labels.shape == labels.shape + assert full_similar_samples.shape == similar_samples.shape From e9520252ab63b4d34837da31b31b1a4e121a0580 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 19:06:17 -0400 Subject: [PATCH 27/74] add scikit-learn to dev dependencies for examples --- poetry.lock | 104 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 99c9b05d..59622c48 100644 --- a/poetry.lock +++ b/poetry.lock @@ -251,6 +251,17 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "joblib" +version = "1.3.1" +description = "Lightweight pipelining with Python functions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "joblib-1.3.1-py3-none-any.whl", hash = "sha256:89cf0529520e01b3de7ac7b74a8102c90d16d54c64b5dd98cafcd14307fdf915"}, + {file = "joblib-1.3.1.tar.gz", hash = "sha256:1f937906df65329ba98013dc9692fe22a4c5e4a648112de500508b18a21b41e3"}, +] + [[package]] name = "markupsafe" version = "2.1.3" @@ -527,6 +538,86 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "scikit-learn" +version = "1.3.0" +description = "A set of python modules for machine learning and data mining" +optional = false +python-versions = ">=3.8" +files = [ + {file = "scikit-learn-1.3.0.tar.gz", hash = "sha256:8be549886f5eda46436b6e555b0e4873b4f10aa21c07df45c4bc1735afbccd7a"}, + {file = "scikit_learn-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:981287869e576d42c682cf7ca96af0c6ac544ed9316328fd0d9292795c742cf5"}, + {file = "scikit_learn-1.3.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:436aaaae2c916ad16631142488e4c82f4296af2404f480e031d866863425d2a2"}, + {file = "scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7e28d8fa47a0b30ae1bd7a079519dd852764e31708a7804da6cb6f8b36e3630"}, + {file = "scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae80c08834a473d08a204d966982a62e11c976228d306a2648c575e3ead12111"}, + {file = "scikit_learn-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:552fd1b6ee22900cf1780d7386a554bb96949e9a359999177cf30211e6b20df6"}, + {file = "scikit_learn-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:79970a6d759eb00a62266a31e2637d07d2d28446fca8079cf9afa7c07b0427f8"}, + {file = "scikit_learn-1.3.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:850a00b559e636b23901aabbe79b73dc604b4e4248ba9e2d6e72f95063765603"}, + {file = "scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee04835fb016e8062ee9fe9074aef9b82e430504e420bff51e3e5fffe72750ca"}, + {file = "scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d953531f5d9f00c90c34fa3b7d7cfb43ecff4c605dac9e4255a20b114a27369"}, + {file = "scikit_learn-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:151ac2bf65ccf363664a689b8beafc9e6aae36263db114b4ca06fbbbf827444a"}, + {file = "scikit_learn-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a885a9edc9c0a341cab27ec4f8a6c58b35f3d449c9d2503a6fd23e06bbd4f6a"}, + {file = "scikit_learn-1.3.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:9877af9c6d1b15486e18a94101b742e9d0d2f343d35a634e337411ddb57783f3"}, + {file = "scikit_learn-1.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c470f53cea065ff3d588050955c492793bb50c19a92923490d18fcb637f6383a"}, + {file = "scikit_learn-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd6e2d7389542eae01077a1ee0318c4fec20c66c957f45c7aac0c6eb0fe3c612"}, + {file = "scikit_learn-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:3a11936adbc379a6061ea32fa03338d4ca7248d86dd507c81e13af428a5bc1db"}, + {file = "scikit_learn-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:998d38fcec96584deee1e79cd127469b3ad6fefd1ea6c2dfc54e8db367eb396b"}, + {file = "scikit_learn-1.3.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:ded35e810438a527e17623ac6deae3b360134345b7c598175ab7741720d7ffa7"}, + {file = "scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e8102d5036e28d08ab47166b48c8d5e5810704daecf3a476a4282d562be9a28"}, + {file = "scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7617164951c422747e7c32be4afa15d75ad8044f42e7d70d3e2e0429a50e6718"}, + {file = "scikit_learn-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:1d54fb9e6038284548072df22fd34777e434153f7ffac72c8596f2d6987110dd"}, +] + +[package.dependencies] +joblib = ">=1.1.1" +numpy = ">=1.17.3" +scipy = ">=1.5.0" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.10.1)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] +tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.16.2)"] + +[[package]] +name = "scipy" +version = "1.9.3" +description = "Fundamental algorithms for scientific computing in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "scipy-1.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1884b66a54887e21addf9c16fb588720a8309a57b2e258ae1c7986d4444d3bc0"}, + {file = "scipy-1.9.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:83b89e9586c62e787f5012e8475fbb12185bafb996a03257e9675cd73d3736dd"}, + {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a72d885fa44247f92743fc20732ae55564ff2a519e8302fb7e18717c5355a8b"}, + {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01e1dd7b15bd2449c8bfc6b7cc67d630700ed655654f0dfcf121600bad205c9"}, + {file = "scipy-1.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:68239b6aa6f9c593da8be1509a05cb7f9efe98b80f43a5861cd24c7557e98523"}, + {file = "scipy-1.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b41bc822679ad1c9a5f023bc93f6d0543129ca0f37c1ce294dd9d386f0a21096"}, + {file = "scipy-1.9.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:90453d2b93ea82a9f434e4e1cba043e779ff67b92f7a0e85d05d286a3625df3c"}, + {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c06e62a390a9167da60bedd4575a14c1f58ca9dfde59830fc42e5197283dab"}, + {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abaf921531b5aeaafced90157db505e10345e45038c39e5d9b6c7922d68085cb"}, + {file = "scipy-1.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:06d2e1b4c491dc7d8eacea139a1b0b295f74e1a1a0f704c375028f8320d16e31"}, + {file = "scipy-1.9.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a04cd7d0d3eff6ea4719371cbc44df31411862b9646db617c99718ff68d4840"}, + {file = "scipy-1.9.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:545c83ffb518094d8c9d83cce216c0c32f8c04aaf28b92cc8283eda0685162d5"}, + {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d54222d7a3ba6022fdf5773931b5d7c56efe41ede7f7128c7b1637700409108"}, + {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cff3a5295234037e39500d35316a4c5794739433528310e117b8a9a0c76d20fc"}, + {file = "scipy-1.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:2318bef588acc7a574f5bfdff9c172d0b1bf2c8143d9582e05f878e580a3781e"}, + {file = "scipy-1.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d644a64e174c16cb4b2e41dfea6af722053e83d066da7343f333a54dae9bc31c"}, + {file = "scipy-1.9.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:da8245491d73ed0a994ed9c2e380fd058ce2fa8a18da204681f2fe1f57f98f95"}, + {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4db5b30849606a95dcf519763dd3ab6fe9bd91df49eba517359e450a7d80ce2e"}, + {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c68db6b290cbd4049012990d7fe71a2abd9ffbe82c0056ebe0f01df8be5436b0"}, + {file = "scipy-1.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:5b88e6d91ad9d59478fafe92a7c757d00c59e3bdc3331be8ada76a4f8d683f58"}, + {file = "scipy-1.9.3.tar.gz", hash = "sha256:fbc5c05c85c1a02be77b1ff591087c83bc44579c6d2bd9fb798bb64ea5e1a027"}, +] + +[package.dependencies] +numpy = ">=1.18.5,<1.26.0" + +[package.extras] +dev = ["flake8", "mypy", "pycodestyle", "typing_extensions"] +doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-panels (>=0.5.2)", "sphinx-tabs"] +test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + [[package]] name = "sympy" version = "1.12" @@ -541,6 +632,17 @@ files = [ [package.dependencies] mpmath = ">=0.19" +[[package]] +name = "threadpoolctl" +version = "3.2.0" +description = "threadpoolctl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.2.0-py3-none-any.whl", hash = "sha256:2b7818516e423bdaebb97c723f86a7c6b0a83d3f3b0970328d66f4d9104dc032"}, + {file = "threadpoolctl-3.2.0.tar.gz", hash = "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355"}, +] + [[package]] name = "tomli" version = "2.0.1" @@ -709,4 +811,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "2f9ca2a22605f7e818169ada7afac4da7523745e5725ad93e86263d829f2109e" +content-hash = "beaf1837c50c0400cfcd305eacd132f26513c427374115daf23e4ac0693b41ab" diff --git a/pyproject.toml b/pyproject.toml index e5443782..9405dbae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ black = "^23.7.0" isort = "^5.12.0" torchtext = "^0.15.2" portalocker = "^2.7.0" +scikit-learn = "^1.3.0" [tool.poetry.group.test.dependencies] From ca61d8d7cc0765672b8142dd7483bdf786741139 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 19:06:28 -0400 Subject: [PATCH 28/74] add imdb prediction example --- examples/imdb.py | 111 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 examples/imdb.py diff --git a/examples/imdb.py b/examples/imdb.py new file mode 100644 index 00000000..a2fa52f4 --- /dev/null +++ b/examples/imdb.py @@ -0,0 +1,111 @@ +import numpy as np +from sklearn.metrics import classification_report +from sklearn.utils import shuffle +from torchtext.datasets import IMDB + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.compressors.gzip_compressor import GZipCompressor +from npc_gzip.knn_compressor import KnnCompressor + + +def get_data(): + """ + Pulls the IMDB sentiment analysis dataset + and returns two tuples the first being the + training data and the second being the test + data. Each tuple contains the text and label + respsectively as numpy arrays. + + """ + + train_iter, test_iter = IMDB(split=("train", "test")) + + train_text = [] + train_labels = [] + for label, text in train_iter: + train_labels.append(label) + train_text.append(text) + + test_text = [] + test_labels = [] + for label, text in test_iter: + test_labels.append(label) + test_text.append(text) + + train_text = np.array(train_text) + train_labels = np.array(train_labels) + + test_text = np.array(test_text) + test_labels = np.array(test_labels) + + train = (train_text, train_labels) + test = (test_text, test_labels) + + return (train, test) + + +def fit_model( + train_text: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd" +): + """ + Fits a Knn-GZip compressor on the train + data and returns it. + + Arguments: + train_text (np.ndarray): Training dataset as a numpy array. + train_labels (np.ndarray): Training labels as a numpy array. + + Returns: + KnnCompressor: Trained Knn-Compressor model ready to make predictions. + """ + + compressor: BaseCompressor = GZipCompressor() + model: KnnCompressor = KnnCompressor( + compressor=compressor, + training_inputs=train_text, + training_labels=train_labels, + distance_metric=distance_metric, + ) + + return model + + +if __name__ == "__main__": + print(f"Fetching data...") + ((train_text, train_labels), (test_text, test_labels)) = get_data() + + print(f"Fitting model...") + model = fit_model(train_text, train_labels) + + # Randomly sampling from the test set. + # The IMDb test data comes in with all of the + # `1` labels first, then all of the `2` labels + # last, so we're shuffling so that our model + # has something to predict other than `1`. + + sample_test_text = np.random.choice(test_text, 1000) + sample_test_labels = np.random.choice(test_labels, 1000) + + print(f"Generating predictions...") + top_k = 1 + + # Here we use the `sampling_percentage` to save time + # at the expense of worse predictions. This + # `sampling_percentage` selects a random % of training + # data to compare `sample_test_text` against rather + # than comparing it against the entire training dataset. + (distances, labels, similar_samples) = model.predict( + sample_test_text, top_k, sampling_percentage=0.01 + ) + + print(classification_report(sample_test_labels, labels.reshape(-1))) + + +# precision recall f1-score support + +# 1 0.49 1.00 0.66 489 +# 2 0.00 0.00 0.00 511 + +# accuracy 0.49 1000 +# macro avg 0.24 0.50 0.33 1000 +# weighted avg 0.24 0.49 0.32 1000 \ No newline at end of file From d9eddb91241b69043139d57fc4cc6c8f35e26a38 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 19:06:46 -0400 Subject: [PATCH 29/74] add sampling_percentage tests --- npc_gzip/knn_compressor.py | 54 +++++++++++++++++++++--------------- tests/test_knn_compressor.py | 9 ++++-- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/npc_gzip/knn_compressor.py b/npc_gzip/knn_compressor.py index b6462794..b71aee1a 100644 --- a/npc_gzip/knn_compressor.py +++ b/npc_gzip/knn_compressor.py @@ -1,6 +1,7 @@ from typing import Sequence, Union import numpy as np +from tqdm import tqdm from npc_gzip.compressors.base import BaseCompressor from npc_gzip.distance import Distance @@ -9,7 +10,7 @@ InvalidObjectTypeException, UnsupportedDistanceMetricException, ) -from tqdm import tqdm + class KnnCompressor: """ @@ -94,10 +95,10 @@ def __init__( ).reshape(-1) def _calculate_distance( - self, + self, compressed_x: np.ndarray, compressed_combined: np.ndarray, - compressed_training: np.ndarray = None + compressed_training: np.ndarray = None, ) -> np.ndarray: """ Helper function that converts the string representation @@ -118,7 +119,7 @@ def _calculate_distance( Returns: np.ndarray: Numpy array containing the distance metric. """ - + if compressed_training is None: distance = Distance( np.resize(self.compressed_training_inputs, compressed_x.shape), @@ -166,14 +167,12 @@ def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: assert isinstance( x, str ), f"Non-string was passed to self._compress_sample: {x}" - + sample_size: int = int(sampling_percentage * self.training_inputs.shape[0]) training_inputs = np.random.choice(self.training_inputs, sample_size) - + x_compressed = self.compressor.get_compressed_length(x) - compressed_x: list = [ - x_compressed for _ in range(training_inputs.shape[0]) - ] + compressed_x: list = [x_compressed for _ in range(training_inputs.shape[0])] compressed_x: np.ndarray = np.array(compressed_x).reshape(-1) assert compressed_x.shape == training_inputs.shape @@ -208,14 +207,19 @@ def concatenate_with_space(stringa: str, stringb: str) -> str: return stringa + " " + stringb - def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1, sampling_percentage: float = 1.0): + def predict( + self, + x: Union[np.ndarray, Sequence[str], str], + top_k: int = 1, + sampling_percentage: float = 1.0, + ): """ - Faster version of `predict`. This method - will compare `x` against a sample of the - training data provided and will return the - best matching label if `training_labels` was + Faster version of `predict`. This method + will compare `x` against a sample of the + training data provided and will return the + best matching label if `training_labels` was passed during instantiation. - + If `training_labels` was not passed during instantiation, or if `top_k` is not None, the most similar samples from `training_inputs` will be @@ -231,10 +235,10 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1, samp `top_k` must be greater than zero. [default: top_k=1] sampling_percentage (float): (0.0, 1.0] % of `self.training_inputs` - to sample predictions against. + to sample predictions against. Returns: - np.ndarray: The distance-metrics matrix computed on the test + np.ndarray: The distance-metrics matrix computed on the test set. np.ndarray: The `top_k` most similar `self.training_labels`. np.ndarray: The `top_k` best matching samples @@ -257,16 +261,22 @@ def predict(self, x: Union[np.ndarray, Sequence[str], str], top_k: int = 1, samp compressed_samples = [] compressed_combined = [] - for sample in tqdm(x, desc='Compressing input...'): - compressed_sample, combined = self._compress_sample(sample, sampling_percentage) + for sample in tqdm(x, desc="Compressing input..."): + compressed_sample, combined = self._compress_sample( + sample, sampling_percentage + ) compressed_samples.append(compressed_sample) compressed_combined.append(combined) compressed_samples: np.ndarray = np.array(compressed_samples) compressed_combined: np.ndarray = np.array(compressed_combined) - - compressed_training_size: int = int(self.compressed_training_inputs.shape[0] * sampling_percentage) - compressed_training = np.random.choice(self.compressed_training_inputs, compressed_training_size) + + compressed_training_size: int = int( + self.compressed_training_inputs.shape[0] * sampling_percentage + ) + compressed_training = np.random.choice( + self.compressed_training_inputs, compressed_training_size + ) distances: np.ndarray = self._calculate_distance( compressed_samples, compressed_combined, compressed_training diff --git a/tests/test_knn_compressor.py b/tests/test_knn_compressor.py index f2b1b257..b719da08 100644 --- a/tests/test_knn_compressor.py +++ b/tests/test_knn_compressor.py @@ -148,12 +148,15 @@ def test_top_k_bigger_than_test_set(self): (distance, labels, similar_samples) = self.model.predict(test_set, top_k) def test_sampling_percentage(self): - test_set_size = random.randint(1, 10) test_set = [self.sample_input for _ in range(test_set_size)] top_k = 1 - (full_distance, full_labels, full_similar_samples) = self.model.predict(test_set, top_k, sampling_percentage=1.0) - (distance, labels, similar_samples) = self.model.predict(test_set, top_k, sampling_percentage=0.1) + (full_distance, full_labels, full_similar_samples) = self.model.predict( + test_set, top_k, sampling_percentage=1.0 + ) + (distance, labels, similar_samples) = self.model.predict( + test_set, top_k, sampling_percentage=0.1 + ) assert full_labels.shape == labels.shape assert full_similar_samples.shape == similar_samples.shape From 41f58f13d0a2aa2d7bb558f799ee11a53e4d6e5d Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 19:12:22 -0400 Subject: [PATCH 30/74] add install and test descriptions --- README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 30a465ee..b8c572ae 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,29 @@ -### Code for Paper: “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors +# Code for Paper: “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors This paper is accepted to Findings of [ACL2023](https://aclanthology.org/2023.findings-acl.426/). +## Getting Started + +This codebase is available on pypi.org via + + +```bash + +pip install npc-gzip + +``` + +See the [examples](./examples/imdb.py) directory for example usage. + + +## Testing + +This package utilizes `poetry` to maintain it's dependencies and `pytest` to execute tests. To get started running the tests: + +1. `poetry install --with test` +2. `pytest tests/` + + ### Require See `requirements.txt`. From db6ae4ea7632f4594b66149d459651a718829d90 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 19:12:34 -0400 Subject: [PATCH 31/74] move original codebase to a separate directory --- compressors.py => original_codebase/compressors.py | 0 data.py => original_codebase/data.py | 0 experiments.py => original_codebase/experiments.py | 0 main_text.py => original_codebase/main_text.py | 0 requirements.txt => original_codebase/requirements.txt | 0 utils.py => original_codebase/utils.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename compressors.py => original_codebase/compressors.py (100%) rename data.py => original_codebase/data.py (100%) rename experiments.py => original_codebase/experiments.py (100%) rename main_text.py => original_codebase/main_text.py (100%) rename requirements.txt => original_codebase/requirements.txt (100%) rename utils.py => original_codebase/utils.py (100%) diff --git a/compressors.py b/original_codebase/compressors.py similarity index 100% rename from compressors.py rename to original_codebase/compressors.py diff --git a/data.py b/original_codebase/data.py similarity index 100% rename from data.py rename to original_codebase/data.py diff --git a/experiments.py b/original_codebase/experiments.py similarity index 100% rename from experiments.py rename to original_codebase/experiments.py diff --git a/main_text.py b/original_codebase/main_text.py similarity index 100% rename from main_text.py rename to original_codebase/main_text.py diff --git a/requirements.txt b/original_codebase/requirements.txt similarity index 100% rename from requirements.txt rename to original_codebase/requirements.txt diff --git a/utils.py b/original_codebase/utils.py similarity index 100% rename from utils.py rename to original_codebase/utils.py From 42342614039fd52242f7fea4ed13f7b362faeac6 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Fri, 28 Jul 2023 19:39:17 -0400 Subject: [PATCH 32/74] bump version to 0.1.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9405dbae..b1a889ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "npc-gzip" -version = "0.0.1" +version = "0.1.0" description = "Code for Paper: “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors" authors = ["Zach Bloss "] license = "MIT" From 6a05984680fbdbedb16a4fd90cc4e68bc85f3d0e Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:05:58 -0400 Subject: [PATCH 33/74] update after feedback --- .github/workflows/publish-package.yml | 2 +- .github/workflows/test-package.yml | 2 +- examples/imdb.py | 6 +++--- npc_gzip/{knn_compressor.py => knn_classifier.py} | 4 ++-- tests/test_knn_compressor.py | 12 ++++++------ 5 files changed, 13 insertions(+), 13 deletions(-) rename npc_gzip/{knn_compressor.py => knn_classifier.py} (99%) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 850c4242..eb43c535 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index 96428e75..bbb469bb 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/examples/imdb.py b/examples/imdb.py index a2fa52f4..6175475e 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -5,7 +5,7 @@ from npc_gzip.compressors.base import BaseCompressor from npc_gzip.compressors.gzip_compressor import GZipCompressor -from npc_gzip.knn_compressor import KnnCompressor +from npc_gzip.knn_classifier import KnnClassifier def get_data(): @@ -56,11 +56,11 @@ def fit_model( train_labels (np.ndarray): Training labels as a numpy array. Returns: - KnnCompressor: Trained Knn-Compressor model ready to make predictions. + KnnClassifier: Trained Knn-Compressor model ready to make predictions. """ compressor: BaseCompressor = GZipCompressor() - model: KnnCompressor = KnnCompressor( + model: KnnClassifier = KnnClassifier( compressor=compressor, training_inputs=train_text, training_labels=train_labels, diff --git a/npc_gzip/knn_compressor.py b/npc_gzip/knn_classifier.py similarity index 99% rename from npc_gzip/knn_compressor.py rename to npc_gzip/knn_classifier.py index b71aee1a..7adbf911 100644 --- a/npc_gzip/knn_compressor.py +++ b/npc_gzip/knn_classifier.py @@ -12,7 +12,7 @@ ) -class KnnCompressor: +class KnnClassifier: """ Given the training input and optional training labels data, this class stores @@ -306,7 +306,7 @@ def predict( training_labels = [random.randint(0, 1) for _ in range(len(training_data))] assert len(training_data) == len(training_labels) - model = KnnCompressor( + model = KnnClassifier( compressor=GZipCompressor(), training_inputs=training_data, training_labels=training_labels, diff --git a/tests/test_knn_compressor.py b/tests/test_knn_compressor.py index b719da08..885f074a 100644 --- a/tests/test_knn_compressor.py +++ b/tests/test_knn_compressor.py @@ -13,11 +13,11 @@ InvalidObjectTypeException, UnsupportedDistanceMetricException, ) -from npc_gzip.knn_compressor import KnnCompressor +from npc_gzip.knn_classifier import KnnClassifier from npc_gzip.utils import generate_dataset -class TestKnnCompressor: +class TestKnnClassifier: gzip_compressor: BaseCompressor = GZipCompressor() bz2_compressor: BaseCompressor = Bz2Compressor() lzma_compressor: BaseCompressor = LzmaCompressor() @@ -26,7 +26,7 @@ class TestKnnCompressor: training_dataset: list = generate_dataset(dataset_size) training_labels: list = [random.randint(0, 10) for _ in range(dataset_size)] - model = KnnCompressor( + model = KnnClassifier( compressor=gzip_compressor, training_inputs=training_dataset, training_labels=training_labels, @@ -48,7 +48,7 @@ def test_init(self): assert self.model.supported_distance_metrics == ["ncd", "clm", "cdm", "mse"] - model = KnnCompressor( + model = KnnClassifier( compressor=self.gzip_compressor, training_inputs=np.array(self.training_dataset), training_labels=np.array(self.training_labels), @@ -65,7 +65,7 @@ def test_init(self): def test_invalid_metric(self): with pytest.raises(UnsupportedDistanceMetricException): - model = KnnCompressor( + model = KnnClassifier( compressor=self.gzip_compressor, training_inputs=np.array(self.training_dataset), training_labels=np.array(self.training_labels), @@ -80,7 +80,7 @@ def test_invalid_data_and_label_size(self): labels = [random.randint(0, 10) for _ in range(training_label_size)] with pytest.raises(InputLabelEqualLengthException): - model = KnnCompressor( + model = KnnClassifier( compressor=self.gzip_compressor, training_inputs=dataset, training_labels=labels, From 1ffa682c89c945cefd6411d99db3ab27594a3de0 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:09:59 -0400 Subject: [PATCH 34/74] fix random selection of data --- examples/imdb.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/imdb.py b/examples/imdb.py index 6175475e..24db184b 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -83,8 +83,10 @@ def fit_model( # last, so we're shuffling so that our model # has something to predict other than `1`. - sample_test_text = np.random.choice(test_text, 1000) - sample_test_labels = np.random.choice(test_labels, 1000) + random_indicies = np.random.choice(test_text.shape[0], 1000, replace=False) + + sample_test_text = test_text[random_indicies] + sample_test_labels = test_labels[random_indicies] print(f"Generating predictions...") top_k = 1 From 5afe9cf46a58d77c5f4585d045aa65b3ba854c90 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:11:52 -0400 Subject: [PATCH 35/74] update python version dependency --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b1a889ed..3835e82c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" packages = [{include = "npc_gzip"}] [tool.poetry.dependencies] -python = "^3.9" +python = "^3.7" numpy = "^1.25.1" tqdm = "^4.65.0" From 6be71e27716b58ea0ecacedcd31c1a94f660b9e1 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:18:04 -0400 Subject: [PATCH 36/74] bump back to 3.9 minimum through 3.11 --- .github/workflows/publish-package.yml | 2 +- .github/workflows/test-package.yml | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index eb43c535..a71d6efd 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index bbb469bb..96428e75 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/pyproject.toml b/pyproject.toml index 3835e82c..b1a889ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" packages = [{include = "npc_gzip"}] [tool.poetry.dependencies] -python = "^3.7" +python = "^3.9" numpy = "^1.25.1" tqdm = "^4.65.0" From 8bd8bc32b69c2fe522bb54fb930739c07473afb7 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:22:43 -0400 Subject: [PATCH 37/74] update usage of Union[list-like] with Sequence --- npc_gzip/knn_classifier.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index 7adbf911..19d5d974 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -23,8 +23,8 @@ class KnnClassifier: def __init__( self, compressor: BaseCompressor, - training_inputs: Union[np.ndarray, Sequence[str]], - training_labels: Union[np.ndarray, Sequence[Union[str, int]]] = None, + training_inputs: Sequence, + training_labels: Sequence = None, distance_metric: str = "ncd", ): self.compressor = compressor @@ -209,7 +209,7 @@ def concatenate_with_space(stringa: str, stringb: str) -> str: def predict( self, - x: Union[np.ndarray, Sequence[str], str], + x: Sequence, top_k: int = 1, sampling_percentage: float = 1.0, ): @@ -228,7 +228,7 @@ def predict( resulting array is [batch_size, self.training_inputs.shape[0] ] Arguments: - x (Union[np.ndarray, Sequence[str]]): The sample data to compare against + x (Sequence): The sample data to compare against the `training_inputs`. top_k (int): [Optional] If not None, the most similar `k` number of samples will be returned. From 25350aaf53dd20f2778d3dbae45b14408985a347 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:32:44 -0400 Subject: [PATCH 38/74] update pipelines --- .github/workflows/publish-package.yml | 7 +++---- .github/workflows/test-package.yml | 11 +++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index a71d6efd..ab4f7cf4 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -5,7 +5,7 @@ on: types: [published] jobs: - build: + publish: runs-on: ubuntu-latest strategy: @@ -14,6 +14,7 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -23,14 +24,12 @@ jobs: run: | curl -sSL https://install.python-poetry.org | python3 - poetry --version - poetry export --with test --without-hashes --format=requirements.txt > requirements.txt - name: Run Tests run: | python -m pip install --upgrade pip - pip install -r requirements.txt poetry install - python -m pytest --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index 96428e75..add2ab9f 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -1,9 +1,9 @@ name: Run npc_gzip tests -on: [ push ] +on: [push, pull_request] jobs: - build: + run-tests: runs-on: ubuntu-latest strategy: @@ -12,6 +12,7 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -21,14 +22,12 @@ jobs: run: | curl -sSL https://install.python-poetry.org | python3 - poetry --version - poetry export --with test --without-hashes --format=requirements.txt > requirements.txt - name: Run Tests run: | python -m pip install --upgrade pip - pip install -r requirements.txt - poetry install - python -m pytest tests/ --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry install --with test + poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 From 5429bad11e2c3900b3134e359043d9737a8e04c3 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:35:07 -0400 Subject: [PATCH 39/74] update spacing --- .github/workflows/publish-package.yml | 1 + .github/workflows/test-package.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index ab4f7cf4..98d05b6d 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -5,6 +5,7 @@ on: types: [published] jobs: + publish: runs-on: ubuntu-latest diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index add2ab9f..574e6907 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -3,6 +3,7 @@ name: Run npc_gzip tests on: [push, pull_request] jobs: + run-tests: runs-on: ubuntu-latest From d2855d173aea557f4594d8a0da6eec1570a41ae1 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 13:51:21 -0400 Subject: [PATCH 40/74] remove object assignment from tests that don't need it. rename test_knn_compressor --- tests/test_base_compressor.py | 2 ++ tests/test_distance.py | 6 +++--- tests/{test_knn_compressor.py => test_knn_classifier.py} | 8 ++++---- 3 files changed, 9 insertions(+), 7 deletions(-) rename tests/{test_knn_compressor.py => test_knn_classifier.py} (95%) diff --git a/tests/test_base_compressor.py b/tests/test_base_compressor.py index 38670b7f..f8d1a7b5 100644 --- a/tests/test_base_compressor.py +++ b/tests/test_base_compressor.py @@ -27,6 +27,8 @@ def test__open_file(self): with pytest.raises(AssertionError): self.compressor._open_file(invalid_filepath) + + with pytest.raises(AssertionError): self.compressor._open_file(invalid_filepath, as_bytes=True) file_contents = self.compressor._open_file(valid_filepath, as_bytes=False) diff --git a/tests/test_distance.py b/tests/test_distance.py index 53b18c79..bb013d7f 100644 --- a/tests/test_distance.py +++ b/tests/test_distance.py @@ -46,7 +46,7 @@ def test__ncd(self): distance._ncd(a, b, ab) with pytest.raises(ValueError): - out = distance._ncd(self.array_a, self.array_b, self.array_ab) + distance._ncd(self.array_a, self.array_b, self.array_ab) def test__cdm(self): distance = Distance(self.float_a, self.float_b, self.float_ab) @@ -58,7 +58,7 @@ def test__cdm(self): distance._cdm(a, b, ab) with pytest.raises(ValueError): - out = distance._ncd(self.array_a, self.array_b, self.array_ab) + distance._ncd(self.array_a, self.array_b, self.array_ab) def test__clm(self): distance = Distance(self.float_a, self.float_b, self.float_ab) @@ -70,7 +70,7 @@ def test__clm(self): distance._clm(a, b, ab) with pytest.raises(ValueError): - out = distance._ncd(self.array_a, self.array_b, self.array_ab) + distance._ncd(self.array_a, self.array_b, self.array_ab) def test__mse(self): distance = Distance(self.float_a, self.float_b, self.float_ab) diff --git a/tests/test_knn_compressor.py b/tests/test_knn_classifier.py similarity index 95% rename from tests/test_knn_compressor.py rename to tests/test_knn_classifier.py index 885f074a..f8a9e77f 100644 --- a/tests/test_knn_compressor.py +++ b/tests/test_knn_classifier.py @@ -65,7 +65,7 @@ def test_init(self): def test_invalid_metric(self): with pytest.raises(UnsupportedDistanceMetricException): - model = KnnClassifier( + KnnClassifier( compressor=self.gzip_compressor, training_inputs=np.array(self.training_dataset), training_labels=np.array(self.training_labels), @@ -80,7 +80,7 @@ def test_invalid_data_and_label_size(self): labels = [random.randint(0, 10) for _ in range(training_label_size)] with pytest.raises(InputLabelEqualLengthException): - model = KnnClassifier( + KnnClassifier( compressor=self.gzip_compressor, training_inputs=dataset, training_labels=labels, @@ -138,14 +138,14 @@ def test_negative_top_k(self): top_k = -1 with pytest.raises(AssertionError): - (distance, labels, similar_samples) = self.model.predict(test_set, top_k) + self.model.predict(test_set, top_k) def test_top_k_bigger_than_test_set(self): test_set_size = random.randint(1, 10) test_set = [self.sample_input for _ in range(test_set_size)] top_k = test_set_size + 1 with pytest.raises(AssertionError): - (distance, labels, similar_samples) = self.model.predict(test_set, top_k) + self.model.predict(test_set, top_k) def test_sampling_percentage(self): test_set_size = random.randint(1, 10) From 39db00ca5e6e89267793108e3b5cb47cf64ca3ff Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 14:10:16 -0400 Subject: [PATCH 41/74] added type hinting --- examples/imdb.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/imdb.py b/examples/imdb.py index 24db184b..3de9ab98 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -8,7 +8,7 @@ from npc_gzip.knn_classifier import KnnClassifier -def get_data(): +def get_data() -> tuple: """ Pulls the IMDB sentiment analysis dataset and returns two tuples the first being the @@ -46,7 +46,7 @@ def get_data(): def fit_model( train_text: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd" -): +) -> KnnClassifier : """ Fits a Knn-GZip compressor on the train data and returns it. @@ -69,8 +69,8 @@ def fit_model( return model +def main() -> None: -if __name__ == "__main__": print(f"Fetching data...") ((train_text, train_labels), (test_text, test_labels)) = get_data() @@ -103,6 +103,12 @@ def fit_model( print(classification_report(sample_test_labels, labels.reshape(-1))) + + +if __name__ == "__main__": + main() + + # precision recall f1-score support # 1 0.49 1.00 0.66 489 From ed7e4ec5824992b0a297c05b51330097109fcfb2 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 14:12:27 -0400 Subject: [PATCH 42/74] add ag_news example --- examples/ag_news.py | 114 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 examples/ag_news.py diff --git a/examples/ag_news.py b/examples/ag_news.py new file mode 100644 index 00000000..ee549c85 --- /dev/null +++ b/examples/ag_news.py @@ -0,0 +1,114 @@ +import numpy as np +from sklearn.metrics import classification_report +from sklearn.utils import shuffle +from torchtext.datasets import AG_NEWS + +from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.compressors.gzip_compressor import GZipCompressor +from npc_gzip.knn_classifier import KnnClassifier + + +def get_data() -> tuple: + """ + Pulls the AG_NEWS dataset + and returns two tuples the first being the + training data and the second being the test + data. Each tuple contains the text and label + respsectively as numpy arrays. + + """ + + train_iter, test_iter = AG_NEWS(split=("train", "test")) + + train_text = [] + train_labels = [] + for label, text in train_iter: + train_labels.append(label) + train_text.append(text) + + test_text = [] + test_labels = [] + for label, text in test_iter: + test_labels.append(label) + test_text.append(text) + + train_text = np.array(train_text) + train_labels = np.array(train_labels) + + test_text = np.array(test_text) + test_labels = np.array(test_labels) + + train = (train_text, train_labels) + test = (test_text, test_labels) + + return (train, test) + + +def fit_model( + train_text: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd" +) -> KnnClassifier : + """ + Fits a Knn-GZip compressor on the train + data and returns it. + + Arguments: + train_text (np.ndarray): Training dataset as a numpy array. + train_labels (np.ndarray): Training labels as a numpy array. + + Returns: + KnnClassifier: Trained Knn-Compressor model ready to make predictions. + """ + + compressor: BaseCompressor = GZipCompressor() + model: KnnClassifier = KnnClassifier( + compressor=compressor, + training_inputs=train_text, + training_labels=train_labels, + distance_metric=distance_metric, + ) + + return model + +def main() -> None: + + print(f"Fetching data...") + ((train_text, train_labels), (test_text, test_labels)) = get_data() + + print(f"Fitting model...") + model = fit_model(train_text, train_labels) + random_indicies = np.random.choice(test_text.shape[0], 1000, replace=False) + + sample_test_text = test_text[random_indicies] + sample_test_labels = test_labels[random_indicies] + + print(f"Generating predictions...") + top_k = 1 + + # Here we use the `sampling_percentage` to save time + # at the expense of worse predictions. This + # `sampling_percentage` selects a random % of training + # data to compare `sample_test_text` against rather + # than comparing it against the entire training dataset. + (distances, labels, similar_samples) = model.predict( + sample_test_text, top_k, sampling_percentage=0.01 + ) + + print(classification_report(sample_test_labels, labels.reshape(-1))) + + + + +if __name__ == "__main__": + main() + + +# precision recall f1-score support + +# 1 0.24 0.09 0.13 258 +# 2 0.25 0.44 0.32 231 +# 3 0.26 0.06 0.09 244 +# 4 0.26 0.43 0.32 267 + +# accuracy 0.26 1000 +# macro avg 0.25 0.26 0.22 1000 +# weighted avg 0.25 0.26 0.22 1000 \ No newline at end of file From c139ca373f2705e45ff912b887ed0523f901fc3a Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 16:56:49 -0400 Subject: [PATCH 43/74] only publish the package one time --- .github/workflows/publish-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 98d05b6d..9a2e917b 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9"] steps: - uses: actions/checkout@v3 From c81250376c6c92f220d2d9fc9c6975d4eb1f8568 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 16:57:53 -0400 Subject: [PATCH 44/74] remove unused --doctest-modules --- .github/workflows/publish-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 9a2e917b..4bb499f7 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -30,7 +30,7 @@ jobs: run: | python -m pip install --upgrade pip poetry install - poetry run pytest --doctest-modules --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 From bf865546793b2d578ba255c3b726ceda7eb43d94 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 16:58:20 -0400 Subject: [PATCH 45/74] isort + black formatting --- examples/ag_news.py | 12 +++++------- examples/imdb.py | 14 ++++++-------- original_codebase/compressors.py | 3 ++- original_codebase/experiments.py | 3 +-- original_codebase/main_text.py | 17 +++++++++++------ 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/examples/ag_news.py b/examples/ag_news.py index ee549c85..1e845c00 100644 --- a/examples/ag_news.py +++ b/examples/ag_news.py @@ -46,7 +46,7 @@ def get_data() -> tuple: def fit_model( train_text: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd" -) -> KnnClassifier : +) -> KnnClassifier: """ Fits a Knn-GZip compressor on the train data and returns it. @@ -69,14 +69,14 @@ def fit_model( return model -def main() -> None: +def main() -> None: print(f"Fetching data...") ((train_text, train_labels), (test_text, test_labels)) = get_data() print(f"Fitting model...") model = fit_model(train_text, train_labels) - random_indicies = np.random.choice(test_text.shape[0], 1000, replace=False) + random_indicies = np.random.choice(test_text.shape[0], 1000, replace=False) sample_test_text = test_text[random_indicies] sample_test_labels = test_labels[random_indicies] @@ -85,7 +85,7 @@ def main() -> None: top_k = 1 # Here we use the `sampling_percentage` to save time - # at the expense of worse predictions. This + # at the expense of worse predictions. This # `sampling_percentage` selects a random % of training # data to compare `sample_test_text` against rather # than comparing it against the entire training dataset. @@ -96,8 +96,6 @@ def main() -> None: print(classification_report(sample_test_labels, labels.reshape(-1))) - - if __name__ == "__main__": main() @@ -111,4 +109,4 @@ def main() -> None: # accuracy 0.26 1000 # macro avg 0.25 0.26 0.22 1000 -# weighted avg 0.25 0.26 0.22 1000 \ No newline at end of file +# weighted avg 0.25 0.26 0.22 1000 diff --git a/examples/imdb.py b/examples/imdb.py index 3de9ab98..c3eb7e00 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -46,7 +46,7 @@ def get_data() -> tuple: def fit_model( train_text: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd" -) -> KnnClassifier : +) -> KnnClassifier: """ Fits a Knn-GZip compressor on the train data and returns it. @@ -69,8 +69,8 @@ def fit_model( return model -def main() -> None: +def main() -> None: print(f"Fetching data...") ((train_text, train_labels), (test_text, test_labels)) = get_data() @@ -78,12 +78,12 @@ def main() -> None: model = fit_model(train_text, train_labels) # Randomly sampling from the test set. - # The IMDb test data comes in with all of the + # The IMDb test data comes in with all of the # `1` labels first, then all of the `2` labels # last, so we're shuffling so that our model # has something to predict other than `1`. - random_indicies = np.random.choice(test_text.shape[0], 1000, replace=False) + random_indicies = np.random.choice(test_text.shape[0], 1000, replace=False) sample_test_text = test_text[random_indicies] sample_test_labels = test_labels[random_indicies] @@ -92,7 +92,7 @@ def main() -> None: top_k = 1 # Here we use the `sampling_percentage` to save time - # at the expense of worse predictions. This + # at the expense of worse predictions. This # `sampling_percentage` selects a random % of training # data to compare `sample_test_text` against rather # than comparing it against the entire training dataset. @@ -103,8 +103,6 @@ def main() -> None: print(classification_report(sample_test_labels, labels.reshape(-1))) - - if __name__ == "__main__": main() @@ -116,4 +114,4 @@ def main() -> None: # accuracy 0.49 1000 # macro avg 0.24 0.50 0.33 1000 -# weighted avg 0.24 0.49 0.32 1000 \ No newline at end of file +# weighted avg 0.24 0.49 0.32 1000 diff --git a/original_codebase/compressors.py b/original_codebase/compressors.py index ded581dc..bffd1ae8 100644 --- a/original_codebase/compressors.py +++ b/original_codebase/compressors.py @@ -7,7 +7,8 @@ class DefaultCompressor: """For non-neural-based compressor""" - def __init__(self, compressor, typ='text'): + + def __init__(self, compressor, typ="text"): try: self.compressor = import_module(compressor) except ModuleNotFoundError: diff --git a/original_codebase/experiments.py b/original_codebase/experiments.py index 8bb7cbab..2e42f7af 100644 --- a/original_codebase/experiments.py +++ b/original_codebase/experiments.py @@ -13,11 +13,10 @@ import numpy as np import torch +from compressors import DefaultCompressor from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score from tqdm import tqdm -from compressors import DefaultCompressor - class KnnExpText: def __init__( diff --git a/original_codebase/main_text.py b/original_codebase/main_text.py index f3a319f1..5de0a570 100644 --- a/original_codebase/main_text.py +++ b/original_codebase/main_text.py @@ -3,6 +3,9 @@ from functools import partial from typing import Callable +from compressors import * +from data import * +from experiments import * from pathos.multiprocessing import ProcessingPool as Pool from torchtext.datasets import ( AG_NEWS, @@ -13,10 +16,6 @@ YahooAnswers, YelpReviewPolarity, ) - -from compressors import * -from data import * -from experiments import * from utils import * # np.random.seed(6) @@ -90,7 +89,11 @@ def record_distance( def non_neurl_knn_exp_given_dis(dis_matrix, k, test_label, train_label): knn_exp = KnnExpText(None, None, None) _, correct = knn_exp.calc_acc( - k, test_label, train_label=train_label, provided_distance_matrix=dis_matrix, rand=args.random + k, + test_label, + train_label=train_label, + provided_distance_matrix=dis_matrix, + rand=args.random, ) return correct @@ -284,4 +287,6 @@ def non_neurl_knn_exp_given_dis(dis_matrix, k, test_label, train_label): print("Altogether Accuracy is: {}".format(all_correct / total_num)) else: dis_matrix = np.load(args.distance_fn) - non_neurl_knn_exp_given_dis(dis_matrix, args.k, test_labels, train_labels) + non_neurl_knn_exp_given_dis( + dis_matrix, args.k, test_labels, train_labels + ) From 0424c61496195db3a7f3c92db4e7c99e634c14fe Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 16:58:59 -0400 Subject: [PATCH 46/74] tpyos on respectively --- examples/ag_news.py | 2 +- examples/imdb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ag_news.py b/examples/ag_news.py index 1e845c00..a548e450 100644 --- a/examples/ag_news.py +++ b/examples/ag_news.py @@ -14,7 +14,7 @@ def get_data() -> tuple: and returns two tuples the first being the training data and the second being the test data. Each tuple contains the text and label - respsectively as numpy arrays. + respectively as numpy arrays. """ diff --git a/examples/imdb.py b/examples/imdb.py index c3eb7e00..447da7d9 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -14,7 +14,7 @@ def get_data() -> tuple: and returns two tuples the first being the training data and the second being the test data. Each tuple contains the text and label - respsectively as numpy arrays. + respectively as numpy arrays. """ From 27c1692ab51e560b93b5033b2bfd74731419fb46 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 17:02:18 -0400 Subject: [PATCH 47/74] raise exception instead of returning text. --- npc_gzip/knn_classifier.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index 19d5d974..e2164999 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -1,4 +1,4 @@ -from typing import Sequence, Union +from typing import Sequence, Union, Optional import numpy as np from tqdm import tqdm @@ -98,7 +98,7 @@ def _calculate_distance( self, compressed_x: np.ndarray, compressed_combined: np.ndarray, - compressed_training: np.ndarray = None, + compressed_training: Optional[np.ndarray] = None, ) -> np.ndarray: """ Helper function that converts the string representation @@ -142,7 +142,11 @@ def _calculate_distance( elif self.distance_metric == "mse": return distance.mse else: - return "Invalid Distance Metric" + raise UnsupportedDistanceMetricException( + self.distance_metric, + supported_distance_metrics=self.supported_distance_metrics, + function_name='_calculate_distance' + ) def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: """ From 8c2e56f55421bf2725ec6eb61da449c75d0837d8 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 17:03:06 -0400 Subject: [PATCH 48/74] add return type hint to knn_classifier.predict --- npc_gzip/knn_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index e2164999..a0d92587 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -216,7 +216,7 @@ def predict( x: Sequence, top_k: int = 1, sampling_percentage: float = 1.0, - ): + ) -> tuple: """ Faster version of `predict`. This method will compare `x` against a sample of the From c2ea22f6bd1a153e414a39d426bc292c246149ca Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sat, 29 Jul 2023 17:06:31 -0400 Subject: [PATCH 49/74] rename compressed_x to compressed_input --- npc_gzip/knn_classifier.py | 26 +++++++++++++------------- tests/test_knn_classifier.py | 12 ++++++------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index a0d92587..5ca2f7e3 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -96,7 +96,7 @@ def __init__( def _calculate_distance( self, - compressed_x: np.ndarray, + compressed_input: np.ndarray, compressed_combined: np.ndarray, compressed_training: Optional[np.ndarray] = None, ) -> np.ndarray: @@ -105,11 +105,11 @@ def _calculate_distance( of `self.distance_metric` to the actual `npc_gzip.distance.Distance.[distance_metric]`. Then that distance metric is calculated using - `self.compressed_training_inputs`, `compressed_x` and + `self.compressed_training_inputs`, `compressed_input` and `compressed_combined`. Arguments: - compressed_x (np.ndarray): Numpy array representing the + compressed_input (np.ndarray): Numpy array representing the compressed lengths of the input data to be scored. compressed_combined (np.ndarray): Numpy array representing the @@ -122,14 +122,14 @@ def _calculate_distance( if compressed_training is None: distance = Distance( - np.resize(self.compressed_training_inputs, compressed_x.shape), - compressed_x, + np.resize(self.compressed_training_inputs, compressed_input.shape), + compressed_input, compressed_combined, ) else: distance = Distance( - np.resize(compressed_training, compressed_x.shape), - compressed_x, + np.resize(compressed_training, compressed_input.shape), + compressed_input, compressed_combined, ) @@ -175,10 +175,10 @@ def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: sample_size: int = int(sampling_percentage * self.training_inputs.shape[0]) training_inputs = np.random.choice(self.training_inputs, sample_size) - x_compressed = self.compressor.get_compressed_length(x) - compressed_x: list = [x_compressed for _ in range(training_inputs.shape[0])] - compressed_x: np.ndarray = np.array(compressed_x).reshape(-1) - assert compressed_x.shape == training_inputs.shape + compressed_input_length: int = self.compressor.get_compressed_length(x) + compressed_input: list = [compressed_input_length for _ in range(training_inputs.shape[0])] + compressed_input: np.ndarray = np.array(compressed_input).reshape(-1) + assert compressed_input.shape == training_inputs.shape combined: list = [] for training_sample in training_inputs: @@ -189,9 +189,9 @@ def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: combined.append(combined_compressed) combined: np.ndarray = np.array(combined).reshape(-1) - assert training_inputs.shape == compressed_x.shape == combined.shape + assert training_inputs.shape == compressed_input.shape == combined.shape - return (compressed_x, combined) + return (compressed_input, combined) @staticmethod def concatenate_with_space(stringa: str, stringb: str) -> str: diff --git a/tests/test_knn_classifier.py b/tests/test_knn_classifier.py index f8a9e77f..9e027c3f 100644 --- a/tests/test_knn_classifier.py +++ b/tests/test_knn_classifier.py @@ -88,22 +88,22 @@ def test_invalid_data_and_label_size(self): ) def test__compress_sample(self): - compressed_x, compressed_combined = self.model._compress_sample( + compressed_input, compressed_combined = self.model._compress_sample( self.sample_input ) - assert isinstance(compressed_x, np.ndarray) + assert isinstance(compressed_input, np.ndarray) assert isinstance(compressed_combined, np.ndarray) - assert compressed_x.shape == compressed_combined.shape + assert compressed_input.shape == compressed_combined.shape def test__calculate_distance(self): - compressed_x, compressed_combined = self.model._compress_sample( + compressed_input, compressed_combined = self.model._compress_sample( self.sample_input ) - distance = self.model._calculate_distance(compressed_x, compressed_combined) + distance = self.model._calculate_distance(compressed_input, compressed_combined) assert isinstance(distance, np.ndarray) - assert distance.shape == compressed_x.shape == compressed_combined.shape + assert distance.shape == compressed_input.shape == compressed_combined.shape def test_concatenate_with_space(self): a = "hello there" From 7a3e855b80e90c099348bcb006b753550b72328f Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 09:40:42 -0400 Subject: [PATCH 50/74] add replace=False to fix training data from resampling add doctest --- npc_gzip/knn_classifier.py | 55 +++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index 5ca2f7e3..96f7e99d 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -1,4 +1,4 @@ -from typing import Sequence, Union, Optional +from typing import Optional, Sequence, Union import numpy as np from tqdm import tqdm @@ -18,6 +18,23 @@ class KnnClassifier: training labels data, this class stores the data in memory + >>> import random + >>> from npc_gzip.compressors.gzip_compressor import GZipCompressor + + >>> training_data = ["hey", "hi", "how are you?", "not too bad"] + + >>> training_labels = [random.randint(0, 1) for _ in range(len(training_data))] + >>> assert len(training_data) == len(training_labels) + + >>> model = KnnClassifier(compressor=GZipCompressor(), training_inputs=training_data, training_labels=training_labels, distance_metric="ncd") + + >>> test = np.array(["hey", "you are a real pain in my ass", "go away please"]) + + >>> top_k = 1 + >>> distances, labels, similar_samples = model.predict(test, top_k=top_k) + >>> assert distances.shape == (test.shape[0], len(training_data)) + >>> assert labels.shape == similar_samples.shape + >>> assert distances.shape[0] == labels.shape[0] == similar_samples.shape[0] """ def __init__( @@ -145,7 +162,7 @@ def _calculate_distance( raise UnsupportedDistanceMetricException( self.distance_metric, supported_distance_metrics=self.supported_distance_metrics, - function_name='_calculate_distance' + function_name="_calculate_distance", ) def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: @@ -173,10 +190,14 @@ def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: ), f"Non-string was passed to self._compress_sample: {x}" sample_size: int = int(sampling_percentage * self.training_inputs.shape[0]) - training_inputs = np.random.choice(self.training_inputs, sample_size) + training_inputs = np.random.choice( + self.training_inputs, sample_size, replace=False + ) compressed_input_length: int = self.compressor.get_compressed_length(x) - compressed_input: list = [compressed_input_length for _ in range(training_inputs.shape[0])] + compressed_input: list = [ + compressed_input_length for _ in range(training_inputs.shape[0]) + ] compressed_input: np.ndarray = np.array(compressed_input).reshape(-1) assert compressed_input.shape == training_inputs.shape @@ -298,29 +319,3 @@ def predict( labels = self.training_labels[minimum_distance_indices] return distances, labels, similar_samples - - -if __name__ == "__main__": - import random - - from npc_gzip.compressors.gzip_compressor import GZipCompressor - - training_data = ["hey", "hi", "how are you?", "not too bad"] - - training_labels = [random.randint(0, 1) for _ in range(len(training_data))] - assert len(training_data) == len(training_labels) - - model = KnnClassifier( - compressor=GZipCompressor(), - training_inputs=training_data, - training_labels=training_labels, - distance_metric="ncd", - ) - - test = np.array(["hey", "you are a real pain in my ass", "go away please"]) - - top_k = 1 - distances, labels, similar_samples = model.predict(test, top_k=top_k) - assert distances.shape == (test.shape[0], len(training_data)) - assert labels.shape == similar_samples.shape - assert distances.shape[0] == labels.shape[0] == similar_samples.shape[0] From 9b9ee3a05296ec600211c5cf24a79f94f1803a39 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 10:38:58 -0400 Subject: [PATCH 51/74] add several -> None type hints to __init__ add module docstrings remove many if __name__ == '__main__' example calls --- npc_gzip/aggregations.py | 62 +---------------------- npc_gzip/compressors/base.py | 21 ++++---- npc_gzip/compressors/bz2_compressor.py | 29 ++++++----- npc_gzip/compressors/gzip_compressor.py | 30 ++++++------ npc_gzip/compressors/lzma_compressor.py | 29 ++++++----- npc_gzip/distance.py | 55 ++++++++++++++++++--- npc_gzip/exceptions.py | 65 ++++++++----------------- npc_gzip/knn_classifier.py | 30 +++++++----- npc_gzip/utils.py | 27 +++++----- poetry.lock | 2 +- pyproject.toml | 25 +++++++--- tests/test_aggregations.py | 14 ++---- tests/test_base_compressor.py | 10 ++-- tests/test_bz2_compressor.py | 6 +-- tests/test_distance.py | 20 ++++---- tests/test_gzip_compressor.py | 6 +-- tests/test_knn_classifier.py | 20 ++++---- tests/test_lzma_compressor.py | 6 +-- tests/test_utils.py | 4 +- 19 files changed, 215 insertions(+), 246 deletions(-) diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index 400f3df6..0cbe4afe 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -48,66 +48,6 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> aggregated: str = " ".join(out) if by_character: - aggregated: str = "".join(out) + aggregated = "".join(out) return aggregated - - -def average(array_a: np.ndarray, array_b: np.ndarray) -> np.ndarray: - """ - Calculates the average of `array_a` and `array_b`, - rounding down. - - (replaces agg_by_avg) - - Arguments: - array_a (np.ndarray): First series of numbers. - array_b (np.ndarray): Second series of numbers. - - Returns: - np.ndarray: Average of the two series of numbers rounded - towards zero. - """ - - average_array = (a + b) / 2 - average_array = average_array.astype(int) - - return average_array - - -def min_max_aggregation( - array_a: np.ndarray, array_b: np.ndarray, aggregate_by_minimum: bool = False -) -> np.ndarray: - """ - Stacks `array_a` and `array_b` then returns the minimum value if - `aggregate_by_minimum`, else returns the maximum value. - - Arguments: - array_a (np.ndarray): First series of numbers. - array_b (np.ndarray): Second series of numbers. - aggregate_by_minimum (bool): True if you want to take the minimum of the two series. - False if you want to take the maximum instead. - - Returns: - np.ndarray: 1-D Numpy array of the min/max value from `array_a` & `array_b`. - """ - - stacked = np.stack([array_a, array_b], axis=0) - if aggregate_by_minimum: - return np.min(stacked, axis=0).reshape(-1) - return torch.max(stacked, axis=0).reshape(-1) - - -def stack(array_a: np.ndarray, array_b: np.ndarray) -> np.ndarray: - """ - Combines `array_a` and `array_b` via `np.stack`. - - Arguments: - array_a (np.ndarray): First series of numbers. - array_b (np.ndarray): Second series of numbers. - - Returns: - np.ndarray: Stack of the two series. - """ - - return np.stack([array_a, array_b], axis=0) diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py index 095ff6e0..c943ddcd 100644 --- a/npc_gzip/compressors/base.py +++ b/npc_gzip/compressors/base.py @@ -9,9 +9,18 @@ class BaseCompressor: Default compressor class that other compressors inherit from. + >>> import gzip + + >>> compressor = BaseCompressor(compressor=gzip) + + >>> example = "Hello there!" + >>> compressed_length: int = compressor.get_compressed_length(example) + >>> bits_per_character: float = compressor.get_bits_per_character(example) + >>> assert isinstance(compressed_length, int) + >>> assert isinstance(bits_per_character, float) """ - def __init__(self, compressor: ModuleType): + def __init__(self, compressor: ModuleType) -> None: if not isinstance(compressor, (ModuleType, BaseCompressor)): raise InvalidCompressorException( f"Not a function passed: {type(compressor)}" @@ -104,13 +113,3 @@ def get_bits_per_character(self, x: str) -> float: compressed_length_in_bits / number_of_characters ) return compressed_length_per_number_of_characters - - -if __name__ == "__main__": - import gzip - - compressor = BaseCompressor(compressor=gzip) - - example = "Hello there!" - compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/compressors/bz2_compressor.py b/npc_gzip/compressors/bz2_compressor.py index 8a802772..3ec1a994 100644 --- a/npc_gzip/compressors/bz2_compressor.py +++ b/npc_gzip/compressors/bz2_compressor.py @@ -3,25 +3,24 @@ class Bz2Compressor(BaseCompressor): - def __init__(self): + """ + bz2 compressor that inherits from + `npc_gzip.compressors.base.BaseCompressor` + + >>> compressor: BaseCompressor = Bz2Compressor() + >>> example: str = "Hello there!" + >>> compressed_length: int = compressor.get_compressed_length(example) + >>> bits_per_character: float = compressor.get_bits_per_character(example) + >>> assert isinstance(compressed_length, int) + >>> assert isinstance(bits_per_character, float) + """ + + def __init__(self) -> None: super().__init__(self) try: import bz2 except ModuleNotFoundError as e: - import platform - - major, minor, patch = platform.python_version_tuple() - if int(major) >= 3 and int(minor) >= 11: - raise ExceptionGroup([MissingDependencyException("bz2"), e]) - else: - raise MissingDependencyException("bz2") + raise MissingDependencyException("bz2") from e self.compressor = bz2 - - -if __name__ == "__main__": - compressor = Bz2Compressor() - example: str = "Hello there!" - compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/compressors/gzip_compressor.py b/npc_gzip/compressors/gzip_compressor.py index 7b8f946d..0ab009e6 100644 --- a/npc_gzip/compressors/gzip_compressor.py +++ b/npc_gzip/compressors/gzip_compressor.py @@ -1,26 +1,26 @@ from npc_gzip.compressors.base import BaseCompressor +from npc_gzip.exceptions import MissingDependencyException class GZipCompressor(BaseCompressor): - def __init__(self): + """ + gzip compressor that inherits from + `npc_gzip.compressors.base.BaseCompressor` + + >>> compressor: BaseCompressor = GZipCompressor() + >>> example: str = "Hello there!" + >>> compressed_length: int = compressor.get_compressed_length(example) + >>> bits_per_character: float = compressor.get_bits_per_character(example) + >>> assert isinstance(compressed_length, int) + >>> assert isinstance(bits_per_character, float) + """ + + def __init__(self) -> None: super().__init__(self) try: import gzip except ModuleNotFoundError as e: - import platform - - major, minor, patch = platform.python_version_tuple() - if int(major) >= 3 and int(minor) >= 11: - raise ExceptionGroup([MissingDependencyException("gzip"), e]) - else: - raise MissingDependencyException("gzip") + raise MissingDependencyException("gzip") from e self.compressor = gzip - - -if __name__ == "__main__": - compressor = GZipCompressor() - example: str = "Hello there!" - compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/compressors/lzma_compressor.py b/npc_gzip/compressors/lzma_compressor.py index 62883024..ea84bd2e 100644 --- a/npc_gzip/compressors/lzma_compressor.py +++ b/npc_gzip/compressors/lzma_compressor.py @@ -3,25 +3,24 @@ class LzmaCompressor(BaseCompressor): - def __init__(self): + """ + lzma compressor that inherits from + `npc_gzip.compressors.base.BaseCompressor` + + >>> compressor: BaseCompressor = LzmaCompressor() + >>> example: str = "Hello there!" + >>> compressed_length: int = compressor.get_compressed_length(example) + >>> bits_per_character: float = compressor.get_bits_per_character(example) + >>> assert isinstance(compressed_length, int) + >>> assert isinstance(bits_per_character, float) + """ + + def __init__(self) -> None: super().__init__(self) try: import lzma except ModuleNotFoundError as e: - import platform - - major, minor, patch = platform.python_version_tuple() - if int(major) >= 3 and int(minor) >= 11: - raise ExceptionGroup([MissingDependencyException("lzma"), e]) - else: - raise MissingDependencyException("lzma") + raise MissingDependencyException("lzma") from e self.compressor = lzma - - -if __name__ == "__main__": - compressor = LzmaCompressor() - example: str = "Hello there!" - compressed_length: int = compressor.get_compressed_length(example) - bits_per_character: float = compressor.get_bits_per_character(example) diff --git a/npc_gzip/distance.py b/npc_gzip/distance.py index 07efc1f0..e456509e 100644 --- a/npc_gzip/distance.py +++ b/npc_gzip/distance.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Sequence, Union import numpy as np @@ -6,12 +6,53 @@ class Distance: + """ + Used to calculate the distance between compressed + objects. Typical usage is your training data as + `compressed_values_a`, the data you want to predict + on as `compressed_values_b` and the two values + concatenated then compressed as `compressed_values_ab`. + + >>> import random + + >>> a = random.random() + >>> b = random.random() + >>> ab = random.random() + + >>> distance = Distance(a, b, ab) + >>> ncd: float = distance._ncd(a, b, ab) + >>> cdm: float = distance._cdm(a, b, ab) + >>> clm: float = distance._clm(a, b, ab) + >>> mse: float = distance._mse(a, b) + + >>> assert isinstance(ncd, float) + >>> assert isinstance(cdm, float) + >>> assert isinstance(clm, float) + >>> assert isinstance(mse, float) + + >>> a = np.random.rand(3, 10) + >>> b = np.random.rand(3, 10) + >>> ab = np.random.rand(3, 10) + + >>> distance = Distance(a, b, ab) + + >>> ncd: np.ndarray = distance.ncd + >>> cdm: np.ndarray = distance.cdm + >>> clm: np.ndarray = distance.clm + >>> mse: np.ndarray = distance.mse + + >>> assert isinstance(ncd, np.ndarray) + >>> assert isinstance(cdm, np.ndarray) + >>> assert isinstance(clm, np.ndarray) + >>> assert isinstance(mse, np.ndarray) + """ + def __init__( self, - compressed_values_a: Union[list, np.ndarray], - compressed_values_b: Union[list, np.ndarray], - compressed_values_ab: Union[list, np.ndarray], - ): + compressed_values_a: Sequence, + compressed_values_b: Sequence, + compressed_values_ab: Sequence, + ) -> None: if not isinstance(compressed_values_a, np.ndarray): compressed_values_a = np.array(compressed_values_a) @@ -93,8 +134,8 @@ def _clm( def _mse( self, - compressed_value_a: Union[list, np.ndarray], - compressed_value_b: Union[list, np.ndarray], + compressed_value_a: Sequence, + compressed_value_b: Sequence, ) -> np.ndarray: """ Computes the mean squared error between two diff --git a/npc_gzip/exceptions.py b/npc_gzip/exceptions.py index d78697fe..eeda08d8 100644 --- a/npc_gzip/exceptions.py +++ b/npc_gzip/exceptions.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Optional import numpy as np @@ -9,7 +9,7 @@ class InvalidCompressorException(Exception): library that is not supported. """ - def __init__(self, compression_library: str): + def __init__(self, compression_library: str) -> None: self.message = f""" Compression Library ({compression_library}) is not currently supported. @@ -23,7 +23,7 @@ class MissingDependencyException(Exception): found when loading a library. """ - def __init__(self, compression_library: str): + def __init__(self, compression_library: str) -> None: self.message = f""" Compression Library ({compression_library}) is missing an underlying dependency. Try @@ -44,7 +44,9 @@ def __init__(self, compression_library: str): class StringTooShortException(Exception): - def __init__(self, stringa: str, stringb: str, function_name: str = None): + def __init__( + self, stringa: str, stringb: str, function_name: Optional[str] = None + ) -> None: self.message = f""" Unable to aggregate ({stringa}) and ({stringb}). One or both of the two strings are too short to concatenate. @@ -61,9 +63,9 @@ class CompressedValuesEqualZero(Exception): def __init__( self, compressed_value_a: float, - compressed_value_b: float = None, - function_name: str = None, - ): + compressed_value_b: Optional[float] = None, + function_name: Optional[str] = None, + ) -> None: self.message = f""" The combination of compressed values passed equal zero. This will result in a divide by zero error. @@ -82,8 +84,8 @@ def __init__( a: Any, b: Any, c: Any, - function_name: str = None, - ): + function_name: Optional[str] = None, + ) -> None: self.message = f""" The passed values must either all be None or not None. arg1: {type(a)} @@ -103,8 +105,8 @@ def __init__( array_a: np.ndarray, array_b: np.ndarray, array_c: np.ndarray, - function_name: str = None, - ): + function_name: Optional[str] = None, + ) -> None: self.message = f""" The passed values must either all of the same shape. arg1: {array_a.shape} @@ -122,9 +124,9 @@ class UnsupportedDistanceMetricException(Exception): def __init__( self, distance_metric: str, - supported_distance_metrics: list = None, - function_name: str = None, - ): + supported_distance_metrics: Optional[list] = None, + function_name: Optional[str] = None, + ) -> None: self.message = f""" The `distance_metric` ({distance_metric}) provided is not currently supported. Please submit an Issue and/or @@ -147,9 +149,9 @@ class InvalidObjectTypeException(Exception): def __init__( self, passed_type: str, - supported_types: list = None, - function_name: str = None, - ): + supported_types: Optional[list] = None, + function_name: Optional[str] = None, + ) -> None: self.message = f""" The type passed ({passed_type}) provided is not currently supported. @@ -169,8 +171,8 @@ def __init__( self, training_samples: int, label_samples: int, - function_name: str = None, - ): + function_name: Optional[str] = None, + ) -> None: self.message = f""" If training labels are passed, the number of training data samples must equal the @@ -184,28 +186,3 @@ def __init__( if function_name is not None: self.message += f"function_name: {function_name}" super().__init__(self.message) - - -class UnsupportedDistanceMetricException(Exception): - def __init__( - self, - distance_metric: str, - supported_distance_metrics: list = None, - function_name: str = None, - ): - self.message = f""" - The `distance_metric` ({distance_metric}) provided is not - currently supported. Please submit an Issue and/or - Pull Request here to add support: - https://github.com/bazingagin/npc_gzip - - """ - - if supported_distance_metrics is not None: - self.message += ( - f"supported_distance_metrics: {supported_distance_metrics}\n" - ) - - if function_name is not None: - self.message += f"function_name: {function_name}" - super().__init__(self.message) diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index 96f7e99d..cb101003 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -43,7 +43,7 @@ def __init__( training_inputs: Sequence, training_labels: Sequence = None, distance_metric: str = "ncd", - ): + ) -> None: self.compressor = compressor if isinstance(training_inputs, list) and isinstance(training_labels, list): @@ -237,7 +237,9 @@ def predict( x: Sequence, top_k: int = 1, sampling_percentage: float = 1.0, - ) -> tuple: + ) -> tuple[ + np.ndarray[float, float], np.ndarray[float, float], np.ndarray[float, float] + ]: """ Faster version of `predict`. This method will compare `x` against a sample of the @@ -275,7 +277,7 @@ def predict( assert top_k > 0, f"top_k ({top_k}) must be greater than zero." - x = x.reshape(-1) + x: np.ndarray = x.reshape(-1) assert ( top_k <= x.shape[0] ), f""" @@ -284,22 +286,24 @@ def predict( """ - compressed_samples = [] - compressed_combined = [] + samples: list = [] + combined: list = [] for sample in tqdm(x, desc="Compressing input..."): - compressed_sample, combined = self._compress_sample( + compressed_sample, combined_length = self._compress_sample( sample, sampling_percentage ) - compressed_samples.append(compressed_sample) - compressed_combined.append(combined) + samples.append(compressed_sample) + combined.append(combined_length) + + compressed_samples: np.ndarray = np.array(samples) + compressed_combined: np.ndarray = np.array(combined) - compressed_samples: np.ndarray = np.array(compressed_samples) - compressed_combined: np.ndarray = np.array(compressed_combined) + assert isinstance(self.compressed_training_inputs, np.ndarray) compressed_training_size: int = int( self.compressed_training_inputs.shape[0] * sampling_percentage ) - compressed_training = np.random.choice( + compressed_training: np.ndarray = np.random.choice( self.compressed_training_inputs, compressed_training_size ) @@ -313,8 +317,8 @@ def predict( # get indicies of minimum top_k distances. minimum_distance_indices = np.argpartition(distances, top_k)[:, :top_k] - similar_samples = self.training_inputs[minimum_distance_indices] - labels = [] + similar_samples: np.ndarray = self.training_inputs[minimum_distance_indices] + labels: np.ndarray = np.array([]) if self.training_labels is not None: labels = self.training_labels[minimum_distance_indices] diff --git a/npc_gzip/utils.py b/npc_gzip/utils.py index 84cf5679..16eff88e 100644 --- a/npc_gzip/utils.py +++ b/npc_gzip/utils.py @@ -3,6 +3,15 @@ from npc_gzip.exceptions import InvalidObjectTypeException +""" +Helper functions used primarily in testing the +rest of the codebase. + +>>> number_of_sentences = random.randint(1, 100) +>>> dataset = generate_dataset(number_of_sentences) +>>> assert len(dataset) == number_of_sentences +""" + def generate_sentence(number_of_words: int = 10) -> str: """ @@ -21,14 +30,14 @@ def generate_sentence(number_of_words: int = 10) -> str: """ if not isinstance(number_of_words, int): - if isinstance(number_of_words, float): + try: number_of_words = int(number_of_words) - else: + except Exception as e: raise InvalidObjectTypeException( type(number_of_words), supported_types=[int, float], function_name="generate_sentence", - ) + ) from e assert number_of_words > 0, f"`number_of_words` must be greater than zero." @@ -62,14 +71,14 @@ def generate_dataset(number_of_sentences: int) -> list: """ if not isinstance(number_of_sentences, int): - if isinstance(number_of_sentences, float): + try: number_of_sentences = int(number_of_sentences) - else: + except Exception as e: raise InvalidObjectTypeException( type(number_of_sentences), supported_types=[int, float], function_name="generate_dataset", - ) + ) from e assert number_of_sentences > 0, f"`number_of_sentences` must be greater than zero." @@ -79,9 +88,3 @@ def generate_dataset(number_of_sentences: int) -> list: dataset.append(generate_sentence(number_of_words)) return dataset - - -if __name__ == "__main__": - number_of_sentences = random.randint(1, 100) - dataset = generate_dataset(number_of_sentences) - assert len(dataset) == number_of_sentences diff --git a/poetry.lock b/poetry.lock index 59622c48..eb97f83f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -811,4 +811,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "beaf1837c50c0400cfcd305eacd132f26513c427374115daf23e4ac0693b41ab" +content-hash = "939536d95d97dcbd4510d11fc228cd450ac11d03d98e29a473734a43ed758964" diff --git a/pyproject.toml b/pyproject.toml index b1a889ed..090ddacd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,19 @@ name = "npc-gzip" version = "0.1.0" description = "Code for Paper: “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors" -authors = ["Zach Bloss "] +authors = [ + "Gin Jiang ", + "Zach Bloss ", +] +maintainers = [ + "Zach Bloss ", + "Gin Jiang ", +] license = "MIT" readme = "README.md" +homepage = "https://github.com/bazingagin/npc_gzip" +repository = "https://github.com/bazingagin/npc_gzip" +keywords = ["npc_gzip", "knn-gzip"] packages = [{include = "npc_gzip"}] [tool.poetry.dependencies] @@ -12,17 +22,20 @@ python = "^3.9" numpy = "^1.25.1" tqdm = "^4.65.0" -[tool.poetry.group.dev.dependencies] +[tool.poetry.group.test.dependencies] +pytest = "^7.4.0" + + +[tool.poetry.group.lint.dependencies] black = "^23.7.0" isort = "^5.12.0" + + +[tool.poetry.group.examples.dependencies] torchtext = "^0.15.2" portalocker = "^2.7.0" scikit-learn = "^1.3.0" - -[tool.poetry.group.test.dependencies] -pytest = "^7.4.0" - [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/tests/test_aggregations.py b/tests/test_aggregations.py index ba5edfb3..714268ed 100644 --- a/tests/test_aggregations.py +++ b/tests/test_aggregations.py @@ -1,12 +1,6 @@ import pytest -from npc_gzip.aggregations import ( - aggregate_strings, - average, - concatenate_with_space, - min_max_aggregation, - stack, -) +from npc_gzip.aggregations import aggregate_strings, concatenate_with_space from npc_gzip.exceptions import StringTooShortException @@ -14,17 +8,17 @@ class TestAggregations: stringa: str = "hey there how are you?" stringb: str = "I am just hanging out!" - def test_concatenate_with_space(self): + def test_concatenate_with_space(self) -> None: out = concatenate_with_space(self.stringa, self.stringb) assert len(out) == len(self.stringa) + len(self.stringb) + 1 assert out == f"{self.stringa} {self.stringb}" - def test_aggregate_strings(self): + def test_aggregate_strings(self) -> None: out = aggregate_strings(self.stringa, self.stringb, by_character=False) assert len(out) == len(self.stringa) + len(self.stringb) + 1 - def test_aggregate_strings_by_character(self): + def test_aggregate_strings_by_character(self) -> None: out = aggregate_strings(self.stringa, self.stringb, by_character=True) total_length = len("".join(self.stringa.split())) total_length += len("".join(self.stringb.split())) diff --git a/tests/test_base_compressor.py b/tests/test_base_compressor.py index f8d1a7b5..c078e7d4 100644 --- a/tests/test_base_compressor.py +++ b/tests/test_base_compressor.py @@ -11,7 +11,7 @@ class TestBaseCompressor: compressor = BaseCompressor(compressor=gzip) example_input = "hello there!" - def test_types(self): + def test_types(self) -> None: invalid_compressors = ["test", 0.1, 123, ("hello", "there"), {"hey": "hi"}] for compressor in invalid_compressors: @@ -21,7 +21,7 @@ def test_types(self): valid_compressor = gzip BaseCompressor(valid_compressor) - def test__open_file(self): + def test__open_file(self) -> None: valid_filepath = "tests/test_base_compressor.py" invalid_filepath = "tests/test_base_compressor.py.xyz" @@ -37,7 +37,7 @@ def test__open_file(self): file_contents = self.compressor._open_file(valid_filepath, as_bytes=True) assert isinstance(file_contents, str) - def test__compress(self): + def test__compress(self) -> None: compressed_bytes = self.compressor._compress(self.example_input) assert isinstance(compressed_bytes, bytes) @@ -47,7 +47,7 @@ def test__compress(self): out = self.compressor._compress(input_) assert isinstance(out, bytes) - def test_get_compressed_length(self): + def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -59,7 +59,7 @@ def test_get_compressed_length(self): assert isinstance(out, int) assert out > 0 - def test_get_bits_per_character(self): + def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_bz2_compressor.py b/tests/test_bz2_compressor.py index 30eb41d0..8fcbc9ca 100644 --- a/tests/test_bz2_compressor.py +++ b/tests/test_bz2_compressor.py @@ -13,7 +13,7 @@ class TestBz2Compressor: compressor = Bz2Compressor() example_input = "hello there!" - def test__compress(self): + def test__compress(self) -> None: compressed_bytes = self.compressor._compress(self.example_input) base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes @@ -24,7 +24,7 @@ def test__compress(self): out = self.base_compressor._compress(input_) assert isinstance(out, bytes) - def test_get_compressed_length(self): + def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -36,7 +36,7 @@ def test_get_compressed_length(self): assert isinstance(out, int) assert out > 0 - def test_get_bits_per_character(self): + def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_distance.py b/tests/test_distance.py index bb013d7f..b9470e0b 100644 --- a/tests/test_distance.py +++ b/tests/test_distance.py @@ -16,7 +16,7 @@ class TestDistance: float_b = random.random() float_ab = random.random() - def test_types(self): + def test_types(self) -> None: distance = Distance(self.array_a, self.array_b, self.array_ab) assert isinstance(distance.compressed_values_a, np.ndarray) @@ -28,7 +28,7 @@ def test_types(self): assert isinstance(distance.compressed_values_b, np.ndarray) assert isinstance(distance.compressed_values_ab, np.ndarray) - def test_invalid_shapes(self): + def test_invalid_shapes(self) -> None: array_a_shape = self.array_a.shape invalid_shape_array = np.random.rand(array_a_shape[0] + 1, array_a_shape[1] + 1) assert invalid_shape_array.shape != self.array_a.shape @@ -36,7 +36,7 @@ def test_invalid_shapes(self): with pytest.raises(InvalidShapeException): Distance(self.array_a, self.array_b, invalid_shape_array) - def test__ncd(self): + def test__ncd(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._ncd(self.float_a, self.float_b, self.float_ab) assert isinstance(out, float) @@ -48,7 +48,7 @@ def test__ncd(self): with pytest.raises(ValueError): distance._ncd(self.array_a, self.array_b, self.array_ab) - def test__cdm(self): + def test__cdm(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._cdm(self.float_a, self.float_b, self.float_ab) assert isinstance(out, float) @@ -60,7 +60,7 @@ def test__cdm(self): with pytest.raises(ValueError): distance._ncd(self.array_a, self.array_b, self.array_ab) - def test__clm(self): + def test__clm(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._clm(self.float_a, self.float_b, self.float_ab) assert isinstance(out, float) @@ -72,7 +72,7 @@ def test__clm(self): with pytest.raises(ValueError): distance._ncd(self.array_a, self.array_b, self.array_ab) - def test__mse(self): + def test__mse(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance._mse(self.float_a, self.float_b) assert isinstance(out, float) @@ -81,7 +81,7 @@ def test__mse(self): out = distance._mse(a, b) assert isinstance(out, float) - def test_ncd(self): + def test_ncd(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.ncd assert isinstance(out, np.ndarray) @@ -90,7 +90,7 @@ def test_ncd(self): out = distance.ncd assert isinstance(out, np.ndarray) - def test_cdm(self): + def test_cdm(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.cdm assert isinstance(out, np.ndarray) @@ -99,7 +99,7 @@ def test_cdm(self): out = distance.cdm assert isinstance(out, np.ndarray) - def test_clm(self): + def test_clm(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.clm assert isinstance(out, np.ndarray) @@ -108,7 +108,7 @@ def test_clm(self): out = distance.clm assert isinstance(out, np.ndarray) - def test_mse(self): + def test_mse(self) -> None: distance = Distance(self.float_a, self.float_b, self.float_ab) out = distance.mse assert isinstance(out, np.ndarray) diff --git a/tests/test_gzip_compressor.py b/tests/test_gzip_compressor.py index d10f8394..298f1c2e 100644 --- a/tests/test_gzip_compressor.py +++ b/tests/test_gzip_compressor.py @@ -13,7 +13,7 @@ class TestBz2Compressor: compressor = GZipCompressor() example_input = "hello there!" - def test__compress(self): + def test__compress(self) -> None: compressed_bytes = self.compressor._compress(self.example_input) base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes @@ -24,7 +24,7 @@ def test__compress(self): out = self.base_compressor._compress(input_) assert isinstance(out, bytes) - def test_get_compressed_length(self): + def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -36,7 +36,7 @@ def test_get_compressed_length(self): assert isinstance(out, int) assert out > 0 - def test_get_bits_per_character(self): + def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_knn_classifier.py b/tests/test_knn_classifier.py index 9e027c3f..fb40e15f 100644 --- a/tests/test_knn_classifier.py +++ b/tests/test_knn_classifier.py @@ -35,7 +35,7 @@ class TestKnnClassifier: sample_input: str = "hello there" - def test_init(self): + def test_init(self) -> None: assert self.model.distance_metric == "ncd" assert isinstance(self.model.training_inputs, np.ndarray) assert isinstance(self.model.compressed_training_inputs, np.ndarray) @@ -63,7 +63,7 @@ def test_init(self): == model.compressed_training_inputs.shape[0] ) - def test_invalid_metric(self): + def test_invalid_metric(self) -> None: with pytest.raises(UnsupportedDistanceMetricException): KnnClassifier( compressor=self.gzip_compressor, @@ -72,7 +72,7 @@ def test_invalid_metric(self): distance_metric="hoopla", ) - def test_invalid_data_and_label_size(self): + def test_invalid_data_and_label_size(self) -> None: training_data_size = 10 training_label_size = training_data_size - 1 @@ -87,7 +87,7 @@ def test_invalid_data_and_label_size(self): distance_metric="ncd", ) - def test__compress_sample(self): + def test__compress_sample(self) -> None: compressed_input, compressed_combined = self.model._compress_sample( self.sample_input ) @@ -96,7 +96,7 @@ def test__compress_sample(self): assert isinstance(compressed_combined, np.ndarray) assert compressed_input.shape == compressed_combined.shape - def test__calculate_distance(self): + def test__calculate_distance(self) -> None: compressed_input, compressed_combined = self.model._compress_sample( self.sample_input ) @@ -105,12 +105,12 @@ def test__calculate_distance(self): assert isinstance(distance, np.ndarray) assert distance.shape == compressed_input.shape == compressed_combined.shape - def test_concatenate_with_space(self): + def test_concatenate_with_space(self) -> None: a = "hello there" b = "who goes there?" assert self.model.concatenate_with_space(a, b) == "hello there who goes there?" - def test_predict(self): + def test_predict(self) -> None: top_k = 1 (distance, labels, similar_samples) = self.model.predict( self.sample_input, top_k @@ -132,7 +132,7 @@ def test_predict(self): assert labels.shape == (test_set_size, top_k) assert similar_samples.shape == (test_set_size, top_k) - def test_negative_top_k(self): + def test_negative_top_k(self) -> None: test_set_size = random.randint(1, 50) test_set = [self.sample_input for _ in range(test_set_size)] top_k = -1 @@ -140,14 +140,14 @@ def test_negative_top_k(self): with pytest.raises(AssertionError): self.model.predict(test_set, top_k) - def test_top_k_bigger_than_test_set(self): + def test_top_k_bigger_than_test_set(self) -> None: test_set_size = random.randint(1, 10) test_set = [self.sample_input for _ in range(test_set_size)] top_k = test_set_size + 1 with pytest.raises(AssertionError): self.model.predict(test_set, top_k) - def test_sampling_percentage(self): + def test_sampling_percentage(self) -> None: test_set_size = random.randint(1, 10) test_set = [self.sample_input for _ in range(test_set_size)] top_k = 1 diff --git a/tests/test_lzma_compressor.py b/tests/test_lzma_compressor.py index 3fe1e8ba..11bb3db3 100644 --- a/tests/test_lzma_compressor.py +++ b/tests/test_lzma_compressor.py @@ -13,7 +13,7 @@ class TestBz2Compressor: compressor = LzmaCompressor() example_input = "hello there!" - def test__compress(self): + def test__compress(self) -> None: compressed_bytes = self.compressor._compress(self.example_input) base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes @@ -24,7 +24,7 @@ def test__compress(self): out = self.base_compressor._compress(input_) assert isinstance(out, bytes) - def test_get_compressed_length(self): + def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 @@ -36,7 +36,7 @@ def test_get_compressed_length(self): assert isinstance(out, int) assert out > 0 - def test_get_bits_per_character(self): + def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 96bef109..c9fa5acc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,7 +7,7 @@ class TestUtils: - def test_generate_sentence(self): + def test_generate_sentence(self) -> None: for _ in range(100): number_of_words = random.randint(1, 100) sentence = generate_sentence(number_of_words) @@ -19,7 +19,7 @@ def test_generate_sentence(self): with pytest.raises(AssertionError): generate_sentence(-1) - def test_generate_dataset(self): + def test_generate_dataset(self) -> None: for _ in range(100): number_of_sentences = random.randint(1, 100) dataset = generate_dataset(number_of_sentences) From 67d76a5d62f02124f152e512f5bcbbcf48d4da11 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 10:41:44 -0400 Subject: [PATCH 52/74] add --doctest-modules back into the cicd pipeline --- .github/workflows/publish-package.yml | 4 ++-- .github/workflows/test-package.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 4bb499f7..c9d3a80e 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -30,8 +30,8 @@ jobs: run: | python -m pip install --upgrade pip poetry install - poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - + poetry run pytest --doctest-modules --ignore original_codebase/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + - name: Upload pytest test results uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index 574e6907..480a12f4 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -28,7 +28,7 @@ jobs: run: | python -m pip install --upgrade pip poetry install --with test - poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --doctest-modules --ignore original_codebase/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 From e44434644810212e17b58451228c14dfb89d1a0d Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 10:48:05 -0400 Subject: [PATCH 53/74] ignore examples in pytest --- .github/workflows/publish-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index c9d3a80e..c3cce5f8 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -30,7 +30,7 @@ jobs: run: | python -m pip install --upgrade pip poetry install - poetry run pytest --doctest-modules --ignore original_codebase/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 From aadb104f229a436d45131203936f3ebb0ee7dc5e Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 10:48:13 -0400 Subject: [PATCH 54/74] ignore examples/ in pytest --- .github/workflows/test-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index 480a12f4..306038ed 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -28,7 +28,7 @@ jobs: run: | python -m pip install --upgrade pip poetry install --with test - poetry run pytest --doctest-modules --ignore original_codebase/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 From baa72abf1a4fdb3f0baf986c21b7bd36ace70413 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 18:09:09 -0400 Subject: [PATCH 55/74] add sample_data method to handle random sampling logic fixes bug causing drastically lower accuracy than expected --- npc_gzip/knn_classifier.py | 82 +++++++++++++++++++++++++++++------- tests/test_knn_classifier.py | 13 ++++++ 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index cb101003..a5783923 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -165,7 +165,11 @@ def _calculate_distance( function_name="_calculate_distance", ) - def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: + def _compress_sample( + self, + x: str, + training_inputs: Optional[np.ndarray] = None, + ) -> tuple: """ Helper method that compresses `x` against each item in `self.training_inputs` and returns the @@ -176,6 +180,10 @@ def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: Arguments: x (np.ndarray): The sample data to compare against the `training_inputs`. + training_inputs (np.ndarray): [Optional] If provided, this + method will use `training_inputs` + when calculating the distance matrix + rather than `self.training_inputs`. Returns: np.ndarray: Compressed length of `x` as an array of shape @@ -189,10 +197,8 @@ def _compress_sample(self, x: str, sampling_percentage: float = 1.0) -> tuple: x, str ), f"Non-string was passed to self._compress_sample: {x}" - sample_size: int = int(sampling_percentage * self.training_inputs.shape[0]) - training_inputs = np.random.choice( - self.training_inputs, sample_size, replace=False - ) + if training_inputs is None: + training_inputs = self.training_inputs compressed_input_length: int = self.compressor.get_compressed_length(x) compressed_input: list = [ @@ -232,6 +238,47 @@ def concatenate_with_space(stringa: str, stringb: str) -> str: return stringa + " " + stringb + def sample_data( + self, sampling_percentage: float = 1.0 + ) -> tuple[np.ndarray, np.ndarray]: + """ + Given a `sampling_percentage`, this method randomly + samples data from `self.training_inputs` & + `self.training_labels` (if exists) without replacement + and returns two numpy arrays containing the randomly + sampled data. + + Arguments: + sampling_percentage (float): (0, 1.0] % of data to + randomly sample from the + training inputs and labels. + + Returns: + np.ndarray: Randomly sampled training inputs. + np.ndarray: Randomly sampled training labels. + np.ndarray: Indices that the training inputs & + labels were sampled. + """ + + total_inputs: int = self.training_inputs.shape[0] + sample_size: int = int(sampling_percentage * total_inputs) + randomly_sampled_indices: np.ndarray = np.random.choice( + total_inputs, sample_size, replace=False + ) + + randomly_sampled_inputs: np.ndarray = self.training_inputs[ + randomly_sampled_indices + ] + randomly_sampled_labels: np.ndarray = np.array([]) + if self.training_labels is not None: + randomly_sampled_labels = self.training_labels[randomly_sampled_indices] + + return ( + randomly_sampled_inputs, + randomly_sampled_labels, + randomly_sampled_indices, + ) + def predict( self, x: Sequence, @@ -286,11 +333,16 @@ def predict( """ + # sample training inputs and labels + training_inputs, training_labels, randomly_sampled_indices = self.sample_data( + sampling_percentage + ) + samples: list = [] combined: list = [] for sample in tqdm(x, desc="Compressing input..."): compressed_sample, combined_length = self._compress_sample( - sample, sampling_percentage + sample, training_inputs=training_inputs ) samples.append(compressed_sample) combined.append(combined_length) @@ -298,14 +350,14 @@ def predict( compressed_samples: np.ndarray = np.array(samples) compressed_combined: np.ndarray = np.array(combined) + assert isinstance(training_inputs, np.ndarray) + assert isinstance(compressed_samples, np.ndarray) + assert isinstance(compressed_combined, np.ndarray) assert isinstance(self.compressed_training_inputs, np.ndarray) - compressed_training_size: int = int( - self.compressed_training_inputs.shape[0] * sampling_percentage - ) - compressed_training: np.ndarray = np.random.choice( - self.compressed_training_inputs, compressed_training_size - ) + compressed_training: np.ndarray = self.compressed_training_inputs[ + randomly_sampled_indices + ] distances: np.ndarray = self._calculate_distance( compressed_samples, compressed_combined, compressed_training @@ -317,9 +369,7 @@ def predict( # get indicies of minimum top_k distances. minimum_distance_indices = np.argpartition(distances, top_k)[:, :top_k] - similar_samples: np.ndarray = self.training_inputs[minimum_distance_indices] - labels: np.ndarray = np.array([]) - if self.training_labels is not None: - labels = self.training_labels[minimum_distance_indices] + similar_samples: np.ndarray = training_inputs[minimum_distance_indices] + labels: np.ndarray = training_labels[minimum_distance_indices] return distances, labels, similar_samples diff --git a/tests/test_knn_classifier.py b/tests/test_knn_classifier.py index fb40e15f..16672e2d 100644 --- a/tests/test_knn_classifier.py +++ b/tests/test_knn_classifier.py @@ -160,3 +160,16 @@ def test_sampling_percentage(self) -> None: assert full_labels.shape == labels.shape assert full_similar_samples.shape == similar_samples.shape + + def test_sample_data(self): + sampling_percentage: float = 0.8 + randomly_sampled_inputs, randomly_sampled_labels, randomly_sampled_indices = self.model.sample_data( + sampling_percentage + ) + + assert randomly_sampled_inputs.shape == randomly_sampled_labels.shape == randomly_sampled_indices.shape + assert randomly_sampled_inputs.shape[0] == int( + self.model.training_inputs.shape[0] * sampling_percentage + ) + + assert (self.model.training_inputs[randomly_sampled_indices] == randomly_sampled_inputs).all() From b317492ef480dfb41483d8f3d42ff0eeafebf69c Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 18:11:05 -0400 Subject: [PATCH 56/74] add new classification report results --- examples/ag_news.py | 16 ++++++++-------- examples/imdb.py | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/ag_news.py b/examples/ag_news.py index a548e450..7ba69dcc 100644 --- a/examples/ag_news.py +++ b/examples/ag_news.py @@ -102,11 +102,11 @@ def main() -> None: # precision recall f1-score support -# 1 0.24 0.09 0.13 258 -# 2 0.25 0.44 0.32 231 -# 3 0.26 0.06 0.09 244 -# 4 0.26 0.43 0.32 267 - -# accuracy 0.26 1000 -# macro avg 0.25 0.26 0.22 1000 -# weighted avg 0.25 0.26 0.22 1000 +# 1 0.67 0.80 0.73 246 +# 2 0.78 0.74 0.76 246 +# 3 0.64 0.61 0.62 249 +# 4 0.65 0.59 0.62 259 + +# accuracy 0.68 1000 +# macro avg 0.68 0.68 0.68 1000 +# weighted avg 0.68 0.68 0.68 1000 \ No newline at end of file diff --git a/examples/imdb.py b/examples/imdb.py index 447da7d9..cabf464e 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -109,9 +109,9 @@ def main() -> None: # precision recall f1-score support -# 1 0.49 1.00 0.66 489 -# 2 0.00 0.00 0.00 511 +# 1 0.57 0.63 0.60 495 +# 2 0.59 0.53 0.56 505 -# accuracy 0.49 1000 -# macro avg 0.24 0.50 0.33 1000 -# weighted avg 0.24 0.49 0.32 1000 +# accuracy 0.58 1000 +# macro avg 0.58 0.58 0.58 1000 +# weighted avg 0.58 0.58 0.58 1000 \ No newline at end of file From e2b195524ba64e5fc668cb7e030605fcf0cf79d1 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 18:17:10 -0400 Subject: [PATCH 57/74] add correct load_filipino function --- original_codebase/data.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/original_codebase/data.py b/original_codebase/data.py index d1de74c1..2ff3b0a4 100644 --- a/original_codebase/data.py +++ b/original_codebase/data.py @@ -312,33 +312,34 @@ def process(dataset: Iterable) -> list: return train_ds, test_ds -def load_filipino() -> tuple: +def load_filipino(data_directory) -> tuple: """ - deprecated - datasets on huggingface have overlapped train&test + Loads the Dengue Filipino dataset from local directory - Loads the Dengue Filipino dataset + :ref: https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks#datasets + Arguments: + data_directory (str): Directory containing Dengue Filipino dataset Returns: tuple: Tuple of lists containing the training and testing datasets respectively. """ - def process(dataset: Iterable) -> list: - label_dict = OrderedDict() - d = {"absent": 0, "dengue": 1, "health": 2, "mosquito": 3, "sick": 4} - for k, v in d.items(): - label_dict[k] = v - + def process(fn): pairs = [] - for pair in dataset: - text = pair["text"] - for k in label_dict: - if pair[k] == 1: - label = label_dict[k] - pairs.append((label, text)) + with open(fn, "r") as f: + reader = csv.reader(f, delimiter=",", quotechar='"') + for row in reader: + text = row[0] + for i in range(1, 6): + if row[i] == "1": + label = i - 1 + pairs.append((label, text)) + break return pairs - ds = load_dataset("dengue_filipino") - train_ds, test_ds = process(ds["train"]), process(ds["test"]) + train_ds, test_ds = process(os.path.join(data_directory, "train.csv")), process( + os.path.join(data_directory, "test.csv") + ) return train_ds, test_ds From a18b56a28462c58e7d355a4a7f14fde28539ab16 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 18:17:16 -0400 Subject: [PATCH 58/74] black + isort --- examples/ag_news.py | 2 +- examples/imdb.py | 2 +- tests/test_knn_classifier.py | 19 ++++++++++++++----- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/ag_news.py b/examples/ag_news.py index 7ba69dcc..2f60ad99 100644 --- a/examples/ag_news.py +++ b/examples/ag_news.py @@ -109,4 +109,4 @@ def main() -> None: # accuracy 0.68 1000 # macro avg 0.68 0.68 0.68 1000 -# weighted avg 0.68 0.68 0.68 1000 \ No newline at end of file +# weighted avg 0.68 0.68 0.68 1000 diff --git a/examples/imdb.py b/examples/imdb.py index cabf464e..44be9f1b 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -114,4 +114,4 @@ def main() -> None: # accuracy 0.58 1000 # macro avg 0.58 0.58 0.58 1000 -# weighted avg 0.58 0.58 0.58 1000 \ No newline at end of file +# weighted avg 0.58 0.58 0.58 1000 diff --git a/tests/test_knn_classifier.py b/tests/test_knn_classifier.py index 16672e2d..c13a45e3 100644 --- a/tests/test_knn_classifier.py +++ b/tests/test_knn_classifier.py @@ -163,13 +163,22 @@ def test_sampling_percentage(self) -> None: def test_sample_data(self): sampling_percentage: float = 0.8 - randomly_sampled_inputs, randomly_sampled_labels, randomly_sampled_indices = self.model.sample_data( - sampling_percentage - ) + ( + randomly_sampled_inputs, + randomly_sampled_labels, + randomly_sampled_indices, + ) = self.model.sample_data(sampling_percentage) - assert randomly_sampled_inputs.shape == randomly_sampled_labels.shape == randomly_sampled_indices.shape + assert ( + randomly_sampled_inputs.shape + == randomly_sampled_labels.shape + == randomly_sampled_indices.shape + ) assert randomly_sampled_inputs.shape[0] == int( self.model.training_inputs.shape[0] * sampling_percentage ) - assert (self.model.training_inputs[randomly_sampled_indices] == randomly_sampled_inputs).all() + assert ( + self.model.training_inputs[randomly_sampled_indices] + == randomly_sampled_inputs + ).all() From cef8c8dca1ecf2d4bf542ecc6214af4e69929752 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 21:00:51 -0400 Subject: [PATCH 59/74] add hyperlink in README --- README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b8c572ae..db4977d5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This paper is accepted to Findings of [ACL2023](https://aclanthology.org/2023.fi ## Getting Started -This codebase is available on pypi.org via +This codebase is [available on pypi.org via](https://pypi.org/project/npc-gzip) ```bash @@ -18,10 +18,16 @@ See the [examples](./examples/imdb.py) directory for example usage. ## Testing -This package utilizes `poetry` to maintain it's dependencies and `pytest` to execute tests. To get started running the tests: +This package utilizes `poetry` to maintain its dependencies and `pytest` to execute tests. To get started running the tests: + +```bash + +poetry shell +poetry install +pytest + +``` -1. `poetry install --with test` -2. `pytest tests/` ### Require From 6b440565b67de0ed4799b79cfa9d5854645865b9 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 23:09:22 -0400 Subject: [PATCH 60/74] remove pip upgrade --- .github/workflows/publish-package.yml | 1 - .github/workflows/test-package.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index c3cce5f8..c0133a03 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -28,7 +28,6 @@ jobs: - name: Run Tests run: | - python -m pip install --upgrade pip poetry install poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index 306038ed..ac78834e 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -26,7 +26,6 @@ jobs: - name: Run Tests run: | - python -m pip install --upgrade pip poetry install --with test poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml From b00bae76488d31d7f68b9691cfd01d5d107f2434 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 23:09:40 -0400 Subject: [PATCH 61/74] simplify utils by only expecting ints --- npc_gzip/utils.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/npc_gzip/utils.py b/npc_gzip/utils.py index 16eff88e..68aee0f0 100644 --- a/npc_gzip/utils.py +++ b/npc_gzip/utils.py @@ -29,16 +29,6 @@ def generate_sentence(number_of_words: int = 10) -> str: str: Sentence of random numbers and letters. """ - if not isinstance(number_of_words, int): - try: - number_of_words = int(number_of_words) - except Exception as e: - raise InvalidObjectTypeException( - type(number_of_words), - supported_types=[int, float], - function_name="generate_sentence", - ) from e - assert number_of_words > 0, f"`number_of_words` must be greater than zero." words = [] @@ -70,16 +60,6 @@ def generate_dataset(number_of_sentences: int) -> list: list: List of sentences (str). """ - if not isinstance(number_of_sentences, int): - try: - number_of_sentences = int(number_of_sentences) - except Exception as e: - raise InvalidObjectTypeException( - type(number_of_sentences), - supported_types=[int, float], - function_name="generate_dataset", - ) from e - assert number_of_sentences > 0, f"`number_of_sentences` must be greater than zero." dataset = [] From 3fe355e4a293178a2a565ee5e989706e25bba279 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 23:09:50 -0400 Subject: [PATCH 62/74] add return hint of None --- tests/test_knn_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_knn_classifier.py b/tests/test_knn_classifier.py index c13a45e3..d35a34dc 100644 --- a/tests/test_knn_classifier.py +++ b/tests/test_knn_classifier.py @@ -161,7 +161,7 @@ def test_sampling_percentage(self) -> None: assert full_labels.shape == labels.shape assert full_similar_samples.shape == similar_samples.shape - def test_sample_data(self): + def test_sample_data(self) -> None: sampling_percentage: float = 0.8 ( randomly_sampled_inputs, From 9208fa784bcfc48ed78ed82157655eb10aaced95 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 23:09:56 -0400 Subject: [PATCH 63/74] update tests --- poetry.lock | 18 +++++++++--------- tests/test_utils.py | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/poetry.lock b/poetry.lock index eb97f83f..828be8c7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -414,29 +414,29 @@ files = [ [[package]] name = "pathspec" -version = "0.11.1" +version = "0.11.2" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.7" files = [ - {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, - {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, + {file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"}, + {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"}, ] [[package]] name = "platformdirs" -version = "3.9.1" +version = "3.10.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.9.1-py3-none-any.whl", hash = "sha256:ad8291ae0ae5072f66c16945166cb11c63394c7a3ad1b1bc9828ca3162da8c2f"}, - {file = "platformdirs-3.9.1.tar.gz", hash = "sha256:1b42b450ad933e981d56e59f1b97495428c9bd60698baab9f3eb3d00d5822421"}, + {file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"}, + {file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"}, ] [package.extras] -docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] [[package]] name = "pluggy" @@ -811,4 +811,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "939536d95d97dcbd4510d11fc228cd450ac11d03d98e29a473734a43ed758964" +content-hash = "fbaae83db7c0f3f88c84f676cbcca269405053841c048a085843a23f0f9a5f48" diff --git a/tests/test_utils.py b/tests/test_utils.py index c9fa5acc..341762c2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,7 +13,7 @@ def test_generate_sentence(self) -> None: sentence = generate_sentence(number_of_words) assert len(sentence.split()) == number_of_words - with pytest.raises(InvalidObjectTypeException): + with pytest.raises(TypeError): generate_sentence("hey there") with pytest.raises(AssertionError): @@ -25,7 +25,7 @@ def test_generate_dataset(self) -> None: dataset = generate_dataset(number_of_sentences) assert len(dataset) == number_of_sentences - with pytest.raises(InvalidObjectTypeException): + with pytest.raises(TypeError): generate_dataset("hey there") with pytest.raises(AssertionError): From 088010203c707a349d970203d7f969b1ba5ecf76 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Sun, 30 Jul 2023 23:11:35 -0400 Subject: [PATCH 64/74] simplify poetry install --- .github/workflows/test-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index ac78834e..c259bcb8 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -26,7 +26,7 @@ jobs: - name: Run Tests run: | - poetry install --with test + poetry install poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results From 241595cec5ea2f897d5fdeef39142996b0726750 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 07:34:37 -0400 Subject: [PATCH 65/74] move doctest to top of file --- README.md | 14 +++++++++----- npc_gzip/distance.py | 4 ---- npc_gzip/utils.py | 9 ++++----- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index db4977d5..d35c1536 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ pip install npc-gzip ``` +## Usage + See the [examples](./examples/imdb.py) directory for example usage. @@ -24,13 +26,15 @@ This package utilizes `poetry` to maintain its dependencies and `pytest` to exec poetry shell poetry install -pytest +poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ ``` +------------------------- +### Original Codebase -### Require +#### Require See `requirements.txt`. @@ -41,7 +45,7 @@ conda activate npc pip install -r requirements.txt ``` -### Run +#### Run ``` python main_text.py @@ -64,7 +68,7 @@ By default, this will only use 100 test and training samples per class as a quic ``` -### Calculate Accuracy (Optional) +#### Calculate Accuracy (Optional) If we want to calculate accuracy from recorded distance file , use @@ -73,7 +77,7 @@ python main_text.py --record --score --distance_fn ``` to calculate accuracy. Otherwise, the accuracy will be calculated automatically using the command in the last section. -### Use Custom Dataset +#### Use Custom Dataset You can use your own custom dataset by passing `custom` to `--dataset`; pass the data directory that contains `train.txt` and `test.txt` to `--data_dir`; pass the class number to the `--class_num`. diff --git a/npc_gzip/distance.py b/npc_gzip/distance.py index e456509e..3c4bfca7 100644 --- a/npc_gzip/distance.py +++ b/npc_gzip/distance.py @@ -62,10 +62,6 @@ def __init__( if not isinstance(compressed_values_ab, np.ndarray): compressed_values_ab = np.array(compressed_values_ab) - compressed_values_a = compressed_values_a - compressed_values_b = compressed_values_b - compressed_values_ab = compressed_values_ab - if ( compressed_values_a.shape == compressed_values_b.shape diff --git a/npc_gzip/utils.py b/npc_gzip/utils.py index 68aee0f0..71824ede 100644 --- a/npc_gzip/utils.py +++ b/npc_gzip/utils.py @@ -1,8 +1,3 @@ -import random -import string - -from npc_gzip.exceptions import InvalidObjectTypeException - """ Helper functions used primarily in testing the rest of the codebase. @@ -12,6 +7,10 @@ >>> assert len(dataset) == number_of_sentences """ +import random +import string + +from npc_gzip.exceptions import InvalidObjectTypeException def generate_sentence(number_of_words: int = 10) -> str: """ From 5a649d09ee4b6c52c0903143a8a645fa8ab795f5 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 10:55:27 -0400 Subject: [PATCH 66/74] remove redundant poetry run pytest calls --- .github/workflows/publish-package.yml | 5 +++-- .github/workflows/test-package.yml | 3 ++- README.md | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index c0133a03..c79ba722 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -28,9 +28,10 @@ jobs: - name: Run Tests run: | + poetry shell poetry install - poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - + pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + - name: Upload pytest test results uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index c259bcb8..1a49dea5 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -26,8 +26,9 @@ jobs: - name: Run Tests run: | + poetry shell poetry install - poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 diff --git a/README.md b/README.md index d35c1536..01203361 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ This package utilizes `poetry` to maintain its dependencies and `pytest` to exec poetry shell poetry install -poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ +pytest --doctest-modules --ignore original_codebase/ --ignore examples/ ``` From feae49fff8abf0a7f21b5a2af8dee8f0dc4f63f7 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 10:57:35 -0400 Subject: [PATCH 67/74] undo pipeline change --- .github/workflows/publish-package.yml | 5 ++--- .github/workflows/test-package.yml | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index c79ba722..c0133a03 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -28,10 +28,9 @@ jobs: - name: Run Tests run: | - poetry shell poetry install - pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - + poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + - name: Upload pytest test results uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index 1a49dea5..c259bcb8 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -26,9 +26,8 @@ jobs: - name: Run Tests run: | - poetry shell poetry install - pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 From 994f9b09d04f0b40dc2b7aa4b0859a8353d3fa3d Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 11:21:28 -0400 Subject: [PATCH 68/74] add tool.pytest.ini_options to simplify pytest call --- .github/workflows/publish-package.yml | 2 +- .github/workflows/test-package.yml | 2 +- README.md | 2 +- pyproject.toml | 3 +++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index c0133a03..148ca40d 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -29,7 +29,7 @@ jobs: - name: Run Tests run: | poetry install - poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index c259bcb8..a369a13c 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -27,7 +27,7 @@ jobs: - name: Run Tests run: | poetry install - poetry run pytest --doctest-modules --ignore original_codebase/ --ignore examples/ --junit-xml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results uses: actions/upload-artifact@v3 diff --git a/README.md b/README.md index 01203361..0abea063 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ This package utilizes `poetry` to maintain its dependencies and `pytest` to exec poetry shell poetry install -pytest --doctest-modules --ignore original_codebase/ --ignore examples/ +pytest ``` diff --git a/pyproject.toml b/pyproject.toml index 090ddacd..ac8cbba0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,3 +39,6 @@ scikit-learn = "^1.3.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +addopts = "--doctest-modules --ignore original_codebase/ --ignore examples/" From 22f95f3a744d05b18b48fc5573b98726f68bdd6c Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 13:15:05 -0400 Subject: [PATCH 69/74] remove unused if/main --- npc_gzip/distance.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/npc_gzip/distance.py b/npc_gzip/distance.py index 3c4bfca7..f999b1d6 100644 --- a/npc_gzip/distance.py +++ b/npc_gzip/distance.py @@ -221,28 +221,3 @@ def mse(self) -> np.ndarray: out = out.reshape(self.compressed_values_a.shape) return out - - -if __name__ == "__main__": - import random - - a = random.random() - b = random.random() - ab = random.random() - - distance = Distance(a, b, ab) - ncd = distance._ncd(a, b, ab) - cdm = distance._cdm(a, b, ab) - clm = distance._clm(a, b, ab) - mse = distance._mse(a, b) - - a = np.random.rand(3, 10) - b = np.random.rand(3, 10) - ab = np.random.rand(3, 10) - - distance = Distance(a, b, ab) - - ncd = distance.ncd - cdm = distance.cdm - clm = distance.clm - mse = distance.mse From a08a9ddcd8a318a6161871014b3268290b47b4d0 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 14:35:06 -0400 Subject: [PATCH 70/74] update numpy update tests to reflect changes in not handling type conversions --- .github/workflows/publish-package.yml | 2 ++ .github/workflows/test-package.yml | 3 +- npc_gzip/compressors/base.py | 5 +-- poetry.lock | 52 +++++++++++++-------------- tests/test_base_compressor.py | 17 --------- tests/test_bz2_compressor.py | 18 ---------- tests/test_gzip_compressor.py | 17 --------- tests/test_lzma_compressor.py | 17 --------- 8 files changed, 31 insertions(+), 100 deletions(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 148ca40d..38c99e56 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -32,6 +32,8 @@ jobs: poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - name: Upload pytest test results + if: ${{ !cancelled() }} # Upload results even if tests fail. + uses: actions/upload-artifact@v3 with: name: pytest-results-${{ matrix.python-version }} diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index a369a13c..a00dc356 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -29,7 +29,8 @@ jobs: poetry install poetry run pytest --junit-xml=junit/test-results-${{ matrix.python-version }}.xml - - name: Upload pytest test results + - name: Upload pytest test results + if: ${{ !cancelled() }} # Upload results even if tests fail. uses: actions/upload-artifact@v3 with: name: pytest-results-${{ matrix.python-version }} diff --git a/npc_gzip/compressors/base.py b/npc_gzip/compressors/base.py index c943ddcd..9b52ffe7 100644 --- a/npc_gzip/compressors/base.py +++ b/npc_gzip/compressors/base.py @@ -27,7 +27,7 @@ def __init__(self, compressor: ModuleType) -> None: ) self.compressor = compressor - def _open_file(self, filepath: str, as_bytes: bool = False): + def _open_file(self, filepath: str, as_bytes: bool = False) -> str: """ Helper function that loads and returns the contents of a file at `filepath`. Optional `as_bytes` parameter @@ -50,7 +50,6 @@ def _open_file(self, filepath: str, as_bytes: bool = False): with open(filepath, open_mode) as f: file_contents = f.read() - f.close() if as_bytes: file_contents = file_contents.decode("utf-8") @@ -69,7 +68,6 @@ def _compress(self, x: str) -> bytes: bytes: The compressed bytes representation of `x`. """ - x = str(x) x: bytes = x.encode("utf-8") compressed: bytes = self.compressor.compress(x) return compressed @@ -103,7 +101,6 @@ def get_bits_per_character(self, x: str) -> float: characters in `x`. """ - x = str(x) compressed_value: bytes = self._compress(x) compressed_length: int = len(compressed_value) compressed_length_in_bits: int = compressed_length * 8 diff --git a/poetry.lock b/poetry.lock index 828be8c7..11cfee00 100644 --- a/poetry.lock +++ b/poetry.lock @@ -369,36 +369,36 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] [[package]] name = "numpy" -version = "1.25.1" +version = "1.25.2" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" files = [ - {file = "numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d339465dff3eb33c701430bcb9c325b60354698340229e1dff97745e6b3efa"}, - {file = "numpy-1.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d736b75c3f2cb96843a5c7f8d8ccc414768d34b0a75f466c05f3a739b406f10b"}, - {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a90725800caeaa160732d6b31f3f843ebd45d6b5f3eec9e8cc287e30f2805bf"}, - {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6c9261d21e617c6dc5eacba35cb68ec36bb72adcff0dee63f8fbc899362588"}, - {file = "numpy-1.25.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0def91f8af6ec4bb94c370e38c575855bf1d0be8a8fbfba42ef9c073faf2cf19"}, - {file = "numpy-1.25.1-cp310-cp310-win32.whl", hash = "sha256:fd67b306320dcadea700a8f79b9e671e607f8696e98ec255915c0c6d6b818503"}, - {file = "numpy-1.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:c1516db588987450b85595586605742879e50dcce923e8973f79529651545b57"}, - {file = "numpy-1.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b82655dd8efeea69dbf85d00fca40013d7f503212bc5259056244961268b66e"}, - {file = "numpy-1.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f6049c4878cb16960fbbfb22105e49d13d752d4d8371b55110941fb3b17800"}, - {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41a56b70e8139884eccb2f733c2f7378af06c82304959e174f8e7370af112e09"}, - {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5154b1a25ec796b1aee12ac1b22f414f94752c5f94832f14d8d6c9ac40bcca6"}, - {file = "numpy-1.25.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38eb6548bb91c421261b4805dc44def9ca1a6eef6444ce35ad1669c0f1a3fc5d"}, - {file = "numpy-1.25.1-cp311-cp311-win32.whl", hash = "sha256:791f409064d0a69dd20579345d852c59822c6aa087f23b07b1b4e28ff5880fcb"}, - {file = "numpy-1.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:c40571fe966393b212689aa17e32ed905924120737194b5d5c1b20b9ed0fb171"}, - {file = "numpy-1.25.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d7abcdd85aea3e6cdddb59af2350c7ab1ed764397f8eec97a038ad244d2d105"}, - {file = "numpy-1.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a180429394f81c7933634ae49b37b472d343cccb5bb0c4a575ac8bbc433722f"}, - {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d412c1697c3853c6fc3cb9751b4915859c7afe6a277c2bf00acf287d56c4e625"}, - {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e1266411120a4f16fad8efa8e0454d21d00b8c7cee5b5ccad7565d95eb42dd"}, - {file = "numpy-1.25.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f76aebc3358ade9eacf9bc2bb8ae589863a4f911611694103af05346637df1b7"}, - {file = "numpy-1.25.1-cp39-cp39-win32.whl", hash = "sha256:247d3ffdd7775bdf191f848be8d49100495114c82c2bd134e8d5d075fb386a1c"}, - {file = "numpy-1.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:1d5d3c68e443c90b38fdf8ef40e60e2538a27548b39b12b73132456847f4b631"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:35a9527c977b924042170a0887de727cd84ff179e478481404c5dc66b4170009"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d3fe3dd0506a28493d82dc3cf254be8cd0d26f4008a417385cbf1ae95b54004"}, - {file = "numpy-1.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:012097b5b0d00a11070e8f2e261128c44157a8689f7dedcf35576e525893f4fe"}, - {file = "numpy-1.25.1.tar.gz", hash = "sha256:9a3a9f3a61480cc086117b426a8bd86869c213fc4072e606f01c4e4b66eb92bf"}, + {file = "numpy-1.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:db3ccc4e37a6873045580d413fe79b68e47a681af8db2e046f1dacfa11f86eb3"}, + {file = "numpy-1.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:90319e4f002795ccfc9050110bbbaa16c944b1c37c0baeea43c5fb881693ae1f"}, + {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfe4a913e29b418d096e696ddd422d8a5d13ffba4ea91f9f60440a3b759b0187"}, + {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f08f2e037bba04e707eebf4bc934f1972a315c883a9e0ebfa8a7756eabf9e357"}, + {file = "numpy-1.25.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bec1e7213c7cb00d67093247f8c4db156fd03075f49876957dca4711306d39c9"}, + {file = "numpy-1.25.2-cp310-cp310-win32.whl", hash = "sha256:7dc869c0c75988e1c693d0e2d5b26034644399dd929bc049db55395b1379e044"}, + {file = "numpy-1.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:834b386f2b8210dca38c71a6e0f4fd6922f7d3fcff935dbe3a570945acb1b545"}, + {file = "numpy-1.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c5462d19336db4560041517dbb7759c21d181a67cb01b36ca109b2ae37d32418"}, + {file = "numpy-1.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5652ea24d33585ea39eb6a6a15dac87a1206a692719ff45d53c5282e66d4a8f"}, + {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d60fbae8e0019865fc4784745814cff1c421df5afee233db6d88ab4f14655a2"}, + {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e7f0f7f6d0eee8364b9a6304c2845b9c491ac706048c7e8cf47b83123b8dbf"}, + {file = "numpy-1.25.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb33d5a1cf360304754913a350edda36d5b8c5331a8237268c48f91253c3a364"}, + {file = "numpy-1.25.2-cp311-cp311-win32.whl", hash = "sha256:5883c06bb92f2e6c8181df7b39971a5fb436288db58b5a1c3967702d4278691d"}, + {file = "numpy-1.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:5c97325a0ba6f9d041feb9390924614b60b99209a71a69c876f71052521d42a4"}, + {file = "numpy-1.25.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b79e513d7aac42ae918db3ad1341a015488530d0bb2a6abcbdd10a3a829ccfd3"}, + {file = "numpy-1.25.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eb942bfb6f84df5ce05dbf4b46673ffed0d3da59f13635ea9b926af3deb76926"}, + {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0746410e73384e70d286f93abf2520035250aad8c5714240b0492a7302fdca"}, + {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7806500e4f5bdd04095e849265e55de20d8cc4b661b038957354327f6d9b295"}, + {file = "numpy-1.25.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8b77775f4b7df768967a7c8b3567e309f617dd5e99aeb886fa14dc1a0791141f"}, + {file = "numpy-1.25.2-cp39-cp39-win32.whl", hash = "sha256:2792d23d62ec51e50ce4d4b7d73de8f67a2fd3ea710dcbc8563a51a03fb07b01"}, + {file = "numpy-1.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:76b4115d42a7dfc5d485d358728cdd8719be33cc5ec6ec08632a5d6fca2ed380"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a1329e26f46230bf77b02cc19e900db9b52f398d6722ca853349a782d4cff55"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c3abc71e8b6edba80a01a52e66d83c5d14433cbcd26a40c329ec7ed09f37901"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1b9735c27cea5d995496f46a8b1cd7b408b3f34b6d50459d9ac8fe3a20cc17bf"}, + {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"}, ] [[package]] diff --git a/tests/test_base_compressor.py b/tests/test_base_compressor.py index c078e7d4..92e72156 100644 --- a/tests/test_base_compressor.py +++ b/tests/test_base_compressor.py @@ -41,32 +41,15 @@ def test__compress(self) -> None: compressed_bytes = self.compressor._compress(self.example_input) assert isinstance(compressed_bytes, bytes) - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.compressor._compress(input_) - assert isinstance(out, bytes) def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.compressor.get_compressed_length(input_) - assert isinstance(out, int) - assert out > 0 def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) assert isinstance(example_bits_per_character, float) - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] - - for input_ in example_inputs: - out = self.compressor.get_bits_per_character(input_) - assert isinstance(out, float) - assert out > 0 diff --git a/tests/test_bz2_compressor.py b/tests/test_bz2_compressor.py index 8fcbc9ca..1958654c 100644 --- a/tests/test_bz2_compressor.py +++ b/tests/test_bz2_compressor.py @@ -18,32 +18,14 @@ def test__compress(self) -> None: base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.base_compressor._compress(input_) - assert isinstance(out, bytes) - def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.compressor.get_compressed_length(input_) - assert isinstance(out, int) - assert out > 0 def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) assert isinstance(example_bits_per_character, float) - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] - - for input_ in example_inputs: - out = self.compressor.get_bits_per_character(input_) - assert isinstance(out, float) - assert out > 0 diff --git a/tests/test_gzip_compressor.py b/tests/test_gzip_compressor.py index 298f1c2e..16fe12b9 100644 --- a/tests/test_gzip_compressor.py +++ b/tests/test_gzip_compressor.py @@ -18,32 +18,15 @@ def test__compress(self) -> None: base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.base_compressor._compress(input_) - assert isinstance(out, bytes) def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.compressor.get_compressed_length(input_) - assert isinstance(out, int) - assert out > 0 def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) assert isinstance(example_bits_per_character, float) - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] - - for input_ in example_inputs: - out = self.compressor.get_bits_per_character(input_) - assert isinstance(out, float) - assert out > 0 diff --git a/tests/test_lzma_compressor.py b/tests/test_lzma_compressor.py index 11bb3db3..fa4584a0 100644 --- a/tests/test_lzma_compressor.py +++ b/tests/test_lzma_compressor.py @@ -18,32 +18,15 @@ def test__compress(self) -> None: base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.base_compressor._compress(input_) - assert isinstance(out, bytes) def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}] - - for input_ in example_inputs: - out = self.compressor.get_compressed_length(input_) - assert isinstance(out, int) - assert out > 0 def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input ) assert isinstance(example_bits_per_character, float) - example_inputs = [0, 0.1, "hey there", [0], (0, 1), {"test": "yep"}, -1] - - for input_ in example_inputs: - out = self.compressor.get_bits_per_character(input_) - assert isinstance(out, float) - assert out > 0 From a241b364d63e6611e830eb58ad172152f2177042 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 14:37:54 -0400 Subject: [PATCH 71/74] fix spacing in pipeline --- .github/workflows/publish-package.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 38c99e56..537c1ee7 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -33,7 +33,6 @@ jobs: - name: Upload pytest test results if: ${{ !cancelled() }} # Upload results even if tests fail. - uses: actions/upload-artifact@v3 with: name: pytest-results-${{ matrix.python-version }} From 9c8b787f89ea4f7d4d1af6f793ee60481d29b23f Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 16:38:04 -0400 Subject: [PATCH 72/74] update numpy update poetry lock --- poetry.lock | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 11cfee00..b552ae01 100644 --- a/poetry.lock +++ b/poetry.lock @@ -811,4 +811,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "fbaae83db7c0f3f88c84f676cbcca269405053841c048a085843a23f0f9a5f48" +content-hash = "e2cb78e14ddc73381fb5922e6ff7e09c273b17c320efd8687440b6607103e787" diff --git a/pyproject.toml b/pyproject.toml index ac8cbba0..b52b44a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ packages = [{include = "npc_gzip"}] [tool.poetry.dependencies] python = "^3.9" -numpy = "^1.25.1" +numpy = "^1.25.2" tqdm = "^4.65.0" [tool.poetry.group.test.dependencies] From 2b53ac36f001daeefaeac367dcc423d14c321c32 Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 19:50:33 -0400 Subject: [PATCH 73/74] cleaning up from recent feedback --- npc_gzip/aggregations.py | 7 ------- npc_gzip/exceptions.py | 2 +- npc_gzip/knn_classifier.py | 29 +++++------------------------ npc_gzip/utils.py | 1 + tests/test_base_compressor.py | 2 -- tests/test_bz2_compressor.py | 1 - tests/test_gzip_compressor.py | 2 -- tests/test_knn_classifier.py | 5 ----- tests/test_lzma_compressor.py | 2 -- 9 files changed, 7 insertions(+), 44 deletions(-) diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index 0cbe4afe..ab1925f6 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -14,10 +14,6 @@ def concatenate_with_space(stringa: str, stringb: str) -> str: Returns: str: `{stringa} {stringb}` """ - - stringa = str(stringa) - stringb = str(stringb) - return stringa + " " + stringb @@ -37,9 +33,6 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> str: combination of stringa and stringb """ - stringa = str(stringa) - stringb = str(stringb) - stringa_list: list = stringa.split() stringb_list: list = stringb.split() diff --git a/npc_gzip/exceptions.py b/npc_gzip/exceptions.py index eeda08d8..a59324ee 100644 --- a/npc_gzip/exceptions.py +++ b/npc_gzip/exceptions.py @@ -145,7 +145,7 @@ def __init__( super().__init__(self.message) -class InvalidObjectTypeException(Exception): +class InvalidObjectTypeException(TypeError): def __init__( self, passed_type: str, diff --git a/npc_gzip/knn_classifier.py b/npc_gzip/knn_classifier.py index a5783923..efbb0791 100644 --- a/npc_gzip/knn_classifier.py +++ b/npc_gzip/knn_classifier.py @@ -3,6 +3,7 @@ import numpy as np from tqdm import tqdm +from npc_gzip.aggregations import concatenate_with_space from npc_gzip.compressors.base import BaseCompressor from npc_gzip.distance import Distance from npc_gzip.exceptions import ( @@ -41,7 +42,7 @@ def __init__( self, compressor: BaseCompressor, training_inputs: Sequence, - training_labels: Sequence = None, + training_labels: Optional[Sequence] = None, distance_metric: str = "ncd", ) -> None: self.compressor = compressor @@ -209,7 +210,7 @@ def _compress_sample( combined: list = [] for training_sample in training_inputs: - train_and_x: str = self.concatenate_with_space(training_sample, x) + train_and_x: str = concatenate_with_space(training_sample, x) combined_compressed: int = self.compressor.get_compressed_length( train_and_x ) @@ -220,27 +221,9 @@ def _compress_sample( return (compressed_input, combined) - @staticmethod - def concatenate_with_space(stringa: str, stringb: str) -> str: - """ - Combines `stringa` and `stringb` with a space. - - Arguments: - stringa (str): First item. - stringb (str): Second item. - - Returns: - str: `{stringa} {stringb}` - """ - - stringa = str(stringa) - stringb = str(stringb) - - return stringa + " " + stringb - def sample_data( self, sampling_percentage: float = 1.0 - ) -> tuple[np.ndarray, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Given a `sampling_percentage`, this method randomly samples data from `self.training_inputs` & @@ -284,9 +267,7 @@ def predict( x: Sequence, top_k: int = 1, sampling_percentage: float = 1.0, - ) -> tuple[ - np.ndarray[float, float], np.ndarray[float, float], np.ndarray[float, float] - ]: + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Faster version of `predict`. This method will compare `x` against a sample of the diff --git a/npc_gzip/utils.py b/npc_gzip/utils.py index 71824ede..f30c5b9e 100644 --- a/npc_gzip/utils.py +++ b/npc_gzip/utils.py @@ -12,6 +12,7 @@ from npc_gzip.exceptions import InvalidObjectTypeException + def generate_sentence(number_of_words: int = 10) -> str: """ Generates a sentence of random diff --git a/tests/test_base_compressor.py b/tests/test_base_compressor.py index 92e72156..c99a374d 100644 --- a/tests/test_base_compressor.py +++ b/tests/test_base_compressor.py @@ -41,13 +41,11 @@ def test__compress(self) -> None: compressed_bytes = self.compressor._compress(self.example_input) assert isinstance(compressed_bytes, bytes) - def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 - def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input diff --git a/tests/test_bz2_compressor.py b/tests/test_bz2_compressor.py index 1958654c..ea1ca097 100644 --- a/tests/test_bz2_compressor.py +++ b/tests/test_bz2_compressor.py @@ -23,7 +23,6 @@ def test_get_compressed_length(self) -> None: assert isinstance(example_input_length, int) assert example_input_length > 0 - def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input diff --git a/tests/test_gzip_compressor.py b/tests/test_gzip_compressor.py index 16fe12b9..d7ca247d 100644 --- a/tests/test_gzip_compressor.py +++ b/tests/test_gzip_compressor.py @@ -18,13 +18,11 @@ def test__compress(self) -> None: base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes - def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 - def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input diff --git a/tests/test_knn_classifier.py b/tests/test_knn_classifier.py index d35a34dc..55f40cd7 100644 --- a/tests/test_knn_classifier.py +++ b/tests/test_knn_classifier.py @@ -105,11 +105,6 @@ def test__calculate_distance(self) -> None: assert isinstance(distance, np.ndarray) assert distance.shape == compressed_input.shape == compressed_combined.shape - def test_concatenate_with_space(self) -> None: - a = "hello there" - b = "who goes there?" - assert self.model.concatenate_with_space(a, b) == "hello there who goes there?" - def test_predict(self) -> None: top_k = 1 (distance, labels, similar_samples) = self.model.predict( diff --git a/tests/test_lzma_compressor.py b/tests/test_lzma_compressor.py index fa4584a0..56c40664 100644 --- a/tests/test_lzma_compressor.py +++ b/tests/test_lzma_compressor.py @@ -18,13 +18,11 @@ def test__compress(self) -> None: base_compressed_bytes = self.base_compressor._compress(self.example_input) assert compressed_bytes == base_compressed_bytes - def test_get_compressed_length(self) -> None: example_input_length = self.compressor.get_compressed_length(self.example_input) assert isinstance(example_input_length, int) assert example_input_length > 0 - def test_get_bits_per_character(self) -> None: example_bits_per_character = self.compressor.get_bits_per_character( self.example_input From 8293be979387927a9a6f90e938a17b8c0c0ed8bc Mon Sep 17 00:00:00 2001 From: Zach Bloss Date: Mon, 31 Jul 2023 23:00:44 -0400 Subject: [PATCH 74/74] simplifying the aggregation line --- npc_gzip/aggregations.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/npc_gzip/aggregations.py b/npc_gzip/aggregations.py index ab1925f6..adedfdf5 100644 --- a/npc_gzip/aggregations.py +++ b/npc_gzip/aggregations.py @@ -39,8 +39,5 @@ def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> zipped_lists: list = list(zip(stringa_list, stringb_list)) out: list = list(itertools.chain(*zipped_lists)) - aggregated: str = " ".join(out) - if by_character: - aggregated = "".join(out) - + aggregated: str = ("" if by_character else " ").join(out) return aggregated