Skip to content

Commit

Permalink
updated code as per suggestion
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub committed Jan 7, 2025
1 parent 8d57220 commit 95358e0
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 35 deletions.
22 changes: 11 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base

ARG PYTHON=python3.11
ARG PIP=pip3.11

USER root

WORKDIR /app
Expand All @@ -9,26 +12,23 @@ COPY unstructured unstructured
COPY test_unstructured test_unstructured
COPY example-docs example-docs

# Copy the downloaded NLTK data folder to your local environment.s
COPY ./nltk_data /home/notebook-user/nltk_data

RUN chown -R notebook-user:notebook-user /app && \
apk add font-ubuntu git && \
fc-cache -fv && \
[ -e /usr/bin/python3 ] || ln -s /usr/bin/python3.11 /usr/bin/python3
[ -e /usr/bin/python3 ] || ln -s /usr/bin/$PYTHON /usr/bin/python3

USER notebook-user

RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';'

# Command to check if NLTK data has been copied correctly
RUN python3.11 -c "import nltk; print(nltk.data.find('tokenizers/punkt_tab'))"
ENV NLTK_DATA=/home/notebook-user/nltk_data

RUN python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
# Install Python dependencies and download required NLTK packages
RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
mkdir -p ${NLTK_DATA} && \
$PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

ENV PATH="${PATH}:/home/notebook-user/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV NLTK_DATA=/home/notebook-user/nltk_data

CMD ["/bin/bash"]
4 changes: 2 additions & 2 deletions test_unstructured/nlp/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@


def test_nltk_assets_validation():
with patch("unstructured.nlp.tokenize.validate_nltk_assets") as mock_validate:
tokenize.validate_nltk_assets()
with patch("unstructured.nlp.tokenize._ensure_nltk_packages_available") as mock_validate:
tokenize._ensure_nltk_packages_available()
mock_validate.assert_called_once()


Expand Down
51 changes: 29 additions & 22 deletions unstructured/nlp/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@

CACHE_MAX_SIZE: Final[int] = 128

NLTK_DATA_PATH = os.getenv("NLTK_DATA", "/home/notebook-user/nltk_data")
nltk.data.path.append(NLTK_DATA_PATH)


def download_nltk_packages():
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
Expand All @@ -22,49 +19,59 @@ def download_nltk_packages():

def check_for_nltk_package(package_name: str, package_category: str) -> bool:
"""Checks to see if the specified NLTK package exists on the file system."""
paths: list[str] = []
for path in nltk.data.path:
if not path.endswith("nltk_data"):
path = os.path.join(path, "nltk_data")
paths.append(path)

try:
nltk.find(f"{package_category}/{package_name}")
nltk.find(f"{package_category}/{package_name}", paths=paths)
return True
except (LookupError, OSError):
return False


# Ensure NLTK data exists in the specified path (pre-baked in Docker)
def validate_nltk_assets():
"""Validate that required NLTK packages are preloaded in the image."""
required_assets = [
("punkt_tab", "tokenizers"),
("averaged_perceptron_tagger_eng", "taggers"),
]
for package_name, category in required_assets:
if not check_for_nltk_package(package_name, category):
raise RuntimeError(
f"Required NLTK package '{package_name}' is missing. "
f"Ensure it is baked into the Docker image at '{NLTK_DATA_PATH}'."
)

# We cache this because we do not want to attempt
# checking the packages multiple times
@lru_cache()
def _ensure_nltk_packages_available():
"""Ensure required NLTK packages are available, raise an error if not."""
tagger_available = check_for_nltk_package(
package_category="taggers",
package_name="averaged_perceptron_tagger_eng",
)
tokenizer_available = check_for_nltk_package(
package_category="tokenizers",
package_name="punkt_tab",
)

# Validate NLTK assets at import time
validate_nltk_assets()
if not tagger_available or not tokenizer_available:
raise RuntimeError(
"Required NLTK packages are not available. "
"Ensure the assets are pre-baked into the image."
)


@lru_cache(maxsize=CACHE_MAX_SIZE)
def sent_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
_ensure_nltk_packages_available()
return _sent_tokenize(text)


@lru_cache(maxsize=CACHE_MAX_SIZE)
def word_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
_ensure_nltk_packages_available()
return _word_tokenize(text)


@lru_cache(maxsize=CACHE_MAX_SIZE)
def pos_tag(text: str) -> List[Tuple[str, str]]:
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
# NOTE: Splitting into sentences before tokenizing helps with situations
# like "ITEM 1A. PROPERTIES" where tokens can be misinterpreted.
_ensure_nltk_packages_available()
# Splitting into sentences before tokenizing.
sentences = _sent_tokenize(text)
parts_of_speech: list[tuple[str, str]] = []
for sentence in sentences:
Expand Down

0 comments on commit 95358e0

Please sign in to comment.