Skip to content

Commit

Permalink
reset example and set upper bound for unstructured
Browse files Browse the repository at this point in the history
  • Loading branch information
mattseddon committed Oct 18, 2024
1 parent 6a79d57 commit fe93b13
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 9 deletions.
12 changes: 5 additions & 7 deletions examples/llm_and_nlp/unstructured-embeddings-gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
group_broken_paragraphs,
replace_unicode_quotes,
)
from unstructured.partition.pdf import partition_pdf
from unstructured_ingest.embed.huggingface import (
from unstructured.embed.huggingface import (
HuggingFaceEmbeddingConfig,
HuggingFaceEmbeddingEncoder,
)
from unstructured.partition.pdf import partition_pdf

from datachain import C, DataChain, DataModel, File

Expand All @@ -43,7 +43,6 @@ def process_pdf(file: File) -> Iterator[Chunk]:
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")

# Clean the chunks and add new columns
chunks_cleaned = []
for chunk in chunks:
chunk.apply(
lambda text: clean(
Expand All @@ -52,17 +51,16 @@ def process_pdf(file: File) -> Iterator[Chunk]:
)
chunk.apply(replace_unicode_quotes)
chunk.apply(group_broken_paragraphs)
chunks_cleaned.append({"text": chunk.text})

# create embeddings
chunks_embedded = embedding_encoder.embed_documents(chunks_cleaned)
chunks_embedded = embedding_encoder.embed_documents(chunks)

# Add new rows to DataChain
for chunk in chunks_embedded:
yield Chunk(
key=file.path,
text=chunk.get("text"),
embeddings=chunk.get("embeddings"),
text=chunk.text,
embeddings=chunk.embeddings,
)


Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ examples = [
"numpy>=1,<2",
"defusedxml",
"accelerate",
"unstructured-ingest[embed-huggingface]",
"unstructured[pdf]",
"unstructured[pdf,embed-huggingface]<0.16.0",
"pdfplumber==0.11.4",
"huggingface_hub[hf_transfer]",
"onnx==1.16.1"
Expand Down

0 comments on commit fe93b13

Please sign in to comment.