Skip to content

Commit

Permalink
patch unstructured embeddings gen example
Browse files Browse the repository at this point in the history
  • Loading branch information
mattseddon committed Oct 18, 2024
1 parent f6445e2 commit ee6ac41
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 6 deletions.
12 changes: 7 additions & 5 deletions examples/llm_and_nlp/unstructured-embeddings-gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
group_broken_paragraphs,
replace_unicode_quotes,
)
from unstructured.embed.huggingface import (
from unstructured.partition.pdf import partition_pdf
from unstructured_ingest.embed.huggingface import (
HuggingFaceEmbeddingConfig,
HuggingFaceEmbeddingEncoder,
)
from unstructured.partition.pdf import partition_pdf

from datachain import C, DataChain, DataModel, File

Expand All @@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]:
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")

# Clean the chunks and add new columns
chunks_cleaned = []
for chunk in chunks:
chunk.apply(
lambda text: clean(
Expand All @@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]:
)
chunk.apply(replace_unicode_quotes)
chunk.apply(group_broken_paragraphs)
chunks_cleaned.append({"text": chunk.text})

# create embeddings
chunks_embedded = embedding_encoder.embed_documents(chunks)
chunks_embedded = embedding_encoder.embed_documents(chunks_cleaned)

# Add new rows to DataChain
for chunk in chunks_embedded:
yield Chunk(
key=file.path,
text=chunk.text,
embeddings=chunk.embeddings,
text=chunk.get("text"),
embeddings=chunk.get("embeddings"),
)


Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ examples = [
"numpy>=1,<2",
"defusedxml",
"accelerate",
"unstructured[pdf, embed-huggingface]",
"unstructured_ingest",
"unstructured[pdf]",
"pdfplumber==0.11.4",
"huggingface_hub[hf_transfer]",
"onnx==1.16.1"
Expand Down

0 comments on commit ee6ac41

Please sign in to comment.