diff --git a/examples/files/CV_Candidate.pdf b/examples/files/CV_Candidate.pdf new file mode 100644 index 0000000..2578c85 Binary files /dev/null and b/examples/files/CV_Candidate.pdf differ diff --git a/examples/files/Job_Offer.pdf b/examples/files/Job_Offer.pdf new file mode 100644 index 0000000..b316cc2 Binary files /dev/null and b/examples/files/Job_Offer.pdf differ diff --git a/examples/resume_processor.py b/examples/resume_processor.py new file mode 100644 index 0000000..3625203 --- /dev/null +++ b/examples/resume_processor.py @@ -0,0 +1,157 @@ +import json +import os +from typing import List, Optional + +from dotenv import load_dotenv +from pydantic import Field +import yaml + +from extract_thinker import Extractor, Contract, DocumentLoaderPyPdf +from litellm import Router + +from extract_thinker.llm import LLM + + +def json_to_yaml(json_dict): + # Check if json_dict is a dictionary + if not isinstance(json_dict, dict): + raise ValueError("json_dict must be a dictionary") + + # Convert the Python dictionary to YAML + yaml_str = yaml.dump(json_dict) + + return yaml_str + + +class RoleContract(Contract): + company_name: str = Field("Company name") + years_of_experience: int = Field("Years of experience required. If not mention, calculate with start date and end date") + is_remote: bool = Field("Is the role remote?") + country: str = Field("Country of the role") + city: Optional[str] = Field("City of the role") + list_of_skills: List[str] = Field(""" + list of strings, e.g ["5 years experience", "3 years in React", "Typescript"] + Make the lists of skills to be a yes/no list, so it can be used in the LLM model as a list of true/false + """) + + +class ResumeContract(Contract): + name: str = Field("First and Last Name") + age: Optional[str] = Field("Age with format DD/MM/YYYY. Empty if not available") + email: str = Field("Email address") + phone: Optional[str] = Field("Phone number") + address: Optional[str] = Field("Address") + city: Optional[str] = Field("City") + total_experience: int = Field("Total experience in years") + can_go_to_office: Optional[bool] = Field("Can go to office. If city/location is not provider, is false. If is the same city, is true") + list_of_skills: List[bool] = Field("Takes the list of skills and returns a list of true/false, if the candidate has that skill. E.g. ['Python', 'JavaScript', 'React', 'Node.js'] -> [True, True, False, True]") + + +class Person(Contract): + name: str = Field("First and Last Name") + list_of_skills: List[str] + +load_dotenv() +cwd = os.getcwd() + + +def config_router(): + rpm = 5000 # Rate limit in requests per minute + + model_list = [ + { + "model_name": "Meta-Llama-3-8B-Instruct", + "litellm_params": { + "model": "deepinfra/meta-llama/Meta-Llama-3-8B-Instruct", + "api_key": os.getenv("DEEPINFRA_API_KEY"), + "rpm": rpm, + }, + }, + { + "model_name": "Mistral-7B-Instruct-v0.2", + "litellm_params": { + "model": "deepinfra/mistralai/Mistral-7B-Instruct-v0.2", + "api_key": os.getenv("DEEPINFRA_API_KEY"), + "rpm": rpm, + } + }, + { + "model_name": "groq-llama3-8b-8192", + "litellm_params": { + "model": "groq/llama3-8b-8192", + "api_key": os.getenv("GROQ_API_KEY"), + "rpm": rpm, + } + }, + ] + + # Adding fallback models + fallback_models = [ + { + "model_name": "claude-3-haiku-20240307", + "litellm_params": { + "model": "claude-3-haiku-20240307", + "api_key": os.getenv("CLAUDE_API_KEY"), + } + }, + { + "model_name": "azure-deployment", + "litellm_params": { + "model": "azure/", + "api_base": os.getenv("AZURE_API_BASE"), + "api_key": os.getenv("AZURE_API_KEY"), + "rpm": 1440, + } + } + ] + + # Combine the lists + model_list.extend(fallback_models) + + # Define the router configuration + router = Router( + model_list=model_list, + default_fallbacks=["claude-3-haiku-20240307", "azure/"], + context_window_fallbacks=[ + {"Meta-Llama-3-8B-Instruct": ["claude-3-haiku-20240307"]}, + {"groq-llama3-8b-8192": ["claude-3-haiku-20240307"]}, + {"Mistral-7B-Instruct-v0.2": ["claude-3-haiku-20240307"]} + ], + set_verbose=True + ) + + return router + + +job_role_path = os.path.join(cwd, "examples", "files", "Job_Offer.pdf") + +extractor_job_role = Extractor() + +extractor_job_role.load_document_loader( + DocumentLoaderPyPdf() +) + +extractor_job_role.load_llm("gpt-4o") +role_result = extractor_job_role.extract(job_role_path, RoleContract) + +print(role_result.json()) + +extractor_candidate = Extractor() +extractor_candidate.load_document_loader( + DocumentLoaderPyPdf() +) + +llm = LLM("groq/llama3-8b-8192") # default model +#llm.load_router(config_router()) # load the router + +extractor_candidate.load_llm(llm) + +resume_content_path = os.path.join(cwd, "examples", "files", "CV_Candidate.pdf") + +job_role_content = "This is the job cotent. to be mapped: \n" + json_to_yaml(json.loads(role_result.json())) + +result = extractor_candidate.extract(resume_content_path, + ResumeContract, + content=job_role_content) + +print(result.json()) diff --git a/extract_thinker/__init__.py b/extract_thinker/__init__.py index 59db2e8..c14ad4c 100644 --- a/extract_thinker/__init__.py +++ b/extract_thinker/__init__.py @@ -3,6 +3,7 @@ from .document_loader.cached_document_loader import CachedDocumentLoader from .document_loader.document_loader_tesseract import DocumentLoaderTesseract from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet +from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf from .document_loader.document_loader_text import DocumentLoaderText from .models import classification, classification_response from .process import Process @@ -17,6 +18,7 @@ 'DocumentLoader', 'CachedDocumentLoader', 'DocumentLoaderTesseract', + 'DocumentLoaderPyPdf', 'DocumentLoaderText', 'classification', 'classification_response', diff --git a/extract_thinker/document_loader/document_loader_llm_image.py b/extract_thinker/document_loader/document_loader_llm_image.py new file mode 100644 index 0000000..ed75c30 --- /dev/null +++ b/extract_thinker/document_loader/document_loader_llm_image.py @@ -0,0 +1,54 @@ +from abc import ABC +from io import BytesIO +from PIL import Image +from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader +from extract_thinker.utils import extract_json + + +class DocumentLoaderLLMImage(CachedDocumentLoader, ABC): + def __init__(self, content=None, cache_ttl=300, llm=None): + super().__init__(content, cache_ttl) + self.llm = llm + + def extract_image_content(self, image_stream: BytesIO) -> str: + """ + Extracts text or data from an image using an LLM. + The actual implementation uses an LLM to process the image content. + """ + # Load the image from the stream + image = Image.open(image_stream) + + # Encode the image to base64 + base64_image = self.encode_image(image) + + # Use the LLM to extract the content from the image + resp = self.llm.completion( + model="claude-3-sonnet-20240229", + messages=[ + { + "role": "system", + "content": 'You are a worldclass Image data extractor. You receive an image and extract useful information from it. You output a JSON with the extracted information.', + }, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "data:image/jpeg;base64," + base64_image + }, + }, + {"type": "text", "text": "###JSON Output\n"}, + ], + }, + ], + ) + + # Extract the JSON text from the response + jsonText = resp.choices[0].message.content + + # Extract the JSON from the text + jsonText = extract_json(jsonText) + + # Return the extracted content + return jsonText diff --git a/extract_thinker/document_loader/document_loader_pypdf.py b/extract_thinker/document_loader/document_loader_pypdf.py new file mode 100644 index 0000000..d51255f --- /dev/null +++ b/extract_thinker/document_loader/document_loader_pypdf.py @@ -0,0 +1,41 @@ +import io +from typing import Any, Dict, List, Union +from PyPDF2 import PdfReader +from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage + + +class DocumentLoaderPyPdf(DocumentLoaderLLMImage): + def __init__(self, content: Any = None, cache_ttl: int = 300): + super().__init__(content, cache_ttl) + + def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]: + reader = PdfReader(file_path) + return self.extract_data_from_pdf(reader) + + def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]: + reader = PdfReader(stream) + return self.extract_data_from_pdf(reader) + + def load_content_from_file_list(self, file_paths: List[str]) -> List[Any]: + return [self.load_content_from_file(file_path) for file_path in file_paths] + + def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Any]: + return [self.load_content_from_stream(stream) for stream in streams] + + def extract_data_from_pdf(self, reader: PdfReader) -> Union[str, Dict[str, Any]]: + document_data = { + "text": [] + } + + for page in reader.pages: + # Extract text and split by newline characters + page_text = page.extract_text() + document_data["text"].extend(page_text.split('\n')) + + # Skip image extraction for now. TODO + # for img_index, image in enumerate(page.images): + # image_data = self.extract_image_content(io.BytesIO(image["data"])) + # if image_data: + # document_data["images"].append(image_data) + + return document_data diff --git a/extract_thinker/extractor.py b/extract_thinker/extractor.py index 07ffeae..81dac12 100644 --- a/extract_thinker/extractor.py +++ b/extract_thinker/extractor.py @@ -13,7 +13,8 @@ from extract_thinker.document_loader.loader_interceptor import LoaderInterceptor from extract_thinker.document_loader.llm_interceptor import LlmInterceptor -from extract_thinker.utils import get_file_extension +from extract_thinker.utils import get_file_extension, encode_image +import yaml SUPPORTED_IMAGE_FORMATS = ["jpeg", "png", "bmp", "tiff"] @@ -30,6 +31,7 @@ def __init__( self.document_loaders_by_file_type: Dict[str, DocumentLoader] = {} self.loader_interceptors: List[LoaderInterceptor] = [] self.llm_interceptors: List[LlmInterceptor] = [] + self.extra_content: Optional[str] = None def add_interceptor( self, interceptor: Union[LoaderInterceptor, LlmInterceptor] @@ -55,10 +57,17 @@ def get_document_loader_for_file(self, file: str) -> DocumentLoader: def load_document_loader(self, document_loader: DocumentLoader) -> None: self.document_loader = document_loader - def load_llm(self, model: str) -> None: - self.llm = LLM(model) + def load_llm(self, model: Optional[str] = None) -> None: + if isinstance(model, LLM): + self.llm = model + elif model is not None: + self.llm = LLM(model) + else: + raise ValueError("Either a model string or an LLM object must be provided.") + + def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False, content: Optional[str] = None) -> Any: + self.extra_content = content - def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str: if not issubclass(response_model, BaseModel): raise ValueError("response_model must be a subclass of Pydantic's BaseModel.") @@ -71,7 +80,7 @@ def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], else: raise ValueError("Source must be a file path, a stream, or a list of dictionaries") - async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str: + async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> Any: return await asyncio.to_thread(self.extract, source, response_model, vision) def extract_from_list(self, data: List[Dict[Any, Any]], response_model: type[BaseModel], vision: bool) -> str: @@ -162,9 +171,13 @@ def classify(self, input: Union[str, IO], classifications: List[Classification]) async def classify_async(self, input: Union[str, IO], classifications: List[Classification]): return await asyncio.to_thread(self.classify, input, classifications) - def _extract( - self, content, file_or_stream, response_model, vision=False, is_stream=False - ): + def _extract(self, + content, + file_or_stream, + response_model, + vision=False, + is_stream=False + ): # call all the llm interceptors before calling the llm for interceptor in self.llm_interceptors: interceptor.intercept(self.llm) @@ -177,8 +190,18 @@ def _extract( }, ] + if self.extra_content is not None: + if isinstance(self.extra_content, dict): + self.extra_content = yaml.dump(self.extra_content) + messages.append({"role": "user", "content": "##Extra Content\n\n" + self.extra_content}) + + if content is not None: + if isinstance(content, dict): + content = yaml.dump(content) + messages.append({"role": "user", "content": "##Content\n\n" + content}) + if vision: - base64_encoded_image = self._encode_image_to_base64( + base64_encoded_image = encode_image( file_or_stream, is_stream ) @@ -196,8 +219,6 @@ def _extract( ], } ] - else: - messages.append({"role": "user", "content": "##Content\n\n" + content}) response = self.llm.request(messages, response_model) return response diff --git a/extract_thinker/llm.py b/extract_thinker/llm.py index f6470df..0278696 100644 --- a/extract_thinker/llm.py +++ b/extract_thinker/llm.py @@ -1,21 +1,37 @@ +from typing import List, Dict, Any import instructor import litellm from extract_thinker.utils import num_tokens_from_string +from litellm import Router class LLM: - def __init__(self, model): - self.client = instructor.from_litellm(litellm.completion) + def __init__(self, model: str): + self.client = instructor.from_litellm(litellm.completion, mode=instructor.Mode.MD_JSON) self.model = model + self.router = None - def request(self, messages, response_model): + def load_router(self, router: Router) -> None: + self.router = router + def request(self, messages: List[Dict[str, str]], response_model: str) -> Any: contents = map(lambda message: message['content'], messages) - all_contents = ' '.join(contents) - return self.client.chat.completions.create( - model=self.model, - max_tokens=num_tokens_from_string(all_contents), - messages=messages, - response_model=response_model, - ) + max_tokens = num_tokens_from_string(all_contents) + + if self.router: + response = self.router.completion( + model=self.model, + max_tokens=max_tokens, + messages=messages, + response_model=response_model, + ) + else: + response = self.client.chat.completions.create( + model=self.model, + max_tokens=max_tokens, + messages=messages, + response_model=response_model + ) + + return response diff --git a/poetry.lock b/poetry.lock index 1aeaf63..9a7cc0b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1469,6 +1469,27 @@ files = [ plugins = ["importlib-metadata"] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pypdf2" +version = "3.0.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, + {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, +] + +[package.dependencies] +typing_extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +crypto = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow", "PyCryptodome"] +image = ["Pillow"] + [[package]] name = "pypdfium2" version = "4.29.0" @@ -2176,4 +2197,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "60b4b4fa08db9ddfeae90cc5b04b979871883db42c2d09b274923934f8a3eb9a" +content-hash = "047a0bc6e650003696373a331d62a070eb264f898d3613222f61649ea9853a12" diff --git a/pyproject.toml b/pyproject.toml index 6a1bc83..3ac71e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "extract_thinker" -version = "0.0.2" +version = "0.0.3" description = "Library to extract data from files and documents agnositicaly using LLMs" authors = ["JĂșlio Almeida "] readme = "README.md" @@ -20,6 +20,7 @@ cachetools = "^5.3.3" pyyaml = "^6.0.1" tiktoken = "^0.6.0" openpyxl = "^3.1.2" +pypdf2 = "^3.0.1" [tool.poetry.dev-dependencies] flake8 = "^3.9.2" diff --git a/tests/classify.py b/tests/classify.py index 53c8fe9..a5ae394 100644 --- a/tests/classify.py +++ b/tests/classify.py @@ -23,7 +23,7 @@ def test_classify_feature(): extractor = Extractor() extractor.load_document_loader(DocumentLoaderTesseract(tesseract_path)) - extractor.load_llm("claude-3-haiku-20240307") + extractor.load_llm("gpt-3.5-turbo") # Act result = extractor.classify_from_path(test_file_path, Classifications) @@ -65,4 +65,4 @@ def test_classify(): # Assert assert result is not None assert isinstance(result, ClassificationResponse) - assert result.name == "Invoice" \ No newline at end of file + assert result.name == "Invoice" diff --git a/tests/document_loader_pypdf.py b/tests/document_loader_pypdf.py new file mode 100644 index 0000000..39562c5 --- /dev/null +++ b/tests/document_loader_pypdf.py @@ -0,0 +1,23 @@ +import os +from dotenv import load_dotenv +from extract_thinker.document_loader.document_loader_pypdf import DocumentLoaderPyPdf + +cwd = os.getcwd() +load_dotenv() + +# Arrange +loader = DocumentLoaderPyPdf() +test_file_path = os.path.join(cwd, "files", "CV_Candidate.pdf") + + +def test_load_content_from_file(): + # Act + content = loader.load_content_from_file(test_file_path) + + # Convert the list of words into a single string + content_text = " ".join(content["text"]) + + # Assert + assert content is not None + assert "University of New York" in content_text + assert "XYZ Innovations" in content_text diff --git a/tests/files/CV_Candidate.pdf b/tests/files/CV_Candidate.pdf new file mode 100644 index 0000000..2578c85 Binary files /dev/null and b/tests/files/CV_Candidate.pdf differ