diff --git a/backend/Dockerfile b/backend/Dockerfile index 5d20cc09bb79..b44c972a88fb 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -49,8 +49,6 @@ COPY worker/pyproject.toml worker/README.md ./worker/ COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py COPY worker/diff-assistant/pyproject.toml worker/diff-assistant/README.md ./worker/diff-assistant/ COPY worker/diff-assistant/quivr_diff_assistant/__init__.py ./worker/diff-assistant/quivr_diff_assistant/__init__.py -COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/ -COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py RUN PYTHONDONTWRITEBYTECODE=1 pip install --no-cache-dir -r requirements.lock diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev index 9fb458b16546..01c4a51c6bb4 100644 --- a/backend/Dockerfile.dev +++ b/backend/Dockerfile.dev @@ -36,8 +36,6 @@ COPY worker/pyproject.toml worker/README.md ./worker/ COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py COPY worker/diff-assistant/pyproject.toml worker/diff-assistant/README.md ./worker/diff-assistant/ COPY worker/diff-assistant/quivr_diff_assistant/__init__.py ./worker/diff-assistant/quivr_diff_assistant/__init__.py -COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/ -COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py RUN PYTHONDONTWRITEBYTECODE=1 pip install --no-cache-dir -r requirements.lock diff --git a/backend/core/MegaParse/.env.example b/backend/core/MegaParse/.env.example deleted file mode 100644 index b4776ec5bcc9..000000000000 --- a/backend/core/MegaParse/.env.example +++ /dev/null @@ -1 +0,0 @@ -OPENAI_API_KEY=CHANGE_ME \ No newline at end of file diff --git a/backend/core/MegaParse/.gitattributes b/backend/core/MegaParse/.gitattributes deleted file mode 100644 index 9030923a7819..000000000000 --- a/backend/core/MegaParse/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.ipynb linguist-vendored \ No newline at end of file diff --git a/backend/core/MegaParse/.github/workflows/release-please.yml b/backend/core/MegaParse/.github/workflows/release-please.yml deleted file mode 100644 index 01ac897e2aef..000000000000 --- a/backend/core/MegaParse/.github/workflows/release-please.yml +++ /dev/null @@ -1,50 +0,0 @@ -on: - push: - branches: - - main - -permissions: - contents: write - pull-requests: write - -name: release-please - -jobs: - release-please: - runs-on: ubuntu-latest - outputs: - release_created: ${{ steps.release.outputs.release_created }} - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 # Fetch all history for tags and releases - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Run release-please - id: release - uses: google-github-actions/release-please-action@v4 - with: - token: ${{ secrets.RELEASE_PLEASE_TOKEN }} - - - deploy: - if: needs.release-please.outputs.release_created == 'true' - needs: release-please - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Install Rye - uses: eifinger/setup-rye@v2 - with: - enable-cache: true - - name: Rye Sync - run: rye sync --no-lock - - name: Rye Build - run: rye build - - name: Rye Publish - run: rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes diff --git a/backend/core/MegaParse/.gitignore b/backend/core/MegaParse/.gitignore deleted file mode 100644 index ae93dd793415..000000000000 --- a/backend/core/MegaParse/.gitignore +++ /dev/null @@ -1,18 +0,0 @@ -CHANGE*.md -/output -/input -.env -__pycache__/ -dist/** -megaparse.egg-info/ -*.pyc -build/* -ENV -venv -*/evaluations/* -*/cdp/* -*.pkl - -!megaparse/tests/output_tests/MegaFake_report.md -*.DS_Store -.tool-versions diff --git a/backend/core/MegaParse/.pre-commit-config.yaml b/backend/core/MegaParse/.pre-commit-config.yaml deleted file mode 100644 index afbea82654b0..000000000000 --- a/backend/core/MegaParse/.pre-commit-config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 - hooks: - - id: check-added-large-files - args: ["--maxkb=5000"] - - id: check-toml - - id: check-yaml - - id: end-of-file-fixer - - id: trailing-whitespace - - id: check-merge-conflict - - id: detect-private-key - - id: check-case-conflict - - repo: https://github.com/pre-commit/pre-commit - rev: v3.6.2 - hooks: - - id: validate_manifest - - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: v0.5.1 - hooks: - # Run the linter. - - id: ruff - args: [--fix] - additional_dependencies: [] - # Run the formatter. - - id: ruff-format - additional_dependencies: [] - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.1 - hooks: - - id: mypy - name: mypy - additional_dependencies: ["types-aiofiles"] - - repo: https://github.com/python-poetry/poetry - rev: "1.8.0" - hooks: - - id: poetry-check - args: ["-C", "./backend/core"] - - id: poetry-lock - args: ["-C", "./backend/core"] diff --git a/backend/core/MegaParse/.release-please-manifest.json b/backend/core/MegaParse/.release-please-manifest.json deleted file mode 100644 index a065a580af25..000000000000 --- a/backend/core/MegaParse/.release-please-manifest.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - ".": "0.0.31" -} diff --git a/backend/core/MegaParse/Dockerfile b/backend/core/MegaParse/Dockerfile deleted file mode 100644 index 77a5c0668ebd..000000000000 --- a/backend/core/MegaParse/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -# Using a slim version for a smaller base image -FROM python:3.11.6-slim-bullseye - -# Install GEOS library, Rust, and other dependencies, then clean up -RUN apt-get clean && apt-get update && apt-get install -y \ - poppler-utils \ - tesseract-ocr - -WORKDIR /code - -# Upgrade pip and install dependencies -RUN pip install megaparse - -# You can run the application with the following command: -# docker run -it megaparse_image python your_script.py - diff --git a/backend/core/MegaParse/LICENSE b/backend/core/MegaParse/LICENSE deleted file mode 100644 index 261eeb9e9f8b..000000000000 --- a/backend/core/MegaParse/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/backend/core/MegaParse/Makefile b/backend/core/MegaParse/Makefile deleted file mode 100644 index e0987605bebb..000000000000 --- a/backend/core/MegaParse/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# Makefile - -# Image name -IMAGE_NAME = megaparse_image - -# Dockerfile location -DOCKERFILE = Dockerfile - -# Build Docker image -build: - docker build -t $(IMAGE_NAME) -f $(DOCKERFILE) . - -.PHONY: build \ No newline at end of file diff --git a/backend/core/MegaParse/README.md b/backend/core/MegaParse/README.md deleted file mode 100644 index 420f9a56bd63..000000000000 --- a/backend/core/MegaParse/README.md +++ /dev/null @@ -1,93 +0,0 @@ -# MegaParse - Your Mega Parser for every type of documents - -
- Quivr-logo -
- -MegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing. - -## Key Features 🎯 - -- **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease. -- **No Information Loss**: Focus on having no information loss during parsing. -- **Fast and Efficient**: Designed with speed and efficiency at its core. -- **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents. -- **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use. - -## Support - -- Files: ✅ PDF ✅ Powerpoint ✅ Word -- Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images - -### Example - -https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3 - -## Installation - -```bash -pip install megaparse -``` - -## Usage - -1. Add your OpenAI API key to the .env file - -2. Install poppler on your computer (images and PDFs) - -3. Install tesseract on your computer (images and PDFs) - -```python -from megaparse import MegaParse - -megaparse = MegaParse(file_path="./test.pdf") -document = megaparse.load() -print(document.page_content) -megaparse.save_md(document.page_content, "./test.md") -``` - -### (Optional) Use LlamaParse for Improved Results - -1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key. - -2. Call Megaparse with the `llama_parse_api_key` parameter - -```python -from megaparse import MegaParse - -megaparse = MegaParse(file_path="./test.pdf", llama_parse_api_key="llx-your_api_key") -document = megaparse.load() -print(document.page_content) -``` - -## BenchMark - - - -| Parser | Diff | -| ---------------------------------------- | ---- | -| LMM megaparse | 36 | -| Megaparse with LLamaParse and GPTCleaner | 74 | -| Megaparse with LLamaParse | 97 | -| Unstructured Augmented Parse | 99 | -| LLama Parse | 102 | -| **Megaparse** | 105 | - - - -_Lower is better_ - -## Next Steps - -- [ ] Improve Table Parsing -- [ ] Improve Image Parsing and description -- [ ] Add TOC for Docx -- [ ] Add Hyperlinks for Docx -- [ ] Order Headers for Docx to Markdown -- [X] Add Rye package manager - - - -## Star History - -[![Star History Chart](https://api.star-history.com/svg?repos=QuivrHQ/MegaParse&type=Date)](https://star-history.com/#QuivrHQ/MegaParse&Date) diff --git a/backend/core/MegaParse/images/tables.png b/backend/core/MegaParse/images/tables.png deleted file mode 100644 index d4537b2b1e64..000000000000 Binary files a/backend/core/MegaParse/images/tables.png and /dev/null differ diff --git a/backend/core/MegaParse/logo.png b/backend/core/MegaParse/logo.png deleted file mode 100644 index 55d67a36140e..000000000000 Binary files a/backend/core/MegaParse/logo.png and /dev/null differ diff --git a/backend/core/MegaParse/notebooks/docx2md.ipynb b/backend/core/MegaParse/notebooks/docx2md.ipynb deleted file mode 100644 index cd6010f4ba80..000000000000 --- a/backend/core/MegaParse/notebooks/docx2md.ipynb +++ /dev/null @@ -1,57 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# -*- coding: utf-8 -*-\n", - "from pathlib import Path\n", - "from src.Converter import DOCXConverter" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "converter = DOCXConverter()\n", - "md_content = converter.convert('./input/CDP_QUAL_CHART_01_CHARTE PRODUITS_2023.12.13.docx')\n", - "converter.save_md(md_content, Path('./output/CDP_QUAL_CHART_01_CHARTE PRODUITS_2023.12.13.md'))" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "metadata": {}, - "outputs": [], - "source": [ - "# import mammoth to compare results\n", - "# md = mammoth.convert_to_markdown('./input/CDP_QUAL_CHART_01_CHARTE PRODUITS_2023.12.13.docx')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "QuivrParse-DS8JDGq8", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/backend/core/MegaParse/notebooks/evaluate.ipynb b/backend/core/MegaParse/notebooks/evaluate.ipynb deleted file mode 100644 index 537360f3f1e1..000000000000 --- a/backend/core/MegaParse/notebooks/evaluate.ipynb +++ /dev/null @@ -1,551 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Mega Parse" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Started parsing the file under job_id e5e0367d-2f83-4e4d-84e5-4d5df7119516\n", - "Started parsing the file under job_id 0b5d66aa-bbab-454b-b256-82495d20f91f\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n", - "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], - "source": [ - "from pathlib import Path\n", - "import sys\n", - "sys.path.append('..')\n", - "from megaparse.Converter import MegaParse\n", - "import os \n", - "\n", - "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", - "\n", - "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\", llama_parse_api_key=api_key)\n", - "md_content = converter.convert()\n", - "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\"))\n", - "\n", - "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\", llama_parse_api_key=api_key)\n", - "md_content = converter.convert(gpt4o_cleaner = True)\n", - "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\"))\n", - "\n", - "\n", - "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\")\n", - "md_content = converter.convert()\n", - "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md\"))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### LLama Parse" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Started parsing the file under job_id f78ee794-ffde-4e0a-938d-987f1b22cfcb\n" - ] - } - ], - "source": [ - "from typing import List\n", - "from llama_index.core.schema import Document\n", - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()\n", - "#GET LLAMA_CLOUD_API_KEY\n", - "import os\n", - "from llama_parse import LlamaParse\n", - "from llama_parse.utils import ResultType, Language\n", - "\n", - "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", - "\n", - "parsing_instructions = \"Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables.\"\n", - "\n", - "parser = LlamaParse(\n", - " api_key=str(api_key), \n", - " result_type=ResultType.MD,\n", - " gpt4o_mode=True,\n", - " verbose=True,\n", - " language=Language.FRENCH,\n", - " parsing_instruction=parsing_instructions, # Optionally you can define a parsing instruction\n", - ")\n", - "# sync\n", - "documents: List[Document] = parser.load_data(\"../megaparse/tests/input_tests/MegaFake_report.pdf\")\n", - "\n", - "with open(\"../megaparse/tests/output_tests/MegaFake_report_llama.md\", \"w\") as f:\n", - " f.write(documents[0].get_content())\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Unstructured" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import UnstructuredPDFLoader\n", - "loader = UnstructuredPDFLoader(\"../megaparse/tests/input_tests/MegaFake_report.pdf\", strategy=\"hi_res\", infer_table_structure=True,\n", - ")\n", - "data = loader.load()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"../megaparse/tests/output_tests/MegaFake_report_unstructured.md\", \"w\") as f:\n", - " f.write(data[0].page_content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Evaluation with Diff Lib" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "import difflib\n", - "def read_file(file_path):\n", - " with open(file_path, 'r', encoding='utf-8') as file:\n", - " return file.readlines()\n", - "\n", - "def compare_files(source_path, target_path, with_formatting=False):\n", - " source_lines = read_file(source_path)\n", - " target_lines = read_file(target_path)\n", - " if not with_formatting:\n", - " source_lines = [line.replace(\"*\",\"\") for line in source_lines]\n", - " target_lines = [line.replace(\"*\",\"\") for line in target_lines]\n", - "\n", - " diff = difflib.unified_diff(\n", - " source_lines,\n", - " target_lines,\n", - " fromfile='target.md',\n", - " tofile='generated.md',\n", - " lineterm=''\n", - " )\n", - "\n", - " modifications = 0\n", - " for line in diff:\n", - " #print(line)\n", - " if line.startswith('+') and not line.startswith('+++'):\n", - " modifications += 1\n", - " elif line.startswith('-') and not line.startswith('---'):\n", - " modifications += 1\n", - "\n", - " return modifications\n", - " \n", - "diff_megaparse_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", - "diff_megaparse_llama_gptcleaner = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", - "diff_megaparse_llama = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", - "diff_llamaparse = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", - "diff_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", - "diff_megaparse_llm = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llm_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", - "diff_megaparse_unstructured_augmented = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_augmented.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "diff_results = {\n", - " \"**Megaparse**\": diff_megaparse_unstructured,\n", - " \"Megaparse with LLamaParse\": diff_megaparse_llama,\n", - " \"Megaparse with LLamaParse and GPTCleaner\": diff_megaparse_llama_gptcleaner,\n", - " \"LMM megaparse\": diff_megaparse_llm,\n", - " \"LLama Parse\": diff_llamaparse,\n", - " \"Unstructured Augmented Parse\": diff_megaparse_unstructured_augmented,\n", - "}\n", - "\n", - "# Sort the results\n", - "sorted_diff_results = sorted(diff_results.items(), key=lambda x: x[1])\n", - "\n", - "# Generate a table with the results\n", - "benchmark_results = \"| Parser | Diff |\\n|---|---|\\n\"\n", - "for parser, diff in sorted_diff_results:\n", - " benchmark_results += f\"| {parser} | {diff} |\\n\"\n", - "\n", - "# Update README.md file\n", - "with open(\"../README.md\", \"r\") as readme_file:\n", - " readme_content = readme_file.read()\n", - "\n", - "start_marker = \"\"\n", - "end_marker = \"\"\n", - "start_index = readme_content.find(start_marker) + len(start_marker)\n", - "end_index = readme_content.find(end_marker)\n", - "\n", - "updated_readme_content = readme_content[:start_index] + \"\\n\" + benchmark_results + readme_content[end_index:]\n", - "\n", - "with open(\"../README.md\", \"w\") as readme_file:\n", - " readme_file.write(updated_readme_content)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--- target.md\n", - "+++ generated.md\n", - "@@ -1,18 +1,19 @@\n", - "-| My Mega fake | report | #1756394 31/05/2024 |\n", - "\n", - "-|--------------|--------|---------------------|\n", - "\n", - "-| | | |\n", - "\n", - "+| My Mega fake report | #1756394 | 31/05/2024 |\n", - "\n", - "+|---------------------|----------|------------|\n", - "\n", - " \n", - "\n", - " # Why Mega Parse might be the best ?\n", - "\n", - " \n", - "\n", - "-# Introduction\n", - "\n", - "+## Introduction\n", - "\n", - " \n", - "\n", - " Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.\n", - "\n", - " \n", - "\n", - "-# Features of Mega Parse\n", - "\n", - "+## Features of Mega Parse\n", - "\n", - " \n", - "\n", - " Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.\n", - "\n", - " \n", - "\n", - " Multiple Format Support: Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.\n", - "\n", - "+\n", - "\n", - "+High-Speed Processing: One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.\n", - "\n", - " \n", - "\n", - " Markdown Output: Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.\n", - "\n", - " \n", - "\n", - "@@ -24,7 +25,7 @@\n", - " \n", - "\n", - " Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.\n", - "\n", - " \n", - "\n", - "-# Benefits of Mega Parse\n", - "\n", - "+## Benefits of Mega Parse\n", - "\n", - " \n", - "\n", - " The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.\n", - "\n", - " \n", - "\n", - "@@ -32,9 +33,7 @@\n", - " \n", - "\n", - " Versatility: Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.\n", - "\n", - " \n", - "\n", - "-Enhanced Knowledge Management: Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but\n", - "\n", - "-\n", - "\n", - "-also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.\n", - "\n", - "+Enhanced Knowledge Management: Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.\n", - "\n", - " \n", - "\n", - " Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.\n", - "\n", - " \n", - "\n", - "@@ -42,57 +41,45 @@\n", - " \n", - "\n", - " Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.\n", - "\n", - " \n", - "\n", - "-# Comparative Performance\n", - "\n", - "+## Comparative Performance\n", - "\n", - " \n", - "\n", - " The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.\n", - "\n", - " \n", - "\n", - "-| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D |\n", - "\n", - "-|-------------------------------|----------------------|------------|------------|------------|-------------------|\n", - "\n", - "-| Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX |\n", - "\n", - "-| Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 |\n", - "\n", - "-\n", - "\n", - "-| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D | Plain Text |\n", - "\n", - "-|--------------------------------------|------------|----------|----------|----------|------------|-------------|\n", - "\n", - "-| Accuracy Rate (%) | 98 | 95 | 93 | 90 | 92 | 90 |\n", - "\n", - "-| Output Format | Markdown | HTML | Markdown | HTML | Plain Text | Plain Text |\n", - "\n", - "-| Error Rate (%) | 1 | 3 | 4 | 5 | 3 | 5 |\n", - "\n", - "-| Ease of Use | High | Medium | High | Medium | Medium | Medium |\n", - "\n", - "-| Integration Capability | Excellent | Good | Good | Fair | Good | Good |\n", - "\n", - "-| Batch Processing | Yes | No | Yes | No | Yes | No |\n", - "\n", - "-| Custom Parsing Rules | Yes | Limited | Yes | No | Yes | No |\n", - "\n", - "-| Multilingual Support | Yes | Yes | Yes | Yes | Yes | Yes |\n", - "\n", - "-| OCR (Optical Character Recognition) | Yes | Yes | Yes | Yes | Yes | No |\n", - "\n", - "-| Price (per user/month) | $30 | $25 | $20 | $15 | $18 | $15 |\n", - "\n", - "-| Customer Support Rating (out of 5) | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 | 3.9 |\n", - "\n", - "-| Free Trial Available | Yes | Yes | No | Yes | No | Yes |\n", - "\n", - "-| Cloud Integration | Yes | No | Yes | No | No | Yes |\n", - "\n", - "-| Security Features | Advanced | Basic | Advanced | Basic | Intermediate| Basic |\n", - "\n", - "-\n", - "\n", - "-\n", - "\n", - "-| Feature | Tool 1 | Tool 2 | Tool 3 | Tool 4 | Tool 5 |\n", - "\n", - "-|--------------------------------|---------------------|------------------|----------------|---------------|------------------|\n", - "\n", - "-| User Community Size | Large | Medium | Medium | Small | Medium |\n", - "\n", - "-| Monthly Updates | Yes | Yes | No | No | No |\n", - "\n", - "-| Mobile App Availability | Yes | No | Yes | No | No |\n", - "\n", - "-| Platform Compatibility | Windows, Mac, Linux | Windows, Linux | Windows | Mac, Linux | Windows, Linux |\n", - "\n", - "-| Data Privacy Compliance | High | Medium | High | Low | Medium |\n", - "\n", - "-| AI-Driven Enhancements | Yes | No | Yes | No | Yes |\n", - "\n", - "-| File Size Limit (per document) | 1GB | 500MB | 750MB | 200MB | 500MB |\n", - "\n", - "-| User Training Resources | Extensive | Moderate | Extensive | Limited | Moderate |\n", - "\n", - "-| API Access | Yes | No | Yes | No | Yes |\n", - "\n", - "-| Customizable Output Templates | Yes | Limited | Yes | No | Limited |\n", - "\n", - "-| Collaboration Features | Yes | No | Yes | No | Limited |\n", - "\n", - "-| Document Version Control | Yes | No | Yes | No | Yes |\n", - "\n", - "-| Import/Export Options | Extensive | Moderate | Extensive | Limited | Moderate |\n", - "\n", - "-\n", - "\n", - "-\n", - "\n", - "-| Feedback Mechanism | Yes | No | Yes | No | Yes |\n", - "\n", - "-|--------------------|-----|----|-----|----|-----|\n", - "\n", - "-\n", - "\n", - "+| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D |\n", - "\n", - "+|---------------------|-------------|----------------|--------------|--------------|----------------|\n", - "\n", - "+| Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX|\n", - "\n", - "+| Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 |\n", - "\n", - "+| Accuracy Rate (%) | 98 | 95 | 93 | 90 | 92 |\n", - "\n", - "+| Output Format | Markdown | HTML | Markdown | Plain Text | HTML |\n", - "\n", - "+| Error Rate (%) | 1 | 3 | 4 | 5 | 3 |\n", - "\n", - "+| Ease of Use | High | Medium | High | Medium | Medium |\n", - "\n", - "+| Integration Capability| Excellent| Good | Good | Fair | Good |\n", - "\n", - "+| Batch Processing | Yes | No | Yes | No | Yes |\n", - "\n", - "+| Custom Parsing Rules | Yes | Limited | Yes | No | Limited |\n", - "\n", - "+| Multilingual Support | Yes | Yes | No | Yes | Yes |\n", - "\n", - "+| OCR (Optical Character Recognition) | Yes | No | Yes | No | Yes |\n", - "\n", - "+| Price (per user/month)| $30 | $25 | $20 | $15 | $18 |\n", - "\n", - "+| Customer Support Rating (out of 5) | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 |\n", - "\n", - "+| Free Trial Available | Yes | Yes | No | Yes | No |\n", - "\n", - "+| Cloud Integration | Yes | No | Yes | Yes | No |\n", - "\n", - "+| Security Features | Advanced | Basic | Advanced | Basic | Intermediate |\n", - "\n", - "+| User Community Size | Large | Medium | Medium | Small | Medium |\n", - "\n", - "+| Monthly Updates | Yes | Yes | No | Yes | No |\n", - "\n", - "+| Mobile App Availability| Yes | No | Yes | No | Yes |\n", - "\n", - "+| Platform Compatibility| Windows, Mac, Linux | Windows, Mac | Windows | Mac, Linux | Windows, Linux |\n", - "\n", - "+| Data Privacy Compliance| High | Medium | High | Low | Medium |\n", - "\n", - "+| AI-Driven Enhancements| Yes | No | Yes | No | Yes |\n", - "\n", - "+| File Size Limit (per document) | 1GB | 500MB | 750MB | 200MB | 500MB |\n", - "\n", - "+| User Training Resources| Extensive | Moderate | Extensive | Limited | Moderate |\n", - "\n", - "+| API Access | Yes | No | Yes | No | Yes |\n", - "\n", - "+| Customizable Output Templates | Yes | Limited | Yes | No | Yes |\n", - "\n", - "+| Collaboration Features| Yes | No | Yes | No | Limited |\n", - "\n", - "+| Document Version Control| Yes | No | Yes | No | Yes |\n", - "\n", - "+| Import/Export Options | Extensive | Moderate | Extensive | Limited | Moderate |\n", - "\n", - "+| Feedback Mechanism | Yes | No | Yes | No | Yes |\n", - "\n", - " \n", - "\n", - " Note: All data presented in this table is fictional and for illustrative purposes only.\n", - "\n", - " \n", - "\n", - "-# Conclusion\n", - "\n", - "+## Conclusion\n", - "\n", - " \n", - "\n", - "-Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.\n", - "\n", - "-\n", - "\n", - "+Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.\n" - ] - } - ], - "source": [ - "source_lines = read_file(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_augmented.md\")\n", - "target_lines = read_file(\"../megaparse/tests/output_tests/MegaFake_report.md\")\n", - "\n", - "source_lines = [line.replace(\"*\",\"\") for line in source_lines]\n", - "target_lines = [line.replace(\"*\",\"\") for line in target_lines]\n", - "\n", - "diff = difflib.unified_diff(\n", - "source_lines,\n", - "target_lines,\n", - "fromfile='target.md',\n", - "tofile='generated.md',\n", - "lineterm=''\n", - ")\n", - "modifications = 0\n", - "for line in diff:\n", - " print(line)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "QuivrParse-DS8JDGq8", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/backend/core/MegaParse/notebooks/pdf2md_llamaParse.ipynb b/backend/core/MegaParse/notebooks/pdf2md_llamaParse.ipynb deleted file mode 100644 index e51378c8d22d..000000000000 --- a/backend/core/MegaParse/notebooks/pdf2md_llamaParse.ipynb +++ /dev/null @@ -1,148 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "llx-2n4Awnlb1jwmF0Nn5iHtXNIntWYJFKIOP2rUJpJYjfi4ZECV\n", - "Started parsing the file under job_id 4fd224a0-f850-4ffb-8f4f-46831510ec1a\n", - "[Document(id_='86203ce1-cb60-4435-a909-6f8999d347ed', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023\\n\\n| Date | Mises à jour |\\n|------------|------------------------------------------------------------------------------|\\n| 19/12/2014 | Création |\\n| 12/12/2019 | Insertion des additifs interdits et à éviter |\\n| 13/05/2022 | Revue des exigences recettes et annexes |\\n| 30/03/2023 | Revue des annexes I et II. Fréquence de mise à jour CDC |\\n| 13/12/2023 | Ajout d'une exigence de certification sur le cacao |\\n| | Revue des exigences de certification de l’huile de palme |\\n\\n## Table des matières\\n\\n1. [Exigence recette](#exigence-recette) .................................................. 2 \\n2. [Produits soumis à certification ou allégations](#produits-soumis-à-certification-ou-allégations) ........ 3 \\n 2.1. [Produits « sans gluten »](#produits-sans-gluten) ................................................. 3 \\n 2.2. [Produits issus de l’agriculture biologique](#produits-issus-de-lagriculture-biologique) ............ 3 \\n3. [Exigences générales relatives au fournisseur](#exigences-générales-relatives-au-fournisseur) .......... 4 \\n4. [Exigences relatives aux sites de production](#exigences-relatives-aux-sites-de-production) ............ 4 \\n5. [Traçabilité](#traçabilité) ............................................................ 4 \\n6. [Suivi analytique](#suivi-analytique) .................................................. 5 \\n 6.1. [Suivi microbiologique](#suivi-microbiologique) .................................................. 5 \\n 6.2. [Suivi nutritionnel](#suivi-nutritionnel) ....................................................... 5 \\n 6.3. [Suivi organoleptique](#suivi-organoleptique) ................................................... 5 \\n7. [Non conformités](#non-conformités) .................................................... 5 \\n8. [Gestion de crise Coup de Pates](#gestion-de-crise-coup-de-pates) ........................ 6 \\n\\n**ANNEXE I**: Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle cancérogénicité ou une implication dans les pathologies lourdes ........ 7 \\n**ANNEXE II**: Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires ........ 10 \\n**ANNEXE III**: Additifs verts : additifs identifiés à ce jour comme non dangereux pour la santé ........ 11 \\n**ANNEXE IV**: Ingrédients controversés ........ 12 \\n\\n## Liste des abréviations\\n\\n- **AFDIAG** : Association Française Des Intolérants Au Gluten\\n- **AOECS** : Association of European Coeliac Societies\\n- **COFRAC** : Comité français d'accréditation\\n- **DGHM** : Deutschen Gesellschaft für Hygiene und Mikrobiologie\\n- **FCD** : Fédération du Commerce et de la Distribution\\n- **GFSI** : Global Food Safety Initiative\\n- **ILAC** : International Laboratory Accreditation Cooperation\\n- **NPD** : New Product Development\\n\\n---\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023\\n\\n## 1. Exigence recette\\n\\nPour le développement de nos produits, nous souhaitons favoriser une offre saine avec des recettes simples (sans colorant, sans arôme, sans conservateur), avec des ingrédients de qualité, en favorisant des produits locaux et labellisés.\\n\\nLe fournisseur s’engage à respecter la réglementation européenne et nationale ainsi que les codes d’usages professionnels applicables aux produits surgelés vendus à Coup de Pates.\\n\\nPour les produits commercialisés sous une marque appartenant à Coup de Pates, le fournisseur s’engage également à respecter les exigences spécifiques de cette même marque.\\n\\nDans ce cas, nos exigences recettes sont spécifiques à trois niveaux gammes : Entrée de gamme, Cœur de gamme, Haut de gamme.\\n\\nPour les produits développés en réponse à des demandes spécifiques de nos clients, il vous sera également demandé de prendre leurs exigences en considération.\\n\\n| Caractéristiques | Entrée de gamme | Cœur de gamme | Haut de Gamme |\\n|------------------|-----------------|---------------|---------------|\\n| Ingrédients soumis à déclaration OGM | INTERDIT | INTERDIT | INTERDIT |\\n| Traitement par ionisation | INTERDIT | INTERDIT | INTERDIT |\\n| Colorants azoïques (E102, E104, E110, E122, E124, E129) | INTERDIT | INTERDIT | INTERDIT |\\n| Nanoparticules (E170, E171, E172, E174, E152, E341, E551 et E552) | INTERDIT | INTERDIT | INTERDIT |\\n| Glutamates et exhausteurs de goût | INTERDIT | INTERDIT | INTERDIT |\\n| Œufs de poules élevées en cage | INTERDIT | INTERDIT | INTERDIT |\\n| Matières grasses partiellement hydrogénées | INTERDIT | INTERDIT | INTERDIT |\\n| Acides gras trans non naturellement présents | INTERDIT | INTERDIT | INTERDIT |\\n| Édulcorants de synthèse | INTERDIT | INTERDIT | INTERDIT |\\n| Viande Séparée Mécaniquement - VSM | INTERDIT | INTERDIT | INTERDIT |\\n| Cacao non certifié durable | * INTERDIT pour tous les NPD et plan action pour remplacer le cacao non certifié dans l’existant. | * INTERDIT pour tous les NPD et plan action pour remplacer le cacao non certifié dans l’existant. | * INTERDIT pour tous les NPD et plan action pour remplacer le cacao non certifié dans l’existant. |\\n| Gélatine porcine | INTERDIT | INTERDIT | INTERDIT |\\n| Gélatine animale – (autre que porcine) | À ÉVITER | INTERDIT (tolérance dans les pâtisseries) | INTERDIT (tolérance dans les pâtisseries) |\\n| Huile de palme + palmiste non RSPO | * INTERDIT pour tous les NPD et plan action pour retirer dans l’existant - (tolérée dans supports d’additifs) - En aucun cas, l’huile de palme non RSPO ne pourra être substituée par de l’huile de coprah ou coco. | * INTERDIT pour tous les NPD et plan action pour retirer dans l’existant - (tolérée dans supports d’additifs) - En aucun cas, l’huile de palme non RSPO ne pourra être substituée par de l’huile de coprah ou coco. | * INTERDIT pour tous les NPD et plan action pour retirer dans l’existant - (tolérée dans supports d’additifs) - En aucun cas, l’huile de palme non RSPO ne pourra être substituée par de l’huile de coprah ou coco. |\\n| Huile de palme + palmiste RSPO (certification « Segregated » demandée, à minima « Mass Balance » soumis à dérogation) | À ÉVITER | À ÉVITER | INTERDIT |\\n\\nPage 2 sur 15\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| | A ÉVITER | INTERDIT (tolérance dans les pâtisseries *sauf arômes de fumée et vanilline) | INTERDIT (sauf arômes de fumée) |\\n|--------------------------|----------|-----------------------------------------------------------------------------|---------------------------------|\\n| Arômes artificiels | A ÉVITER | INTERDIT (tolérance dans les pâtisseries *sauf arômes de fumée et vanilline) | INTERDIT (sauf arômes de fumée) |\\n| Colorants artificiels | A ÉVITER | Interdit dans les produits salés | INTERDIT |\\n| Ingrédients controversés (cf. Annexe IV) | A ÉVITER | A ÉVITER | INTERDIT |\\n| Additifs rouges (cf. Annexe I) | A ÉVITER | INTERDIT (hors nitrites et polyphosphates) | INTERDIT (hors nitrites dans les produits de salaison) |\\n| Additifs Oranges (cf. Annexe II) | A ÉVITER | A ÉVITER | INTERDIT |\\n| Nitrites (E250 à E252) | A ÉVITER | A ÉVITER | INTERDIT (Hors produits de salaison) |\\n| Polyphosphates (E450 à 452 - E339 à 341) | A ÉVITER | A ÉVITER | INTERDIT |\\n| Viande et volaille origine hors UE | A ÉVITER | A ÉVITER | INTERDIT |\\n\\nL’ensemble de ces critères est applicable à tous les produits vendus par Coup de Pates. Des dérogations peuvent être accordées au cas par cas, sur justificatifs fournis par le fournisseur et après validation par la direction qualité Coup de Pates.\\n\\n## 2. Produits soumis à certification ou allégations\\n\\nLe fournisseur se doit de communiquer tout document permettant de valider la certification ou allégation associée à un produit.\\n\\nEn vue de vérifier la véracité des critères déclarés, le fournisseur s’engage à transmettre sur demande expresse de Coup de Pates, tout document permettant de justifier la certification et/ou de l’allégation associée(s) au(x) produit(s).\\n\\n### 2.1. Produits « sans gluten »\\n\\nLe fournisseur doit confirmer annuellement à Coup de Pates que l’allégation « sans gluten » de son (ses) produit(s) est applicable, conformément au règlement européen n°828/2014. Pour cela, un bulletin d’analyse de quantification du taux de gluten dans le produit fini doit être communiqué au service qualité.\\n\\nSi le fournisseur possède un contrat de licence auprès d’une association de personnes cœliaques (AFDIAG, AOECS…), il en transmettra le numéro de licence à Coup de Pates et les rapports et/ou certificats d’audits selon le référentiel d’audit de l’AOECS.\\n\\n### 2.2. Produits issus de l’agriculture biologique\\n\\nLe fournisseur s’engage à transmettre sur demande expresse de Coup de Pates, les analyses pesticides sur produits finis pour répondre aux exigences de la réglementation européenne.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\nCodification : CDP_QUA_CHART_01\\n\\nVersion : 5\\n\\nDate d’application : 13/12/2023\\n\\n(Règlement (CE) N°834/2007 relatif à la production biologique et à l’étiquetage des produits biologiques).\\n\\nEn cas de déclassement produit, de non-renouvellement ou de perte de la certification « produit issu de l’agriculture biologique », le fournisseur doit immédiatement en informer le service qualité Coup de Pates afin d’organiser le blocage et le retrait de ces produits.\\n\\n## 3. Exigences générales relatives au fournisseur\\n\\nLe fournisseur se doit de disposer de moyens de contrôle et d'enregistrement permettant le respect de la chaîne du froid dans son stockage et son transport de denrées congelées/surgelées.\\n\\nTout envoi d’échantillon devra être accompagné de la « Fiche d’évolution produit » ou d’une fiche technique fournisseur, reprenant à minima les données techniques demandées dans le document précédent (composition, dimensions, DDM ...). Toute autre information jugée nécessaire par le service qualité Coup de Pates devra être communiquée sur demande. Le cahier des charges Coup de Pates devra être rempli dès que le référencement du produit aura été confirmé.\\n\\nIl revient au fournisseur d’appliquer la plus grande diligence dans le transfert exhaustif de ces données. Le dossier établi à l’issue du processus de référencement sera validé à la fois par le fournisseur et un représentant du service qualité Coup de Pates. Toute modification du dossier technique devra être validée en amont par le service qualité Coup de Pates. Si cela est jugé nécessaire, des échantillons (produit actuel / produit modifié) devront être envoyés au service qualité Coup de Pates. Le cahier des charges devra être revu dans son intégralité tous les 5 ans. Même s’il n’y a pas de modification, le cahier des charges sera de nouveau signé avec la nouvelle date.\\n\\n## 4. Exigences relatives aux sites de production\\n\\nLe fournisseur se doit de communiquer les certificats relatifs à son activité, en cours de validité, par exemple : IFS, BRC, FSSC 22000. Le service qualité Coup de Pates devra être informé de tout renouvellement ou perte de certification.\\n\\nLe fournisseur se doit de communiquer, sur demande de Coup de Pates, l’ensemble des documents permettant de justifier sa maîtrise des risques liés à son activité (étude HACCP par exemple).\\n\\nLa mise en place des mesures contre les actes malveillants en matière de protection de la chaine alimentaire/des produits sont de la responsabilité du fournisseur.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification :** \\nCDP_QUA_CHART_01\\n\\n**Version :** 5\\n\\n**Date d’application :** 13/12/2023\\n\\nLe fournisseur doit posséder un **numéro d’enregistrement** auprès des services officiels, permettant l’export de ses produits par Coup de Pates.\\n\\n## 5. Traçabilité\\n\\nLe fournisseur se doit d’assurer la traçabilité de ses produits, de la réception des matières premières jusqu’à l’expédition des produits finis. La traçabilité d’une référence doit pouvoir être assurée via la date de durabilité minimale (au format jour/mois/année).\\n\\nSur demande de Coup de Pates, le fournisseur s’engage à transmettre les fiches ingrédients, certificats et éléments de traçabilité liés à la nature des matières premières, des emballages et du produit fini ainsi que les bilans de matière dans les délais stipulés.\\n\\n## 6. Suivi analytique\\n\\n### 6.1. Suivi microbiologique\\n\\nLes analyses microbiologiques réalisées sur les produits finis doivent être en adéquation avec la réglementation européenne n°2073/2005 et les recommandations de la FCD en France, du DGHM en Allemagne et en Suisse, ou équivalent local au sein de l’Europe.\\n\\nUne analyse microbiologique devra être réalisée lors de chaque première fabrication. Cette analyse devra être réalisée par un laboratoire accrédité COFRAC ou équivalent du COFRAC reconnu par l’ILAC dans les pays concernés ou certifié ISO 17025. Les résultats doivent être transmis au service qualité Coup de Pates.\\n\\nL’ensemble des produits Coup de Pates doivent être inclus dans le plan de contrôle microbiologique du fournisseur, selon les critères FCD. Sur demande de Coup de Pates, un nouveau bulletin d’analyse devra être communiqué.\\n\\n### 6.2. Suivi nutritionnel\\n\\nLe fournisseur doit communiquer à Coup de Pates une analyse nutritionnelle réalisée par un laboratoire accrédité COFRAC ou équivalent du COFRAC reconnu par l’ILAC dans les pays concernés. Cette analyse doit être réalisée pour chaque nouveau produit référencé, afin de répondre aux exigences d’étiquetage européennes (avec quantification des acides gras trans et des fibres), et à chaque modification de matières premières et/ou de recette. La communication d’analyses nutritionnelles calculées à l’aide d’un logiciel consolidé est également acceptée.\\n\\nSur demande de Coup de Pates, un nouveau bulletin d’analyse devra être communiqué.\\n\\n### 6.3. Suivi organoleptique\\n\\nL’ensemble des produits Coup de Pates doivent être inclus dans le plan de contrôle organoleptique du fournisseur. Sur demande de Coup de Pates, les résultats de ces analyses devront être communiqués.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Codification : | CDP_QUA_CHART_01 |\\n|----------------|------------------|\\n| Version : | 5 |\\n| Date d’application : | 13/12/2023 |\\n\\n## 7. Non conformités\\n\\nEn cas de non-conformité produit, sanitaire ou réglementaire, le fournisseur s’engage à alerter immédiatement Coup de Pates et à communiquer les éléments de traçabilité nécessaires.\\n\\nEn cas de non-conformité détectée par le service qualité Coup de Pates ou un de ses clients, une notification est envoyée au fournisseur. Celui-ci s’engage à communiquer son analyse et son plan d’action dans les délais demandés.\\n\\n## 8. Gestion de crise Coup de Pates\\n\\nEn cas de crise, le fournisseur s’engage à suivre la procédure de gestion de crise/alerte qui lui a été communiquée par Coup de Pates. Un contact spécifique avec numéro d’astreinte doit être communiqué.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023 \\n\\n## ANNEXE 1 : Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle cancérogénicité ou une implication dans les pathologies lourdes\\n\\n| Additif | Code | Additif | Code |\\n|---------|------|---------|------|\\n| Tartrazine | E 102 | Acide propionique | E 280 |\\n| Jaune de quinoléine | E 104 | Propionate de sodium | E 281 |\\n| Sunset Yellow FCF/Jaune orange S | E 110 | Propionate de potassium | E 283 |\\n| Azorubine, carmoisine | E 122 | Acide borique | E 284 |\\n| Amarante | E 123 | Tétraborate de sodium (borax) | E 285 |\\n| Ponceau 4R, rouge cochenille A | E 124 | Acide fumarique | E 297 |\\n| Erythrosine | E 127 | Gamma-tocophérol | E 308 |\\n| Rouge allura AC | E 129 | Delta-tocophérol | E 309 |\\n| Indigotine, carmin d’indigo | E 132 | Gallate de propyle | E 310 |\\n| Bleu brillant FCF | E 133 | Acide érythorbique | E 315 |\\n| Vert S | E 142 | Butylhydro-quinone tertiaire (BHQT) | E 319 |\\n| Caramel ammoniacal | E 150c | Butylhydroxy-anisol (BHA) | E 320 |\\n| Caramel au sulfite d’ammonium | E 150d | Butylhydroxy-toluène (BHT) | E 321 |\\n| Noir brillant PN | E 151 | Tartrates de sodium | E 335 |\\n| Brun HT | E 155 | Tartrate double de sodium et de potassium | E 337 |\\n| Carbonate de calcium | E 170 | Acide phosphorique | E 338 |\\n| Dioxyde de titane | E 171 | Phosphates de sodium | E 339 |\\n| Oxyde et hydroxyde de fer | E 172 | Phosphates de potassium | E 340 |\\n| Aluminium | E 173 | Phosphates de calcium | E 341 |\\n| Argent | E 174 | Phosphates de magnésium | E 343 |\\n| Lithol-rubine BK | E 180 | Malates de sodium | E 350 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| **Benozoate de potassium** | **E 212** | **Malates de calcium** | **E 352** |\\n|----------------------------|-----------|------------------------|-----------|\\n| Benzoate de calcium | E 213 | Acide adipique | E 355 |\\n| p- hydroxybenzoate d’éthyle| E 214 | Adipate de sodium | E 356 |\\n| Dérivé sodique de l’ester éthylique de l’acide p-hydroxybenzoïque | E 215 | Adipate de potassium | E 357 |\\n| p-hydroxybenzoate de méthyle | E 218 | Acide succinique | E 363 |\\n| Dérivé sodique de l’ester méthylique de l’acide p-hydroxybenzoïque | E 219 | Citrate de triammonium | E 380 |\\n| Nisine | E 234 | Alginate de potassium | E 402 |\\n| Hexaméthylènetétramine | E 239 | Alginate d’ammonium | E 403 |\\n| Dicarbonate de diméthyle | E 242 | Mannitol | E 421 |\\n| Éthyl Lauroyl Arginate | E 243 | Gomme arabique modifiée à l’acide octénylsuccinique (OSA) | E 423 |\\n| Nitrite de potassium | E 249 | Konjac | E 425 |\\n| Nitrite de sodium | E 250 | Hémicellulose de soja | E 426 |\\n| Nitrate de sodium | E 251 | Stéarate de polyoxyéthylène (40) | E 431 |\\n| Nitrate de potassium | E 252 | Mono laurate de polyoxyéthylène de sorbitane (polysorbate 20) | E 432 |\\n| Monooléate de polyoxyéthylène de sorbitane (polysorbate 80) | E 433 | Dioxyde de silicium | E 551 |\\n| Monopalmitate de polyoxyéthylène de sorbitane (polysorbate 40) | E 434 | Silicate de calcium | E 552 |\\n| Monostéarate de polyoxyéthylène de sorbitane (polysorbate 60) | E 435 | Silicate de magnésium | E 553a |\\n| Tristéarate de polyoxyéthylène de sorbitane (polysorbate 65) | E 436 | Talc | E 553b |\\n| Phosphatides d’ammonium | E 442 | Silicate alumino-sodique | E 554 |\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d’application**: 13/12/2023\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| **Acétate isobutyrate de saccharose** | **E 444** |\\n|---------------------------------------|-----------|\\n| Esters glycériques de résine de bois | E 445 |\\n| Diphosphates | E 450 |\\n| Triphosphates | E 451 |\\n| Polyphosphates | E 452 |\\n| Polyaspartate de potassium | E 456 |\\n| Bêta-cyclodextrine | E 459 |\\n| Éthylcellulose | E 462 |\\n| Hydroxypropylcellulose faiblement substituée (L-HPC) | E 463a |\\n| Méthyléthylcellulose | E 465 |\\n| Carboxyméthylcellulose de sodium réticulée, gomme de cellulose réticulée | E 468 |\\n| Carboxyméthylcellulose hydrolysée de manière enzymatique, gomme de cellulose hydrolysée de manière enzymatique | E 469 |\\n| Sucroglycérides | E 474 |\\n| Huile de soja oxydée par chauffage ayant réagi avec des mono- et diglycérides d’acides gras | E 479b |\\n| Monostéarate de sorbitane | E 491 |\\n| Tristéarate de sorbitane | E 492 |\\n| Monolaurate de sorbitane | E 493 |\\n| Monooléate de sorbitane | E 494 |\\n| Monopalmitate de sorbitane | E 495 |\\n| Chlorure d’étain | E 512 |\\n| Silicate alumino-potassique | E 555 |\\n| 4-Hexylrésorcinol | E 586 |\\n| Acide glutamique | E 620 |\\n| Glutamate monosodique | E 621 |\\n| Glutamate monopotassique | E 622 |\\n| Diglutamate de calcium | E 623 |\\n| Glutamate d’ammonium | E 624 |\\n| Diglutamate de magnésium | E 625 |\\n| Acide guanylique | E 626 |\\n| Guanylate disodique | E 627 |\\n| Guanylate dipotassique | E 628 |\\n| Guanylate de calcium | E 629 |\\n| Acide inosinique | E 630 |\\n| Inosinate disodique | E 631 |\\n| Inosinate dipotassique | E 632 |\\n| Inosinate de calcium | E 633 |\\n| 5'-ribonucléotide calcique | E 634 |\\n| 5'-ribonucléotide disodique | E 635 |\\n| Glycine et son sel de sodium | E 640 |\\n| Acétate de zinc | E 650 |\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d’application**: 13/12/2023\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Sulfate d’aluminium | E 520 | Cire microcristalline | E 905 |\\n|------------------------------------------|--------|-----------------------------------------------------|--------|\\n| Sulfate d’aluminium sodique | E 521 | Poly-1-décène hydrogéné | E 907 |\\n| Sulfate d’aluminium potassique | E 522 | Cire de polyéthylène oxydée | E 914 |\\n| Sulfate d’aluminium ammonique | E 523 | Butane | E 943a |\\n| Hydroxyde d’ammonium | E 527 | Isobutane | E 943b |\\n| Oxyde de calcium | E 529 | Propane | E 944 |\\n| Oxyde de magnésium | E 530 | Acésulfame-K | E 950 |\\n| Ferrocyanure de calcium | E 538 | Aspartame | E 951 |\\n| Phosphate d’aluminium sodique acide | E 541 | Cyclamates | E 952 |\\n| Isomalt | E 953 | Polyvinylpolypyrrolidone | E 1202 |\\n| Saccharines | E 954 | Alcool polyvinylique (APV) | E 1203 |\\n| Sucralose | E 955 | Copolymère méthacrylate basique | E 1205 |\\n| Thaumatine | E 957 | Copolymère de méthacrylate neutre | E 1206 |\\n| Néotame | E 961 | Copolymère de méthacrylate anionique | E 1207 |\\n| Sel d’aspartame-acésulfame | E 962 | Copolymère d’acétate de vinyle et de polyvinylpyrrolidone | E 1208 |\\n| Sirop de polyglycitol | E 964 | Copolymère greffé d’alcool polyvinylique et de polyéthylèneglycol | E 1209 |\\n| Maltitols | E 965 | Octényl succinate d’amidon d’aluminium | E 1452 |\\n| Xylitol | E 967 | Diacétate de glycéryle (diacéitine) | E 1517 |\\n| Érythritol | E 968 | Alcool benzylique | E 1519 |\\n| Polyvinylpyrrolidone | E 1201 | Polyéthylène glycol | E 1521 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023 \\n\\n## ANNEXE II : Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires\\n\\n| Additif | Code | Additif | Code |\\n|---------|------|---------|------|\\n| Acide carminique, carmins | E 120 | Esters lactiques des mono- et diglycérides d’acides gras | E 472b |\\n| Bleu patenté V | E 131 | Esters citriques des mono- et diglycérides d’acides gras | E 472c |\\n| Caramel de sulfite caustique | E 150b | Esters tartriques des mono- et diglycérides d’acides gras | E 472d |\\n| Or | E 175 | Esters monoacétyltartriques et diacétyltartriques des mono- et diglycérides d’acides gras | E 472e |\\n| Acide benzoïque | E 210 | Esters mixtes acétiques et tartriques des mono- et diglycérides d’acides gras | E 472f |\\n| Benzoate de sodium | E 211 | Sucroesters d’acides gras | E 473 |\\n| Anhydride sulfureux | E 220 | Esters polyglycériques d’acides gras | E 475 |\\n| Sulfite de sodium | E 221 | Esters de propane-1,2-diol d’acides gras | E 477 |\\n| Sulfite acide de sodium | E 222 | Stéaroyl-2-lactylate de sodium | E 481 |\\n| Disulfite de sodium | E 223 | Stéaroyl-2-lactylate de calcium | E 482 |\\n| Disulfite de potassium | E 224 | Tartrate de stéaryle | E 483 |\\n| Sulfite de calcium | E 226 | Diméthylpolysiloxane | E 900 |\\n| Sulfite acide de calcium | E 227 | Advantame | E 969 |\\n| Sulfite acide de potassium | E 228 | Extraits de quillaia | E 999 |\\n| Natamycine | E 235 | Lysozyme | E 1105 |\\n| Éthylène-diamine-tétra-acétate de calcium disodium (calcium disodium EDTA) | E 385 | Amidon oxydé | E 1404 |\\n| Alginate de propane-1,2-diol | E 405 | Phosphate de monoamidon | E 1410 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Algues Euchema transformées | E 407a | Phosphate de diamidon | E 1412 |\\n|-----------------------------|--------|-----------------------|--------|\\n| Carraghénanes | E 407 | Phosphate de diamidon phosphaté | E 1413 |\\n| Cellulose | E 460 | Phosphate de diamidon acétylé | E 1414 |\\n| Hydroxypropylcellulose | E 463 | Amidon acétylé | E 1420 |\\n| Hydroxypropylméthylcellulose| E 464 | Adipate de diamidon acétylé | E 1422 |\\n| Carboxyméthyl-cellulose sodique, gomme cellulosique | E 466 | Amidon hydroxypropylé | E 1440 |\\n| Sels de sodium, de potassium, calcium d’acides gras, magnésium d’acides gras | E 470 | Phosphate de diamidon hydroxypropylé | E 1442 |\\n| Mono- et diglycérides d’acides gras | E 471 | Octényle succinate d’amidon sodique | E 1450 |\\n| Esters acétiques des mono- et diglycérides d’acides gras | E 472a | Amidon oxydé acétylé | E 1451 |\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023 \\n\\n## ANNEXE III : Additifs verts : additifs identifiés à ce jour comme non dangereux pour la santé.\\n\\n| Additif | Code | Additif | Code |\\n|----------------------------------------------|-------|----------------------------------------------|-------|\\n| Curcumine | E 100 | Tartrates de potassium | E 336 |\\n| Riboflavines | E 101 | Malate de potassium | E 351 |\\n| Chlorophylles et chlorophyllines | E 140 | Acide métatartarique | E 353 |\\n| Complexes cuivre-chlorophylles et cuivre-chlorophyllines | E 141 | Tartrate de calcium | E 354 |\\n| Caramel ordinaire | E 150a| Extraits de romarin | E 392 |\\n| Charbon végétal médicinal | E 153 | Acide alginique | E 400 |\\n| Caroténoïdes | E 160a| Alginate de sodium | E 401 |\\n| Bixine de rocou / Norbixine de rocou | E 160b| Alginate de calcium | E 404 |\\n| Extrait de paprika, capsanthine, capsorubine| E 160c| Agar-agar | E 406 |\\n| Lycopène | E 160d| Farine de graines de caroube | E 410 |\\n| β- apocaroténal-8' (C 30) | E 160e| Gomme guar | E 412 |\\n| Lutéine | E 161b| Gomme adragante | E 413 |\\n| Rouge de betterave, bétanine | E 162 | Gomme arabique ou gomme d'acacia | E 414 |\\n| Anthocyanes | E 163 | Gomme xanthane | E 415 |\\n| Acide sorbique | E 200 | Gomme Karaya | E 416 |\\n| Sorbate de potassium | E 202 | Gomme Tara | E 417 |\\n| Acide acétique | E 260 | Gomme Gellane | E 418 |\\n| Acétates de potassium | E 261 | Sorbitols | E 420 |\\n| Acétates de sodium | E 262 | Glycérol | E 422 |\\n| Acétate de calcium | E 263 | Gomme cassia | E 427 |\\n| Acide lactique | E 270 | Pectines | E 440 |\\n| Propionate de calcium | E 282 | Méthylcellulose | E 461 |\\n| Dioxyde de carbone | E 290 | Sels de sodium, de potassium et de calcium d'acides gras | E 470a |\\n| Acide malique | E 296 | Sels de magnésium d'acides gras | E 470b |\\n| Acide ascorbique | E 300 | Polyglycérols de polyglycérol | E 476 |\\n| Ascorbate de sodium | E 301 | Phytostérols riches en stigmasterol | E 499 |\\n| Ascorbate de calcium | E 302 | Carbonates de sodium | E 500 |\\n| Esters d'acides gras de l'acide ascorbique | E 304 | Carbonates de potassium | E 501 |\\n| Extrait riche en tocophérols | E 306 | Carbonates d'ammonium | E 503 |\\n| Alpha-tocophérol | E 307 | Carbonates de magnésium | E 504 |\\n| Érythorbate de sodium | E 316 | Acide chlorhydrique | E 507 |\\n| Lécithines | E 322 | Chlorure de potassium | E 508 |\\n| Lactate de sodium | E 325 | Chlorure de calcium | E 509 |\\n| Lactate de potassium | E 326 | Chlorure de magnésium | E 511 |\\n| Lactate de calcium | E 327 | Acide sulfurique | E 513 |\\n| Acide citrique | E 330 | Sulfates de sodium | E 514 |\\n| Citrates de sodium | E 331 | Sulfates de potassium | E 515 |\\n| Citrates de potassium | E 332 | Sulfate de calcium | E 516 |\\n| Citrates de calcium | E 333 | Sulfate d'ammonium | E 517 |\\n| Acide tartrique [L (+)] | E 334 | Hydroxyde de sodium | E 524 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Hydroxyde de potassium | E 525 | Shellac | E 904 |\\n|------------------------------|--------|----------------------------------|--------|\\n| Hydroxyde de calcium | E 526 | L-cystéine | E 920 |\\n| Hydroxyde de magnésium | E 528 | Carbamide | E 927b |\\n| Tartrate de fer | E 534 | Argon | E 938 |\\n| Ferrocyanure de sodium | E 535 | Hélium | E 939 |\\n| Ferrocyanure de potassium | E 536 | Azote | E 941 |\\n| Acides gras | E 570 | Protoxyde d’azote | E 942 |\\n| Acide gluconique | E 574 | Oxygène | E 948 |\\n| Glucono-delta-lactone | E 575 | Hydrogène | E 949 |\\n| Gluconate de sodium | E 576 | Néo-hespéridine DC | E 959 |\\n| Gluconate de potassium | E 577 | Glycosides de stéviol | E 960 |\\n| Gluconate de calcium | E 578 | Lactitol | E 966 |\\n| Gluconate ferreux | E 579 | Invertase | E 1103 |\\n| Lactate ferreux | E 585 | Polydextrose | E 1200 |\\n| L-leucine | E 641 | Pullulan | E 1204 |\\n| Cire d’abeille blanche et jaune | E 901 | Citrate de triéthyle | E 1505 |\\n| Cire de candelilla | E 902 | Triacétate de glycéryle (triacétine) | E 1518 |\\n| Cire de carnauba | E 903 | Propanediol-1,2 (propylène glycol) | E 1520 |\\n\\n## ANNEXE IV : Ingrédients controversés : ingrédients faisant l’objet de rapports scientifiques controversés et/ou perçus négativement par le consommateur.\\n\\n| Ingrédient | Motif |\\n|---------------------------|-----------------------------------------------------------------------|\\n| Sirop de glucose-fructose | Niveau de transformation élevé + manque de transparence sur le niveau de sucre présent dans le produit |\\n| Maltodextrine | Ingrédient sans intérêt nutritionnel et organoleptique |\\n| Huile de coco/coprah | Contient 80% d’acides gras saturés dont l’excès augmente le risque de maladies cardiovasculaires |\\n| Sirop de maïs | Niveau de transformation élevé + manque de transparence sur le niveau de sucre présent dans le produit |\\n\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n" - ] - } - ], - "source": [ - "## Read PDF files\n", - "from typing import List\n", - "from llama_index.core.schema import Document\n", - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()\n", - "#GET LLAMA_CLOUD_API_KEY\n", - "import os\n", - "from llama_parse import LlamaParse\n", - "from llama_parse.utils import ResultType, Language\n", - "\n", - "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", - "print(api_key)\n", - "\n", - "parsing_instructions = \"Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables.\"\n", - "\n", - "parser = LlamaParse(\n", - " api_key=str(api_key), \n", - " result_type=ResultType.MD,\n", - " gpt4o_mode=True,\n", - " verbose=True,\n", - " language=Language.FRENCH,\n", - " parsing_instruction=parsing_instructions, # Optionally you can define a parsing instruction\n", - ")\n", - "# sync\n", - "documents: List[Document] = parser.load_data(\"../input/CDP_CHARTE_PRODUITS.pdf\")\n", - "print(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# with open(\"../output/CDP_CHARTE_PRODUITS__llamaParse.md\", \"w\") as f:\n", - "# f.write(documents[0].get_content())" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "106\n", - "Found 76 unique paragraphs on 106 paragraphs.\n", - "Found 30 duplicate paragraphs.\n" - ] - } - ], - "source": [ - "import sys\n", - "sys.path.append('..')\n", - "from src.markdown_processor import MarkdownProcessor\n", - "\n", - "md_result: str = documents[0].get_content()\n", - "\n", - "output_path = \"../output/CDP_CHARTE_PRODUITS__llamaParse_cleaned.md\"\n", - "processor = MarkdownProcessor(md_result, strict=True, remove_pagination=True)\n", - "md_cleaned = processor.process()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "processor.save_cleaned_result(md_cleaned, output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Started parsing the file under job_id b82bb4ce-18ac-4c84-a2c8-f48ab418aae0\n", - "106\n", - "Found 76 unique paragraphs on 106 paragraphs.\n", - "Found 30 duplicate paragraphs.\n" - ] - } - ], - "source": [ - "from pathlib import Path\n", - "import sys\n", - "sys.path.append('..')\n", - "from src.markdown_processor import MarkdownProcessor\n", - "from src.converter import PDFConverter\n", - "import os \n", - "\n", - "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", - "\n", - "converter = PDFConverter(api_key=str(api_key))\n", - "md_content = converter.convert(\"../input/CDP_CHARTE_PRODUITS.pdf\")\n", - "converter.save_md(md_content, Path(\"../output/CDP_CHARTE_PRODUITS.md\"))\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "QuivrParse-DS8JDGq8", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/backend/core/MegaParse/notebooks/pptx2md.ipynb b/backend/core/MegaParse/notebooks/pptx2md.ipynb deleted file mode 100644 index 1d4102b5d254..000000000000 --- a/backend/core/MegaParse/notebooks/pptx2md.ipynb +++ /dev/null @@ -1,46 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from src.Converter import PPTXConverter\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "converter = PPTXConverter()\n", - "md = converter.convert(\"./input/Quivr_Monotype_Proposal.pptx\")\n", - "converter.save_md(md, Path(\"./output/Quivr_Monotype_Proposal.md\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "QuivrParse-DS8JDGq8", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/backend/core/MegaParse/notebooks/test.ipynb b/backend/core/MegaParse/notebooks/test.ipynb deleted file mode 100644 index bfa2a7738e35..000000000000 --- a/backend/core/MegaParse/notebooks/test.ipynb +++ /dev/null @@ -1,159 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "from megaparse.Converter import MegaParse\n", - "from IPython.display import display_markdown\n", - "import pdfminer\n", - "from pdfminer.image import ImageWriter\n", - "from pdfminer.high_level import extract_pages\n", - "\n", - "import fitz\n", - "import io\n", - "from PIL import Image" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "file_path = \"megaparse/tests/input_tests/MegaFake_report.pdf\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "megaparse = MegaParse(file_path=file_path)\n", - "content = megaparse.convert()\n", - "megaparse.save_md(md_content=content, file_path=\"./content.md\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "display_markdown(content, raw=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# def extract_images_from_pdf(pdf_file_path, output_dir):\n", - "# iw = ImageWriter(output_dir)\n", - "# image_count = 0\n", - "\n", - "# for page_num, page_layout in enumerate(extract_pages(pdf_file_path)):\n", - "# for image in get_images_from_page(page_layout):\n", - "# image_name = f\"image_{image_count}_page_{page_num}.png\"\n", - "# iw.export_image(image)\n", - "# image_count += 1\n", - "\n", - "\n", - "# def get_images_from_page(page_layout):\n", - "# if isinstance(page_layout, pdfminer.layout.LTImage):\n", - "# return [page_layout]\n", - "# if isinstance(page_layout, pdfminer.layout.LTContainer):\n", - "# img_list = []\n", - "# for child in page_layout:\n", - "# img_list += get_images_from_page(child)\n", - "# return img_list\n", - "# else:\n", - "# return []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_images_from_pdf(pdf_file_path: str, output_dir: str):\n", - " pdf_file = fitz.open(pdf_file_path)\n", - " for page_number in range(1, len(pdf_file)):\n", - " page = pdf_file[page_number]\n", - " for image_index, img in enumerate(page.get_images(), start=1):\n", - " xref = img[0]\n", - " base_image = pdf_file.extract_image(xref)\n", - " image_bytes = base_image[\"image\"]\n", - " image_ext = base_image[\"ext\"]\n", - " pil_image = Image.open(io.BytesIO(image_bytes))\n", - " image_path = (\n", - " f\"{output_dir}image_{image_index}_page_{page_number}.{image_ext}\"\n", - " )\n", - " pil_image.save(image_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n", - "1\n", - "1\n", - "1\n", - "1\n" - ] - } - ], - "source": [ - "extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ENV", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/backend/core/MegaParse/notebooks/unstructured.ipynb b/backend/core/MegaParse/notebooks/unstructured.ipynb deleted file mode 100644 index 6174d3341931..000000000000 --- a/backend/core/MegaParse/notebooks/unstructured.ipynb +++ /dev/null @@ -1,71 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Mega Parse" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n", - "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], - "source": [ - "from pathlib import Path\n", - "import sys\n", - "sys.path.append('..')\n", - "from megaparse.unstructured import UnstructuredParser\n", - "import os \n", - "\n", - "unstructured = UnstructuredParser()\n", - "file_partitioned = unstructured.partition_pdf_file('../megaparse/tests/input_tests/MegaFake_report.pdf')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "unstructured2 = UnstructuredParser()\n", - "\n", - "\n", - "elements_dict = [el.to_dict() for el in file_partitioned]\n", - "markdown_content = unstructured2.convert_to_markdown(elements_dict)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "QuivrParse-DS8JDGq8", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/backend/core/MegaParse/pyproject.toml b/backend/core/MegaParse/pyproject.toml deleted file mode 100644 index 20bdf1bb14de..000000000000 --- a/backend/core/MegaParse/pyproject.toml +++ /dev/null @@ -1,53 +0,0 @@ -[project] -name = "megaparse" -version = "0.0.31" -description = "Parse complex files (PDF,Docx,PPTX) for LLM consumption" -authors = [ - { name = "Stan Girard", email = "stan@quivr.app" }, - { name = "Chloé Daems", email = "chloe@quivr.app" } -] -readme = "README.md" -dependencies = [ - "python-docx>=1.1.0", - "mammoth>=1.8.0", - "python-pptx>=1.0.2", - "llama-parse>=0.4.0", - "pdf2docx>=0.5.0", - "unstructured[pdf]>=0.15.0", - "langchain>=0.2.0", - "langchain-community>=0.2.0", - "langchain-openai>=0.1.0", - "langchain-core>=0.2.0", - "python-dotenv>=1.0.0", - "pycryptodome>=3.20.0", - "llama-index>=0.10.0", - "pdfplumber>=0.11.0", -] -python = "^3.11" - - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.rye] -managed = true -universal = true -dev-dependencies = [ - "mypy>=1.11.1", - "pre-commit>=3.8.0", - "ipykernel>=6.29.5", - "ruff>=0.6.0", - "flake8>=7.1.1", - "flake8-black>=0.3.6", - "pytest-asyncio>=0.23.8", - "pytest>=8.3.2", - "pytest-xdist>=3.6.1", - "pytest-cov>=5.0.0", -] - -[tool.hatch.metadata] -allow-direct-references = true - -[tool.hatch.build.targets.wheel] -packages = ["megaparse"] \ No newline at end of file diff --git a/backend/core/MegaParse/release-please-config.json b/backend/core/MegaParse/release-please-config.json deleted file mode 100644 index f954720b905a..000000000000 --- a/backend/core/MegaParse/release-please-config.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "packages": { - ".": { - "release-type": "python", - "package-name": "megaparse", - "bump-patch-for-minor-pre-major": true, - "changelog-notes-type": "github", - "include-v-in-tag": true - } - } -} diff --git a/backend/core/MegaParse/requirements-dev.lock b/backend/core/MegaParse/requirements-dev.lock deleted file mode 100644 index 02ee108e3c32..000000000000 --- a/backend/core/MegaParse/requirements-dev.lock +++ /dev/null @@ -1,710 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: true -# with-sources: false -# generate-hashes: false -# universal: true - --e file:. -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via langchain - # via langchain-community - # via llama-index-core - # via llama-index-legacy -aiosignal==1.3.1 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -antlr4-python3-runtime==4.9.3 - # via omegaconf -anyio==4.4.0 - # via httpx - # via openai -appnope==0.1.4 ; platform_system == 'Darwin' - # via ipykernel -asttokens==2.4.1 - # via stack-data -attrs==24.2.0 - # via aiohttp -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via llama-index-readers-file - # via unstructured -black==24.8.0 - # via flake8-black -cachetools==5.5.0 - # via google-auth -certifi==2024.7.4 - # via httpcore - # via httpx - # via requests - # via unstructured-client -cffi==1.17.0 ; implementation_name == 'pypy' or platform_python_implementation != 'PyPy' - # via cryptography - # via pyzmq -cfgv==3.4.0 - # via pre-commit -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via pdfminer-six - # via requests - # via unstructured-client -click==8.1.7 - # via black - # via nltk -cobble==0.1.4 - # via mammoth -colorama==0.4.6 ; platform_system == 'Windows' or sys_platform == 'win32' - # via click - # via ipython - # via pytest - # via tqdm -coloredlogs==15.0.1 - # via onnxruntime -comm==0.2.2 - # via ipykernel -contourpy==1.2.1 - # via matplotlib -coverage==7.6.1 - # via pytest-cov -cryptography==43.0.0 - # via pdfminer-six -cycler==0.12.1 - # via matplotlib -dataclasses-json==0.6.7 - # via langchain-community - # via llama-index-core - # via llama-index-legacy - # via unstructured - # via unstructured-client -debugpy==1.8.5 - # via ipykernel -decorator==5.1.1 - # via ipython -deepdiff==7.0.1 - # via unstructured-client -deprecated==1.2.14 - # via llama-index-core - # via llama-index-legacy - # via pikepdf -dirtyjson==1.0.8 - # via llama-index-core - # via llama-index-legacy -distlib==0.3.8 - # via virtualenv -distro==1.9.0 - # via openai -effdet==0.4.1 - # via unstructured -emoji==2.12.1 - # via unstructured -execnet==2.1.1 - # via pytest-xdist -executing==2.0.1 - # via stack-data -filelock==3.15.4 - # via huggingface-hub - # via torch - # via transformers - # via triton - # via virtualenv -filetype==1.2.0 - # via unstructured -fire==0.6.0 - # via pdf2docx -flake8==7.1.1 - # via flake8-black -flake8-black==0.3.6 -flatbuffers==24.3.25 - # via onnxruntime -fonttools==4.53.1 - # via matplotlib - # via pdf2docx -frozenlist==1.4.1 - # via aiohttp - # via aiosignal -fsspec==2024.6.1 - # via huggingface-hub - # via llama-index-core - # via llama-index-legacy - # via torch -google-api-core==2.19.1 - # via google-cloud-vision -google-auth==2.34.0 - # via google-api-core - # via google-cloud-vision -google-cloud-vision==3.7.4 - # via unstructured -googleapis-common-protos==1.63.2 - # via google-api-core - # via grpcio-status -greenlet==3.0.3 - # via sqlalchemy -grpcio==1.65.5 - # via google-api-core - # via grpcio-status -grpcio-status==1.65.5 - # via google-api-core -h11==0.14.0 - # via httpcore -httpcore==1.0.5 - # via httpx -httpx==0.27.0 - # via llama-cloud - # via llama-index-core - # via llama-index-legacy - # via openai - # via unstructured-client -huggingface-hub==0.24.6 - # via timm - # via tokenizers - # via transformers - # via unstructured-inference -humanfriendly==10.0 - # via coloredlogs -identify==2.6.0 - # via pre-commit -idna==3.7 - # via anyio - # via httpx - # via requests - # via unstructured-client - # via yarl -iniconfig==2.0.0 - # via pytest -iopath==0.1.10 - # via layoutparser -ipykernel==6.29.5 -ipython==8.26.0 - # via ipykernel -jedi==0.19.1 - # via ipython -jinja2==3.1.4 - # via torch -jiter==0.5.0 - # via openai -joblib==1.4.2 - # via nltk -jsonpatch==1.33 - # via langchain-core -jsonpath-python==1.0.6 - # via unstructured-client -jsonpointer==3.0.0 - # via jsonpatch -jupyter-client==8.6.2 - # via ipykernel -jupyter-core==5.7.2 - # via ipykernel - # via jupyter-client -kiwisolver==1.4.5 - # via matplotlib -langchain==0.2.14 - # via langchain-community - # via megaparse -langchain-community==0.2.12 - # via megaparse -langchain-core==0.2.33 - # via langchain - # via langchain-community - # via langchain-openai - # via langchain-text-splitters - # via megaparse -langchain-openai==0.1.22 - # via megaparse -langchain-text-splitters==0.2.2 - # via langchain -langdetect==1.0.9 - # via unstructured -langsmith==0.1.99 - # via langchain - # via langchain-community - # via langchain-core -layoutparser==0.3.4 - # via unstructured-inference -llama-cloud==0.0.13 - # via llama-index-indices-managed-llama-cloud -llama-index==0.10.67.post1 - # via megaparse -llama-index-agent-openai==0.2.9 - # via llama-index - # via llama-index-program-openai -llama-index-cli==0.1.13 - # via llama-index -llama-index-core==0.10.67 - # via llama-index - # via llama-index-agent-openai - # via llama-index-cli - # via llama-index-embeddings-openai - # via llama-index-indices-managed-llama-cloud - # via llama-index-llms-openai - # via llama-index-multi-modal-llms-openai - # via llama-index-program-openai - # via llama-index-question-gen-openai - # via llama-index-readers-file - # via llama-index-readers-llama-parse - # via llama-parse -llama-index-embeddings-openai==0.1.11 - # via llama-index - # via llama-index-cli -llama-index-indices-managed-llama-cloud==0.2.7 - # via llama-index -llama-index-legacy==0.9.48.post3 - # via llama-index -llama-index-llms-openai==0.1.29 - # via llama-index - # via llama-index-agent-openai - # via llama-index-cli - # via llama-index-multi-modal-llms-openai - # via llama-index-program-openai - # via llama-index-question-gen-openai -llama-index-multi-modal-llms-openai==0.1.9 - # via llama-index -llama-index-program-openai==0.1.7 - # via llama-index - # via llama-index-question-gen-openai -llama-index-question-gen-openai==0.1.3 - # via llama-index -llama-index-readers-file==0.1.33 - # via llama-index -llama-index-readers-llama-parse==0.1.6 - # via llama-index -llama-parse==0.4.9 - # via llama-index-readers-llama-parse - # via megaparse -lxml==5.3.0 - # via pikepdf - # via python-docx - # via python-pptx - # via unstructured -mammoth==1.8.0 - # via megaparse -markupsafe==2.1.5 - # via jinja2 -marshmallow==3.21.3 - # via dataclasses-json - # via unstructured-client -matplotlib==3.9.2 - # via pycocotools - # via unstructured-inference -matplotlib-inline==0.1.7 - # via ipykernel - # via ipython -mccabe==0.7.0 - # via flake8 -mpmath==1.3.0 - # via sympy -multidict==6.0.5 - # via aiohttp - # via yarl -mypy==1.11.1 -mypy-extensions==1.0.0 - # via black - # via mypy - # via typing-inspect - # via unstructured-client -nest-asyncio==1.6.0 - # via ipykernel - # via llama-index-core - # via llama-index-legacy - # via unstructured-client -networkx==3.3 - # via llama-index-core - # via llama-index-legacy - # via torch -nltk==3.9.1 - # via llama-index-core - # via llama-index-legacy - # via unstructured -nodeenv==1.9.1 - # via pre-commit -numpy==1.26.4 - # via contourpy - # via langchain - # via langchain-community - # via layoutparser - # via llama-index-core - # via llama-index-legacy - # via matplotlib - # via onnx - # via onnxruntime - # via opencv-python - # via opencv-python-headless - # via pandas - # via pdf2docx - # via pycocotools - # via scipy - # via torchvision - # via transformers - # via unstructured -nvidia-cublas-cu12==12.1.3.1 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via nvidia-cudnn-cu12 - # via nvidia-cusolver-cu12 - # via torch -nvidia-cuda-cupti-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cuda-runtime-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cudnn-cu12==9.1.0.70 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cufft-cu12==11.0.2.54 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-curand-cu12==10.3.2.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cusolver-cu12==11.4.5.107 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cusparse-cu12==12.1.0.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via nvidia-cusolver-cu12 - # via torch -nvidia-nccl-cu12==2.20.5 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-nvjitlink-cu12==12.6.20 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via nvidia-cusolver-cu12 - # via nvidia-cusparse-cu12 -nvidia-nvtx-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -omegaconf==2.3.0 - # via effdet -onnx==1.16.2 - # via unstructured - # via unstructured-inference -onnxruntime==1.19.0 - # via unstructured-inference -openai==1.41.1 - # via langchain-openai - # via llama-index-agent-openai - # via llama-index-core - # via llama-index-legacy - # via llama-index-llms-openai -opencv-python==4.10.0.84 - # via layoutparser - # via unstructured-inference -opencv-python-headless==4.10.0.84 - # via pdf2docx -ordered-set==4.1.0 - # via deepdiff -orjson==3.10.7 - # via langsmith -packaging==24.1 - # via black - # via huggingface-hub - # via ipykernel - # via langchain-core - # via marshmallow - # via matplotlib - # via onnxruntime - # via pikepdf - # via pytest - # via transformers - # via unstructured-client - # via unstructured-pytesseract -pandas==2.2.2 - # via layoutparser - # via llama-index-core - # via llama-index-legacy -parso==0.8.4 - # via jedi -pathspec==0.12.1 - # via black -pdf2docx==0.5.8 - # via megaparse -pdf2image==1.17.0 - # via layoutparser - # via unstructured -pdfminer-six==20231228 - # via pdfplumber - # via unstructured -pdfplumber==0.11.4 - # via layoutparser - # via megaparse -pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' - # via ipython -pikepdf==9.1.1 - # via unstructured -pillow==10.4.0 - # via layoutparser - # via llama-index-core - # via matplotlib - # via pdf2image - # via pdfplumber - # via pikepdf - # via pillow-heif - # via python-pptx - # via torchvision - # via unstructured-pytesseract -pillow-heif==0.18.0 - # via unstructured -platformdirs==4.2.2 - # via black - # via jupyter-core - # via virtualenv -pluggy==1.5.0 - # via pytest -portalocker==2.10.1 - # via iopath -pre-commit==3.8.0 -prompt-toolkit==3.0.47 - # via ipython -proto-plus==1.24.0 - # via google-api-core - # via google-cloud-vision -protobuf==5.27.3 - # via google-api-core - # via google-cloud-vision - # via googleapis-common-protos - # via grpcio-status - # via onnx - # via onnxruntime - # via proto-plus -psutil==6.0.0 - # via ipykernel - # via unstructured -ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' - # via pexpect -pure-eval==0.2.3 - # via stack-data -pyasn1==0.6.0 - # via pyasn1-modules - # via rsa -pyasn1-modules==0.4.0 - # via google-auth -pycocotools==2.0.8 - # via effdet -pycodestyle==2.12.1 - # via flake8 -pycparser==2.22 ; implementation_name == 'pypy' or platform_python_implementation != 'PyPy' - # via cffi -pycryptodome==3.20.0 - # via megaparse -pydantic==2.8.2 - # via langchain - # via langchain-core - # via langsmith - # via llama-cloud - # via openai -pydantic-core==2.20.1 - # via pydantic -pyflakes==3.2.0 - # via flake8 -pygments==2.18.0 - # via ipython -pymupdf==1.24.9 - # via pdf2docx -pymupdfb==1.24.9 - # via pymupdf -pyparsing==3.1.2 - # via matplotlib -pypdf==4.3.1 - # via llama-index-readers-file - # via unstructured - # via unstructured-client -pypdfium2==4.30.0 - # via pdfplumber -pyreadline3==3.4.1 ; sys_platform == 'win32' - # via humanfriendly -pytest==8.3.2 - # via pytest-asyncio - # via pytest-cov - # via pytest-xdist -pytest-asyncio==0.23.8 -pytest-cov==5.0.0 -pytest-xdist==3.6.1 -python-dateutil==2.9.0.post0 - # via jupyter-client - # via matplotlib - # via pandas - # via unstructured-client -python-docx==1.1.2 - # via megaparse - # via pdf2docx -python-dotenv==1.0.1 - # via megaparse -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-multipart==0.0.9 - # via unstructured-inference -python-pptx==1.0.2 - # via megaparse -pytz==2024.1 - # via pandas -pywin32==306 ; platform_system == 'Windows' or (platform_python_implementation != 'PyPy' and sys_platform == 'win32') - # via jupyter-core - # via portalocker -pyyaml==6.0.2 - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-core - # via layoutparser - # via llama-index-core - # via omegaconf - # via pre-commit - # via timm - # via transformers -pyzmq==26.1.1 - # via ipykernel - # via jupyter-client -rapidfuzz==3.9.6 - # via unstructured - # via unstructured-inference -regex==2024.7.24 - # via nltk - # via tiktoken - # via transformers -requests==2.32.3 - # via google-api-core - # via huggingface-hub - # via langchain - # via langchain-community - # via langsmith - # via llama-index-core - # via llama-index-legacy - # via requests-toolbelt - # via tiktoken - # via transformers - # via unstructured - # via unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -rsa==4.9 - # via google-auth -ruff==0.6.1 -safetensors==0.4.4 - # via timm - # via transformers -scipy==1.14.0 - # via layoutparser -setuptools==73.0.0 - # via torch -six==1.16.0 - # via asttokens - # via fire - # via langdetect - # via python-dateutil - # via unstructured-client -sniffio==1.3.1 - # via anyio - # via httpx - # via openai -soupsieve==2.6 - # via beautifulsoup4 -sqlalchemy==2.0.32 - # via langchain - # via langchain-community - # via llama-index-core - # via llama-index-legacy -stack-data==0.6.3 - # via ipython -striprtf==0.0.26 - # via llama-index-readers-file -sympy==1.13.2 - # via onnxruntime - # via torch -tabulate==0.9.0 - # via unstructured -tenacity==8.5.0 - # via langchain - # via langchain-community - # via langchain-core - # via llama-index-core - # via llama-index-legacy -termcolor==2.4.0 - # via fire -tiktoken==0.7.0 - # via langchain-openai - # via llama-index-core - # via llama-index-legacy -timm==1.0.8 - # via effdet - # via unstructured-inference -tokenizers==0.19.1 - # via transformers -torch==2.4.0 - # via effdet - # via timm - # via torchvision - # via unstructured-inference -torchvision==0.19.0 - # via effdet - # via timm -tornado==6.4.1 - # via ipykernel - # via jupyter-client -tqdm==4.66.5 - # via huggingface-hub - # via iopath - # via llama-index-core - # via nltk - # via openai - # via transformers - # via unstructured -traitlets==5.14.3 - # via comm - # via ipykernel - # via ipython - # via jupyter-client - # via jupyter-core - # via matplotlib-inline -transformers==4.44.0 - # via unstructured-inference -triton==3.0.0 ; python_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -typing-extensions==4.12.2 - # via emoji - # via huggingface-hub - # via iopath - # via langchain-core - # via llama-index-core - # via llama-index-legacy - # via mypy - # via openai - # via pydantic - # via pydantic-core - # via python-docx - # via python-pptx - # via sqlalchemy - # via torch - # via typing-inspect - # via unstructured - # via unstructured-client -typing-inspect==0.9.0 - # via dataclasses-json - # via llama-index-core - # via llama-index-legacy - # via unstructured-client -tzdata==2024.1 - # via pandas -unstructured==0.15.5 - # via megaparse -unstructured-client==0.25.5 - # via unstructured -unstructured-inference==0.7.36 - # via unstructured -unstructured-pytesseract==0.3.13 - # via unstructured -urllib3==2.2.2 - # via requests - # via unstructured-client -virtualenv==20.26.3 - # via pre-commit -wcwidth==0.2.13 - # via prompt-toolkit -wrapt==1.16.0 - # via deprecated - # via llama-index-core - # via unstructured -xlsxwriter==3.2.0 - # via python-pptx -yarl==1.9.4 - # via aiohttp diff --git a/backend/core/MegaParse/requirements.lock b/backend/core/MegaParse/requirements.lock deleted file mode 100644 index 9c58f6746245..000000000000 --- a/backend/core/MegaParse/requirements.lock +++ /dev/null @@ -1,594 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: true -# with-sources: false -# generate-hashes: false -# universal: true - --e file:. -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via langchain - # via langchain-community - # via llama-index-core - # via llama-index-legacy -aiosignal==1.3.1 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -antlr4-python3-runtime==4.9.3 - # via omegaconf -anyio==4.4.0 - # via httpx - # via openai -attrs==24.2.0 - # via aiohttp -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via llama-index-readers-file - # via unstructured -cachetools==5.5.0 - # via google-auth -certifi==2024.7.4 - # via httpcore - # via httpx - # via requests - # via unstructured-client -cffi==1.17.0 ; platform_python_implementation != 'PyPy' - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via pdfminer-six - # via requests - # via unstructured-client -click==8.1.7 - # via nltk -cobble==0.1.4 - # via mammoth -colorama==0.4.6 ; platform_system == 'Windows' - # via click - # via tqdm -coloredlogs==15.0.1 - # via onnxruntime -contourpy==1.2.1 - # via matplotlib -cryptography==43.0.0 - # via pdfminer-six -cycler==0.12.1 - # via matplotlib -dataclasses-json==0.6.7 - # via langchain-community - # via llama-index-core - # via llama-index-legacy - # via unstructured - # via unstructured-client -deepdiff==7.0.1 - # via unstructured-client -deprecated==1.2.14 - # via llama-index-core - # via llama-index-legacy - # via pikepdf -dirtyjson==1.0.8 - # via llama-index-core - # via llama-index-legacy -distro==1.9.0 - # via openai -effdet==0.4.1 - # via unstructured -emoji==2.12.1 - # via unstructured -filelock==3.15.4 - # via huggingface-hub - # via torch - # via transformers - # via triton -filetype==1.2.0 - # via unstructured -fire==0.6.0 - # via pdf2docx -flatbuffers==24.3.25 - # via onnxruntime -fonttools==4.53.1 - # via matplotlib - # via pdf2docx -frozenlist==1.4.1 - # via aiohttp - # via aiosignal -fsspec==2024.6.1 - # via huggingface-hub - # via llama-index-core - # via llama-index-legacy - # via torch -google-api-core==2.19.1 - # via google-cloud-vision -google-auth==2.34.0 - # via google-api-core - # via google-cloud-vision -google-cloud-vision==3.7.4 - # via unstructured -googleapis-common-protos==1.63.2 - # via google-api-core - # via grpcio-status -greenlet==3.0.3 - # via sqlalchemy -grpcio==1.65.5 - # via google-api-core - # via grpcio-status -grpcio-status==1.65.5 - # via google-api-core -h11==0.14.0 - # via httpcore -httpcore==1.0.5 - # via httpx -httpx==0.27.0 - # via llama-cloud - # via llama-index-core - # via llama-index-legacy - # via openai - # via unstructured-client -huggingface-hub==0.24.6 - # via timm - # via tokenizers - # via transformers - # via unstructured-inference -humanfriendly==10.0 - # via coloredlogs -idna==3.7 - # via anyio - # via httpx - # via requests - # via unstructured-client - # via yarl -iopath==0.1.10 - # via layoutparser -jinja2==3.1.4 - # via torch -jiter==0.5.0 - # via openai -joblib==1.4.2 - # via nltk -jsonpatch==1.33 - # via langchain-core -jsonpath-python==1.0.6 - # via unstructured-client -jsonpointer==3.0.0 - # via jsonpatch -kiwisolver==1.4.5 - # via matplotlib -langchain==0.2.14 - # via langchain-community - # via megaparse -langchain-community==0.2.12 - # via megaparse -langchain-core==0.2.33 - # via langchain - # via langchain-community - # via langchain-openai - # via langchain-text-splitters - # via megaparse -langchain-openai==0.1.22 - # via megaparse -langchain-text-splitters==0.2.2 - # via langchain -langdetect==1.0.9 - # via unstructured -langsmith==0.1.99 - # via langchain - # via langchain-community - # via langchain-core -layoutparser==0.3.4 - # via unstructured-inference -llama-cloud==0.0.13 - # via llama-index-indices-managed-llama-cloud -llama-index==0.10.67.post1 - # via megaparse -llama-index-agent-openai==0.2.9 - # via llama-index - # via llama-index-program-openai -llama-index-cli==0.1.13 - # via llama-index -llama-index-core==0.10.67 - # via llama-index - # via llama-index-agent-openai - # via llama-index-cli - # via llama-index-embeddings-openai - # via llama-index-indices-managed-llama-cloud - # via llama-index-llms-openai - # via llama-index-multi-modal-llms-openai - # via llama-index-program-openai - # via llama-index-question-gen-openai - # via llama-index-readers-file - # via llama-index-readers-llama-parse - # via llama-parse -llama-index-embeddings-openai==0.1.11 - # via llama-index - # via llama-index-cli -llama-index-indices-managed-llama-cloud==0.2.7 - # via llama-index -llama-index-legacy==0.9.48.post3 - # via llama-index -llama-index-llms-openai==0.1.29 - # via llama-index - # via llama-index-agent-openai - # via llama-index-cli - # via llama-index-multi-modal-llms-openai - # via llama-index-program-openai - # via llama-index-question-gen-openai -llama-index-multi-modal-llms-openai==0.1.9 - # via llama-index -llama-index-program-openai==0.1.7 - # via llama-index - # via llama-index-question-gen-openai -llama-index-question-gen-openai==0.1.3 - # via llama-index -llama-index-readers-file==0.1.33 - # via llama-index -llama-index-readers-llama-parse==0.1.6 - # via llama-index -llama-parse==0.4.9 - # via llama-index-readers-llama-parse - # via megaparse -lxml==5.3.0 - # via pikepdf - # via python-docx - # via python-pptx - # via unstructured -mammoth==1.8.0 - # via megaparse -markupsafe==2.1.5 - # via jinja2 -marshmallow==3.21.3 - # via dataclasses-json - # via unstructured-client -matplotlib==3.9.2 - # via pycocotools - # via unstructured-inference -mpmath==1.3.0 - # via sympy -multidict==6.0.5 - # via aiohttp - # via yarl -mypy-extensions==1.0.0 - # via typing-inspect - # via unstructured-client -nest-asyncio==1.6.0 - # via llama-index-core - # via llama-index-legacy - # via unstructured-client -networkx==3.3 - # via llama-index-core - # via llama-index-legacy - # via torch -nltk==3.9.1 - # via llama-index-core - # via llama-index-legacy - # via unstructured -numpy==1.26.4 - # via contourpy - # via langchain - # via langchain-community - # via layoutparser - # via llama-index-core - # via llama-index-legacy - # via matplotlib - # via onnx - # via onnxruntime - # via opencv-python - # via opencv-python-headless - # via pandas - # via pdf2docx - # via pycocotools - # via scipy - # via torchvision - # via transformers - # via unstructured -nvidia-cublas-cu12==12.1.3.1 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via nvidia-cudnn-cu12 - # via nvidia-cusolver-cu12 - # via torch -nvidia-cuda-cupti-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cuda-runtime-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cudnn-cu12==9.1.0.70 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cufft-cu12==11.0.2.54 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-curand-cu12==10.3.2.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cusolver-cu12==11.4.5.107 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-cusparse-cu12==12.1.0.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via nvidia-cusolver-cu12 - # via torch -nvidia-nccl-cu12==2.20.5 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -nvidia-nvjitlink-cu12==12.6.20 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via nvidia-cusolver-cu12 - # via nvidia-cusparse-cu12 -nvidia-nvtx-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -omegaconf==2.3.0 - # via effdet -onnx==1.16.2 - # via unstructured - # via unstructured-inference -onnxruntime==1.19.0 - # via unstructured-inference -openai==1.41.1 - # via langchain-openai - # via llama-index-agent-openai - # via llama-index-core - # via llama-index-legacy - # via llama-index-llms-openai -opencv-python==4.10.0.84 - # via layoutparser - # via unstructured-inference -opencv-python-headless==4.10.0.84 - # via pdf2docx -ordered-set==4.1.0 - # via deepdiff -orjson==3.10.7 - # via langsmith -packaging==24.1 - # via huggingface-hub - # via langchain-core - # via marshmallow - # via matplotlib - # via onnxruntime - # via pikepdf - # via transformers - # via unstructured-client - # via unstructured-pytesseract -pandas==2.2.2 - # via layoutparser - # via llama-index-core - # via llama-index-legacy -pdf2docx==0.5.8 - # via megaparse -pdf2image==1.17.0 - # via layoutparser - # via unstructured -pdfminer-six==20231228 - # via pdfplumber - # via unstructured -pdfplumber==0.11.4 - # via layoutparser - # via megaparse -pikepdf==9.1.1 - # via unstructured -pillow==10.4.0 - # via layoutparser - # via llama-index-core - # via matplotlib - # via pdf2image - # via pdfplumber - # via pikepdf - # via pillow-heif - # via python-pptx - # via torchvision - # via unstructured-pytesseract -pillow-heif==0.18.0 - # via unstructured -portalocker==2.10.1 - # via iopath -proto-plus==1.24.0 - # via google-api-core - # via google-cloud-vision -protobuf==5.27.3 - # via google-api-core - # via google-cloud-vision - # via googleapis-common-protos - # via grpcio-status - # via onnx - # via onnxruntime - # via proto-plus -psutil==6.0.0 - # via unstructured -pyasn1==0.6.0 - # via pyasn1-modules - # via rsa -pyasn1-modules==0.4.0 - # via google-auth -pycocotools==2.0.8 - # via effdet -pycparser==2.22 ; platform_python_implementation != 'PyPy' - # via cffi -pycryptodome==3.20.0 - # via megaparse -pydantic==2.8.2 - # via langchain - # via langchain-core - # via langsmith - # via llama-cloud - # via openai -pydantic-core==2.20.1 - # via pydantic -pymupdf==1.24.9 - # via pdf2docx -pymupdfb==1.24.9 - # via pymupdf -pyparsing==3.1.2 - # via matplotlib -pypdf==4.3.1 - # via llama-index-readers-file - # via unstructured - # via unstructured-client -pypdfium2==4.30.0 - # via pdfplumber -pyreadline3==3.4.1 ; sys_platform == 'win32' - # via humanfriendly -python-dateutil==2.9.0.post0 - # via matplotlib - # via pandas - # via unstructured-client -python-docx==1.1.2 - # via megaparse - # via pdf2docx -python-dotenv==1.0.1 - # via megaparse -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-multipart==0.0.9 - # via unstructured-inference -python-pptx==1.0.2 - # via megaparse -pytz==2024.1 - # via pandas -pywin32==306 ; platform_system == 'Windows' - # via portalocker -pyyaml==6.0.2 - # via huggingface-hub - # via langchain - # via langchain-community - # via langchain-core - # via layoutparser - # via llama-index-core - # via omegaconf - # via timm - # via transformers -rapidfuzz==3.9.6 - # via unstructured - # via unstructured-inference -regex==2024.7.24 - # via nltk - # via tiktoken - # via transformers -requests==2.32.3 - # via google-api-core - # via huggingface-hub - # via langchain - # via langchain-community - # via langsmith - # via llama-index-core - # via llama-index-legacy - # via requests-toolbelt - # via tiktoken - # via transformers - # via unstructured - # via unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -rsa==4.9 - # via google-auth -safetensors==0.4.4 - # via timm - # via transformers -scipy==1.14.0 - # via layoutparser -setuptools==73.0.0 - # via torch -six==1.16.0 - # via fire - # via langdetect - # via python-dateutil - # via unstructured-client -sniffio==1.3.1 - # via anyio - # via httpx - # via openai -soupsieve==2.6 - # via beautifulsoup4 -sqlalchemy==2.0.32 - # via langchain - # via langchain-community - # via llama-index-core - # via llama-index-legacy -striprtf==0.0.26 - # via llama-index-readers-file -sympy==1.13.2 - # via onnxruntime - # via torch -tabulate==0.9.0 - # via unstructured -tenacity==8.5.0 - # via langchain - # via langchain-community - # via langchain-core - # via llama-index-core - # via llama-index-legacy -termcolor==2.4.0 - # via fire -tiktoken==0.7.0 - # via langchain-openai - # via llama-index-core - # via llama-index-legacy -timm==1.0.8 - # via effdet - # via unstructured-inference -tokenizers==0.19.1 - # via transformers -torch==2.4.0 - # via effdet - # via timm - # via torchvision - # via unstructured-inference -torchvision==0.19.0 - # via effdet - # via timm -tqdm==4.66.5 - # via huggingface-hub - # via iopath - # via llama-index-core - # via nltk - # via openai - # via transformers - # via unstructured -transformers==4.44.0 - # via unstructured-inference -triton==3.0.0 ; python_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux' - # via torch -typing-extensions==4.12.2 - # via emoji - # via huggingface-hub - # via iopath - # via langchain-core - # via llama-index-core - # via llama-index-legacy - # via openai - # via pydantic - # via pydantic-core - # via python-docx - # via python-pptx - # via sqlalchemy - # via torch - # via typing-inspect - # via unstructured - # via unstructured-client -typing-inspect==0.9.0 - # via dataclasses-json - # via llama-index-core - # via llama-index-legacy - # via unstructured-client -tzdata==2024.1 - # via pandas -unstructured==0.15.5 - # via megaparse -unstructured-client==0.25.5 - # via unstructured -unstructured-inference==0.7.36 - # via unstructured -unstructured-pytesseract==0.3.13 - # via unstructured -urllib3==2.2.2 - # via requests - # via unstructured-client -wrapt==1.16.0 - # via deprecated - # via llama-index-core - # via unstructured -xlsxwriter==3.2.0 - # via python-pptx -yarl==1.9.4 - # via aiohttp diff --git a/backend/core/MegaParse/tests/__init__.py b/backend/core/MegaParse/tests/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/backend/core/MegaParse/tests/data/dummy.pdf b/backend/core/MegaParse/tests/data/dummy.pdf deleted file mode 100644 index 8da27b526712..000000000000 Binary files a/backend/core/MegaParse/tests/data/dummy.pdf and /dev/null differ diff --git a/backend/core/MegaParse/tests/data/input_tests/MegaFake_report.pdf b/backend/core/MegaParse/tests/data/input_tests/MegaFake_report.pdf deleted file mode 100644 index 602ae67ecd19..000000000000 Binary files a/backend/core/MegaParse/tests/data/input_tests/MegaFake_report.pdf and /dev/null differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample.docx b/backend/core/MegaParse/tests/data/input_tests/sample.docx deleted file mode 100644 index 330bd5000310..000000000000 Binary files a/backend/core/MegaParse/tests/data/input_tests/sample.docx and /dev/null differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample.pptx b/backend/core/MegaParse/tests/data/input_tests/sample.pptx deleted file mode 100644 index ea727948c20f..000000000000 Binary files a/backend/core/MegaParse/tests/data/input_tests/sample.pptx and /dev/null differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample_pdf.pdf b/backend/core/MegaParse/tests/data/input_tests/sample_pdf.pdf deleted file mode 100644 index 5dc1f2e3102c..000000000000 Binary files a/backend/core/MegaParse/tests/data/input_tests/sample_pdf.pdf and /dev/null differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample_table.pdf b/backend/core/MegaParse/tests/data/input_tests/sample_table.pdf deleted file mode 100644 index b8e1353851a0..000000000000 Binary files a/backend/core/MegaParse/tests/data/input_tests/sample_table.pdf and /dev/null differ diff --git a/backend/core/MegaParse/tests/test_import.py b/backend/core/MegaParse/tests/test_import.py deleted file mode 100644 index 840d7baf41e2..000000000000 --- a/backend/core/MegaParse/tests/test_import.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest -from megaparse.Converter import MegaParse - - -@pytest.mark.skip("slow test") -def test_load(): - megaparse = MegaParse(file_path="./tests/data/dummy.pdf") - element = megaparse.load() - assert element.page_content.strip("\n") == "# Dummy PDF download" diff --git a/backend/core/pyproject.toml b/backend/core/pyproject.toml index c9916c0e91cb..75aebbf3be79 100644 --- a/backend/core/pyproject.toml +++ b/backend/core/pyproject.toml @@ -30,6 +30,23 @@ all = [ "megaparse" ] +megaparse = [ + "python-docx>=1.1.0", + "mammoth>=1.8.0", + "python-pptx>=1.0.2", + "llama-parse>=0.4.0", + "pdf2docx>=0.5.0", + "unstructured[pdf]>=0.15.0", + "langchain>=0.2.0", + "langchain-community>=0.2.0", + "langchain-openai>=0.1.0", + "langchain-core>=0.2.0", + "python-dotenv>=1.0.0", + "pycryptodome>=3.20.0", + "llama-index>=0.10.0", + "pdfplumber>=0.11.0", +] + [build-system] requires = ["hatchling"] build-backend = "hatchling.build" diff --git a/backend/core/quivr_core/config.py b/backend/core/quivr_core/config.py index 25181a40a029..12f7a3516574 100644 --- a/backend/core/quivr_core/config.py +++ b/backend/core/quivr_core/config.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional from uuid import UUID -from megaparse.config import MegaparseConfig +from quivr_core.processor.megaparse.config import MegaparseConfig from sqlmodel import SQLModel from quivr_core.base_config import QuivrBaseConfig diff --git a/backend/core/quivr_core/processor/implementations/megaparse_processor.py b/backend/core/quivr_core/processor/implementations/megaparse_processor.py index 6b5fcc182e07..0411538f6fc4 100644 --- a/backend/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/backend/core/quivr_core/processor/implementations/megaparse_processor.py @@ -3,8 +3,8 @@ import tiktoken from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter -from megaparse import MegaParse -from megaparse.config import MegaparseConfig +from quivr_core.processor.megaparse import MegaParse +from quivr_core.processor.megaparse.config import MegaparseConfig from quivr_core.files.file import QuivrFile from quivr_core.processor.processor_base import ProcessorBase diff --git a/backend/core/MegaParse/megaparse/Converter.py b/backend/core/quivr_core/processor/megaparse/Converter.py similarity index 97% rename from backend/core/MegaParse/megaparse/Converter.py rename to backend/core/quivr_core/processor/megaparse/Converter.py index 7a6b112a1af9..c1562923d96b 100644 --- a/backend/core/MegaParse/megaparse/Converter.py +++ b/backend/core/quivr_core/processor/megaparse/Converter.py @@ -20,10 +20,10 @@ from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE -from megaparse.config import MegaparseConfig, PdfParser -from megaparse.markdown_processor import MarkdownProcessor -from megaparse.multimodal_convertor.megaparse_vision import MegaParseVision -from megaparse.unstructured_convertor import ModelEnum, UnstructuredParser +from quivr_core.processor.megaparse.config import MegaparseConfig, PdfParser +from quivr_core.processor.megaparse.markdown_processor import MarkdownProcessor +from quivr_core.processor.megaparse.multimodal_convertor.megaparse_vision import MegaParseVision +from quivr_core.processor.megaparse.unstructured_convertor import ModelEnum, UnstructuredParser logger = logging.getLogger("megaparse") diff --git a/backend/core/MegaParse/megaparse/__init__.py b/backend/core/quivr_core/processor/megaparse/__init__.py similarity index 100% rename from backend/core/MegaParse/megaparse/__init__.py rename to backend/core/quivr_core/processor/megaparse/__init__.py diff --git a/backend/core/MegaParse/megaparse/config.py b/backend/core/quivr_core/processor/megaparse/config.py similarity index 100% rename from backend/core/MegaParse/megaparse/config.py rename to backend/core/quivr_core/processor/megaparse/config.py diff --git a/backend/core/MegaParse/megaparse/markdown_processor.py b/backend/core/quivr_core/processor/megaparse/markdown_processor.py similarity index 100% rename from backend/core/MegaParse/megaparse/markdown_processor.py rename to backend/core/quivr_core/processor/megaparse/markdown_processor.py diff --git a/backend/core/MegaParse/megaparse/multimodal_convertor/__init__.py b/backend/core/quivr_core/processor/megaparse/multimodal_convertor/__init__.py similarity index 100% rename from backend/core/MegaParse/megaparse/multimodal_convertor/__init__.py rename to backend/core/quivr_core/processor/megaparse/multimodal_convertor/__init__.py diff --git a/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py b/backend/core/quivr_core/processor/megaparse/multimodal_convertor/megaparse_vision.py similarity index 100% rename from backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py rename to backend/core/quivr_core/processor/megaparse/multimodal_convertor/megaparse_vision.py diff --git a/backend/core/MegaParse/megaparse/unstructured_convertor.py b/backend/core/quivr_core/processor/megaparse/unstructured_convertor.py similarity index 100% rename from backend/core/MegaParse/megaparse/unstructured_convertor.py rename to backend/core/quivr_core/processor/megaparse/unstructured_convertor.py diff --git a/backend/core/MegaParse/megaparse/utils.py b/backend/core/quivr_core/processor/megaparse/utils.py similarity index 100% rename from backend/core/MegaParse/megaparse/utils.py rename to backend/core/quivr_core/processor/megaparse/utils.py diff --git a/backend/requirements-dev.lock b/backend/requirements-dev.lock index 498ff5d20dc1..2ae36b678b53 100644 --- a/backend/requirements-dev.lock +++ b/backend/requirements-dev.lock @@ -18,9 +18,6 @@ -e file:core # via quivr-api # via quivr-worker --e file:core/MegaParse - # via quivr-core - # via quivr-diff-assistant -e file:worker -e file:worker/diff-assistant # via quivr-worker @@ -447,6 +444,7 @@ langchain-experimental==0.0.64 langchain-openai==0.1.25 # via megaparse # via quivr-api + # via quivr-core # via quivr-diff-assistant langchain-text-splitters==0.2.2 # via langchain @@ -473,6 +471,7 @@ llama-cloud==0.0.13 # via llama-index-indices-managed-llama-cloud llama-index==0.11.12 # via megaparse + # via quivr-core # via quivr-diff-assistant llama-index-agent-openai==0.3.4 # via llama-index @@ -523,6 +522,7 @@ llama-parse==0.5.6 # via llama-index-readers-llama-parse # via megaparse # via quivr-api + # via quivr-core llvmlite==0.43.0 # via numba lxml==5.3.0 @@ -532,6 +532,7 @@ lxml==5.3.0 # via unstructured mammoth==1.8.0 # via megaparse + # via quivr-core markdown==3.7 # via mkdocs # via mkdocs-autorefs @@ -571,6 +572,9 @@ mdit-py-plugins==0.4.1 # via jupytext mdurl==0.1.2 # via markdown-it-py +megaparse==0.0.31 + # via quivr-core + # via quivr-diff-assistant mergedeep==1.3.4 # via mkdocs # via mkdocs-get-deps @@ -778,6 +782,7 @@ pathspec==0.12.1 # via mkdocs pdf2docx==0.5.8 # via megaparse + # via quivr-core pdf2image==1.17.0 # via layoutparser # via unstructured @@ -787,6 +792,7 @@ pdfminer-six==20231228 pdfplumber==0.11.4 # via layoutparser # via megaparse + # via quivr-core pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' # via ipython pgvector==0.3.2 @@ -874,6 +880,7 @@ pycparser==2.22 ; platform_python_implementation != 'PyPy' or implementation_nam # via cffi pycryptodome==3.20.0 # via megaparse + # via quivr-core pydantic==2.8.2 # via anthropic # via chainlit @@ -966,6 +973,7 @@ python-doctr==0.9.0 python-docx==1.1.2 # via megaparse # via pdf2docx + # via quivr-core # via unstructured python-dotenv==1.0.1 # via chainlit @@ -974,6 +982,7 @@ python-dotenv==1.0.1 # via pydantic-settings # via pytest-dotenv # via quivr-api + # via quivr-core # via quivr-diff-assistant # via quivr-worker python-engineio==4.9.1 @@ -995,6 +1004,7 @@ python-oxmsg==0.0.1 # via unstructured python-pptx==1.0.2 # via megaparse + # via quivr-core # via unstructured python-socketio==5.11.3 # via chainlit diff --git a/backend/requirements.lock b/backend/requirements.lock index 7bb40f61eaef..9ce248580205 100644 --- a/backend/requirements.lock +++ b/backend/requirements.lock @@ -18,9 +18,6 @@ -e file:core # via quivr-api # via quivr-worker --e file:core/MegaParse - # via quivr-core - # via quivr-diff-assistant -e file:worker -e file:worker/diff-assistant # via quivr-worker @@ -398,6 +395,7 @@ langchain-experimental==0.0.64 langchain-openai==0.1.25 # via megaparse # via quivr-api + # via quivr-core # via quivr-diff-assistant langchain-text-splitters==0.2.2 # via langchain @@ -420,6 +418,7 @@ llama-cloud==0.0.13 # via llama-index-indices-managed-llama-cloud llama-index==0.11.12 # via megaparse + # via quivr-core # via quivr-diff-assistant llama-index-agent-openai==0.3.4 # via llama-index @@ -470,6 +469,7 @@ llama-parse==0.5.6 # via llama-index-readers-llama-parse # via megaparse # via quivr-api + # via quivr-core llvmlite==0.43.0 # via numba lxml==5.3.0 @@ -479,6 +479,7 @@ lxml==5.3.0 # via unstructured mammoth==1.8.0 # via megaparse + # via quivr-core markdown==3.7 # via mkdocs # via mkdocs-autorefs @@ -516,6 +517,9 @@ mdit-py-plugins==0.4.1 # via jupytext mdurl==0.1.2 # via markdown-it-py +megaparse==0.0.31 + # via quivr-core + # via quivr-diff-assistant mergedeep==1.3.4 # via mkdocs # via mkdocs-get-deps @@ -681,6 +685,7 @@ pathspec==0.12.1 # via mkdocs pdf2docx==0.5.8 # via megaparse + # via quivr-core pdf2image==1.17.0 # via layoutparser # via unstructured @@ -690,6 +695,7 @@ pdfminer-six==20231228 pdfplumber==0.11.4 # via layoutparser # via megaparse + # via quivr-core pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' # via ipython pgvector==0.3.2 @@ -765,6 +771,7 @@ pycparser==2.22 ; platform_python_implementation != 'PyPy' or implementation_nam # via cffi pycryptodome==3.20.0 # via megaparse + # via quivr-core pydantic==2.8.2 # via anthropic # via cohere @@ -837,12 +844,14 @@ python-doctr==0.9.0 python-docx==1.1.2 # via megaparse # via pdf2docx + # via quivr-core # via unstructured python-dotenv==1.0.1 # via litellm # via megaparse # via pydantic-settings # via quivr-api + # via quivr-core # via quivr-diff-assistant # via quivr-worker python-iso639==2024.4.27 @@ -861,6 +870,7 @@ python-oxmsg==0.0.1 # via unstructured python-pptx==1.0.2 # via megaparse + # via quivr-core # via unstructured pytz==2024.1 # via flower diff --git a/backend/worker/pyproject.toml b/backend/worker/pyproject.toml index f4893ed8bee9..92eec4c3c3a0 100644 --- a/backend/worker/pyproject.toml +++ b/backend/worker/pyproject.toml @@ -6,7 +6,7 @@ authors = [ { name = "Stan Girard", email = "stan@quivr.app" } ] dependencies = [ - "quivr-core[all]", + "quivr-core[all,megaparse]", "quivr-api", "quivr-diff-assistant", "celery[redis]>=5.0.0",