Allow more for convert to text

h2oai · Oct 30, 2024 · e1caa2e · e1caa2e
1 parent e77f54a
commit e1caa2e
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 22 deletions.
diff --git a/openai_server/agent_tools/convert_document_to_text.py b/openai_server/agent_tools/convert_document_to_text.py
@@ -81,6 +81,8 @@ def process_files(files, urls):
     from openai_server.agent_tools.common.utils import download_simple
 
     for filename in files + urls:
+        enable_transcriptions = False
+        enable_llava = False
         if filename.lower().endswith('.pdf'):
             if filename in urls:
                 newfile = download_simple(filename)
@@ -102,10 +104,12 @@ def process_files(files, urls):
                 use_pymupdf = 'on'
                 use_pypdf = 'off'
         else:
-            # pymupdf faster for many pages
-            enable_pdf_doctr = 'off'
+            # non-pdf, allow docTR in case, e.g. video
+            enable_pdf_doctr = 'on'
             use_pymupdf = 'on'
             use_pypdf = 'off'
+            enable_transcriptions = True
+            enable_llava = True
 
         if filename.lower().endswith('.xls') or filename.lower().endswith('.xlsx'):
             if filename in urls:
@@ -123,10 +127,10 @@ def process_files(files, urls):
                                                enable_pdf_ocr='off',
                                                enable_pdf_doctr=enable_pdf_doctr,
                                                try_pdf_as_html='off',
-                                               enable_captions=False,
-                                               enable_llava=False,
+                                               enable_captions=False,  # no need if llava used
+                                               enable_llava=enable_llava,
                                                chunk=False,
-                                               enable_transcriptions=False,
+                                               enable_transcriptions=enable_transcriptions,
                                                )
         pages1 = [x.page_content for x in sources1]
         all_content1 = "\n\n".join(pages1)
@@ -139,19 +143,19 @@ def process_files(files, urls):
                 use_pymupdf = 'on'
                 use_pypdf = 'off'
             sources2, known_type = get_data_h2ogpt(filename,
-                                               is_url=filename in urls,
-                                               verbose=False,
-                                               use_pymupdf=use_pymupdf,
-                                               use_pypdf=use_pypdf,
-                                               use_unstructured_pdf='off',
-                                               enable_pdf_ocr='off',
-                                               enable_pdf_doctr=enable_pdf_doctr,
-                                               try_pdf_as_html='off',
-                                               enable_captions=False,
-                                               enable_llava=False,
-                                               chunk=False,
-                                               enable_transcriptions=False,
-                                               )
+                                                   is_url=filename in urls,
+                                                   verbose=False,
+                                                   use_pymupdf=use_pymupdf,
+                                                   use_pypdf=use_pypdf,
+                                                   use_unstructured_pdf='off',
+                                                   enable_pdf_ocr='off',
+                                                   enable_pdf_doctr=enable_pdf_doctr,
+                                                   try_pdf_as_html='off',
+                                                   enable_captions=False,
+                                                   enable_llava=False,
+                                                   chunk=False,
+                                                   enable_transcriptions=False,
+                                                   )
 
             pages2 = [x.page_content for x in sources1]
             all_content2 = "\n\n".join(pages2)
@@ -203,7 +207,8 @@ def main():
             f.write(output_text)
 
         print(f"{files + urls} have been converted to text and written to {args.output}")
-        print("The output may be complex for input of PDFs or URLs etc., so do not assume the structure of the output file and instead check it directly.")
+        print(
+            "The output may be complex for input of PDFs or URLs etc., so do not assume the structure of the output file and instead check it directly.")
         print("Probably a verify any use of convert_document_to_text.py with ask_question_about_documents.py")
 
         max_tokens = 1024
@@ -222,12 +227,11 @@ def main():
 if __name__ == "__main__":
     main()
 
-
 """
 Examples:
 
 wget https://aiindex.stanford.edu/wp-content/uploads/2024/04/HAI_2024_AI-Index-Report.pdf
 python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --urls http://www.cnn.com
 python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --files HAI_2024_AI-Index-Report.pdf
 python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --urls https://aiindex.stanford.edu/wp-content/uploads/2024/04/HAI_2024_AI-Index-Report.pdf
-"""
+"""
diff --git a/src/version.py b/src/version.py
@@ -1 +1 @@
-__version__ = "dce9960977e52cc03ae07115e858bdbe308773ed"
+__version__ = "e77f54aa6d4f2b1b31a4f1b2cc27b9b0c0033ad6"