From 986a7600eea637ceb3be51564a1a9d4fbeff96ee Mon Sep 17 00:00:00 2001
From: Emmett McFaralne <staminacode@gmail.com>
Date: Sat, 23 Mar 2024 21:24:15 -0400
Subject: [PATCH] playwright ci changes

---
 .github/workflows/python-ci.yml |  2 +-
 .gitignore                      |  3 +-
 extract.py                      |  8 +----
 package-lock.json               | 57 +++++++++++++++++++++++++++++++++
 package.json                    | 18 +++++++++++
 5 files changed, 79 insertions(+), 9 deletions(-)
 create mode 100644 package-lock.json
 create mode 100644 package.json

diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
index 49c66b5..24b8a74 100644
--- a/.github/workflows/python-ci.yml
+++ b/.github/workflows/python-ci.yml
@@ -20,7 +20,7 @@ jobs:
       with:
         node-version: 18
     - name: Install dependencies
-      run: npm ci
+      run: npm install
     - name: Install Playwright Browsers
       run: npx playwright install --with-deps
     - name: Set up Python 3.10
diff --git a/.gitignore b/.gitignore
index f8bbf80..29ea292 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 __pycache__/
 outputs/
-logs/
\ No newline at end of file
+logs/
+node_modules/
\ No newline at end of file
diff --git a/extract.py b/extract.py
index 2754e1c..59efc12 100644
--- a/extract.py
+++ b/extract.py
@@ -287,31 +287,25 @@ def extract_docx(source_name: str) -> List[Chunk]:
     # make new temp image directory
     chunks = []
     temp_image_dir = tempfile.mkdtemp()
-    print('processing')
     text = docx2txt.process(source_name, temp_image_dir)
     chunks.append(Chunk(path=source_name, text=text, image=None, source_type=SourceTypes.DOCX))
     for image_name in os.listdir(temp_image_dir):
-        print(image_name)
         image_path = os.path.join(temp_image_dir, image_name)
-        print('attempgin to open')
         image = Image.open(image_path)
         image.load() # needed to close the file
-        print("appending")
         chunks.append(Chunk(path=source_name, text=None, image=image, source_type=SourceTypes.DOCX))
     # if temp dir exists, remove images and it
-    print('attempting delete')
     if os.path.exists(temp_image_dir):
         for image_name in os.listdir(temp_image_dir):
             image_path = os.path.join(temp_image_dir, image_name)
             os.remove(image_path)
         os.rmdir(temp_image_dir)
-    print('done')
     return chunks
 
 def extract_pptx(source_name: str) -> List[Chunk]:
     prs = Presentation(source_name)
     chunks = []
-    # parse slides, shapes, and images
+    # parse shapes inside slides
     for slide in prs.slides:
         slide_text = ""
         slide_images = []
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 0000000..6d359b8
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,57 @@
+{
+  "name": "thepipe",
+  "version": "1.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "thepipe",
+      "version": "1.0.0",
+      "license": "ISC",
+      "dependencies": {
+        "playwright": "^1.42.1"
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+      "hasInstallScript": true,
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/playwright": {
+      "version": "1.42.1",
+      "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.42.1.tgz",
+      "integrity": "sha512-PgwB03s2DZBcNRoW+1w9E+VkLBxweib6KTXM0M3tkiT4jVxKSi6PmVJ591J+0u10LUrgxB7dLRbiJqO5s2QPMg==",
+      "dependencies": {
+        "playwright-core": "1.42.1"
+      },
+      "bin": {
+        "playwright": "cli.js"
+      },
+      "engines": {
+        "node": ">=16"
+      },
+      "optionalDependencies": {
+        "fsevents": "2.3.2"
+      }
+    },
+    "node_modules/playwright-core": {
+      "version": "1.42.1",
+      "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.42.1.tgz",
+      "integrity": "sha512-mxz6zclokgrke9p1vtdy/COWBH+eOZgYUVVU34C73M+4j4HLlQJHtfcqiqqxpP0o8HhMkflvfbquLX5dg6wlfA==",
+      "bin": {
+        "playwright-core": "cli.js"
+      },
+      "engines": {
+        "node": ">=16"
+      }
+    }
+  }
+}
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..3484102
--- /dev/null
+++ b/package.json
@@ -0,0 +1,18 @@
+{
+  "name": "thepipe",
+  "version": "1.0.0",
+  "description": "The Pipe is a simple tool to automate information extraction for multimodal LLMs.",
+  "main": "index.js",
+  "directories": {
+    "test": "tests"
+  },
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "playwright": "^1.42.1"
+  }
+}