From 986a7600eea637ceb3be51564a1a9d4fbeff96ee Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Sat, 23 Mar 2024 21:24:15 -0400 Subject: [PATCH] playwright ci changes --- .github/workflows/python-ci.yml | 2 +- .gitignore | 3 +- extract.py | 8 +---- package-lock.json | 57 +++++++++++++++++++++++++++++++++ package.json | 18 +++++++++++ 5 files changed, 79 insertions(+), 9 deletions(-) create mode 100644 package-lock.json create mode 100644 package.json diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 49c66b5..24b8a74 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -20,7 +20,7 @@ jobs: with: node-version: 18 - name: Install dependencies - run: npm ci + run: npm install - name: Install Playwright Browsers run: npx playwright install --with-deps - name: Set up Python 3.10 diff --git a/.gitignore b/.gitignore index f8bbf80..29ea292 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__/ outputs/ -logs/ \ No newline at end of file +logs/ +node_modules/ \ No newline at end of file diff --git a/extract.py b/extract.py index 2754e1c..59efc12 100644 --- a/extract.py +++ b/extract.py @@ -287,31 +287,25 @@ def extract_docx(source_name: str) -> List[Chunk]: # make new temp image directory chunks = [] temp_image_dir = tempfile.mkdtemp() - print('processing') text = docx2txt.process(source_name, temp_image_dir) chunks.append(Chunk(path=source_name, text=text, image=None, source_type=SourceTypes.DOCX)) for image_name in os.listdir(temp_image_dir): - print(image_name) image_path = os.path.join(temp_image_dir, image_name) - print('attempgin to open') image = Image.open(image_path) image.load() # needed to close the file - print("appending") chunks.append(Chunk(path=source_name, text=None, image=image, source_type=SourceTypes.DOCX)) # if temp dir exists, remove images and it - print('attempting delete') if os.path.exists(temp_image_dir): for image_name in os.listdir(temp_image_dir): image_path = os.path.join(temp_image_dir, image_name) os.remove(image_path) os.rmdir(temp_image_dir) - print('done') return chunks def extract_pptx(source_name: str) -> List[Chunk]: prs = Presentation(source_name) chunks = [] - # parse slides, shapes, and images + # parse shapes inside slides for slide in prs.slides: slide_text = "" slide_images = [] diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..6d359b8 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,57 @@ +{ + "name": "thepipe", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "thepipe", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "playwright": "^1.42.1" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/playwright": { + "version": "1.42.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.42.1.tgz", + "integrity": "sha512-PgwB03s2DZBcNRoW+1w9E+VkLBxweib6KTXM0M3tkiT4jVxKSi6PmVJ591J+0u10LUrgxB7dLRbiJqO5s2QPMg==", + "dependencies": { + "playwright-core": "1.42.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=16" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.42.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.42.1.tgz", + "integrity": "sha512-mxz6zclokgrke9p1vtdy/COWBH+eOZgYUVVU34C73M+4j4HLlQJHtfcqiqqxpP0o8HhMkflvfbquLX5dg6wlfA==", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=16" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..3484102 --- /dev/null +++ b/package.json @@ -0,0 +1,18 @@ +{ + "name": "thepipe", + "version": "1.0.0", + "description": "The Pipe is a simple tool to automate information extraction for multimodal LLMs.", + "main": "index.js", + "directories": { + "test": "tests" + }, + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "playwright": "^1.42.1" + } +}