From 8172ef12e4439f52d1338f243f3dcad3decd86b9 Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Thu, 2 Jan 2025 11:35:13 -0500 Subject: [PATCH] fix scraper error response parsing --- setup.py | 49 ++++++++++++++++++++++++++-------------------- thepipe/scraper.py | 24 +++++++++++++++++------ 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index 851521f..7e30f9e 100644 --- a/setup.py +++ b/setup.py @@ -1,38 +1,45 @@ from setuptools import setup, find_packages + def read_requirements(file): - with open(file, encoding='utf-8') as f: - return [line.strip() for line in f if line.strip() and not line.startswith('#') and not line.startswith('git+')] + with open(file, encoding="utf-8") as f: + return [ + line.strip() + for line in f + if line.strip() and not line.startswith("#") and not line.startswith("git+") + ] + def read_git_requirements(file): - with open(file, encoding='utf-8') as f: - return [line.strip() for line in f if line.strip().startswith('git+')] + with open(file, encoding="utf-8") as f: + return [line.strip() for line in f if line.strip().startswith("git+")] + setup( - name='thepipe_api', - version='1.3.9', - author='Emmett McFarlane', - author_email='emmett@thepi.pe', - description='AI-native extractor, powered by multimodal LLMs.', - long_description=open('README.md', encoding='utf-8').read(), - long_description_content_type='text/markdown', - url='https://github.com/emcf/thepipe', + name="thepipe_api", + version="1.4.0", + author="Emmett McFarlane", + author_email="emmett@thepi.pe", + description="Document extraction, powered by multimodal LLMs.", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + url="https://github.com/emcf/thepipe", packages=find_packages(), classifiers=[ - 'Programming Language :: Python :: 3', - 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", ], - python_requires='>=3.9', - install_requires=read_requirements('requirements.txt'), + python_requires=">=3.9", + install_requires=read_requirements("requirements.txt"), include_package_data=True, entry_points={ - 'console_scripts': [ - 'thepipe=thepipe.__init__:main', + "console_scripts": [ + "thepipe=thepipe.__init__:main", ], }, extras_require={ - 'local': read_requirements('local.txt'), + "local": read_requirements("local.txt"), }, - dependency_links=read_git_requirements('local.txt') + dependency_links=read_git_requirements("local.txt"), ) diff --git a/thepipe/scraper.py b/thepipe/scraper.py index a8797db..23ce2b0 100644 --- a/thepipe/scraper.py +++ b/thepipe/scraper.py @@ -114,10 +114,16 @@ def scrape_file( ), }, ) - if "error" in response.content.decode("utf-8"): - error_message = json.loads(response.content.decode("utf-8"))["error"] - raise ValueError(f"Error scraping {filepath}: {error_message}") response.raise_for_status() + for line in response.iter_lines(decode_unicode=True): + # each line is its own JSON object + if not line.strip(): + continue # skip blank lines + data = json.loads(line) + # If the server sent an error for this chunk, handle it + if "error" in data: + raise ValueError(f"Error scraping: {data['error']}") + chunks = [] for line in response.iter_lines(): if line: @@ -729,10 +735,16 @@ def scrape_url( } data["urls"] = url response = requests.post(endpoint, headers=headers, data=data, stream=True) - if "error" in response.content.decode("utf-8"): - error_message = json.loads(response.content.decode("utf-8"))["error"] - raise ValueError(f"Error scraping {url}: {error_message}") response.raise_for_status() + for line in response.iter_lines(decode_unicode=True): + # each line is its own JSON object + if not line.strip(): + continue # skip blank lines + data = json.loads(line) + # If the server sent an error for this chunk, handle it + if "error" in data: + raise ValueError(f"Error scraping: {data['error']}") + results = [] for line in response.iter_lines(): if line: