Skip to content

Use requests instead of urllib, PEP8 and Python3 compatibility #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
*~
*.pyc
*.swp

/build
/dist
/*.egg-info/
81 changes: 0 additions & 81 deletions refine.py

This file was deleted.

106 changes: 106 additions & 0 deletions refine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python
#
# Authors:
# David Huynh (@dfhuynh)
# Pablo Castellano (@PabloCastellano)

import argparse
import os.path
import time
import requests


class Refine:
def __init__(self, server='http://127.0.0.1:3333'):
self.server = server[0, -1] if server.endswith('/') else server

def new_project(self, file_path, options=None):
file_name = os.path.split(file_path)[-1]
project_name = options['project_name'] if options != None and 'project_name' in options else file_name

files = {'file': (file_name, open(file_path, 'rb'), 'application/vnd.ms-excel', {'Expires': '0'})}

r = requests.post(self.server + '/command/core/create-project-from-upload', files=files)
if '?project=' in r.request.path_url:
_id = r.request.path_url.split('?project=')[1]
return RefineProject(self.server, _id, project_name)

# TODO: better error reporting
return None


class RefineProject:
def __init__(self, server, id, project_name):
self.server = server
self.id = id
self.project_name = project_name

def wait_until_idle(self, polling_delay=0.5):
while True:
r = requests.get(self.server + '/command/core/get-processes?project=' + self.id)
response_json = r.json()
if 'processes' in response_json and len(response_json['processes']) > 0:
time.sleep(polling_delay)
else:
return

def apply_operations(self, file_path, wait=True):
fd = open(file_path)
operations_json = fd.read()

data = {
'operations': operations_json
}
r = requests.post(self.server + '/command/core/apply-operations?project=' + self.id, data)
response_json = r.json()
if response_json['code'] == 'error':
raise Exception(response_json['message'])
elif response_json['code'] == 'pending':
if wait:
self.wait_until_idle()
return 'ok'

return response_json['code'] # can be 'ok' or 'pending'

def export_rows(self, format='tsv', printColumnHeader=True):
data = {
'engine': '{"facets":[],"mode":"row-based"}',
'project': self.id,
'format' : format,
'printColumnHeader': printColumnHeader
}
r = requests.post(self.server + '/command/core/export-rows/' + self.project_name + '.' + format, data)
return r.content.decode("utf-8")

def export_project(self, format='openrefine.tar.gz'):
data = {
'project' : self.id,
'format' : format
}
response = urllib2.urlopen(self.server + '/command/core/export-project/' + self.project_name + '.' + format, data)
return response.read()

def delete_project(self):
data = {
'project': self.id
}
r = requests.post(self.server + '/command/core/delete-project', data)
response_json = r.json()
return response_json.get('code', '') == 'ok'


def main():
parser = argparse.ArgumentParser(description='Apply operations to a CSV file by using the OpenRefine API')
parser.add_argument("input", help="Input CSV")
parser.add_argument("operations", help="Operations CSV")
args = parser.parse_args()

r = Refine()
p = r.new_project(args.input)
p.apply_operations(args.operations)
print(p.export_rows())
p.delete_project()


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requests==2.18.1
47 changes: 47 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from setuptools import setup

long_description = """
This allows you to script Refine by creating projects from data files, applying extracted JSON operation histories against the data and then exporting the transformed data back out of Refine.
"""

def get_install_requires():
"""
parse requirements.txt, ignore links, exclude comments
"""
requirements = []
for line in open('requirements.txt').readlines():
line = line.rstrip()
# skip to next iteration if comment or empty line
if any([line.startswith('#'), line == '', line.startswith('http'), line.startswith('git'), line == '-r base.txt']):
continue
# add line to requirements
requirements.append(line)
return requirements

setup(
name='refine',
version='0.1',
packages=['refine'],
entry_points={
'console_scripts': ['refine-cli = refine:main']},
install_requires=get_install_requires(),
# metadata for upload to PyPI
author="David Huynh",
author_email="",
description=("Python client library for Google Refine"),
license='MIT',
keywords=['OpenRefine', 'CSV', 'data'],
url='https://github.com/PabloCastellano/refine-python',
classifiers=[
'Development Status :: 3 - Alpha',
'Environment :: Console',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Topic :: Text Processing'
],
long_description=long_description
)
9 changes: 0 additions & 9 deletions test.py

This file was deleted.

2 changes: 1 addition & 1 deletion dates.txt → tests/dates.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Date
7 December 2001
July 1 2002
10/20/10
10/20/10
2 changes: 1 addition & 1 deletion operations.json → tests/operations.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@
"repeat": false,
"repeatCount": 10
}
]
]
8 changes: 8 additions & 0 deletions tests/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env python
import refine

r = refine.Refine()
p = r.new_project("dates.csv")
p.apply_operations("operations.json")
print(p.export_rows())
p.delete_project()