diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b76408d --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*~ +*.pyc +*.swp + +/build +/dist +/*.egg-info/ diff --git a/refine.py b/refine.py deleted file mode 100644 index b3f93de..0000000 --- a/refine.py +++ /dev/null @@ -1,81 +0,0 @@ -# originally written by David Huynh (@dfhuynh) -# -# requires installation of urllib2_file from https://github.com/seisen/urllib2_file/#readme - -import urllib2_file -import urllib2, urlparse, os.path, time, json - -class Refine: - def __init__(self, server='http://127.0.0.1:3333'): - self.server = server[0,-1] if server.endswith('/') else server - - def new_project(self, file_path, options=None): - file_name = os.path.split(file_path)[-1] - project_name = options['project_name'] if options != None and 'project_name' in options else file_name - data = { - 'project-file' : { - 'fd' : open(file_path), - 'filename' : file_name - }, - 'project-name' : project_name - } - - response = urllib2.urlopen(self.server + '/command/core/create-project-from-upload', data) - response.read() - url_params = urlparse.parse_qs(urlparse.urlparse(response.geturl()).query) - if 'project' in url_params: - id = url_params['project'][0] - return RefineProject(self.server, id, project_name) - - # TODO: better error reporting - return None - -class RefineProject: - def __init__(self, server, id, project_name): - self.server = server - self.id = id - self.project_name = project_name - - def wait_until_idle(self, polling_delay=0.5): - while True: - response = urllib2.urlopen(self.server + '/command/core/get-processes?project=' + self.id) - response_json = json.loads(response.read()) - if 'processes' in response_json and len(response_json['processes']) > 0: - time.sleep(polling_delay) - else: - return - - def apply_operations(self, file_path, wait=True): - fd = open(file_path) - operations_json = fd.read() - - data = { - 'operations' : operations_json - } - response = urllib2.urlopen(self.server + '/command/core/apply-operations?project=' + self.id, data) - response_json = json.loads(response.read()) - if response_json['code'] == 'error': - raise Exception(response_json['message']) - elif response_json['code'] == 'pending': - if wait: - self.wait_until_idle() - return 'ok' - - return response_json['code'] # can be 'ok' or 'pending' - - def export_rows(self, format='tsv'): - data = { - 'engine' : '{"facets":[],"mode":"row-based"}', - 'project' : self.id, - 'format' : format - } - response = urllib2.urlopen(self.server + '/command/core/export-rows/' + self.project_name + '.' + format, data) - return response.read() - - def delete_project(self): - data = { - 'project' : self.id - } - response = urllib2.urlopen(self.server + '/command/core/delete-project', data) - response_json = json.loads(response.read()) - return 'code' in response_json and response_json['code'] == 'ok' \ No newline at end of file diff --git a/refine/__init__.py b/refine/__init__.py new file mode 100755 index 0000000..bfbca72 --- /dev/null +++ b/refine/__init__.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# +# Authors: +# David Huynh (@dfhuynh) +# Pablo Castellano (@PabloCastellano) + +import argparse +import os.path +import time +import requests + + +class Refine: + def __init__(self, server='http://127.0.0.1:3333'): + self.server = server[0, -1] if server.endswith('/') else server + + def new_project(self, file_path, options=None): + file_name = os.path.split(file_path)[-1] + project_name = options['project_name'] if options != None and 'project_name' in options else file_name + + files = {'file': (file_name, open(file_path, 'rb'), 'application/vnd.ms-excel', {'Expires': '0'})} + + r = requests.post(self.server + '/command/core/create-project-from-upload', files=files) + if '?project=' in r.request.path_url: + _id = r.request.path_url.split('?project=')[1] + return RefineProject(self.server, _id, project_name) + + # TODO: better error reporting + return None + + +class RefineProject: + def __init__(self, server, id, project_name): + self.server = server + self.id = id + self.project_name = project_name + + def wait_until_idle(self, polling_delay=0.5): + while True: + r = requests.get(self.server + '/command/core/get-processes?project=' + self.id) + response_json = r.json() + if 'processes' in response_json and len(response_json['processes']) > 0: + time.sleep(polling_delay) + else: + return + + def apply_operations(self, file_path, wait=True): + fd = open(file_path) + operations_json = fd.read() + + data = { + 'operations': operations_json + } + r = requests.post(self.server + '/command/core/apply-operations?project=' + self.id, data) + response_json = r.json() + if response_json['code'] == 'error': + raise Exception(response_json['message']) + elif response_json['code'] == 'pending': + if wait: + self.wait_until_idle() + return 'ok' + + return response_json['code'] # can be 'ok' or 'pending' + + def export_rows(self, format='tsv', printColumnHeader=True): + data = { + 'engine': '{"facets":[],"mode":"row-based"}', + 'project': self.id, + 'format' : format, + 'printColumnHeader': printColumnHeader + } + r = requests.post(self.server + '/command/core/export-rows/' + self.project_name + '.' + format, data) + return r.content.decode("utf-8") + + def export_project(self, format='openrefine.tar.gz'): + data = { + 'project' : self.id, + 'format' : format + } + response = urllib2.urlopen(self.server + '/command/core/export-project/' + self.project_name + '.' + format, data) + return response.read() + + def delete_project(self): + data = { + 'project': self.id + } + r = requests.post(self.server + '/command/core/delete-project', data) + response_json = r.json() + return response_json.get('code', '') == 'ok' + + +def main(): + parser = argparse.ArgumentParser(description='Apply operations to a CSV file by using the OpenRefine API') + parser.add_argument("input", help="Input CSV") + parser.add_argument("operations", help="Operations CSV") + args = parser.parse_args() + + r = Refine() + p = r.new_project(args.input) + p.apply_operations(args.operations) + print(p.export_rows()) + p.delete_project() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f022291 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests==2.18.1 diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..8a84ebb --- /dev/null +++ b/setup.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from setuptools import setup + +long_description = """ +This allows you to script Refine by creating projects from data files, applying extracted JSON operation histories against the data and then exporting the transformed data back out of Refine. +""" + +def get_install_requires(): + """ + parse requirements.txt, ignore links, exclude comments + """ + requirements = [] + for line in open('requirements.txt').readlines(): + line = line.rstrip() + # skip to next iteration if comment or empty line + if any([line.startswith('#'), line == '', line.startswith('http'), line.startswith('git'), line == '-r base.txt']): + continue + # add line to requirements + requirements.append(line) + return requirements + +setup( + name='refine', + version='0.1', + packages=['refine'], + entry_points={ + 'console_scripts': ['refine-cli = refine:main']}, + install_requires=get_install_requires(), + # metadata for upload to PyPI + author="David Huynh", + author_email="", + description=("Python client library for Google Refine"), + license='MIT', + keywords=['OpenRefine', 'CSV', 'data'], + url='https://github.com/PabloCastellano/refine-python', + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Text Processing' + ], + long_description=long_description +) diff --git a/test.py b/test.py deleted file mode 100644 index 8223b4b..0000000 --- a/test.py +++ /dev/null @@ -1,9 +0,0 @@ -import sys -sys.path.append("refine.py") -import refine - -r = refine.Refine() -p = r.new_project("dates.txt") -p.apply_operations("operations.json") -print p.export_rows() -p.delete_project() \ No newline at end of file diff --git a/dates.txt b/tests/dates.csv similarity index 78% rename from dates.txt rename to tests/dates.csv index e6d9e07..0a4c1fe 100644 --- a/dates.txt +++ b/tests/dates.csv @@ -1,4 +1,4 @@ Date 7 December 2001 July 1 2002 -10/20/10 \ No newline at end of file +10/20/10 diff --git a/operations.json b/tests/operations.json similarity index 99% rename from operations.json rename to tests/operations.json index bfeda57..1028a40 100644 --- a/operations.json +++ b/tests/operations.json @@ -25,4 +25,4 @@ "repeat": false, "repeatCount": 10 } -] \ No newline at end of file +] diff --git a/tests/test.py b/tests/test.py new file mode 100755 index 0000000..1abddd2 --- /dev/null +++ b/tests/test.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python +import refine + +r = refine.Refine() +p = r.new_project("dates.csv") +p.apply_operations("operations.json") +print(p.export_rows()) +p.delete_project()