Skip to content
This repository has been archived by the owner on Dec 8, 2024. It is now read-only.

port to python 3.7 and use requests #8

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions google/refine/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env python
#!/usr/bin/env python3
"""
Script to provide a command line interface to a Refine server.
"""
@@ -210,14 +210,14 @@ def main():

# get project_id
if args and not str.isdigit(args[0]):
projects = refine.Refine(refine.RefineServer()).list_projects().items()
projects = list(refine.Refine(refine.RefineServer()).list_projects().items())
idlist = []
for project_id, project_info in projects:
if args[0].decode('UTF-8') == project_info['name']:
if args[0] == project_info['name']:
idlist.append(str(project_id))
if len(idlist) > 1:
print('Error: Found %s projects with name %s.\n'
'Please specify project by id.' % (len(idlist), args[0]))
print(('Error: Found %s projects with name %s.\n'
'Please specify project by id.' % (len(idlist), args[0])))
for i in idlist:
print('')
cli.info(i)
@@ -226,8 +226,8 @@ def main():
try:
project_id = idlist[0]
except IndexError:
print('Error: No project found with name %s.\n'
'Try command --list' % args[0])
print(('Error: No project found with name %s.\n'
'Try command --list' % args[0]))
return
elif args:
project_id = args[0]
@@ -240,11 +240,11 @@ def main():
elif options.create:
group5_dict = {group5_arg.dest: getattr(options, group5_arg.dest)
for group5_arg in group5.option_list}
kwargs = {k: v for k, v in group5_dict.items()
kwargs = {k: v for k, v in list(group5_dict.items())
if v is not None and v not in ['true', 'false']}
kwargs.update({k: True for k, v in group5_dict.items()
kwargs.update({k: True for k, v in list(group5_dict.items())
if v == 'true'})
kwargs.update({k: False for k, v in group5_dict.items()
kwargs.update({k: False for k, v in list(group5_dict.items())
if v == 'false'})
if options.file_format:
kwargs.update({'project_format': options.file_format})
@@ -259,11 +259,11 @@ def main():
elif args and options.template:
group6_dict = {group6_arg.dest: getattr(options, group6_arg.dest)
for group6_arg in group6.option_list}
kwargs = {k: v for k, v in group6_dict.items()
kwargs = {k: v for k, v in list(group6_dict.items())
if v is not None and v not in ['true', 'false']}
kwargs.update({k: True for k, v in group6_dict.items()
kwargs.update({k: True for k, v in list(group6_dict.items())
if v == 'true'})
kwargs.update({k: False for k, v in group6_dict.items()
kwargs.update({k: False for k, v in list(group6_dict.items())
if v == 'false'})
cli.templating(project_id, options.template,
output_file=options.output, **kwargs)
77 changes: 39 additions & 38 deletions google/refine/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env python
#!/usr/bin/env python3
"""
Functions used by the command line interface (CLI)
"""
@@ -24,7 +24,8 @@
import ssl
import sys
import time
import urllib
import requests
import urllib.request, urllib.parse, urllib.error
from xml.etree import ElementTree

from google.refine import refine
@@ -38,8 +39,8 @@ def apply(project_id, history_file):
raise Exception('Failed to apply %s to %s: %s' %
(history_file, project_id, response))
else:
print('File %s has been successfully applied to project %s' %
(history_file, project_id))
print(('File %s has been successfully applied to project %s' %
(history_file, project_id)))


def create(project_file,
@@ -113,7 +114,7 @@ def create(project_file,
sheets = [0]
# TODO: new format for sheets option introduced in OpenRefine 2.8
# execute
kwargs = {k: v for k, v in vars().items() if v is not None}
kwargs = {k: v for k, v in list(vars().items()) if v is not None}
project = refine.Refine(refine.RefineServer()).new_project(
guess_cell_value_types=guessCellValueTypes,
ignore_lines=ignoreLines,
@@ -127,8 +128,8 @@ def create(project_file,
**kwargs)
rows = project.do_json('get-rows')['total']
if rows > 0:
print('{0}: {1}'.format('id', project.project_id))
print('{0}: {1}'.format('rows', rows))
print(('{0}: {1}'.format('id', project.project_id)))
print(('{0}: {1}'.format('rows', rows)))
return project
else:
raise Exception(
@@ -144,22 +145,23 @@ def delete(project_id):
raise Exception('Failed to delete %s: %s' %
(project_id, response))
else:
print('Project %s has been successfully deleted' % project_id)
print(('Project %s has been successfully deleted' % project_id))


def download(url, output_file=None):
"""Integrated download function for your convenience."""
if not output_file:
output_file = os.path.basename(url)
if os.path.exists(output_file):
print('Error: File %s already exists.\n'
print(('Error: File %s already exists.\n'
'Delete existing file or try command --output '
'to specify a different filename.' % output_file)
'to specify a different filename.' % output_file))
return
# Workaround for SSL verification problems in one-file-executables
context = ssl._create_unverified_context()
urllib.urlretrieve(url, output_file, context=context)
print('Download to file %s complete' % output_file)

myfile = requests.get(url)
with open(output_file, 'wb') as fo:
fo.write(myfile.content)
print(('Download to file %s complete' % output_file))


def export(project_id, encoding=None, output_file=None, export_format=None):
@@ -171,7 +173,7 @@ def export(project_id, encoding=None, output_file=None, export_format=None):
if export_format in ['csv', 'tsv', 'txt']:
encoding = 'UTF-8'
sys.stdout.write(project.export(
export_format=export_format, encoding=encoding).read())
export_format=export_format, encoding=encoding).text)
else:
ext = os.path.splitext(output_file)[1][1:]
if ext:
@@ -180,42 +182,41 @@ def export(project_id, encoding=None, output_file=None, export_format=None):
encoding = 'UTF-8'
with open(output_file, 'wb') as f:
f.write(project.export(
export_format=export_format, encoding=encoding).read())
print('Export to file %s complete' % output_file)

export_format=export_format, encoding=encoding).content)
print(('Export to file %s complete' % output_file))

def info(project_id):
"""Show project metadata"""
projects = refine.Refine(refine.RefineServer()).list_projects()
if project_id in projects.keys():
print('{0:>20}: {1}'.format('id', project_id))
print('{0:>20}: {1}'.format('url', 'http://' +
if project_id in list(projects.keys()):
print(('{0:>20}: {1}'.format('id', project_id)))
print(('{0:>20}: {1}'.format('url', 'http://' +
refine.REFINE_HOST + ':' +
refine.REFINE_PORT +
'/project?project=' + project_id))
for k, v in projects[project_id].items():
'/project?project=' + project_id)))
for k, v in list(projects[project_id].items()):
if v:
print(u'{0:>20}: {1}'.format(k, v))
print(('{0:>20}: {1}'.format(k, v)))
project_model = refine.RefineProject(project_id).get_models()
columns = [c['name'] for c in project_model['columnModel']['columns']]
for (i, v) in enumerate(columns, start=1):
print(u'{0:>20}: {1}'.format(u'column ' + str(i).zfill(3), v))
print(('{0:>20}: {1}'.format('column ' + str(i).zfill(3), v)))
else:
print('Error: No project found with id %s.\n'
'Check existing projects with command --list' % (project_id))
print(('Error: No project found with id %s.\n'
'Check existing projects with command --list' % (project_id)))


def ls():
"""Query the server and list projects sorted by mtime."""
projects = refine.Refine(refine.RefineServer()).list_projects().items()
projects = list(refine.Refine(refine.RefineServer()).list_projects().items())

def date_to_epoch(json_dt):
"""Convert a JSON date time into seconds-since-epoch."""
return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ'))
projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True)
if projects:
for project_id, project_info in projects:
print(u'{0:>14}: {1}'.format(project_id, project_info['name']))
print(('{0:>14}: {1}'.format(project_id, project_info['name'])))
else:
print('Error: No projects found')

@@ -267,11 +268,11 @@ def templating(project_id,
# normal output
if not output_file:
sys.stdout.write(project.export_templating(
**templateconfig).read())
**templateconfig).text)
else:
with open(output_file, 'wb') as f:
f.write(project.export_templating(**templateconfig).read())
print('Export to file %s complete' % output_file)
f.write(project.export_templating(**templateconfig).content)
print(('Export to file %s complete' % output_file))
else:
# splitToFiles functionality
prefix = templateconfig['prefix']
@@ -294,7 +295,7 @@ def templating(project_id,
'rowSeparator': '\n',
'encoding': encoding}
ids = [line.rstrip('\n') for line in project.export_templating(
**ids_templateconfig) if line.rstrip('\n')]
**ids_templateconfig).text if line.rstrip('\n')]
# generate common config
if mode == 'record-based':
# record-based: split-character into template
@@ -316,19 +317,19 @@ def templating(project_id,
'rowSeparator': ''})
# execute
records = project.export_templating(
**templateconfig).read().split(split)
**templateconfig).text.split(split)
del records[0] # skip first blank entry
if suffixById:
for index, record in enumerate(records):
output_file = base + '_' + ids[index] + '.' + ext
with open(output_file, 'wb') as f:
with open(output_file, 'w') as f:
f.writelines([prefix, record, suffix])
print('Export to files complete. Last file: %s' % output_file)
print(('Export to files complete. Last file: %s' % output_file))
else:
zeros = len(str(len(records)))
for index, record in enumerate(records):
output_file = base + '_' + \
str(index + 1).zfill(zeros) + '.' + ext
with open(output_file, 'wb') as f:
with open(output_file, 'w') as f:
f.writelines([prefix, record, suffix])
print('Export to files complete. Last file: %s' % output_file)
print(('Export to files complete. Last file: %s' % output_file))
17 changes: 10 additions & 7 deletions google/refine/facet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""
OpenRefine Facets, Engine, and Facet Responses.
"""
@@ -40,11 +40,11 @@ def __init__(self, column, facet_type, **options):
self.type = facet_type
self.name = column
self.column_name = column
for k, v in options.items():
for k, v in list(options.items()):
setattr(self, k, v)

def as_dict(self):
return dict([(to_camel(k), v) for k, v in self.__dict__.items()
return dict([(to_camel(k), v) for k, v in list(self.__dict__.items())
if v is not None])


@@ -159,8 +159,8 @@ class FacetResponse(object):
"""Class for unpacking an individual facet response."""
def __init__(self, facet):
self.name = None
for k, v in facet.items():
if isinstance(k, bool) or isinstance(k, basestring):
for k, v in list(facet.items()):
if isinstance(k, bool) or isinstance(k, str):
setattr(self, from_camel(k), v)
self.choices = {}

@@ -208,7 +208,10 @@ def __getitem__(self, index):
return self.facets[index]

self.facets = FacetResponseContainer(facets['facets'])
self.mode = facets['mode']
if facets.get('mode'):
self.mode = facets['mode']
else:
self.mode = facets['engine-mode']


class Engine(object):
@@ -268,7 +271,7 @@ def __init__(self, criteria=None):
criteria = [criteria]
for criterion in criteria:
# A string criterion defaults to a string sort on that column
if isinstance(criterion, basestring):
if isinstance(criterion, str):
criterion = {
'column': criterion,
'valueType': 'string',
2 changes: 1 addition & 1 deletion google/refine/history.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""
OpenRefine history: parsing responses.
"""
78 changes: 46 additions & 32 deletions google/refine/refine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""
Client library to communicate with a Refine server.
"""
@@ -23,12 +23,13 @@
import gzip
import os
import re
import StringIO
try:
import io
except:
from io import StringIO, BytesIO
import time
import urllib
import urllib2_file
import urllib2
import urlparse
import requests
import urllib.request, urllib.parse, urllib.error

from google.refine import facet
from google.refine import history
@@ -54,7 +55,7 @@ def __init__(self, server=None):
self.server = server[:-1] if server.endswith('/') else server
self.__version = None # see version @property below

def urlopen(self, command, data=None, params=None, project_id=None):
def urlopen(self, command, data=None, params=None, project_id=None, files=None):
"""Open a Refine URL and with optional query params and POST data.
data: POST data dict
@@ -74,32 +75,44 @@ def urlopen(self, command, data=None, params=None, project_id=None):
else:
params['project'] = project_id
if params:
url += '?' + urllib.urlencode(params)
req = urllib2.Request(url)
if data:
req.add_data(data) # data = urllib.urlencode(data)
#req.add_header('Accept-Encoding', 'gzip')
url += '?' + urllib.parse.urlencode(params)
req = urllib.request.Request(url)

try:
response = urllib2.urlopen(req)
except urllib2.HTTPError as e:
raise Exception('HTTP %d "%s" for %s\n\t%s' % (e.code, e.msg, e.geturl(), data))
except urllib2.URLError as e:
raise urllib2.URLError(
if not data:
response = requests.get(url)
else:
response = requests.post(url, data=data, files=files)
response.raise_for_status()
except requests.exceptions.HTTPError as e:
raise Exception('HTTP Error: %s' % (e))
except requests.exceptions.URLRequired as e:
raise requests.exceptions.URLRequired(
'%s for %s. No Refine server reachable/running; ENV set?' %
(e.reason, self.server))
if response.info().get('Content-Encoding', None) == 'gzip':
(e, self.server))

if response.encoding == 'gzip':
# Need a seekable filestream for gzip
gzip_fp = gzip.GzipFile(fileobj=StringIO.StringIO(response.read()))
gzip_fp = gzip.GzipFile(fileobj=io.StringIO(response.read()))
# XXX Monkey patch response's filehandle. Better way?
urllib.addbase.__init__(response, gzip_fp)
return response


def urlopen_json(self, *args, **kwargs):
"""Open a Refine URL, optionally POST data, and return parsed JSON."""
response = json.loads(self.urlopen(*args, **kwargs).read())
response = self.urlopen(*args, **kwargs).json()
if 'code' in response and response['code'] not in ('ok', 'pending'):
error_message = ('server ' + response['code'] + ': ' +
response.get('message', response.get('stack', response)))
error_hint = ""
if response.get('message') and response['message'] is not None:
error_hint += response['message']
if response.get('stack') and response['stack'] is not None:
error_hint += response['stack']
if not error_hint:
error_hint += str(response)

error_message = 'server ' + response['code'] + ':\n' + error_hint

raise Exception(error_message)
return response

@@ -256,24 +269,25 @@ def s(opt):
'include-file-sources': s(include_file_sources),
}

files = None
if project_url is not None:
options['url'] = project_url
elif project_file is not None:
options['project-file'] = {
'fd': open(project_file),
'filename': project_file,
}
files = {'project-file': open(project_file, 'r')}

if project_name is None:
# make a name for itself by stripping extension and directories
project_name = (project_file or 'New project').rsplit('.', 1)[0]
project_name = os.path.basename(project_name)
options['project-name'] = project_name
response = self.server.urlopen(
'create-project-from-upload', options, params
'create-project-from-upload', options, params, files=files
)
if project_file:
files['project-file'].close()
# expecting a redirect to the new project containing the id in the url
url_params = urlparse.parse_qs(
urlparse.urlparse(response.geturl()).query)
url_params = urllib.parse.parse_qs(
urllib.parse.urlparse(response.url).query)
if 'project' in url_params:
project_id = url_params['project'][0]
return RefineProject(self.server, project_id)
@@ -430,7 +444,7 @@ def apply_operations(self, file_path, wait=True):
def export(self, encoding=None, export_format='tsv'):
"""Return a fileobject of a project's data."""
url = ('export-rows/' +
urllib.quote(self.project_name().encode('utf8')) +
urllib.parse.quote(self.project_name().encode('utf8')) +
'.' + export_format)
data = {'format': export_format}
if encoding:
@@ -441,7 +455,7 @@ def export_templating(self, encoding=None, engine='', prefix='',
template='', rowSeparator='\n', suffix=''):
"""Return a fileobject of a project's data in templating mode."""
url = ('export-rows/' +
urllib.quote(self.project_name().encode('utf8')) +
urllib.parse.quote(self.project_name().encode('utf8')) +
'.' + 'txt')
data = {'format': 'template',
'template': template,
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
urllib2_file>=0.2.1
requests
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -35,8 +35,8 @@ def read(filename):
author_email='felix.lohmeier@opencultureconsulting.com',
url='https://github.com/opencultureconsulting/openrefine-client',
packages=find_packages(exclude=['tests']),
install_requires=['urllib2_file'],
python_requires='>=2.7, !=3.*',
install_requires=['requests'],
python_requires='>=3.0,<4',
entry_points={
'console_scripts': [ 'openrefine-client = google.refine.__main__:main' ]
},
22 changes: 11 additions & 11 deletions tests/test_refine.py
Original file line number Diff line number Diff line change
@@ -13,7 +13,9 @@
import unittest

from google.refine import refine
from tests import refinetest
import refinetest

from io import StringIO


class RefineServerTest(refinetest.RefineTestCase):
@@ -37,7 +39,7 @@ def test_get_version(self):
self.assertTrue(item in version_info)

def test_version(self):
self.assertTrue(self.server.version in ('2.0', '2.1', '2.5'))
self.assertTrue(self.server.version in ('3.2'))


class RefineTest(refinetest.RefineTestCase):
@@ -59,21 +61,19 @@ def test_delete_project(self):
self.assertTrue(self.project.delete())

def test_open_export(self):
fp = refine.RefineProject(self.project.project_url()).export()
line = fp.next()
self.assertTrue('email' in line)
for line in fp:
response = refine.RefineProject(self.project.project_url()).export()
lines = response.text.splitlines()
self.assertTrue('email' in lines[0])
for line in lines[1:]:
self.assertTrue('M' in line or 'F' in line)
fp.close()

def test_open_export_csv(self):
fp = refine.RefineProject(self.project.project_url()).export()
csv_fp = csv.reader(fp, dialect='excel-tab')
row = csv_fp.next()
response = refine.RefineProject(self.project.project_url()).export()
csv_fp = csv.reader(StringIO(response.text), dialect='excel-tab')
row = csv_fp.__next__()
self.assertTrue(row[0] == 'email')
for row in csv_fp:
self.assertTrue(row[3] == 'F' or row[3] == 'M')
fp.close()


if __name__ == '__main__':
2 changes: 1 addition & 1 deletion tests/test_refine_small.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""
test_refine_small.py
"""
18 changes: 9 additions & 9 deletions tests/test_tutorial.py
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@
import unittest

from google.refine import facet
from tests import refinetest
import refinetest


class TutorialTestFacets(refinetest.RefineTestCase):
@@ -138,7 +138,7 @@ def test_editing(self):
# {2}
self.project.text_transform(column='Zip Code 2',
expression='value.toString()[0, 5]')
self.assertInResponse('transform on 6067 cells in column Zip Code 2')
self.assertInResponse('transform on 6958 cells in column Zip Code 2')
# {3} - XXX history
# {4}
office_title_facet = facet.TextFacet('Office Title')
@@ -156,14 +156,14 @@ def test_editing(self):
self.assertEqual(len(response.facets[office_title_facet].choices), 66)
# {6}
response = self.project.compute_clusters('Office Title')
self.assertTrue(not response)
self.assertTrue(response)
# {7}
clusters = self.project.compute_clusters('Office Title', 'knn')
self.assertEqual(len(clusters), 7)
first_cluster = clusters[0]
self.assertEqual(len(first_cluster), 2)
self.assertEqual(first_cluster[0]['value'], 'RSCC Member')
self.assertEqual(first_cluster[0]['count'], 233)
self.assertEqual(first_cluster[0]['value'], 'DPEC Member at Large')
self.assertEqual(first_cluster[0]['count'], 6)
# Not strictly necessary to repeat 'Council Member' but a test
# of mass_edit, and it's also what the front end sends.
self.project.mass_edit('Office Title', [{
@@ -194,9 +194,9 @@ def test_editing(self):
# {5}, {6}, {7}
response = self.project.compute_facets(facet.StarredFacet(True))
self.assertEqual(len(response.facets[0].choices), 2) # true & false
self.assertEqual(response.facets[0].choices[True].count, 3)
self.assertEqual(response.facets[0].choices[True].count, 2)
self.project.remove_rows()
self.assertInResponse('3 rows')
self.assertInResponse('2 rows')


class TutorialTestDuplicateDetection(refinetest.RefineTestCase):
@@ -214,7 +214,7 @@ def test_duplicate_detection(self):
self.assertInResponse('Reorder rows')
response = self.project.get_rows()
indexes = [row.index for row in response.rows]
self.assertEqual(indexes, range(10))
self.assertEqual(indexes, list(range(10)))
# {10}
self.project.add_column(
'email', 'count', 'facetCount(value, "value", "email")')
@@ -393,7 +393,7 @@ def test_transpose_variable_number_of_rows_into_columns(self):
'Column', 'row.record.cells["Column"].value[1, -1].join("|")')
self.assertInResponse('18 cells')
# {26}
self.project.engine.mode = 'row-based'
self.project.engine.mode = 'fd'
# {27}
blank_facet = facet.BlankFacet('First Line', selection=True)
self.project.remove_rows(blank_facet)