Skip to content

Commit

Permalink
Merge pull request #338 from Crunch-io/case-288/replace-from-csv
Browse files Browse the repository at this point in the history
adding replace_from_csv method in BaseDataset
  • Loading branch information
xbito authored Mar 24, 2019
2 parents 249351a + fafaad7 commit f4830cb
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 64 deletions.
71 changes: 45 additions & 26 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,38 +1,57 @@
dist: trusty
dist: xenial
sudo: false
language: python

python:
- 2.7
- 3.4
- 3.5
- &latest_py3 3.6

jobs:
fast_finish: true
include:
- stage: deploy
if: tag IS present
python: *latest_py3
install: skip
script: skip
deploy:
provider: pypi
on:
tags: true
all_branches: true
user: jaraco
password:
secure: f+AUtW6EUqe9DmbjpbJO0q/wesmpyS3zaeKAxENu2klRsV1xpKj55HBj/RP8EIHDrJHi2+GkF0Ml5BRXV8rtAzLSIP5OHWNlBWUPzJzZ+iaTHQAKXWG2po1b3SR+4t/of89tcmr7QybZmibApzT3Vs1Uq66IZ2mRdYN7w9U1TOpgFU1ZGxWgMXQEg9B8/IPlkWFP4pSSOkd+ASNlr9I3jWSpCsBYLSLKs9exUez8UScxmFn0i9Rbql0y/yiT/yagBDgU+T7f5H6KEQr30V0taPpiE+NFpV0iHtR4NnUfYHqtK2nvPmDWQqSGy80U/gwSsXR44k4trSQQBdrp1laO5/XHr5QHrcejrHIu6PYxyAXzMwX611Cll6Dsi+gakPZZCRWgek5Hp96RIBSBuwnjCypsr33T8GRtRrpfRvNLA8ouKhcl5CoWoPocRwyWxPXLOpG3uNbEQNApdbCKBkCHpq+pT4Xa8l7yWFS8nGC5CaGuX9NCkA5njs4RlwpKZpxi0vvr6Iwygq8hYGnKBLg7zCvqJKq8AL1iGKkwxlx4TBIlNcKuZZELKvLRYteg8JKwI+otghgHox+V51jB15j3yBrBqm1rSWT+dk4z03shFYAInBZjD7Lt/CoRd9it8ttgxbX+dVzsy5l4qWY7f5F80UftyClmIcVhbq27FudoKsk=
distributions: dists
skip_cleanup: true
skip_upload_docs: true
- stage: test
python: 2.7
env: TOXENV=py27
- stage: test-pandas
python: 2.7
env: TOXENV=py27-pandas
- stage: test
python: 3.4
env: TOXENV=py34
- stage: test
python: 3.5
env: TOXENV=py35
- stage: test-pandas
python: 3.5
env: TOXENV=py35-pandas
- stage: test
python: 3.6
env: TOXENV=py36
- stage: test-pandas
python: 3.6
env: TOXENV=py36-pandas
- stage: test
python: 3.7
env: TOXENV=py37
- stage: test-pandas
python: 3.7
env: TOXENV=py37-pandas
- stage: deploy
if: tag IS present
python: *latest_py3
install: skip
script: skip
deploy:
provider: pypi
on:
tags: true
all_branches: true
user: jaraco
password:
secure: f+AUtW6EUqe9DmbjpbJO0q/wesmpyS3zaeKAxENu2klRsV1xpKj55HBj/RP8EIHDrJHi2+GkF0Ml5BRXV8rtAzLSIP5OHWNlBWUPzJzZ+iaTHQAKXWG2po1b3SR+4t/of89tcmr7QybZmibApzT3Vs1Uq66IZ2mRdYN7w9U1TOpgFU1ZGxWgMXQEg9B8/IPlkWFP4pSSOkd+ASNlr9I3jWSpCsBYLSLKs9exUez8UScxmFn0i9Rbql0y/yiT/yagBDgU+T7f5H6KEQr30V0taPpiE+NFpV0iHtR4NnUfYHqtK2nvPmDWQqSGy80U/gwSsXR44k4trSQQBdrp1laO5/XHr5QHrcejrHIu6PYxyAXzMwX611Cll6Dsi+gakPZZCRWgek5Hp96RIBSBuwnjCypsr33T8GRtRrpfRvNLA8ouKhcl5CoWoPocRwyWxPXLOpG3uNbEQNApdbCKBkCHpq+pT4Xa8l7yWFS8nGC5CaGuX9NCkA5njs4RlwpKZpxi0vvr6Iwygq8hYGnKBLg7zCvqJKq8AL1iGKkwxlx4TBIlNcKuZZELKvLRYteg8JKwI+otghgHox+V51jB15j3yBrBqm1rSWT+dk4z03shFYAInBZjD7Lt/CoRd9it8ttgxbX+dVzsy5l4qWY7f5F80UftyClmIcVhbq27FudoKsk=
distributions: dists
skip_cleanup: true
skip_upload_docs: true

cache: pip

install:
- pip install tox tox-venv
- pip install tox

script:
- tox
- if [[ "$TRAVIS_PYTHON_VERSION" != "3.4" ]]; then tox -e pandas; fi
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[pytest]
norecursedirs=dist build .tox .eggs
norecursedirs=dist build .tox .eggs examples
addopts=--doctest-modules --cov=scrunch --cov-config=.coveragerc -p no:sugar
doctest_optionflags=ALLOW_UNICODE ELLIPSIS
49 changes: 47 additions & 2 deletions scrunch/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import pycrunch
from pycrunch.exporting import export_dataset
from pycrunch.shoji import Entity
from pycrunch.shoji import Entity, TaskProgressTimeoutError
from scrunch.session import connect
from scrunch.categories import CategoryList
from scrunch.exceptions import (AuthenticationError, InvalidParamError,
Expand Down Expand Up @@ -931,7 +931,7 @@ class BaseDataset(ReadOnly, DatasetVariablesMixin):
"""

_MUTABLE_ATTRIBUTES = {'name', 'notes', 'description', 'is_published',
'archived', 'end_date', 'start_date'}
'archived', 'end_date', 'start_date', 'streaming'}
_IMMUTABLE_ATTRIBUTES = {'id', 'creation_time', 'modification_time',
'size'}
_ENTITY_ATTRIBUTES = _MUTABLE_ATTRIBUTES | _IMMUTABLE_ATTRIBUTES
Expand Down Expand Up @@ -992,6 +992,11 @@ def make_mutable(self):
from scrunch.mutable_dataset import MutableDataset
return MutableDataset(self.resource)

def make_streaming(self):
from scrunch.streaming_dataset import StreamingDataset
self.edit(streaming='streaming')
return StreamingDataset(self.resource)

@property
def project(self):
return Project(self.resource.project)
Expand Down Expand Up @@ -2378,6 +2383,46 @@ def replace_values(self, variables, filter=None, literal_subvar=False):
return
return resp

def replace_from_csv(self, filename, chunksize=1000):
"""
Given a csv file in the format:
id, var1_alias, var2_alias
1, 14, 15
where the first column is the Dataset PK
Replace the values of the matching id, for the given variables
in the Dataset using the /stream endpoint:
[{id: 1, var1_alias: 14, var2_alias: 15}, ...]
"""
streaming_state = self.resource.body.get('streaming', 'no')
ds = self
if streaming_state != 'streaming':
ds = self.make_streaming()
importer = pycrunch.importing.Importer()
df_chunks = pd.read_csv(
filename,
header=0,
chunksize=chunksize
)
for chunk in df_chunks:
# This is a trick to get rid of np.int64, which is not
# json serializable
stream = chunk.to_json(orient='records')
stream = json.loads(stream)
# trap the timeout and allow it to finish
try:
importer.stream_rows(self.resource, stream)
# We force the row push to instantly see any errors in the data
# and to allow changing to streaming status back to it's previous
# state
ds.push_rows(chunksize)
except TaskProgressTimeoutError as exc:
exc.entity.wait_progress(exc.response)
if streaming_state != 'streaming':
ds.edit(streaming=streaming_state)

def merge(self, fork_id=None, autorollback=True):
"""
:param fork_id: str or int
Expand Down
11 changes: 8 additions & 3 deletions scrunch/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
'valid': 'is_valid',
'missing': 'is_missing',
'bin': 'bin',
'selected': 'selected',
'not_selected': 'not_selected',
}

Expand All @@ -68,6 +69,7 @@
'all': 'all',
'duplicates': 'duplicates',
'bin': 'bin',
'selected': 'selected',
'not_selected': 'not_selected',
}

Expand Down Expand Up @@ -740,9 +742,12 @@ def _transform(f, args, nest=False):
op = ' %s ' % f
result = op.join(str(x) for x in args)
elif f in methods:
result = '%s.%s(%s)' % (
args[0], methods[f], ', '.join(str(x) for x in args[1:])
)
if f in ['selected', 'not_selected']:
result = '%s(%s)%s' % (methods[f], args[0], ', '.join(str(x) for x in args[1:]))
else:
result = '%s.%s(%s)' % (
args[0], methods[f], ', '.join(str(x) for x in args[1:])
)
elif f in functions:
result = '%s(%s)' % (functions[f], args[0])
else:
Expand Down
19 changes: 12 additions & 7 deletions scrunch/tests/test_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,32 +90,37 @@ def test_Category_attribute_writes(self):
)
variable = Variable(resource, MagicMock())

error_msg = 'use the edit() method for mutating attributes'
error_msg = "use the edit() method for mutating attributes"

with pytest.raises(AttributeError, message=error_msg):
with pytest.raises(AttributeError) as excinfo:
variable.categories[1].id = 42
# nothing has changed
assert variable.categories[1].id == 1
assert str(excinfo.value) == "Can't edit attibute 'id'"

with pytest.raises(AttributeError, message=error_msg):
with pytest.raises(AttributeError) as excinfo:
variable.categories[1].name = 'forbidden'
# nothing has changed
assert variable.categories[1].name == 'Female'
assert str(excinfo.value) == error_msg

with pytest.raises(AttributeError, message=error_msg):
with pytest.raises(AttributeError) as excinfo:
variable.categories[1].numeric_value = 42
# nothing has changed
assert variable.categories[1].numeric_value is None
assert str(excinfo.value) == error_msg

with pytest.raises(AttributeError, message=error_msg):
with pytest.raises(AttributeError) as excinfo:
variable.categories[1].missing = True
# nothing has changed
assert variable.categories[1].missing is False
assert str(excinfo.value) == error_msg

with pytest.raises(AttributeError, message=error_msg):
with pytest.raises(AttributeError) as excinfo:
variable.categories[1].selected = True
# nothing has changed, default is False
assert variable.categories[1].selected is False
assert str(excinfo.value) == error_msg

def test_edit_derived(self):
resource = EditableMock()
Expand All @@ -127,7 +132,7 @@ def test_edit_derived(self):
variable = Variable(resource, MagicMock())

error_msg = "Cannot edit categories on derived variables. Re-derive with the appropriate expression"
with pytest.raises(TypeError, message=error_msg):
with pytest.raises(TypeError, match=error_msg):
variable.categories[1].edit(name='Mujer')

# Try again with an empty derivation
Expand Down
Loading

0 comments on commit f4830cb

Please sign in to comment.