Skip to content

Commit f4830cb

Browse files
authored
Merge pull request #338 from Crunch-io/case-288/replace-from-csv
adding replace_from_csv method in BaseDataset
2 parents 249351a + fafaad7 commit f4830cb

9 files changed

+183
-64
lines changed

.travis.yml

+45-26
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,57 @@
1-
dist: trusty
1+
dist: xenial
22
sudo: false
33
language: python
44

5-
python:
6-
- 2.7
7-
- 3.4
8-
- 3.5
9-
- &latest_py3 3.6
10-
115
jobs:
12-
fast_finish: true
136
include:
14-
- stage: deploy
15-
if: tag IS present
16-
python: *latest_py3
17-
install: skip
18-
script: skip
19-
deploy:
20-
provider: pypi
21-
on:
22-
tags: true
23-
all_branches: true
24-
user: jaraco
25-
password:
26-
secure: f+AUtW6EUqe9DmbjpbJO0q/wesmpyS3zaeKAxENu2klRsV1xpKj55HBj/RP8EIHDrJHi2+GkF0Ml5BRXV8rtAzLSIP5OHWNlBWUPzJzZ+iaTHQAKXWG2po1b3SR+4t/of89tcmr7QybZmibApzT3Vs1Uq66IZ2mRdYN7w9U1TOpgFU1ZGxWgMXQEg9B8/IPlkWFP4pSSOkd+ASNlr9I3jWSpCsBYLSLKs9exUez8UScxmFn0i9Rbql0y/yiT/yagBDgU+T7f5H6KEQr30V0taPpiE+NFpV0iHtR4NnUfYHqtK2nvPmDWQqSGy80U/gwSsXR44k4trSQQBdrp1laO5/XHr5QHrcejrHIu6PYxyAXzMwX611Cll6Dsi+gakPZZCRWgek5Hp96RIBSBuwnjCypsr33T8GRtRrpfRvNLA8ouKhcl5CoWoPocRwyWxPXLOpG3uNbEQNApdbCKBkCHpq+pT4Xa8l7yWFS8nGC5CaGuX9NCkA5njs4RlwpKZpxi0vvr6Iwygq8hYGnKBLg7zCvqJKq8AL1iGKkwxlx4TBIlNcKuZZELKvLRYteg8JKwI+otghgHox+V51jB15j3yBrBqm1rSWT+dk4z03shFYAInBZjD7Lt/CoRd9it8ttgxbX+dVzsy5l4qWY7f5F80UftyClmIcVhbq27FudoKsk=
27-
distributions: dists
28-
skip_cleanup: true
29-
skip_upload_docs: true
7+
- stage: test
8+
python: 2.7
9+
env: TOXENV=py27
10+
- stage: test-pandas
11+
python: 2.7
12+
env: TOXENV=py27-pandas
13+
- stage: test
14+
python: 3.4
15+
env: TOXENV=py34
16+
- stage: test
17+
python: 3.5
18+
env: TOXENV=py35
19+
- stage: test-pandas
20+
python: 3.5
21+
env: TOXENV=py35-pandas
22+
- stage: test
23+
python: 3.6
24+
env: TOXENV=py36
25+
- stage: test-pandas
26+
python: 3.6
27+
env: TOXENV=py36-pandas
28+
- stage: test
29+
python: 3.7
30+
env: TOXENV=py37
31+
- stage: test-pandas
32+
python: 3.7
33+
env: TOXENV=py37-pandas
34+
- stage: deploy
35+
if: tag IS present
36+
python: *latest_py3
37+
install: skip
38+
script: skip
39+
deploy:
40+
provider: pypi
41+
on:
42+
tags: true
43+
all_branches: true
44+
user: jaraco
45+
password:
46+
secure: f+AUtW6EUqe9DmbjpbJO0q/wesmpyS3zaeKAxENu2klRsV1xpKj55HBj/RP8EIHDrJHi2+GkF0Ml5BRXV8rtAzLSIP5OHWNlBWUPzJzZ+iaTHQAKXWG2po1b3SR+4t/of89tcmr7QybZmibApzT3Vs1Uq66IZ2mRdYN7w9U1TOpgFU1ZGxWgMXQEg9B8/IPlkWFP4pSSOkd+ASNlr9I3jWSpCsBYLSLKs9exUez8UScxmFn0i9Rbql0y/yiT/yagBDgU+T7f5H6KEQr30V0taPpiE+NFpV0iHtR4NnUfYHqtK2nvPmDWQqSGy80U/gwSsXR44k4trSQQBdrp1laO5/XHr5QHrcejrHIu6PYxyAXzMwX611Cll6Dsi+gakPZZCRWgek5Hp96RIBSBuwnjCypsr33T8GRtRrpfRvNLA8ouKhcl5CoWoPocRwyWxPXLOpG3uNbEQNApdbCKBkCHpq+pT4Xa8l7yWFS8nGC5CaGuX9NCkA5njs4RlwpKZpxi0vvr6Iwygq8hYGnKBLg7zCvqJKq8AL1iGKkwxlx4TBIlNcKuZZELKvLRYteg8JKwI+otghgHox+V51jB15j3yBrBqm1rSWT+dk4z03shFYAInBZjD7Lt/CoRd9it8ttgxbX+dVzsy5l4qWY7f5F80UftyClmIcVhbq27FudoKsk=
47+
distributions: dists
48+
skip_cleanup: true
49+
skip_upload_docs: true
3050

3151
cache: pip
3252

3353
install:
34-
- pip install tox tox-venv
54+
- pip install tox
3555

3656
script:
3757
- tox
38-
- if [[ "$TRAVIS_PYTHON_VERSION" != "3.4" ]]; then tox -e pandas; fi

pytest.ini

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[pytest]
2-
norecursedirs=dist build .tox .eggs
2+
norecursedirs=dist build .tox .eggs examples
33
addopts=--doctest-modules --cov=scrunch --cov-config=.coveragerc -p no:sugar
44
doctest_optionflags=ALLOW_UNICODE ELLIPSIS

scrunch/datasets.py

+47-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import pycrunch
2222
from pycrunch.exporting import export_dataset
23-
from pycrunch.shoji import Entity
23+
from pycrunch.shoji import Entity, TaskProgressTimeoutError
2424
from scrunch.session import connect
2525
from scrunch.categories import CategoryList
2626
from scrunch.exceptions import (AuthenticationError, InvalidParamError,
@@ -931,7 +931,7 @@ class BaseDataset(ReadOnly, DatasetVariablesMixin):
931931
"""
932932

933933
_MUTABLE_ATTRIBUTES = {'name', 'notes', 'description', 'is_published',
934-
'archived', 'end_date', 'start_date'}
934+
'archived', 'end_date', 'start_date', 'streaming'}
935935
_IMMUTABLE_ATTRIBUTES = {'id', 'creation_time', 'modification_time',
936936
'size'}
937937
_ENTITY_ATTRIBUTES = _MUTABLE_ATTRIBUTES | _IMMUTABLE_ATTRIBUTES
@@ -992,6 +992,11 @@ def make_mutable(self):
992992
from scrunch.mutable_dataset import MutableDataset
993993
return MutableDataset(self.resource)
994994

995+
def make_streaming(self):
996+
from scrunch.streaming_dataset import StreamingDataset
997+
self.edit(streaming='streaming')
998+
return StreamingDataset(self.resource)
999+
9951000
@property
9961001
def project(self):
9971002
return Project(self.resource.project)
@@ -2378,6 +2383,46 @@ def replace_values(self, variables, filter=None, literal_subvar=False):
23782383
return
23792384
return resp
23802385

2386+
def replace_from_csv(self, filename, chunksize=1000):
2387+
"""
2388+
Given a csv file in the format:
2389+
id, var1_alias, var2_alias
2390+
1, 14, 15
2391+
2392+
where the first column is the Dataset PK
2393+
2394+
Replace the values of the matching id, for the given variables
2395+
in the Dataset using the /stream endpoint:
2396+
2397+
[{id: 1, var1_alias: 14, var2_alias: 15}, ...]
2398+
"""
2399+
streaming_state = self.resource.body.get('streaming', 'no')
2400+
ds = self
2401+
if streaming_state != 'streaming':
2402+
ds = self.make_streaming()
2403+
importer = pycrunch.importing.Importer()
2404+
df_chunks = pd.read_csv(
2405+
filename,
2406+
header=0,
2407+
chunksize=chunksize
2408+
)
2409+
for chunk in df_chunks:
2410+
# This is a trick to get rid of np.int64, which is not
2411+
# json serializable
2412+
stream = chunk.to_json(orient='records')
2413+
stream = json.loads(stream)
2414+
# trap the timeout and allow it to finish
2415+
try:
2416+
importer.stream_rows(self.resource, stream)
2417+
# We force the row push to instantly see any errors in the data
2418+
# and to allow changing to streaming status back to it's previous
2419+
# state
2420+
ds.push_rows(chunksize)
2421+
except TaskProgressTimeoutError as exc:
2422+
exc.entity.wait_progress(exc.response)
2423+
if streaming_state != 'streaming':
2424+
ds.edit(streaming=streaming_state)
2425+
23812426
def merge(self, fork_id=None, autorollback=True):
23822427
"""
23832428
:param fork_id: str or int

scrunch/expressions.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
'valid': 'is_valid',
6161
'missing': 'is_missing',
6262
'bin': 'bin',
63+
'selected': 'selected',
6364
'not_selected': 'not_selected',
6465
}
6566

@@ -68,6 +69,7 @@
6869
'all': 'all',
6970
'duplicates': 'duplicates',
7071
'bin': 'bin',
72+
'selected': 'selected',
7173
'not_selected': 'not_selected',
7274
}
7375

@@ -740,9 +742,12 @@ def _transform(f, args, nest=False):
740742
op = ' %s ' % f
741743
result = op.join(str(x) for x in args)
742744
elif f in methods:
743-
result = '%s.%s(%s)' % (
744-
args[0], methods[f], ', '.join(str(x) for x in args[1:])
745-
)
745+
if f in ['selected', 'not_selected']:
746+
result = '%s(%s)%s' % (methods[f], args[0], ', '.join(str(x) for x in args[1:]))
747+
else:
748+
result = '%s.%s(%s)' % (
749+
args[0], methods[f], ', '.join(str(x) for x in args[1:])
750+
)
746751
elif f in functions:
747752
result = '%s(%s)' % (functions[f], args[0])
748753
else:

scrunch/tests/test_categories.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -90,32 +90,37 @@ def test_Category_attribute_writes(self):
9090
)
9191
variable = Variable(resource, MagicMock())
9292

93-
error_msg = 'use the edit() method for mutating attributes'
93+
error_msg = "use the edit() method for mutating attributes"
9494

95-
with pytest.raises(AttributeError, message=error_msg):
95+
with pytest.raises(AttributeError) as excinfo:
9696
variable.categories[1].id = 42
9797
# nothing has changed
9898
assert variable.categories[1].id == 1
99+
assert str(excinfo.value) == "Can't edit attibute 'id'"
99100

100-
with pytest.raises(AttributeError, message=error_msg):
101+
with pytest.raises(AttributeError) as excinfo:
101102
variable.categories[1].name = 'forbidden'
102103
# nothing has changed
103104
assert variable.categories[1].name == 'Female'
105+
assert str(excinfo.value) == error_msg
104106

105-
with pytest.raises(AttributeError, message=error_msg):
107+
with pytest.raises(AttributeError) as excinfo:
106108
variable.categories[1].numeric_value = 42
107109
# nothing has changed
108110
assert variable.categories[1].numeric_value is None
111+
assert str(excinfo.value) == error_msg
109112

110-
with pytest.raises(AttributeError, message=error_msg):
113+
with pytest.raises(AttributeError) as excinfo:
111114
variable.categories[1].missing = True
112115
# nothing has changed
113116
assert variable.categories[1].missing is False
117+
assert str(excinfo.value) == error_msg
114118

115-
with pytest.raises(AttributeError, message=error_msg):
119+
with pytest.raises(AttributeError) as excinfo:
116120
variable.categories[1].selected = True
117121
# nothing has changed, default is False
118122
assert variable.categories[1].selected is False
123+
assert str(excinfo.value) == error_msg
119124

120125
def test_edit_derived(self):
121126
resource = EditableMock()
@@ -127,7 +132,7 @@ def test_edit_derived(self):
127132
variable = Variable(resource, MagicMock())
128133

129134
error_msg = "Cannot edit categories on derived variables. Re-derive with the appropriate expression"
130-
with pytest.raises(TypeError, message=error_msg):
135+
with pytest.raises(TypeError, match=error_msg):
131136
variable.categories[1].edit(name='Mujer')
132137

133138
# Try again with an empty derivation

0 commit comments

Comments
 (0)