Skip to content

Commit 4abcbde

Browse files
authored
More Manifest Info (#20)
1 parent d8a7511 commit 4abcbde

21 files changed

+541
-98
lines changed

.github/workflows/tests.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ jobs:
3636
3737
- name: Run Tests
3838
run: pytest
39+
env:
40+
CONSTRUE_TEST_SAMPLE_LOADERS: 1

construe/cloud/manifest.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
import json
77
import glob
8+
import zipfile
89

910
from urllib.parse import urljoin
1011

@@ -24,28 +25,52 @@ def load_manifest(path):
2425
return json.load(f)
2526

2627

27-
def generate_manifest(fixtures, out, upload_type):
28+
def generate_manifest(fixtures, out, upload_type, extra=None):
2829
manifest = {}
2930
version = get_version(short=True)
3031

31-
for path in glob.glob(os.path.join(fixtures, "*.zip")):
32+
# Sort the list of paths by name
33+
paths = list(glob.glob(os.path.join(fixtures, "*.zip")))
34+
paths.sort()
35+
36+
for path in paths:
3237
fname = os.path.basename(path)
3338
name, _ = os.path.splitext(fname)
3439

3540
manifest[name] = {
3641
"url": make_fixture_url(fname, upload_type=upload_type, version=version),
3742
"signature": sha256sum(path),
43+
"size": {
44+
"compressed": os.path.getsize(path),
45+
"decompressed": get_uncompressed_size(path),
46+
},
3847
}
3948

49+
if extra is not None:
50+
if callable(extra):
51+
manifest[name].update(extra(path=path, name=name, **manifest[name]))
52+
else:
53+
manifest[name].update(extra)
54+
4055
with open(out, "w") as o:
4156
json.dump(manifest, o, indent=2)
4257

4358

4459
def make_fixture_url(fname, upload_type, version=None):
60+
# Bucket must be joined here and not make_fixture_path to support uploading
4561
path = make_fixture_path(fname, upload_type, version)
62+
path = os.path.join(BUCKET, path)
4663
return urljoin(BASE_URL, path)
4764

4865

4966
def make_fixture_path(fname, upload_type, version=None):
5067
version = version or get_version(short=True)
5168
return os.path.join(f"v{version}", upload_type, fname)
69+
70+
71+
def get_uncompressed_size(path: str) -> int:
72+
bytes = 0
73+
with zipfile.ZipFile(path, 'r') as zf:
74+
for info in zf.infolist():
75+
bytes += info.file_size
76+
return bytes

construe/datasets/__init__.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,10 @@
33
"""
44

55
from .loaders import * # noqa
6-
from .download import download_data
6+
from .download import download_data, load_manifest
77
from .path import get_data_home, cleanup_dataset
8+
9+
try:
10+
DATASETS = load_manifest()
11+
except Exception:
12+
DATASETS = None

construe/datasets/download.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,15 @@ def _download_dataset(
4242
raise DatasetsError(f"no dataset named {name} exists")
4343

4444
info = datasets[name]
45-
info.update({
45+
kwargs = {
4646
"data_home": data_home,
4747
"replace": replace,
4848
"extract": extract,
4949
"progress": progress,
50-
})
51-
download_data(**info)
50+
"url": info["url"],
51+
"signature": info["signature"],
52+
}
53+
download_data(**kwargs)
5254

5355

5456
download_dialects = partial(_download_dataset, DIALECTS)

construe/datasets/loaders.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,11 @@ def _load_prepare(name, sample=True, data_home=None):
4545
info = _info(name)
4646
if not dataset_archive(name, info["signature"], data_home=data_home):
4747
# If the dataset does not exist, download and extract it
48-
info.update({"data_home": data_home, "replace": True, "extract": True})
49-
download_data(**info)
48+
kwargs = {
49+
"data_home": data_home, "replace": True, "extract": True,
50+
"url": info["url"], "signature": info["signature"],
51+
}
52+
download_data(**kwargs)
5053

5154
return find_dataset_path(name, data_home=data_home, fname=None, ext=None)
5255

@@ -56,8 +59,8 @@ def _load_file_dataset(name, sample=True, data_home=None, no_dirs=True, pattern=
5659
data_path = _load_prepare(name, sample=sample, data_home=data_home)
5760

5861
# Glob pattern for discovering files in the dataset
59-
if pattern is not None:
60-
pattern = os.path.join(data_path, name, "**", "*")
62+
if pattern is None:
63+
pattern = os.path.join(data_path, "**", "*")
6164
else:
6265
pattern = os.path.join(data_path, pattern)
6366

@@ -73,7 +76,7 @@ def _load_jsonl_dataset(name, sample=True, data_home=None):
7376
for path in glob.glob(os.path.join(data_path, "*.jsonl")):
7477
with open(path, "r") as f:
7578
for line in f:
76-
yield json.load(f)
79+
yield json.loads(line.strip())
7780

7881

7982
def _cleanup_dataset(name, sample=True, data_home=None):
@@ -100,7 +103,7 @@ def _cleanup_dataset(name, sample=True, data_home=None):
100103
load_aegis = partial(_load_jsonl_dataset, AEGIS)
101104
cleanup_aegis = partial(_cleanup_dataset, AEGIS)
102105

103-
load_nsfw = partial(_load_file_dataset, NSFW)
106+
load_nsfw = partial(_load_file_dataset, NSFW, pattern="nsfw/**/*.jpg")
104107
cleanup_nsfw = partial(_cleanup_dataset, NSFW)
105108

106109

construe/datasets/manifest.json

+153-35
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,176 @@
11
{
2-
"dialects": {
3-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects.zip",
4-
"signature": "0e6767047e05f618560d097dfa0587530636c52fc19507c087bdff556b389489"
2+
"aegis-sample": {
3+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis-sample.zip",
4+
"signature": "a2b3ae9c5a19833cc594fc4c14a6bfce35ab9c6086f0c2836d2719ab788119bd",
5+
"size": {
6+
"compressed": 916334,
7+
"decompressed": 2878359
8+
},
9+
"instances": 3030
510
},
6-
"lowlight": {
7-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight.zip",
8-
"signature": "ddc36eb7f0443efa5e71939e503d0834fd48451281d9658d5cb7ead30143b98f"
11+
"aegis": {
12+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis.zip",
13+
"signature": "c846f20d893461525839cd2f61f85faf0dcbff03e1998fd8f747506ff65bec69",
14+
"size": {
15+
"compressed": 3619910,
16+
"decompressed": 11362916
17+
},
18+
"instances": 11997
919
},
1020
"dialects-sample": {
1121
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects-sample.zip",
12-
"signature": "9e9509f4d82468c896bede36b16c6de218a1dce28a56ae49d1fb75933bf770c5"
13-
},
14-
"reddit": {
15-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit.zip",
16-
"signature": "d97419403f0d940970b2542d5b188570dacedae3c2a68ada3520cfa95c52f75c"
22+
"signature": "9e9509f4d82468c896bede36b16c6de218a1dce28a56ae49d1fb75933bf770c5",
23+
"size": {
24+
"compressed": 243136640,
25+
"decompressed": 356704802
26+
},
27+
"instances": 1785,
28+
"classes": {
29+
"northern_male": 203,
30+
"southern_female": 417,
31+
"northern_female": 65,
32+
"irish_male": 48,
33+
"scottish_male": 172,
34+
"welsh_female": 120,
35+
"southern_male": 436,
36+
"midlands_female": 24,
37+
"midlands_male": 51,
38+
"welsh_male": 157,
39+
"scottish_female": 92
40+
}
1741
},
18-
"movies-sample": {
19-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies-sample.zip",
20-
"signature": "2d3d9294ad875e7489db94fc2ab02c1ad6dfdc15a2bf1a5037be36a6defc8168"
42+
"dialects": {
43+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects.zip",
44+
"signature": "0e6767047e05f618560d097dfa0587530636c52fc19507c087bdff556b389489",
45+
"size": {
46+
"compressed": 2466918919,
47+
"decompressed": 3605272328
48+
},
49+
"instances": 17877,
50+
"classes": {
51+
"northern_male": 2097,
52+
"southern_female": 4161,
53+
"northern_female": 750,
54+
"irish_male": 450,
55+
"scottish_male": 1649,
56+
"welsh_female": 1199,
57+
"southern_male": 4331,
58+
"midlands_female": 246,
59+
"midlands_male": 450,
60+
"welsh_male": 1650,
61+
"scottish_female": 894
62+
}
2163
},
2264
"essays-sample": {
2365
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/essays-sample.zip",
24-
"signature": "a77fc1c2c2718d79132598e6c873fd5b08c40c2e4049d995317747fb76b96631"
25-
},
26-
"aegis-sample": {
27-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis-sample.zip",
28-
"signature": "a2b3ae9c5a19833cc594fc4c14a6bfce35ab9c6086f0c2836d2719ab788119bd"
29-
},
30-
"aegis": {
31-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis.zip",
32-
"signature": "c846f20d893461525839cd2f61f85faf0dcbff03e1998fd8f747506ff65bec69"
33-
},
34-
"nsfw-sample": {
35-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw-sample.zip",
36-
"signature": "d5044f30769d3a6e9ba639312120dc955bdfcf4d8aa8a6f3ee493334644b9fcd"
66+
"signature": "a77fc1c2c2718d79132598e6c873fd5b08c40c2e4049d995317747fb76b96631",
67+
"size": {
68+
"compressed": 1796330,
69+
"decompressed": 8785856
70+
},
71+
"instances": 512
3772
},
3873
"essays": {
3974
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/essays.zip",
40-
"signature": "3a7b260dd5baec9134c7398ac7b9b297d7b1a387bce1a9f99cd8d3e0a7ceb9cc"
41-
},
42-
"reddit-sample": {
43-
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit-sample.zip",
44-
"signature": "24088c648b8c3497d0b682102c3fa965d46ca22abe8f94695287e09bf82db991"
75+
"signature": "3a7b260dd5baec9134c7398ac7b9b297d7b1a387bce1a9f99cd8d3e0a7ceb9cc",
76+
"size": {
77+
"compressed": 7116584,
78+
"decompressed": 35516576
79+
},
80+
"instances": 2078
4581
},
4682
"lowlight-sample": {
4783
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight-sample.zip",
48-
"signature": "f34bafa588441b8e240b0932e9ac446d9f805bdfdb22640c036c441258220eaf"
84+
"signature": "f34bafa588441b8e240b0932e9ac446d9f805bdfdb22640c036c441258220eaf",
85+
"size": {
86+
"compressed": 166217847,
87+
"decompressed": 166608858
88+
},
89+
"instances": 475,
90+
"classes": {
91+
"high": 242,
92+
"low": 233
93+
}
94+
},
95+
"lowlight": {
96+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight.zip",
97+
"signature": "ddc36eb7f0443efa5e71939e503d0834fd48451281d9658d5cb7ead30143b98f",
98+
"size": {
99+
"compressed": 347470078,
100+
"decompressed": 348256471
101+
},
102+
"instances": 1000,
103+
"classes": {
104+
"high": 500,
105+
"low": 500
106+
}
107+
},
108+
"movies-sample": {
109+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies-sample.zip",
110+
"signature": "2d3d9294ad875e7489db94fc2ab02c1ad6dfdc15a2bf1a5037be36a6defc8168",
111+
"size": {
112+
"compressed": 381174092,
113+
"decompressed": 387776108
114+
},
115+
"instances": 5465,
116+
"classes": {
117+
"movies": 5465
118+
}
49119
},
50120
"movies": {
51121
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies.zip",
52-
"signature": "618f7aa8aa103192ee8b76fc701ff182b2a41e5e78675a4d6af707e490d36f45"
122+
"signature": "618f7aa8aa103192ee8b76fc701ff182b2a41e5e78675a4d6af707e490d36f45",
123+
"size": {
124+
"compressed": 7351355869,
125+
"decompressed": 7479027563
126+
},
127+
"instances": 106844,
128+
"classes": {
129+
"movies": 106844
130+
}
131+
},
132+
"nsfw-sample": {
133+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw-sample.zip",
134+
"signature": "d5044f30769d3a6e9ba639312120dc955bdfcf4d8aa8a6f3ee493334644b9fcd",
135+
"size": {
136+
"compressed": 6429140,
137+
"decompressed": 6535438
138+
},
139+
"instances": 53,
140+
"classes": {
141+
"safe": 28,
142+
"nsfw": 25
143+
}
53144
},
54145
"nsfw": {
55146
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw.zip",
56-
"signature": "7ac498e8f17428c51a5c8c366aaf10b47663a9eb8a560fd8abe01366eaf60139"
147+
"signature": "7ac498e8f17428c51a5c8c366aaf10b47663a9eb8a560fd8abe01366eaf60139",
148+
"size": {
149+
"compressed": 27937058,
150+
"decompressed": 28266876
151+
},
152+
"instances": 215,
153+
"classes": {
154+
"safe": 108,
155+
"nsfw": 107
156+
}
157+
},
158+
"reddit-sample": {
159+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit-sample.zip",
160+
"signature": "24088c648b8c3497d0b682102c3fa965d46ca22abe8f94695287e09bf82db991",
161+
"size": {
162+
"compressed": 63979,
163+
"decompressed": 278734
164+
},
165+
"instances": 957
166+
},
167+
"reddit": {
168+
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit.zip",
169+
"signature": "d97419403f0d940970b2542d5b188570dacedae3c2a68ada3520cfa95c52f75c",
170+
"size": {
171+
"compressed": 244363,
172+
"decompressed": 1117785
173+
},
174+
"instances": 3844
57175
}
58176
}

0 commit comments

Comments
 (0)