Skip to content

Commit 9a279d4

Browse files
committed
roll-up properties
1 parent 0533a73 commit 9a279d4

2 files changed

Lines changed: 114 additions & 63 deletions

File tree

fairscape_models/conversion/mapping/AIReady.py

Lines changed: 100 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def score_rocrate(crate_data: Union[Dict[str, Any], ROCrateV1_2]) -> AIReadyScor
6060
_score_pre_model(score.pre_model_explainability, root_data, metadata_graph)
6161
_score_ethics(score.ethics, root_data)
6262
_score_sustainability(score.sustainability, root_data)
63-
_score_computability(score.computability, metadata_graph)
63+
_score_computability(score.computability, root_data, metadata_graph)
6464

6565
return score
6666

@@ -126,22 +126,34 @@ def _score_provenance(provenance: ProvenanceScore, root_data: Dict[str, Any], me
126126
has_content=True,
127127
details=", ".join(actors)
128128
)
129-
130-
datasets_count = 0
131-
transformations_count = 0
132-
software_count = 0
133-
134-
for entity in metadata_graph:
135-
entity_type = _get_type(entity)
136129

137-
if "Dataset" in entity_type:
138-
datasets_count += 1
139-
140-
if "Computation" in entity_type or "Experiment" in entity_type:
141-
transformations_count += 1
142-
143-
if "Software" in entity_type:
144-
software_count += 1
130+
# Check for aggregated metrics first (from release-level RO-Crate)
131+
dataset_count = root_data.get("evi:datasetCount")
132+
computation_count = root_data.get("evi:computationCount")
133+
software_count = root_data.get("evi:softwareCount")
134+
135+
if dataset_count is not None:
136+
# Use pre-aggregated values from release
137+
datasets_count = dataset_count
138+
transformations_count = computation_count
139+
software_count = software_count
140+
else:
141+
# Fall back to counting in metadata_graph (for backwards compatibility)
142+
datasets_count = 0
143+
transformations_count = 0
144+
software_count = 0
145+
146+
for entity in metadata_graph:
147+
entity_type = _get_type(entity)
148+
149+
if "Dataset" in entity_type:
150+
datasets_count += 1
151+
152+
if "Computation" in entity_type or "Experiment" in entity_type:
153+
transformations_count += 1
154+
155+
if "Software" in entity_type:
156+
software_count += 1
145157

146158
if datasets_count > 0:
147159
provenance.transparent = SubCriterionScore(
@@ -169,29 +181,39 @@ def _score_characterization(characterization: CharacterizationScore, root_data:
169181
has_content=True,
170182
details=str(bias)[:200] + ("..." if len(str(bias)) > 200 else "")
171183
)
172-
173-
total_size = 0
174-
stats_count = 0
175-
176-
for entity in metadata_graph:
177-
entity_type = _get_type(entity)
178184

179-
if "Dataset" in entity_type or "ROCrate" in entity_type:
180-
size = entity.get("contentSize", "")
181-
if size:
182-
try:
183-
if isinstance(size, str):
184-
if "TB" in size:
185-
total_size += float(size.replace("TB", "").strip()) * 1e12
186-
elif "GB" in size:
187-
total_size += float(size.replace("GB", "").strip()) * 1e9
188-
elif "MB" in size:
189-
total_size += float(size.replace("MB", "").strip()) * 1e6
190-
except:
191-
pass
192-
193-
if entity.get("hasSummaryStatistics"):
194-
stats_count += 1
185+
# Check for aggregated metrics first
186+
total_size_bytes = root_data.get("evi:totalContentSizeBytes")
187+
stats_count_agg = root_data.get("evi:entitiesWithSummaryStats")
188+
189+
if total_size_bytes is not None:
190+
# Use pre-aggregated statistics
191+
total_size = total_size_bytes
192+
stats_count = stats_count_agg
193+
else:
194+
# Fall back to iterating metadata_graph
195+
total_size = 0
196+
stats_count = 0
197+
198+
for entity in metadata_graph:
199+
entity_type = _get_type(entity)
200+
201+
if "Dataset" in entity_type or "ROCrate" in entity_type:
202+
size = entity.get("contentSize", "")
203+
if size:
204+
try:
205+
if isinstance(size, str):
206+
if "TB" in size:
207+
total_size += float(size.replace("TB", "").strip()) * 1e12
208+
elif "GB" in size:
209+
total_size += float(size.replace("GB", "").strip()) * 1e9
210+
elif "MB" in size:
211+
total_size += float(size.replace("MB", "").strip()) * 1e6
212+
except:
213+
pass
214+
215+
if entity.get("hasSummaryStatistics"):
216+
stats_count += 1
195217

196218
details = []
197219
if total_size > 0:
@@ -227,17 +249,27 @@ def _score_pre_model(pre_model: PreModelExplainabilityScore, root_data: Dict[str
227249
has_content=True,
228250
details=", ".join(details)
229251
)
230-
231-
total = 0
232-
with_checksum = 0
233-
234-
for entity in metadata_graph:
235-
entity_type = _get_type(entity)
236-
237-
if "Dataset" in entity_type or "Software" in entity_type or "ROCrate" in entity_type:
238-
total += 1
239-
if entity.get("md5") or entity.get("MD5"):
240-
with_checksum += 1
252+
253+
# Check for aggregated metrics first
254+
total_entities = root_data.get("evi:totalEntities")
255+
entities_with_checksums = root_data.get("evi:entitiesWithChecksums")
256+
257+
if total_entities is not None:
258+
# Use pre-aggregated checksum data
259+
total = total_entities
260+
with_checksum = entities_with_checksums
261+
else:
262+
# Fall back to counting in metadata_graph
263+
total = 0
264+
with_checksum = 0
265+
266+
for entity in metadata_graph:
267+
entity_type = _get_type(entity)
268+
269+
if "Dataset" in entity_type or "Software" in entity_type or "ROCrate" in entity_type:
270+
total += 1
271+
if entity.get("md5") or entity.get("MD5"):
272+
with_checksum += 1
241273

242274
if total > 0 and with_checksum > 0:
243275
percentage = (with_checksum / total) * 100
@@ -350,17 +382,25 @@ def _score_sustainability(sustainability: SustainabilityScore, root_data: Dict[s
350382
)
351383
break
352384

353-
def _score_computability(computability: ComputabilityScore, metadata_graph: List[Dict]):
385+
def _score_computability(computability: ComputabilityScore, root_data: Dict[str, Any], metadata_graph: List[Dict]):
354386
"""Score Computability criteria."""
355-
formats = set()
356-
357-
for entity in metadata_graph:
358-
entity_type = _get_type(entity)
359-
360-
if "Dataset" in entity_type or "Software" in entity_type:
361-
fmt = _get_format(entity)
362-
if fmt:
363-
formats.add(str(fmt))
387+
# Check for aggregated metrics first
388+
formats_agg = root_data.get("evi:formats")
389+
390+
if formats_agg is not None:
391+
# Use pre-aggregated formats
392+
formats = set(formats_agg)
393+
else:
394+
# Fall back to collecting from metadata_graph
395+
formats = set()
396+
397+
for entity in metadata_graph:
398+
entity_type = _get_type(entity)
399+
400+
if "Dataset" in entity_type or "Software" in entity_type:
401+
fmt = _get_format(entity)
402+
if fmt:
403+
formats.add(str(fmt))
364404

365405
if formats:
366406
fmt_list = sorted(list(formats))[:5]
@@ -415,7 +455,7 @@ def _build_ai_ready_score(value: Any, *, converter_instance) -> AIReadyScore:
415455
_score_pre_model(score.pre_model_explainability, root_data, metadata_graph)
416456
_score_ethics(score.ethics, root_data)
417457
_score_sustainability(score.sustainability, root_data)
418-
_score_computability(score.computability, metadata_graph)
458+
_score_computability(score.computability, root_data, metadata_graph)
419459

420460
return score
421461

fairscape_models/rocrate.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,20 @@ class ROCrateMetadataElem(BaseModel):
114114
rai_annotator_demographics: Optional[List[str]] = Field(alias="rai:annotatorDemographics", default=None)
115115
rai_machine_annotation_tools: Optional[List[str]] = Field(alias="rai:machineAnnotationTools", default=None)
116116

117-
118-
119-
117+
# Aggregated metrics for AI-Ready scoring (roll-up properties from sub-crates)
118+
evi_dataset_count: Optional[int] = Field(alias="evi:datasetCount", default=None)
119+
evi_computation_count: Optional[int] = Field(alias="evi:computationCount", default=None)
120+
evi_software_count: Optional[int] = Field(alias="evi:softwareCount", default=None)
121+
evi_schema_count: Optional[int] = Field(alias="evi:schemaCount", default=None)
122+
evi_total_content_size_bytes: Optional[int] = Field(alias="evi:totalContentSizeBytes", default=None)
123+
evi_entities_with_summary_stats: Optional[int] = Field(alias="evi:entitiesWithSummaryStats", default=None)
124+
evi_entities_with_checksums: Optional[int] = Field(alias="evi:entitiesWithChecksums", default=None)
125+
evi_total_entities: Optional[int] = Field(alias="evi:totalEntities", default=None)
126+
evi_formats: Optional[List[str]] = Field(alias="evi:formats", default=None)
127+
128+
129+
130+
120131
class ROCrateDistribution(BaseModel):
121132
extractedROCrateBucket: Optional[str] = Field(default=None)
122133
archivedROCrateBucket: Optional[str] = Field(default=None)

0 commit comments

Comments
 (0)