Skip to content

Commit

Permalink
Enhance dataset cards
Browse files Browse the repository at this point in the history
aliberts committed Nov 20, 2024
1 parent f43e5d0 commit c6ad495
Showing 6 changed files with 265 additions and 110 deletions.
27 changes: 27 additions & 0 deletions lerobot/common/datasets/card_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
{{ card_data }}
---

This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).

## Dataset Description

{{ dataset_description | default("", true) }}

- **Homepage:** {{ url | default("[More Information Needed]", true)}}
- **Paper [optional]:** {{ paper | default("[More Information Needed]", true)}}
- **License:** {{ license | default("[More Information Needed]", true)}}

## Dataset Structure

{{ dataset_structure | default("[More Information Needed]", true)}}

## Citation [optional]

**BibTeX:**

```bibtex
{{ citation_bibtex | default("[More Information Needed]", true)}}
```
6 changes: 4 additions & 2 deletions lerobot/common/datasets/lerobot_dataset.py
Original file line number Diff line number Diff line change
@@ -467,10 +467,10 @@ def __init__(
def push_to_hub(
self,
tags: list | None = None,
text: str | None = None,
license: str | None = "apache-2.0",
push_videos: bool = True,
private: bool = False,
**card_kwargs,
) -> None:
if not self.consolidated:
raise RuntimeError(
@@ -495,7 +495,9 @@ def push_to_hub(
repo_type="dataset",
ignore_patterns=ignore_patterns,
)
card = create_lerobot_dataset_card(tags=tags, text=text, info=self.meta.info, license=license)
card = create_lerobot_dataset_card(
tags=tags, dataset_info=self.meta.info, license=license, **card_kwargs
)
card.push_to_hub(repo_id=self.repo_id, repo_type="dataset")
create_branch(repo_id=self.repo_id, branch=CODEBASE_VERSION, repo_type="dataset")

64 changes: 29 additions & 35 deletions lerobot/common/datasets/utils.py
Original file line number Diff line number Diff line change
@@ -27,7 +27,7 @@
import pyarrow.compute as pc
import torch
from datasets.table import embed_table_storage
from huggingface_hub import DatasetCard, HfApi
from huggingface_hub import DatasetCard, DatasetCardData, HfApi
from PIL import Image as PILImage
from torchvision import transforms

@@ -50,6 +50,8 @@
---
This dataset was created using [LeRobot](https://github.com/huggingface/lerobot).
## {}
"""

DEFAULT_FEATURES = {
@@ -468,41 +470,33 @@ def create_branch(repo_id, *, branch: str, repo_type: str | None = None) -> None

def create_lerobot_dataset_card(
tags: list | None = None,
text: str | None = None,
info: dict | None = None,
license: str | None = None,
url: str | None = None,
citation: str | None = None,
arxiv: str | None = None,
dataset_info: dict | None = None,
**kwargs,
) -> DatasetCard:
"""
If specified, license must be one of https://huggingface.co/docs/hub/repositories-licenses.
Keyword arguments will be used to replace values in ./lerobot/common/datasets/card_template.md.
Note: If specified, license must be one of https://huggingface.co/docs/hub/repositories-licenses.
"""
card = DatasetCard(DATASET_CARD_TEMPLATE)
card.data.configs = [
{
"config_name": "default",
"data_files": "data/*/*.parquet",
}
]
card.data.task_categories = ["robotics"]
card.data.license = license
card.data.tags = ["LeRobot"]
if license:
card.data.license = license
card_tags = ["LeRobot"]
if tags:
card.data.tags += tags
if url:
card.text += f"## Homepage:\n{url}\n"
if text:
card.text += f"{text}\n"
if info:
card.text += "## Info\n"
card.text += "[meta/info.json](meta/info.json)\n"
card.text += f"```json\n{json.dumps(info, indent=4)}\n```"
if citation:
card.text += "## Citation\n"
card.text += f"```\n{citation}\n```\n"
if arxiv:
card.data.arxiv = arxiv
return card
card_tags += tags
if dataset_info:
dataset_structure = "[meta/info.json](meta/info.json):\n"
dataset_structure += f"```json\n{json.dumps(dataset_info, indent=4)}\n```\n"
kwargs = {**kwargs, "dataset_structure": dataset_structure}
card_data = DatasetCardData(
license=kwargs.get("license"),
tags=card_tags,
task_categories=["robotics"],
configs=[
{
"config_name": "default",
"data_files": "data/*/*.parquet",
}
],
)
return DatasetCard.from_template(
card_data=card_data,
template_path="./lerobot/common/datasets/card_template.md",
**kwargs,
)
Loading

0 comments on commit c6ad495

Please sign in to comment.