Skip to content

Commit

Permalink
Merge pull request #18 from deepghs/dev/sankaku
Browse files Browse the repository at this point in the history
dev(narugo): add sankaku data pool
  • Loading branch information
narugo1992 authored Dec 19, 2024
2 parents c3fbcc5 + 323131b commit 85b7786
Show file tree
Hide file tree
Showing 18 changed files with 201 additions and 4 deletions.
1 change: 1 addition & 0 deletions cheesechaser/datapool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .realbooru import RealbooruDataPool
from .rule34 import Rule34DataPool, Rule34WebpDataPool
from .safebooru import SafebooruDataPool, SafebooruWebpDataPool
from .sankaku import SankakuDataPool, SankakuWebpDataPool
from .table import TableBasedHfDataPool, SimpleTableHfDataPool
from .threedbooru import ThreedbooruDataPool
from .yande import YandeDataPool, YandeWebpDataPool
Expand Down
1 change: 0 additions & 1 deletion cheesechaser/datapool/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def batch_download_to_directory(self, resource_ids, dst_dir: str, max_workers: i
downloads for improved performance.
:param resource_ids: List of resource IDs or tuples of (resource_id, resource_info) to download.
:type resource_ids: Iterable[Union[str, int, Tuple[str, Any]]]
:param dst_dir: Destination directory for downloaded files.
:type dst_dir: str
:param max_workers: Maximum number of worker threads for parallel downloads. Defaults to 12.
Expand Down
5 changes: 4 additions & 1 deletion cheesechaser/datapool/gelbooru.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ class GelbooruDataPool(IncrementIDDataPool):
the images are stored in a directory structure with 4 levels of subdirectories.
"""

def __init__(self, revision: str = 'main'):
def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
"""
Initialize the GelbooruDataPool.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
"""
IncrementIDDataPool.__init__(
self,
Expand All @@ -54,6 +56,7 @@ def __init__(self, revision: str = 'main'):
idx_repo_id=_GELBOORU_REPO,
idx_revision=revision,
base_level=[3, 4],
hf_token=hf_token,
)


Expand Down
5 changes: 4 additions & 1 deletion cheesechaser/datapool/rule34.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ class Rule34DataPool(IncrementIDDataPool):
the images are stored in a directory structure with 4 levels of subdirectories.
"""

def __init__(self, revision: str = 'main'):
def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
"""
Initialize the Rule34DataPool.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
"""
IncrementIDDataPool.__init__(
self,
Expand All @@ -54,6 +56,7 @@ def __init__(self, revision: str = 'main'):
idx_repo_id=_RULE34_REPO,
idx_revision=revision,
base_level=[3, 4],
hf_token=hf_token,
)


Expand Down
5 changes: 4 additions & 1 deletion cheesechaser/datapool/safebooru.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ class SafebooruDataPool(IncrementIDDataPool):
the images are stored in a directory structure with 4 levels of subdirectories.
"""

def __init__(self, revision: str = 'main'):
def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
"""
Initialize the SafebooruDataPool.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
"""
IncrementIDDataPool.__init__(
self,
Expand All @@ -54,6 +56,7 @@ def __init__(self, revision: str = 'main'):
idx_repo_id=_GELBOORU_REPO,
idx_revision=revision,
base_level=[3, 4],
hf_token=hf_token,
)


Expand Down
135 changes: 135 additions & 0 deletions cheesechaser/datapool/sankaku.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
This module provides data pool classes for accessing Sankaku image data.
It contains two classes:
1. SankakuDataPool: For accessing the full Sankaku dataset.
2. SankakuWebpDataPool: For accessing the WebP-formatted Sankaku dataset with 4M pixel images.
Both classes inherit from IncrementIDDataPool and provide easy access to the respective datasets
stored in Hugging Face repositories. These classes simplify the process of retrieving and working
with Sankaku image data, allowing users to easily integrate this data into their projects or
research.
.. note::
The datasets `deepghs/sankaku_full <https://huggingface.co/datasets/deepghs/sankaku_full>`_ and
`deepghs/sankaku-webp-4Mpixel <https://huggingface.co/datasets/deepghs/sankaku-webp-4Mpixel>`_
is gated, you have to get the access of it before using this module.
"""

import os.path
from collections import defaultdict
from threading import Lock
from typing import Dict, List, Iterable, Optional

from hfutils.operate import get_hf_fs
from hfutils.utils import parse_hf_fs_path, hf_fs_path
from natsort import natsorted

from .base import IncrementIDDataPool

_SANKAKU_REPO = 'deepghs/sankaku_full'


class SankakuDataPool(IncrementIDDataPool):
"""
A data pool class for accessing the full Sankaku dataset.
This class inherits from IncrementIDDataPool and is configured to access
the full Sankaku dataset stored in the 'deepghs/sankaku_full' repository.
It provides methods to retrieve image data based on image IDs.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
Note:
This class uses a base level of 4 for file organization, which means
the images are stored in a directory structure with 4 levels of subdirectories.
"""

def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
"""
Initialize the SankakuDataPool.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
"""
IncrementIDDataPool.__init__(
self,
data_repo_id=_SANKAKU_REPO,
data_revision=revision,
idx_repo_id=_SANKAKU_REPO,
idx_revision=revision,
base_level=[3, 4],
hf_token=hf_token,
)
self._tar_files = None
self._lock = Lock()

def _get_tar_files(self) -> Dict[str, List[str]]:
with self._lock:
if self._tar_files is None:
hf_fs = get_hf_fs(hf_token=self._hf_token)
_tar_files = natsorted([
parse_hf_fs_path(file).filename
for file in hf_fs.glob(hf_fs_path(
repo_id=self.data_repo_id,
repo_type='dataset',
filename='images/**/*.tar',
revision=self.data_revision,
))
])

self._tar_files = defaultdict(list)
for tar_file in _tar_files:
self._tar_files[os.path.basename(tar_file)].append(tar_file)

return self._tar_files

def _request_possible_archives(self, resource_id) -> Iterable[str]:
modulo = f'{resource_id % 1000:03d}'
return self._get_tar_files()[f'0{modulo}.tar']


_SANKAKU_WEBP_REPO = 'deepghs/sankaku-webp-4Mpixel'


class SankakuWebpDataPool(IncrementIDDataPool):
"""
A data pool class for accessing the WebP-formatted Sankaku dataset with 4M pixel images.
This class inherits from IncrementIDDataPool and is configured to access
the WebP-formatted Sankaku dataset stored in the 'deepghs/sankaku-webp-4Mpixel' repository.
It provides methods to retrieve WebP-formatted image data based on image IDs.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
Note:
This class uses a base level of 3 for file organization, which means
the images are stored in a directory structure with 3 levels of subdirectories.
Authentication may be required to access this dataset.
"""

def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
"""
Initialize the SankakuWebpDataPool.
:param revision: The revision of the dataset to use, defaults to 'main'.
:type revision: str
:param hf_token: Hugging Face authentication token, defaults to None.
:type hf_token: Optional[str]
"""
IncrementIDDataPool.__init__(
self,
data_repo_id=_SANKAKU_WEBP_REPO,
data_revision=revision,
idx_repo_id=_SANKAKU_WEBP_REPO,
idx_revision=revision,
base_level=3,
hf_token=hf_token,
)
1 change: 1 addition & 0 deletions docs/source/api_doc/datapool/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ cheesechaser.datapool
realbooru
rule34
safebooru
sankaku
table
threedbooru
yande
Expand Down
24 changes: 24 additions & 0 deletions docs/source/api_doc/datapool/sankaku.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
cheesechaser.datapool.sankaku
========================================================

.. currentmodule:: cheesechaser.datapool.sankaku

.. automodule:: cheesechaser.datapool.sankaku


SankakuDataPool
-----------------------------------------------------

.. autoclass:: SankakuDataPool
:members: __doc__,__init__,__module__



SankakuWebpDataPool
-----------------------------------------------------

.. autoclass:: SankakuWebpDataPool
:members: __doc__,__init__,__module__



28 changes: 28 additions & 0 deletions test/datapool/test_sankaku.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pytest
from hbutils.testing import isolated_directory

from cheesechaser.datapool import SankakuWebpDataPool, SankakuDataPool
from ..testings import get_testfile, dir_compare


@pytest.mark.unittest
class TestDatapoolSankaku:
def test_sankaku_origin(self):
with isolated_directory():
pool = SankakuDataPool()
pool.batch_download_to_directory(
resource_ids=[4000000, 4000001, 4000002, 36863304, 36863146],
dst_dir='.',
)

dir_compare('.', get_testfile('sankaku_5'))

def test_sankaku_webp(self):
with isolated_directory():
pool = SankakuWebpDataPool()
pool.batch_download_to_directory(
resource_ids=[4000000, 4000001, 4000002, 36863304, 36863146],
dst_dir='.',
)

dir_compare('.', get_testfile('sankaku_webp_5'))
Binary file added test/testfile/sankaku_5/36863146.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/sankaku_5/36863304.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/sankaku_5/4000000.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/sankaku_5/4000001.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/sankaku_5/4000002.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test/testfile/sankaku_webp_5/36863146.webp
Binary file not shown.
Binary file added test/testfile/sankaku_webp_5/4000000.webp
Binary file not shown.
Binary file added test/testfile/sankaku_webp_5/4000001.webp
Binary file not shown.
Binary file added test/testfile/sankaku_webp_5/4000002.webp
Binary file not shown.

0 comments on commit 85b7786

Please sign in to comment.