diff --git a/docs/sed/dataset.rst b/docs/sed/dataset.rst index e282423e..183b1a4f 100644 --- a/docs/sed/dataset.rst +++ b/docs/sed/dataset.rst @@ -1,326 +1,3 @@ -Dataset -=================================================== - -SED comes with the ability to download and extract any URL based -datasets. By default, user can the “WSe2”, “TaS2” and “Gd_W110” datasets -but easy to extend this list. - -Getting datasets ------------------------- - -.. code:: python - - import os - from sed.dataset import dataset - -get() -^^^^^ - -The “get” just needs the data name, but another root_dir can be provided. - -Try to interrupt the download process and restart to see that it continues the download from where it stopped - -.. code:: python - - dataset.get("WSe2", remove_zip = False) - -.. parsed-literal:: - - Using default data path for "WSe2": "/datasets/WSe2" - - 3%|▎ | 152M/5.73G [00:02<01:24, 71.3MB/s] - - Using default data path for "WSe2": "/datasets/WSe2" - - 100%|██████████| 5.73G/5.73G [01:09<00:00, 54.3MB/s] - - Download complete. - -Not providing “remove_zip” at all will by default delete the zip file after extraction - -.. code:: python - - dataset.get("WSe2") - -Setting the “use_existing” keyword to False allows to download the data in another location. Default is to use existing data - -.. code:: python - - dataset.get("WSe2", root_dir = "new_datasets", use_existing=False) - -.. parsed-literal:: - - Using specified data path for "WSe2": "/new_datasets/datasets/WSe2" - Created new directory at /new_datasets/datasets/WSe2 - - - 3%|▎ | 152M/5.73G [00:02<01:24, 71.3MB/s] - - -Interrupting extraction has similar behavior to download and just continues from where it stopped. - -Or if user deletes the extracted documents, it re-extracts from zip file - -.. code:: python - - dataset.get("WSe2", remove_zip = False) - - ## Try to remove some files and rerun this command. - -.. parsed-literal:: - - Using default data path for "WSe2": "/datasets/WSe2" - WSe2 data is already fully downloaded. - - - 5.73GB [00:00, 12.6MB/s] - - Download complete. - Extracting WSe2 data... - - - - 100%|██████████| 113/113 [02:41<00:00, 1.43s/file] - - WSe2 data extracted successfully. - -remove() -^^^^^^^^ - -“remove” allows removal of some or all instances of existing data - -This would remove only one of the two existing paths - -.. code:: python - - dataset.remove("WSe2", instance = dataset.existing_data_paths[0]) - -.. parsed-literal:: - - Removed /datasets/WSe2 - -This removes all instances, if any present - -.. code:: python - - dataset.remove("WSe2") - -.. parsed-literal:: - - WSe2 data is not present. - -Attributes useful for user -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All available datasets after looking at module, user and folder levels - -.. code:: python - - dataset.available - -.. parsed-literal:: - - ['WSe2', 'TaS2', 'Gd_W110'] - -The dir and subdirs where data is located - -.. code:: python - - dataset.dir - -.. parsed-literal:: - - '/datasets/WSe2' - -.. code:: python - - dataset.subdirs - -.. parsed-literal:: - - ['/datasets/WSe2/Scan049_1', - '/datasets/WSe2/energycal_2019_01_08'] - -Existing locations where data is present - -.. code:: python - - dataset.existing_data_paths - -.. parsed-literal:: - - ['/new_dataset/datasets/WSe2', - '/datasets/WSe2'] - -Example of adding custom datasets ---------------------------------- - -DatasetsManager -^^^^^^^^^^^^^^^ - -Allows to add or remove datasets in json file at any level (module, user, folder). - -Looks at all levels to give the available datasets - -.. code:: python - - import os - from sed.dataset import DatasetsManager - -We add a new dataset to both folder and user levels - -This dataset also has “rearrange_files” set to True, which takes all files in subfolders and puts them in the main dataset specific directory - -.. code:: python - - example_dset_name = "Example" - example_dset_info = {} - - example_dset_info["url"] = "https://example-dataset.com/download" # not a real path - example_dset_info["subdirs"] = ["Example_subdir"] - example_dset_info["rearrange_files"] = True - - DatasetsManager.add(data_name=example_dset_name, info=example_dset_info, levels=["folder", "user"]) - -.. parsed-literal:: - - Added Example dataset to folder datasets.json - Added Example dataset to user datasets.json - -datasets.json should be available in execution folder after this - -.. code:: python - - assert os.path.exists("./datasets.json") - dataset.available - -.. parsed-literal:: - - ['Example', 'WSe2', 'TaS2', 'Gd_W110'] - -This will remove the Example dataset from the user json file - -.. code:: python - - DatasetsManager.remove(data_name=example_dset_name, levels=["user"]) - -.. parsed-literal:: - - Removed Example dataset from user datasets.json - -Adding dataset that already exists will give an error. Likewise, removing one that doesn’t exist - -.. code:: python - - # This should give an error - DatasetsManager.add(data_name=example_dset_name, info=example_dset_info, levels=["folder"]) - -.. parsed-literal:: - - ValueError: Dataset Example already exists in folder datasets.json. - - -Now that dataset.json with Example exists in current dir, lets try to fetch it - -.. code:: python - - dataset.get("Example") - -.. parsed-literal:: - - Using default data path for "Example": "/datasets/Example" - Created new directory at /datasets/Example - Download complete. - Extracting Example data... - - - 100%|██████████| 4/4 [00:00<00:00, 28.10file/s] - - Example data extracted successfully. - Removed Example.zip file. - Rearranging files in Example_subdir. - - - - 100%|██████████| 3/3 [00:00<00:00, 696.11file/s] - - File movement complete. - Rearranging complete. - -.. code:: python - - print(dataset.dir) - print(dataset.subdirs) - -.. parsed-literal:: - - /datasets/Example - [] - -lets download to another location - -.. code:: python - - dataset.get("Example", root_dir = "new_datasets", use_existing = False) - -.. parsed-literal.. parsed-literal:: - - Using specified data path for "Example": "/new_datasets/datasets/Example" - Created new directory at /new_datasets/datasets/Example - Download complete. - Extracting Example data... - - - 100%|██████████| 4/4 [00:00<00:00, 28.28file/s] - - Example data extracted successfully. - Removed Example.zip file. - Rearranging files in Example_subdir. - - - - 100%|██████████| 3/3 [00:00<00:00, 546.16file/s] - - File movement complete. - Rearranging complete. - -we can remove one instance - -.. code:: python - - print(dataset.existing_data_paths) - path_to_remove = dataset.existing_data_paths[0] - -.. parsed-literal:: - - ['/new_datasets/datasets/Example', '/datasets/Example'] - -.. code:: python - - dataset.remove(data_name="Example", instance=path_to_remove) - -.. parsed-literal:: - - Removed /new_datasets/datasets/Example - -.. code:: python - - assert not os.path.exists(path_to_remove) - -.. code:: python - - print(dataset.existing_data_paths) - -.. parsed-literal:: - - ['/datasets/Example'] - -Default datasets.json ---------------------- - -.. literalinclude:: ../../src/sed/config/datasets.json - :language: json - API ------------------------ .. automodule:: sed.dataset.dataset diff --git a/docs/user_guide/dataset.md b/docs/user_guide/dataset.md new file mode 100644 index 00000000..b61d6d3b --- /dev/null +++ b/docs/user_guide/dataset.md @@ -0,0 +1,279 @@ +# Dataset + +## Overview +SED comes with the ability to download and extract any URL-based datasets. By default, users can use the datasets: +- `WSe2` +- `TaS2` +- `Gd_W110` +- `W110` + +It is easy to extend this list using a JSON file. + +--- + +## Getting Datasets + +### Importing Required Modules +```python +import os +from sed.dataset import dataset +``` + +### `get()` Method +The `get` method requires only the dataset name, but an alternative `root_dir` can be provided. + +Try interrupting the download process and restarting it to see that it resumes from where it stopped. + +```python +dataset.get("WSe2", remove_zip=False) +``` + +Example Output: +``` +Using default data path for "WSe2": "/datasets/WSe2" + +3%|▎ | 152M/5.73G [00:02<01:24, 71.3MB/s] + +Using default data path for "WSe2": "/datasets/WSe2" + +100%|██████████| 5.73G/5.73G [01:09<00:00, 54.3MB/s] + +Download complete. +``` + +By default, not providing `remove_zip=False` will delete the zip file after extraction: +```python +dataset.get("WSe2") +``` + +Setting `use_existing=False` allows downloading the data to a new location instead of using existing data. +```python +dataset.get("WSe2", root_dir="new_datasets", use_existing=False) +``` + +Example Output: +``` +Using specified data path for "WSe2": "/new_datasets/datasets/WSe2" +Created new directory at /new_datasets/datasets/WSe2 +``` + +Interrupting extraction here behaves similarly, resuming from where it stopped. + +If the extracted files are deleted, rerunning this command below will re-extract from the zip file: +```python +dataset.get("WSe2", remove_zip=False) +``` + +Example Output: +``` +Using default data path for "WSe2": "/datasets/WSe2" +WSe2 data is already fully downloaded. + +5.73GB [00:00, 12.6MB/s] + +Download complete. +Extracting WSe2 data... + +100%|██████████| 113/113 [02:41<00:00, 1.43s/file] + +WSe2 data extracted successfully. +``` +and this does not delete the zip file. + +--- + +## `remove()` Method + +The `remove` method allows removing some or all instances of existing data. + +Remove only one instance: +```python +dataset.remove("WSe2", instance=dataset.existing_data_paths[0]) +``` + +Example Output: +``` +Removed /datasets/WSe2 +``` + +Remove all instances: +```python +dataset.remove("WSe2") +``` + +Example Output: +``` +WSe2 data is not present. +``` + +--- + +## Useful Attributes + +### Available Datasets +```python +dataset.available +``` + +Example Output: +``` +['WSe2', 'TaS2', 'Gd_W110', 'W110'] +``` + +### Data Directory +```python +dataset.dir +``` + +Example Output: +``` +'/datasets/WSe2' +``` + +### Subdirectories +```python +dataset.subdirs +``` + +Example Output: +``` +['/datasets/WSe2/Scan049_1', + '/datasets/WSe2/energycal_2019_01_08'] +``` + +### Existing Data Paths +```python +dataset.existing_data_paths +``` + +Example Output: +``` +['/new_dataset/datasets/WSe2', + '/datasets/WSe2'] +``` + +--- + +## Example: Adding Custom Datasets + +### `DatasetsManager` +Allows adding or removing datasets in a JSON file at different levels (module, user, folder). It also checks all levels to list available datasets. + +```python +import os +from sed.dataset import DatasetsManager +``` + +#### Adding a New Dataset +This example adds a dataset to both the folder and user levels. Setting `rearrange_files=True` moves all files from subfolders into the main dataset directory. + +```python +example_dset_name = "Example" +example_dset_info = { + "url": "https://example-dataset.com/download", # Not a real path + "subdirs": ["Example_subdir"], + "rearrange_files": True +} + +DatasetsManager.add(data_name=example_dset_name, info=example_dset_info, levels=["folder", "user"]) +``` + +Example Output: +``` +Added Example dataset to folder datasets.json +Added Example dataset to user datasets.json +``` + +Verify that `datasets.json` is created: +```python +assert os.path.exists("./datasets.json") +dataset.available +``` + +Example Output: +``` +['Example', 'WSe2', 'TaS2', 'Gd_W110'] +``` + +#### Removing a Dataset +Remove the Example dataset from the user JSON file: +```python +DatasetsManager.remove(data_name=example_dset_name, levels=["user"]) +``` + +Example Output: +``` +Removed Example dataset from user datasets.json +``` + +Adding an already existing dataset will result in an error: +```python +DatasetsManager.add(data_name=example_dset_name, info=example_dset_info, levels=["folder"]) +``` + +Example Output: +``` +ValueError: Dataset Example already exists in folder datasets.json. +``` + +#### Downloading the Example Dataset +```python +dataset.get("Example") +``` + +Example Output: +``` +Using default data path for "Example": "/datasets/Example" +Created new directory at /datasets/Example +Download complete. +Extracting Example data... + +100%|██████████| 4/4 [00:00<00:00, 28.10file/s] + +Example data extracted successfully. +``` + +#### Download to Another Location +```python +dataset.get("Example", root_dir="new_datasets", use_existing=False) +``` + +Example Output: +``` +Using specified data path for "Example": "/new_datasets/datasets/Example" +Created new directory at /new_datasets/datasets/Example +``` + +#### Removing an Instance +```python +print(dataset.existing_data_paths) +path_to_remove = dataset.existing_data_paths[0] +dataset.remove(data_name="Example", instance=path_to_remove) +``` + +Example Output: +``` +Removed /new_datasets/datasets/Example +``` + +Verify that the path was removed: +```python +assert not os.path.exists(path_to_remove) +``` + +```python +print(dataset.existing_data_paths) +``` + +Example Output: +``` +['/datasets/Example'] +``` + +--- + +## Default datasets.json + +```{literalinclude} ../../src/sed/config/datasets.json +:language: json +``` diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md index 40059b94..3355e56b 100644 --- a/docs/user_guide/index.md +++ b/docs/user_guide/index.md @@ -19,6 +19,7 @@ installation ../tutorial/2_conversion_pipeline_for_example_time-resolved_ARPES_data ../tutorial/3_metadata_collection_and_export_to_NeXus config +dataset ``` ## Advanced Topics diff --git a/tests/test_dataset.py b/tests/test_dataset.py index e97a2764..d4f1152d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,4 +1,14 @@ -"""This code performs several tests for the dataset module. +"""This code performs several tests for the dataset module. + +The tests cover the following functionalities: +- Checking available datasets +- Checking dataset availability +- Setting root directory for datasets +- Getting file list from dataset directory +- Downloading dataset +- Extracting dataset +- Rearranging dataset +- Adding and removing datasets using DatasetsManager """ from __future__ import annotations @@ -20,6 +30,7 @@ @pytest.fixture def zip_buffer(): + """Fixture to create an in-memory zip file buffer with test files.""" zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED) as zip_file: zip_file.writestr("test_file.txt", "This is a test file inside the zip.") @@ -29,19 +40,25 @@ def zip_buffer(): @pytest.fixture -def zip_file(fs, zip_buffer): - fs.create_dir("test/datasets/Test") - with open("test/datasets/Test/Test.zip", "wb") as f: +def zip_filepath(fs, zip_buffer, tmp_path): + """Fixture to create a temporary directory and write the zip buffer to a file.""" + test_dir = tmp_path / "datasets" / "Test" + fs.create_dir(test_dir) + with open(test_dir / "Test.zip", "wb") as f: f.write(zip_buffer.getvalue()) + return test_dir def test_available_datasets(): + """Checks the available datasets by comparing with the loaded datasets dictionary.""" all_dsets = dm.load_datasets_dict() del all_dsets["Test"] assert ds.available == list(all_dsets.keys()) def test_check_dataset_availability(): + """Checks that all available datasets are loaded and tests if an error is raised for + unknown datasets.""" datasets = dm.load_datasets_dict() # return dataset information if available for data_name in datasets.keys(): @@ -54,36 +71,39 @@ def test_check_dataset_availability(): ds._check_dataset_availability() -def test_set_root_dir(): +def test_set_root_dir(tmp_path): + """Checks if all cases of setting root datasets directory.""" # test with existing data path ds.data_name = "Test" - ds._state["data_path"] = ["test/data"] - ds._set_data_dir(root_dir="test/data", use_existing=True) - assert os.path.abspath("test/data/") == ds._dir + ds._state["data_path"] = [str(tmp_path / "test" / "data")] + ds._set_data_dir(root_dir=str(tmp_path / "test" / "data"), use_existing=True) + assert ds._dir == str((tmp_path / "test" / "data").resolve()) # test without existing data path ds._state["data_path"] = [] - ds._set_data_dir(root_dir="test/data", use_existing=True) - assert os.path.abspath("test/data/datasets/Test") == ds._dir + ds._set_data_dir(root_dir=str(tmp_path / "test" / "data"), use_existing=True) + assert ds._dir == str((tmp_path / "test" / "data" / "datasets" / "Test").resolve()) # test without data path and existing data path ds._set_data_dir(root_dir=None, use_existing=True) - assert os.path.abspath("./datasets/Test") == ds._dir + assert f"{os.getcwd()}/datasets/Test" == ds._dir # test with provided data path different from existing data path - ds._state["data_path"] = ["test/data1"] - ds._set_data_dir(root_dir="test/data", use_existing=True) - assert os.path.abspath("test/data1/") == ds._dir - ds._set_data_dir(root_dir="test/data", use_existing=False) - assert os.path.abspath("test/data/datasets/Test") == ds._dir - - -def test_get_file_list(fs): - fs.create_file("test/data/file.txt") - fs.create_file("test/data/subdir/file.txt") - fs.create_file("test/data/subdir/file.zip") - fs.create_file("test/data/file.zip") - ds._dir = "test/data" + ds._state["data_path"] = [str(tmp_path / "test" / "data1")] + ds._set_data_dir(root_dir=str(tmp_path / "test" / "data"), use_existing=True) + assert ds._dir == str((tmp_path / "test" / "data1").resolve()) + ds._set_data_dir(root_dir=str(tmp_path / "test" / "data"), use_existing=False) + assert ds._dir == str((tmp_path / "test" / "data" / "datasets" / "Test").resolve()) + + +def test_get_file_list(fs, tmp_path): + """Test to get the list of files in the dataset directory, including and excluding zip files.""" + test_dir = tmp_path / "test" / "data" + fs.create_file(test_dir / "file.txt") + fs.create_file(test_dir / "subdir" / "file.txt") + fs.create_file(test_dir / "subdir" / "file.zip") + fs.create_file(test_dir / "file.zip") + ds._dir = str(test_dir) assert ["file.txt", "subdir/file.txt"] == ds._get_file_list() assert ["file.txt", "file.zip", "subdir/file.txt", "subdir/file.zip"] == ds._get_file_list( @@ -91,44 +111,47 @@ def test_get_file_list(fs): ) -def test_download_data(fs, requests_mock, zip_buffer): - fs.create_dir("test") +def test_download_data(fs, requests_mock, zip_buffer, tmp_path): + """Test to download a dataset from a URL and verify the downloaded zip file.""" + test_dir = tmp_path / "test" + fs.create_dir(test_dir) data_url = "http://test.com/files/file.zip" requests_mock.get(data_url, content=zip_buffer.getvalue()) ds._data_name = "Test" ds._state = {"data_path": []} - ds._set_data_dir(root_dir="test", use_existing=True) + ds._set_data_dir(root_dir=str(test_dir), use_existing=True) ds._download_data(data_url) - assert os.path.exists("test/datasets/Test/Test.zip") + assert os.path.exists(test_dir / "datasets" / "Test" / "Test.zip") - # assert not ds._download_data("data", "test/data/", data_url) # already exists - -def test_extract_data(zip_file): # noqa: ARG001 +def test_extract_data(zip_filepath): + """Test to extract files from the dataset zip file and verify the extracted files.""" ds._data_name = "Test" - ds._dir = "test/datasets/Test/" + ds._dir = str(zip_filepath) ds._extract_data() - assert os.path.exists("test/datasets/Test/test_file.txt") - assert os.path.exists("test/datasets/Test/subdir/test_subdir.txt") + assert os.path.exists(zip_filepath / "test_file.txt") + assert os.path.exists(zip_filepath / "subdir" / "test_subdir.txt") -def test_rearrange_data(zip_file): # noqa: ARG001 +def test_rearrange_data(zip_filepath): + """Test to rearrange files in the dataset directory and verify the rearranged files.""" ds._data_name = "Test" - ds._dir = "test/datasets/Test/" + ds._dir = str(zip_filepath) ds._subdirs = ["subdir"] ds._extract_data() ds._rearrange_data() - assert os.path.exists("test/datasets/Test/test_file.txt") - assert os.path.exists("test/datasets/Test/test_subdir.txt") - assert not os.path.exists("test/datasets/Test/subdir") + assert os.path.exists(zip_filepath / "test_file.txt") + assert os.path.exists(zip_filepath / "test_subdir.txt") + assert not os.path.exists(zip_filepath / "subdir") with pytest.raises(FileNotFoundError): ds._subdirs = ["non_existing_subdir"] ds._rearrange_data() -def test_get_remove_dataset(requests_mock, zip_buffer): - json_path_user = USER_CONFIG_PATH.joinpath("datasets.json") +def test_get_remove_dataset(requests_mock, zip_buffer, tmp_path): + """Test to get a dataset, verify its directory and files, and then remove the dataset.""" + json_path_user = tmp_path / USER_CONFIG_PATH / "datasets.json" data_name = "Test" _ = dm.load_datasets_dict() # to ensure datasets.json is in user dir @@ -137,11 +160,11 @@ def test_get_remove_dataset(requests_mock, zip_buffer): data_url = "http://test.com/files/file.zip" requests_mock.get(data_url, content=zip_buffer.getvalue()) - ds.get(data_name) - assert ds.dir == os.path.abspath(os.path.join("./datasets", data_name)) + ds.get(data_name, root_dir=str(tmp_path)) + assert ds.dir == str((tmp_path / "datasets" / data_name).resolve()) # check if subdir is removed after rearranging - assert not os.path.exists("./datasets/Test/subdir") + assert not os.path.exists(tmp_path / "datasets" / "Test" / "subdir") # check datasets file to now have data_path listed datasets_json = json.load(open(json_path_user)) @@ -149,14 +172,16 @@ def test_get_remove_dataset(requests_mock, zip_buffer): assert datasets_json[data_name]["files"] ds.remove(data_name) - assert not os.path.exists(os.path.join("./datasets", data_name)) + assert not os.path.exists(tmp_path / "datasets" / data_name) - ds.get(data_name) - ds.get(data_name) + ds.get(data_name, root_dir=str(tmp_path)) + ds.get(data_name, root_dir=str(tmp_path)) ds.remove(data_name, ds.existing_data_paths[0]) -def test_datasets_manager(): +def test_datasets_manager(tmp_path): # noqa: ARG001 + """Tests adds a dataset using DatasetsManager, verifies its addition, + removes it and checks for error raised.""" dm.add( "Test_DM", {"url": "http://test.com/files/file.zip", "subdirs": ["subdir"]},