catalyst-cooperative · e-belfer · Jan 16, 2025 · Jan 7, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/README.md b/README.md
@@ -90,7 +90,29 @@ There are also four optional flags available:
 
 ## Adding a new dataset
 
-### Step 1: Implement archiver interface
+### Step 1: Define the dataset's metadata
+For each dataset we archive, we record information about the title, a description, who
+contributed to archiving the dataset, the segments into which the data files are
+partitioned, its license and keywords. This
+information is used to communicate about the dataset's usage and provenance to any
+future users.
+
+* Title: The title of your dataset should clearly contain the agency publishing the data and a non-abbreviated title (e.g., EIA Manufacturing Energy Consumption Survey, not EIA MECS).
+* Path: The link to the dataset's "homepage", where information about the dataset and the path to download it can be found.
+* Working partitions: A dictionary where the key is the name of the partition (e.g., month, year, form), and the values are the actual available partitions (e.g., 2002-2020).
+* License: We only archive data with an open source license (e.g., US Government Works or a Creative Commons License), so make sure any data you're archiving is licensed for re-distribution.
+* Keywords: Words that someone might use to search for this dataset. These are used to help people find our data on Zenodo.
+
+If your dataset will be integrated directly into
+[PUDL](https://github.com/catalyst-cooperative/pudl), you'll need to add the metadata
+for the dataset into the PUDL repository in the `SOURCES` dictionary in
+`src.pudl.metadata.sources.py`.
+
+If you aren't sure, or you're archiving data that won't go into PUDL, you'll want to
+add your metadata as an entry into the `NON_PUDL_SOURCES` dictionary in
+`src/pudl_archiver/metadata/sources.py`.
+
+### Step 2: Implement archiver interface
 
 All of the archivers inherit from the `AbstractDatasetArchiver` base class (defined
 in `src/pudl_archiver/archiver/classes.py`. There is only a single method that each
@@ -130,7 +152,7 @@ hyperlinks matching the pattern on the page pointed to by the URL. This is usefu
 there's a page containing links to a series of data resources that have somewhat
 structured names.
 
-### Step 2: Run --initialize command
+### Step 3: Run --initialize command
 
 You will need to run the initialize command to create a new zenodo deposition, and
 update the config file with the new DOI:
@@ -145,7 +167,7 @@ require you to create your own
 [Zenodo validation credentials](https://zenodo.org/account/settings/applications/tokens/new/)
 if you are not a core Catalyst developer.
 
-### Step 3: Manually review your archive before publication.
+### Step 4: Manually review your archive before publication.
 
 If the archiver run is successful, it will produce a link to the draft archive. Though
 many of the validation steps are automated, it is worthwhile manually reviewing archives

diff --git a/src/pudl_archiver/archivers/eia/mecs.py → src/pudl_archiver/archivers/eia/eiamecs.py b/src/pudl_archiver/archivers/eia/mecs.py → src/pudl_archiver/archivers/eia/eiamecs.py
@@ -16,7 +16,7 @@
 class EiaMECSArchiver(AbstractDatasetArchiver):
     """EIA MECS archiver."""
 
-    name = "mecs"
+    name = "eiamecs"
 
     async def get_resources(self) -> ArchiveAwaitable:
         """Download EIA-MECS resources."""

diff --git a/src/pudl_archiver/depositors/zenodo/entities.py b/src/pudl_archiver/depositors/zenodo/entities.py
@@ -9,9 +9,11 @@
 from typing import Annotated, Literal
 
 from pudl.metadata.classes import Contributor, DataSource
+from pudl.metadata.sources import SOURCES
 from pydantic import BaseModel, Field, StringConstraints, field_validator
 
 from pudl_archiver.depositors.depositor import DepositionState
+from pudl_archiver.metadata.sources import NON_PUDL_SOURCES
 from pudl_archiver.utils import Url
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
@@ -112,7 +114,9 @@ def check_empty_string(cls, doi: str):  # noqa: N805
     @classmethod
     def from_data_source(cls, data_source_id: str) -> "DepositionMetadata":
         """Construct deposition metadata object from PUDL DataSource model."""
-        data_source = DataSource.from_id(data_source_id)
+        # Identify whether metadata originates from PUDL or archiver repo
+        sources = SOURCES if data_source_id in SOURCES else NON_PUDL_SOURCES
+        data_source = DataSource.from_id(data_source_id, sources=sources)
         creators = [
             DepositionCreator.from_contributor(contributor)
             for contributor in data_source.contributors

diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py
@@ -9,8 +9,10 @@
 
 from pudl.metadata.classes import Contributor, DataSource, License
 from pudl.metadata.constants import CONTRIBUTORS
+from pudl.metadata.sources import SOURCES
 from pydantic import BaseModel, Field, field_serializer
 
+from pudl_archiver.metadata.sources import NON_PUDL_SOURCES
 from pudl_archiver.utils import Url
 
 MEDIA_TYPES: dict[str, str] = {
@@ -149,9 +151,13 @@ def new_datapackage(
                 containing the local path to the resource, and its working partitions.
             version: Version string for current deposition version.
         """
-        if name == "mecs":
-            return cls.mecs(resources=resources, version=version)
-        return cls.from_pudl_metadata(name=name, resources=resources, version=version)
+        if name in SOURCES:  # If data source in PUDL source metadata
+            return cls.from_pudl_metadata(
+                name=name, resources=resources, version=version
+            )
+        return cls.from_non_pudl_metadata(
+            name=name, resources=resources, version=version
+        )
 
     @classmethod
     def from_pudl_metadata(
@@ -161,7 +167,7 @@ def from_pudl_metadata(
         version: str | None,
     ) -> "DataPackage":
         """Create a datapackage using PUDL metadata associated with ``name``."""
-        data_source = DataSource.from_id(name)
+        data_source = DataSource.from_id(name, sources=SOURCES)
 
         return DataPackage(
             name=f"pudl-raw-{data_source.name}",
@@ -177,30 +183,25 @@ def from_pudl_metadata(
         )
 
     @classmethod
-    def mecs(cls, resources: Iterable[Resource], version: str | None):
-        """Hack method to create a Datapackage for EIA MECS data not in PUDL metadata."""
+    def from_non_pudl_metadata(
+        cls,
+        name: str,
+        resources: Iterable[Resource],
+        version: str | None,
+    ):
+        """Create a datapackage for sources that won't end up in PUDL."""
+        data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES)
+
         return DataPackage(
-            name="MECS",
-            title="EIA MECS data",
-            sources=[
-                {
-                    "title": "EIA MECS data",
-                    "path": "https://www.eia.gov/consumption/manufacturing/data/2018/",
-                }
-            ],
-            licenses=[
-                License(
-                    **{
-                        "name": "CC-BY-4.0",
-                        "title": "Creative Commons Attribution 4.0",
-                        "path": "https://creativecommons.org/licenses/by/4.0",
-                    }
-                )
-            ],
+            name=data_source.name,
+            title=data_source.title,
+            sources=[{"title": data_source.title, "path": str(data_source.path)}],
+            licenses=[data_source.license_raw],
             resources=sorted(resources, key=lambda x: x.name),  # Sort by filename
-            contributors=[CONTRIBUTORS["catalyst-cooperative"]],
-            created=str(datetime.datetime.now(tz=datetime.UTC).isoformat()),
-            keywords=["MECS"],
-            description="According to the EIA, the Manufacturing Energy Consumption Survey (MECS) is a national sample survey that collects information on the stock of U.S. manufacturing establishment, their energy-related building characteristics, and their energy consumption and expenditures.",
+            # Make it easier to add contributors directly into source dictionary
+            contributors=data_source.contributors,
+            created=datetime.datetime.now(tz=datetime.UTC).isoformat(),
+            keywords=data_source.keywords,
+            description=data_source.description,
             version=version,
         )
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
@@ -0,0 +1,47 @@
+"""Metadata and operational constants."""
+
+from typing import Any
+
+from pudl.metadata.constants import CONTRIBUTORS, LICENSES
+
+# To add a new contributor, follow the following format to add an entry to the
+# ADDL_CONTRIBUTORS dictionary below formatted like this:
+#     "name-shorthand": {
+#         "title": "Catalyst Cooperative",
+#         "email": "[email protected]",
+#         "path": "https://catalyst.coop",
+#         "role": "publisher",
+#         "zenodo_role": "distributor",
+#         "organization": "Catalyst Cooperative",
+#         "orcid": "0000-1234-5678-9101"
+#     }
+# Note that the only required fields are title (your name) and path
+# (e.g., a link to your Github account, your ORCID site or a personal webpage), but
+# filling other fields is strongly encouraged!
+ADDL_CONTRIBUTORS: dict[str, dict[str, str]] = {}
+
+NON_PUDL_SOURCES: dict[str, Any] = {
+    "eiamecs": {
+        "title": "EIA Manufacturing Energy Consumption Survey",
+        "path": "https://www.eia.gov/consumption/manufacturing/data/2018/",
+        "description": (
+            "EIA Form 846 A and B is more commonly known as the Manufacturing Energy"
+            "Consumption Survey (MECS). MECS is a national sample survey that collects"
+            "information on the stock of U.S. manufacturing establishment, their"
+            "energy-related building characteristics, and their energy consumption"
+            "and expenditures. MECS is conducted every four years."
+        ),
+        "working_partitions": {
+            "years": [1991, 1994, 1998, 2002, 2006, 2010, 2014, 2018]
+        },  # Census DP1 is monolithic.
+        "keywords": sorted(
+            {
+                "manufacturing",
+                "MECS",
+            }
+        ),
+        "license_raw": LICENSES["us-govt"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+        "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
+    },
+}