From e724bd7819fb78da2cf57b563116def5a68d4a1d Mon Sep 17 00:00:00 2001
From: e-belfer <ella.belfer@catalyst.coop>
Date: Tue, 7 Jan 2025 10:37:47 -0500
Subject: [PATCH 1/4] Move MECS to metadata sources dictionary, add
 non-pudl-metadata method

---
 src/pudl_archiver/frictionless.py     | 53 ++++++++++++++-------------
 src/pudl_archiver/metadata/sources.py | 38 +++++++++++++++++++
 2 files changed, 65 insertions(+), 26 deletions(-)
 create mode 100644 src/pudl_archiver/metadata/sources.py

diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py
index 34796d9f..31554bef 100644
--- a/src/pudl_archiver/frictionless.py
+++ b/src/pudl_archiver/frictionless.py
@@ -9,8 +9,10 @@
 
 from pudl.metadata.classes import Contributor, DataSource, License
 from pudl.metadata.constants import CONTRIBUTORS
+from pudl.metadata.sources import SOURCES
 from pydantic import BaseModel, Field, field_serializer
 
+from pudl_archiver.metadata.sources import NON_PUDL_SOURCES
 from pudl_archiver.utils import Url
 
 MEDIA_TYPES: dict[str, str] = {
@@ -149,9 +151,13 @@ def new_datapackage(
                 containing the local path to the resource, and its working partitions.
             version: Version string for current deposition version.
         """
-        if name == "mecs":
-            return cls.mecs(resources=resources, version=version)
-        return cls.from_pudl_metadata(name=name, resources=resources, version=version)
+        if name in SOURCES:  # If data source in PUDL source metadata
+            return cls.from_pudl_metadata(
+                name=name, resources=resources, version=version
+            )
+        return cls.from_non_pudl_metadata(
+            name=name, resources=resources, version=version
+        )
 
     @classmethod
     def from_pudl_metadata(
@@ -177,30 +183,25 @@ def from_pudl_metadata(
         )
 
     @classmethod
-    def mecs(cls, resources: Iterable[Resource], version: str | None):
-        """Hack method to create a Datapackage for EIA MECS data not in PUDL metadata."""
+    def from_non_pudl_metadata(
+        cls,
+        name: str,
+        resources: Iterable[Resource],
+        version: str | None,
+    ):
+        """Create a datapackage for sources that won't end up in PUDL."""
+        data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES)
+
         return DataPackage(
-            name="MECS",
-            title="EIA MECS data",
-            sources=[
-                {
-                    "title": "EIA MECS data",
-                    "path": "https://www.eia.gov/consumption/manufacturing/data/2018/",
-                }
-            ],
-            licenses=[
-                License(
-                    **{
-                        "name": "CC-BY-4.0",
-                        "title": "Creative Commons Attribution 4.0",
-                        "path": "https://creativecommons.org/licenses/by/4.0",
-                    }
-                )
-            ],
+            name=data_source.name,
+            title=data_source.title,
+            sources=[{"title": data_source.title, "path": str(data_source.path)}],
+            licenses=[data_source.license_raw],
             resources=sorted(resources, key=lambda x: x.name),  # Sort by filename
-            contributors=[CONTRIBUTORS["catalyst-cooperative"]],
-            created=str(datetime.datetime.now(tz=datetime.UTC).isoformat()),
-            keywords=["MECS"],
-            description="According to the EIA, the Manufacturing Energy Consumption Survey (MECS) is a national sample survey that collects information on the stock of U.S. manufacturing establishment, their energy-related building characteristics, and their energy consumption and expenditures.",
+            # Make it easier to add contributors directly into source dictionary
+            contributors=data_source.contributors,
+            created=datetime.datetime.now(tz=datetime.UTC).isoformat(),
+            keywords=data_source.keywords,
+            description=data_source.description,
             version=version,
         )
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
new file mode 100644
index 00000000..49f5fdab
--- /dev/null
+++ b/src/pudl_archiver/metadata/sources.py
@@ -0,0 +1,38 @@
+"""Metadata and operational constants."""
+
+from typing import Any
+
+from pudl.metadata.constants import CONTRIBUTORS, LICENSES
+
+NON_PUDL_SOURCES: dict[str, Any] = {
+    "eiamecs": {
+        "title": "EIA Manufacturing Energy Consumption Survey",
+        "path": "https://www.eia.gov/consumption/manufacturing/data/2018/",
+        "description": (
+            "EIA Form 846 A and B is more commonly known as the Manufacturing Energy",
+            "Consumption Survey (MECS). MECS is a national sample survey that collects",
+            "information on the stock of U.S. manufacturing establishment, their",
+            "energy-related building characteristics, and their energy consumption",
+            "and expenditures. MECS is conducted every four years.",
+        ),
+        "working_partitions": {
+            1991,
+            1994,
+            1998,
+            2002,
+            2006,
+            2010,
+            2014,
+            2018,
+        },  # Census DP1 is monolithic.
+        "keywords": sorted(
+            {
+                "manufacturing",
+                "MECS",
+            }
+        ),
+        "license_raw": LICENSES["us-govt"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+        "contributors": [CONTRIBUTORS["catalyst-cooperative"]],
+    },
+}

From e2a4fb50d5f3bbd33f8ba500ac483ff94bb7f13d Mon Sep 17 00:00:00 2001
From: e-belfer <ella.belfer@catalyst.coop>
Date: Wed, 8 Jan 2025 11:48:18 -0500
Subject: [PATCH 2/4] Make archiving flow work, rename to eiamecs, update
 README

---
 README.md                                     | 28 +++++++++++++--
 .../archivers/eia/{mecs.py => eiamecs.py}     |  2 +-
 .../depositors/zenodo/entities.py             | 10 +++++-
 src/pudl_archiver/frictionless.py             |  6 +++-
 src/pudl_archiver/metadata/sources.py         | 35 ++++++++++++-------
 5 files changed, 62 insertions(+), 19 deletions(-)
 rename src/pudl_archiver/archivers/eia/{mecs.py => eiamecs.py} (98%)

diff --git a/README.md b/README.md
index d648eb61..595e1382 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,29 @@ There are also four optional flags available:
 
 ## Adding a new dataset
 
-### Step 1: Implement archiver interface
+### Step 1: Define the dataset's metadata
+For each dataset we archive, we record information about the title, a description, who
+contributed to archiving the dataset, the segments into which the data files are
+partitioned, its license and keywords. This
+information is used to communicate about the dataset's usage and provenance to any
+future users.
+
+* Title: The title of your dataset should clearly contain the agency publishing the data and a non-abbreviated title (e.g., EIA Manufacturing Energy Consumption Survey, not EIA MECS).
+* Path: The link to the dataset's "homepage", where information about the dataset and the path to download it can be found.
+* Working partitions: A dictionary where the key is the name of the partition (e.g., month, year, form), and the values are the actual available partitions (e.g., 2002-2020).
+* License: We only archive data with an open source license (e.g., US Government Works or a Creative Commons License), so make sure any data you're archiving is licensed for re-distribution.
+* Keywords: Words that someone might use to search for this dataset. These are used to help people find our data on Zenodo.
+
+If your dataset will be integrated directly into
+[PUDL](https://github.com/catalyst-cooperative/pudl), you'll need to add the metadata
+for the dataset into the PUDL repository in the `SOURCES` dictionary in
+`src.pudl.metadata.sources.py`.
+
+If you aren't sure, or you're archiving data that won't go into PUDL, you'll want to
+add your metadata as an entry into the `NON_PUDL_SOURCES` dictionary in
+`src/pudl_archiver/metadata/sources.py`.
+
+### Step 2: Implement archiver interface
 
 All of the archivers inherit from the `AbstractDatasetArchiver` base class (defined
 in `src/pudl_archiver/archiver/classes.py`. There is only a single method that each
@@ -130,7 +152,7 @@ hyperlinks matching the pattern on the page pointed to by the URL. This is usefu
 there's a page containing links to a series of data resources that have somewhat
 structured names.
 
-### Step 2: Run --initialize command
+### Step 3: Run --initialize command
 
 You will need to run the initialize command to create a new zenodo deposition, and
 update the config file with the new DOI:
@@ -145,7 +167,7 @@ require you to create your own
 [Zenodo validation credentials](https://zenodo.org/account/settings/applications/tokens/new/)
 if you are not a core Catalyst developer.
 
-### Step 3: Manually review your archive before publication.
+### Step 4: Manually review your archive before publication.
 
 If the archiver run is successful, it will produce a link to the draft archive. Though
 many of the validation steps are automated, it is worthwhile manually reviewing archives
diff --git a/src/pudl_archiver/archivers/eia/mecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py
similarity index 98%
rename from src/pudl_archiver/archivers/eia/mecs.py
rename to src/pudl_archiver/archivers/eia/eiamecs.py
index cb2b5d41..c3ef0fbf 100644
--- a/src/pudl_archiver/archivers/eia/mecs.py
+++ b/src/pudl_archiver/archivers/eia/eiamecs.py
@@ -16,7 +16,7 @@
 class EiaMECSArchiver(AbstractDatasetArchiver):
     """EIA MECS archiver."""
 
-    name = "mecs"
+    name = "eiamecs"
 
     async def get_resources(self) -> ArchiveAwaitable:
         """Download EIA-MECS resources."""
diff --git a/src/pudl_archiver/depositors/zenodo/entities.py b/src/pudl_archiver/depositors/zenodo/entities.py
index e3bddbee..2b04ee71 100644
--- a/src/pudl_archiver/depositors/zenodo/entities.py
+++ b/src/pudl_archiver/depositors/zenodo/entities.py
@@ -9,9 +9,11 @@
 from typing import Annotated, Literal
 
 from pudl.metadata.classes import Contributor, DataSource
+from pudl.metadata.sources import SOURCES
 from pydantic import BaseModel, Field, StringConstraints, field_validator
 
 from pudl_archiver.depositors.depositor import DepositionState
+from pudl_archiver.metadata.sources import NON_PUDL_SOURCES
 from pudl_archiver.utils import Url
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
@@ -112,7 +114,13 @@ def check_empty_string(cls, doi: str):  # noqa: N805
     @classmethod
     def from_data_source(cls, data_source_id: str) -> "DepositionMetadata":
         """Construct deposition metadata object from PUDL DataSource model."""
-        data_source = DataSource.from_id(data_source_id)
+        # Identify whether metadata originates from PUDL or archiver repo
+        sources = SOURCES if data_source_id in SOURCES else NON_PUDL_SOURCES
+        # TODO: This is a hacky workaround to having to update from_id to take sources
+        # in PUDL - should I just fix this at the source to make it more legible?
+        data_source = DataSource(
+            **DataSource.dict_from_id(x=data_source_id, sources=sources)
+        )
         creators = [
             DepositionCreator.from_contributor(contributor)
             for contributor in data_source.contributors
diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py
index 31554bef..c5912b6e 100644
--- a/src/pudl_archiver/frictionless.py
+++ b/src/pudl_archiver/frictionless.py
@@ -190,7 +190,11 @@ def from_non_pudl_metadata(
         version: str | None,
     ):
         """Create a datapackage for sources that won't end up in PUDL."""
-        data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES)
+        # TODO: This is a slightly ugly workaround to avoid having to add sources into
+        # the from_id method - should I just fix this at the source?
+        data_source = DataSource(
+            **DataSource.dict_from_id(x=name, sources=NON_PUDL_SOURCES)
+        )
 
         return DataPackage(
             name=data_source.name,
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
index 49f5fdab..db3bc8c9 100644
--- a/src/pudl_archiver/metadata/sources.py
+++ b/src/pudl_archiver/metadata/sources.py
@@ -4,26 +4,35 @@
 
 from pudl.metadata.constants import CONTRIBUTORS, LICENSES
 
+# To add a new contributor, follow the following format to add an entry to the
+# ADDL_CONTRIBUTORS dictionary below formatted like this:
+#     "name-shorthand": {
+#         "title": "Catalyst Cooperative",
+#         "email": "pudl@catalyst.coop",
+#         "path": "https://catalyst.coop",
+#         "role": "publisher",
+#         "zenodo_role": "distributor",
+#         "organization": "Catalyst Cooperative",
+#         "orcid": "0000-1234-5678-9101"
+#     }
+# Note that the only required fields are title (your name) and path
+# (e.g., a link to your Github account, your ORCID site or a personal webpage), but
+# filling other fields is strongly encouraged!
+ADDL_CONTRIBUTORS: dict[str, dict[str, str]] = {}
+
 NON_PUDL_SOURCES: dict[str, Any] = {
     "eiamecs": {
         "title": "EIA Manufacturing Energy Consumption Survey",
         "path": "https://www.eia.gov/consumption/manufacturing/data/2018/",
         "description": (
-            "EIA Form 846 A and B is more commonly known as the Manufacturing Energy",
-            "Consumption Survey (MECS). MECS is a national sample survey that collects",
-            "information on the stock of U.S. manufacturing establishment, their",
-            "energy-related building characteristics, and their energy consumption",
-            "and expenditures. MECS is conducted every four years.",
+            "EIA Form 846 A and B is more commonly known as the Manufacturing Energy"
+            "Consumption Survey (MECS). MECS is a national sample survey that collects"
+            "information on the stock of U.S. manufacturing establishment, their"
+            "energy-related building characteristics, and their energy consumption"
+            "and expenditures. MECS is conducted every four years."
         ),
         "working_partitions": {
-            1991,
-            1994,
-            1998,
-            2002,
-            2006,
-            2010,
-            2014,
-            2018,
+            "years": [1991, 1994, 1998, 2002, 2006, 2010, 2014, 2018]
         },  # Census DP1 is monolithic.
         "keywords": sorted(
             {

From d4f0acbcec2a958bafa0fc64fb5ffa0d7449a743 Mon Sep 17 00:00:00 2001
From: e-belfer <ella.belfer@catalyst.coop>
Date: Tue, 14 Jan 2025 18:03:34 -0500
Subject: [PATCH 3/4] Restore from_id() method

---
 src/pudl_archiver/depositors/zenodo/entities.py | 6 +-----
 src/pudl_archiver/frictionless.py               | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/pudl_archiver/depositors/zenodo/entities.py b/src/pudl_archiver/depositors/zenodo/entities.py
index 2b04ee71..4b91a10d 100644
--- a/src/pudl_archiver/depositors/zenodo/entities.py
+++ b/src/pudl_archiver/depositors/zenodo/entities.py
@@ -116,11 +116,7 @@ def from_data_source(cls, data_source_id: str) -> "DepositionMetadata":
         """Construct deposition metadata object from PUDL DataSource model."""
         # Identify whether metadata originates from PUDL or archiver repo
         sources = SOURCES if data_source_id in SOURCES else NON_PUDL_SOURCES
-        # TODO: This is a hacky workaround to having to update from_id to take sources
-        # in PUDL - should I just fix this at the source to make it more legible?
-        data_source = DataSource(
-            **DataSource.dict_from_id(x=data_source_id, sources=sources)
-        )
+        data_source = DataSource.from_id(data_source_id, sources=sources)
         creators = [
             DepositionCreator.from_contributor(contributor)
             for contributor in data_source.contributors
diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py
index c5912b6e..31554bef 100644
--- a/src/pudl_archiver/frictionless.py
+++ b/src/pudl_archiver/frictionless.py
@@ -190,11 +190,7 @@ def from_non_pudl_metadata(
         version: str | None,
     ):
         """Create a datapackage for sources that won't end up in PUDL."""
-        # TODO: This is a slightly ugly workaround to avoid having to add sources into
-        # the from_id method - should I just fix this at the source?
-        data_source = DataSource(
-            **DataSource.dict_from_id(x=name, sources=NON_PUDL_SOURCES)
-        )
+        data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES)
 
         return DataPackage(
             name=data_source.name,

From ef9180e34b0cd57b6fa2b379941ef6f002846947 Mon Sep 17 00:00:00 2001
From: e-belfer <ella.belfer@catalyst.coop>
Date: Tue, 14 Jan 2025 18:05:32 -0500
Subject: [PATCH 4/4] Make from_pudl_metadata more obvious by specifying
 sources

---
 src/pudl_archiver/frictionless.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py
index 31554bef..6ea62b13 100644
--- a/src/pudl_archiver/frictionless.py
+++ b/src/pudl_archiver/frictionless.py
@@ -167,7 +167,7 @@ def from_pudl_metadata(
         version: str | None,
     ) -> "DataPackage":
         """Create a datapackage using PUDL metadata associated with ``name``."""
-        data_source = DataSource.from_id(name)
+        data_source = DataSource.from_id(name, sources=SOURCES)
 
         return DataPackage(
             name=f"pudl-raw-{data_source.name}",