From 93bdbe76d946045bb0c0f5c515b45fa6d724b1cf Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 10:24:14 -0500
Subject: [PATCH 01/19] Basic refactoring: * Create explicit "main" method *
 Extract logic to functions * Add detailed docstrings * Add some comments

---
 source_collectors/muckrock/muck_get.py | 94 +++++++++++++++-----------
 1 file changed, 54 insertions(+), 40 deletions(-)

diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py
index 20c29338..4c154f36 100644
--- a/source_collectors/muckrock/muck_get.py
+++ b/source_collectors/muckrock/muck_get.py
@@ -1,6 +1,6 @@
 """
-muck_get.py
-
+A straightforward standalone script for downloading data from MuckRock
+and searching for it with a specific search string.
 """
 
 import requests
@@ -9,53 +9,67 @@
 # Define the base API endpoint
 base_url = "https://www.muckrock.com/api_v1/foia/"
 
-# Define the search string
-search_string = "use of force"
-per_page = 100
-page = 1
-all_results = []
-max_count = 20
+def dump_list(all_results: list[dict], search_string: str) -> None:
+    """
+    Dumps a list of dictionaries into a JSON file.
+    """
+    json_out_file = search_string.replace(" ", "_") + ".json"
+    with open(json_out_file, "w") as json_file:
+        json.dump(all_results, json_file)
 
-while True:
+    print(f"List dumped into {json_out_file}")
 
-    # Make the GET request with the search string as a query parameter
-    response = requests.get(
-        base_url, params={"page": page, "page_size": per_page, "format": "json"}
-    )
+def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20) -> list[dict]:
+    """
+    Search for FOIA data based on a search string.
+    :param search_string: The search string to use.
+    :param per_page: The number of results to retrieve per page.
+    :param max_count: The maximum number of results to retrieve. Search stops once this number is reached or exceeded.
+    """
+    page = 1
+    all_results = []
 
-    # Check if the request was successful
-    if response.status_code == 200:
-        # Parse the JSON response
-        data = response.json()
+    while True:
 
-        if not data["results"]:
-            break
+        # Make the GET request with the search string as a query parameter
+        response = requests.get(
+            base_url, params={"page": page, "page_size": per_page, "format": "json"}
+        )
 
-        filtered_results = [
-            item
-            for item in data["results"]
-            if search_string.lower() in item["title"].lower()
-        ]
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the JSON response
+            data = response.json()
 
-        all_results.extend(filtered_results)
+            if not data["results"]:
+                break
 
-        if len(filtered_results) > 0:
-            num_results = len(filtered_results)
-            print(f"found {num_results} more matching result(s)...")
 
-        if len(all_results) >= max_count:
-            print("max count reached... exiting")
-            break
+            # Filter results according to whether the search string is in the title
+            filtered_results = [
+                item
+                for item in data["results"]
+                if search_string.lower() in item["title"].lower()
+            ]
 
-        page += 1
+            all_results.extend(filtered_results)
 
-    else:
-        print(f"Error: {response.status_code}")
-        break
+            if len(filtered_results) > 0:
+                num_results = len(filtered_results)
+                print(f"found {num_results} more matching result(s)...")
 
-# Dump list into a JSON file
-json_out_file = search_string.replace(" ", "_") + ".json"
-with open(json_out_file, "w") as json_file:
-    json.dump(all_results, json_file)
+            if len(all_results) >= max_count:
+                print(f"max count ({max_count}) reached... exiting")
+                break
+
+            page += 1
+
+        else:
+            print(f"Error: {response.status_code}")
+            break
+    return all_results
 
-print(f"List dumped into {json_out_file}")
+if __name__ == "__main__":
+    search_string = "use of force"
+    all_results = search_for_foia(search_string)
+    dump_list(all_results, search_string)

From 0fe043f542926973fea7e2cba75ef7b0ada274bb Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 10:36:54 -0500
Subject: [PATCH 02/19] Basic refactoring: * Create explicit "main" method *
 Extract logic to functions * Add detailed docstrings * Add some comments and
 TODOs

---
 .../muckrock/muckrock_ml_labeler.py           | 115 +++++++++++-------
 1 file changed, 71 insertions(+), 44 deletions(-)

diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py
index b313c045..e3cb5cc7 100644
--- a/source_collectors/muckrock/muckrock_ml_labeler.py
+++ b/source_collectors/muckrock/muckrock_ml_labeler.py
@@ -1,6 +1,5 @@
 """
-muckrock_ml_labeler.py
-
+Utilizes a fine-tuned model to label a dataset of URLs.
 """
 
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
@@ -8,45 +7,73 @@
 import pandas as pd
 import argparse
 
-# Load the tokenizer and model
-model_name = "PDAP/fine-url-classifier"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-model.eval()
-
-# Load the dataset from command line argument
-parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.")
-parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file")
-args = parser.parse_args()
-df = pd.read_csv(args.csv_file)
-
-# Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row
-columns_to_combine = [
-    "url_path",
-    "html_title",
-    "h1",
-]  # Add other columns here as needed
-df["combined_text"] = df[columns_to_combine].apply(
-    lambda row: " ".join(row.values.astype(str)), axis=1
-)
-
-# Convert the combined text into a list
-texts = df["combined_text"].tolist()
-
-# Tokenize the inputs
-inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
-
-# Perform inference
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# Get the predicted labels
-predictions = torch.argmax(outputs.logits, dim=-1)
-
-# Map predictions to labels
-labels = model.config.id2label
-predicted_labels = [labels[int(pred)] for pred in predictions]
-
-# Add the predicted labels to the dataframe and save
-df["predicted_label"] = predicted_labels
-df.to_csv("labeled_muckrock_dataset.csv", index=False)
+
+def load_dataset_from_command_line() -> pd.DataFrame:
+    parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.")
+    parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file")
+    args = parser.parse_args()
+    return pd.read_csv(args.csv_file)
+
+
+def create_combined_text_column(df: pd.DataFrame) -> None:
+    # Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row
+    columns_to_combine = [
+        "url_path",
+        "html_title",
+        "h1",
+    ]  # Add other columns here as needed
+    df["combined_text"] = df[columns_to_combine].apply(
+        lambda row: " ".join(row.values.astype(str)), axis=1
+    )
+
+
+def get_list_of_combined_texts(df: pd.DataFrame) -> list[str]:
+    # Convert the combined text into a list
+    return df["combined_text"].tolist()
+
+
+def save_labeled_muckrock_dataset_to_csv():
+    df.to_csv("labeled_muckrock_dataset.csv", index=False)
+
+
+def create_predicted_labels_column(df: pd.DataFrame, predicted_labels: list[str]) -> None:
+    df["predicted_label"] = predicted_labels
+
+
+def map_predictions_to_labels(model, predictions) -> list[str]:
+    labels = model.config.id2label
+    return [labels[int(pred)] for pred in predictions]
+
+
+def get_predicted_labels(texts: list[str]) -> list[str]:
+    # Load the tokenizer and model
+    model_name = "PDAP/fine-url-classifier"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    model.eval()
+    # Tokenize the inputs
+    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+    # Perform inference
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Get the predicted labels
+    predictions = torch.argmax(outputs.logits, dim=-1)
+    # Map predictions to labels
+    predicted_labels = map_predictions_to_labels(model=model, predictions=predictions)
+
+    return predicted_labels
+
+
+if __name__ == "__main__":
+    df = load_dataset_from_command_line()
+    # TODO: Check for existence of required columns prior to further processing
+    create_combined_text_column(df=df)
+
+    texts = get_list_of_combined_texts(df=df)
+
+    predicted_labels = get_predicted_labels(texts=texts)
+    # Add the predicted labels to the dataframe and save
+    create_predicted_labels_column(df=df, predicted_labels=predicted_labels)
+
+    save_labeled_muckrock_dataset_to_csv()
\ No newline at end of file

From 4f5cc516e29307353f1b250a6aab00d3ce0b7603 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 10:48:21 -0500
Subject: [PATCH 03/19] Basic refactoring: * Add detailed docstrings * Add some
 comments and TODOs

---
 source_collectors/muckrock/get_allegheny_foias.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py
index a559f67f..bf62ba33 100644
--- a/source_collectors/muckrock/get_allegheny_foias.py
+++ b/source_collectors/muckrock/get_allegheny_foias.py
@@ -1,5 +1,6 @@
 """
-get_allegheny_foias.py
+Get Allegheny County FOIA requests
+and save them to a JSON file
 
 """
 import requests
@@ -47,9 +48,12 @@ def fetch_foia_data(jurisdiction_ids):
     """
     all_data = []
     for name, id_ in jurisdiction_ids.items():
+        # TODO: The muckrock api should be centralized in a `constants.py` folder
+        #   and the url should be constructed in a function or class
         url = f"https://www.muckrock.com/api_v1/foia/?status=done&jurisdiction={id_}"
         while url:
             response = requests.get(url)
+            # TODO: If logic similar to `fetch_jurisdiction_ids` and should be generalized
             if response.status_code == 200:
                 data = response.json()
                 all_data.extend(data.get("results", []))
@@ -66,6 +70,7 @@ def fetch_foia_data(jurisdiction_ids):
                 break
 
     # Save the combined data to a JSON file
+    # TODO: Generalize this logic with similar logic in `muck_get.py` to function
     with open("foia_data_combined.json", "w") as json_file:
         json.dump(all_data, json_file, indent=4)
 

From 9294fb05e6a2b9764f8abb318ffd066f420cd884 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 10:57:54 -0500
Subject: [PATCH 04/19] Basic refactoring: * Create explicit main function and
 `__main__` section * Add detailed docstrings * Add some comments and TODOs

---
 .../generate_detailed_muckrock_csv.py         | 221 ++++++++++--------
 1 file changed, 118 insertions(+), 103 deletions(-)

diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index a077dbc7..2fac3bcd 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -1,3 +1,9 @@
+"""
+Converts JSON file of MuckRock FOIA requests to CSV for further processing
+"""
+
+# TODO: Look into linking up this logic with other components in pipeline.
+
 import json
 import argparse
 import csv
@@ -5,17 +11,6 @@
 import time
 from utils import format_filename_json_to_csv
 
-# Load the JSON data
-parser = argparse.ArgumentParser(description="Parse JSON from a file.")
-parser.add_argument(
-    "--json_file", type=str, required=True, help="Path to the JSON file"
-)
-
-args = parser.parse_args()
-
-with open(args.json_file, "r") as f:
-    json_data = json.load(f)
-
 # Define the CSV headers
 headers = [
     "name",
@@ -54,7 +49,7 @@
 
 def get_agency(agency_id):
     """
-    Function to get agency_described
+    Get agency data from the MuckRock API via agency ID
     """
     if agency_id:
         agency_url = f"https://www.muckrock.com/api_v1/agency/{agency_id}/"
@@ -71,7 +66,7 @@ def get_agency(agency_id):
 
 def get_jurisdiction(jurisdiction_id):
     """
-    Function to get jurisdiction_described
+    Get jurisdiction data from the MuckRock API via jurisdiction ID
     """
     if jurisdiction_id:
         jurisdiction_url = (
@@ -87,96 +82,116 @@ def get_jurisdiction(jurisdiction_id):
     else:
         print("Jurisdiction ID not found in item")
 
+def main():
+    # Load the JSON data
+    parser = argparse.ArgumentParser(description="Parse JSON from a file.")
+    parser.add_argument(
+        "--json_file", type=str, required=True, help="Path to the JSON file"
+    )
+
+    args = parser.parse_args()
+
+    # TODO: Generalize logic
+    with open(args.json_file, "r") as f:
+        json_data = json.load(f)
+
+    output_csv = format_filename_json_to_csv(args.json_file)
+    # Open a CSV file for writing
+
+    # TODO: CSV writing and composition logic is tightly coupled -- separate
+    with open(output_csv, "w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=headers)
 
-output_csv = format_filename_json_to_csv(args.json_file)
-# Open a CSV file for writing
-with open(output_csv, "w", newline="") as csvfile:
-    writer = csv.DictWriter(csvfile, fieldnames=headers)
-
-    # Write the header row
-    writer.writeheader()
-
-    # Iterate through the JSON data
-    for item in json_data:
-        print(f"Writing data for {item.get('title')}")
-        agency_data = get_agency(item.get("agency"))
-        time.sleep(1)
-        jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction"))
-
-        jurisdiction_level = jurisdiction_data.get("level")
-        # federal jurisduction level
-        if jurisdiction_level == "f":
-            state = ""
-            county = ""
-            municipality = ""
-            juris_type = "federal"
-        # state jurisdiction level
-        if jurisdiction_level == "s":
-            state = jurisdiction_data.get("name")
-            county = ""
-            municipality = ""
-            juris_type = "state"
-        # local jurisdiction level
-        if jurisdiction_level == "l":
-            parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent"))
-            state = parent_juris_data.get("abbrev")
-            if "County" in jurisdiction_data.get("name"):
-                county = jurisdiction_data.get("name")
+        # Write the header row
+        writer.writeheader()
+
+        # Iterate through the JSON data
+        for item in json_data:
+            print(f"Writing data for {item.get('title')}")
+            agency_data = get_agency(item.get("agency"))
+            time.sleep(1)
+            jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction"))
+
+            jurisdiction_level = jurisdiction_data.get("level")
+            # federal jurisduction level
+            if jurisdiction_level == "f":
+                state = ""
+                county = ""
                 municipality = ""
-                juris_type = "county"
-            else:
+                juris_type = "federal"
+            # state jurisdiction level
+            if jurisdiction_level == "s":
+                state = jurisdiction_data.get("name")
                 county = ""
-                municipality = jurisdiction_data.get("name")
-                juris_type = "local"
-
-        if "Police" in agency_data.get("types"):
-            agency_type = "law enforcement/police"
-        else:
-            agency_type = ""
-
-        source_url = ""
-        absolute_url = item.get("absolute_url")
-        access_type = ""
-        for comm in item["communications"]:
-            if comm["files"]:
-                source_url = absolute_url + "#files"
-                access_type = "Web page,Download,API"
-                break
-
-        # Extract the relevant fields from the JSON object
-        csv_row = {
-            "name": item.get("title", ""),
-            "agency_described": agency_data.get("name", "") + " - " + state,
-            "record_type": "",
-            "description": "",
-            "source_url": source_url,
-            "readme_url": absolute_url,
-            "scraper_url": "",
-            "state": state,
-            "county": county,
-            "municipality": municipality,
-            "agency_type": agency_type,
-            "jurisdiction_type": juris_type,
-            "View Archive": "",
-            "agency_aggregation": "",
-            "agency_supplied": "no",
-            "supplying_entity": "MuckRock",
-            "agency_originated": "yes",
-            "originating_agency": agency_data.get("name", ""),
-            "coverage_start": "",
-            "source_last_updated": "",
-            "coverage_end": "",
-            "number_of_records_available": "",
-            "size": "",
-            "access_type": access_type,
-            "data_portal_type": "MuckRock",
-            "access_notes": "",
-            "record_format": "",
-            "update_frequency": "",
-            "update_method": "",
-            "retention_schedule": "",
-            "detail_level": "",
-        }
-
-        # Write the extracted row to the CSV file
-        writer.writerow(csv_row)
+                municipality = ""
+                juris_type = "state"
+            # local jurisdiction level
+            if jurisdiction_level == "l":
+                parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent"))
+                state = parent_juris_data.get("abbrev")
+                if "County" in jurisdiction_data.get("name"):
+                    county = jurisdiction_data.get("name")
+                    municipality = ""
+                    juris_type = "county"
+                else:
+                    county = ""
+                    municipality = jurisdiction_data.get("name")
+                    juris_type = "local"
+
+            if "Police" in agency_data.get("types"):
+                agency_type = "law enforcement/police"
+            else:
+                agency_type = ""
+
+            source_url = ""
+            absolute_url = item.get("absolute_url")
+            access_type = ""
+            for comm in item["communications"]:
+                if comm["files"]:
+                    source_url = absolute_url + "#files"
+                    access_type = "Web page,Download,API"
+                    break
+
+            # Extract the relevant fields from the JSON object
+            # TODO: I question the utility of creating columns that are then left blank until later
+            #   and possibly in a different file entirely.
+            csv_row = {
+                "name": item.get("title", ""),
+                "agency_described": agency_data.get("name", "") + " - " + state,
+                "record_type": "",
+                "description": "",
+                "source_url": source_url,
+                "readme_url": absolute_url,
+                "scraper_url": "",
+                "state": state,
+                "county": county,
+                "municipality": municipality,
+                "agency_type": agency_type,
+                "jurisdiction_type": juris_type,
+                "View Archive": "",
+                "agency_aggregation": "",
+                "agency_supplied": "no",
+                "supplying_entity": "MuckRock",
+                "agency_originated": "yes",
+                "originating_agency": agency_data.get("name", ""),
+                "coverage_start": "",
+                "source_last_updated": "",
+                "coverage_end": "",
+                "number_of_records_available": "",
+                "size": "",
+                "access_type": access_type,
+                "data_portal_type": "MuckRock",
+                "access_notes": "",
+                "record_format": "",
+                "update_frequency": "",
+                "update_method": "",
+                "retention_schedule": "",
+                "detail_level": "",
+            }
+
+            # Write the extracted row to the CSV file
+            writer.writerow(csv_row)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From dd3ee80b47c7ec17a79b0d855d35547514b9b5c9 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 11:14:52 -0500
Subject: [PATCH 05/19] Refactor: Add FOIAFetcher * Extract logic from
 `muck_get.py` and `download_muckrock_foia.py` * Create constants for base
 muckrock api url and foia extension of base url

---
 source_collectors/muckrock/FOIAFetcher.py     | 34 +++++++++++++
 source_collectors/muckrock/constants.py       |  3 ++
 .../muckrock/download_muckrock_foia.py        | 32 +++---------
 source_collectors/muckrock/muck_get.py        | 49 ++++++++-----------
 4 files changed, 66 insertions(+), 52 deletions(-)
 create mode 100644 source_collectors/muckrock/FOIAFetcher.py
 create mode 100644 source_collectors/muckrock/constants.py

diff --git a/source_collectors/muckrock/FOIAFetcher.py b/source_collectors/muckrock/FOIAFetcher.py
new file mode 100644
index 00000000..566df2cf
--- /dev/null
+++ b/source_collectors/muckrock/FOIAFetcher.py
@@ -0,0 +1,34 @@
+import requests
+
+from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
+
+FOIA_BASE_URL = f"{BASE_MUCKROCK_URL}/foia"
+
+class FOIAFetcher:
+
+    def __init__(self, start_page: int = 1, per_page: int = 100):
+        """
+        Constructor for the FOIAFetcher class.
+
+        Args:
+            start_page (int): The page number to start fetching from (default is 1).
+            per_page (int): The number of results to fetch per page (default is 100).
+        """
+        self.current_page = start_page
+        self.per_page = per_page
+
+    def fetch_next_page(self) -> dict | None:
+        """
+        Fetches data from a specific page of the MuckRock FOIA API.
+        """
+        page = self.current_page
+        self.current_page += 1
+        response = requests.get(
+            FOIA_BASE_URL, params={"page": page, "page_size": self.per_page, "format": "json"}
+        )
+        if response.status_code == 200:
+            return response.json()
+        # TODO: Look into raising error instead of returning None
+        print(f"Error fetching page {page}: {response.status_code}")
+        return None
+
diff --git a/source_collectors/muckrock/constants.py b/source_collectors/muckrock/constants.py
new file mode 100644
index 00000000..7109847f
--- /dev/null
+++ b/source_collectors/muckrock/constants.py
@@ -0,0 +1,3 @@
+
+
+BASE_MUCKROCK_URL = "https://www.muckrock.com/api_v1"
\ No newline at end of file
diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py
index 0abd527d..1e73c65a 100644
--- a/source_collectors/muckrock/download_muckrock_foia.py
+++ b/source_collectors/muckrock/download_muckrock_foia.py
@@ -7,49 +7,33 @@
 
 """
 
+
+# TODO: Logic redundant with `muck_get.py`. Generalize
+
 import requests
 import csv
 import time
 import json
 
-# Define the base API endpoint
-base_url = "https://www.muckrock.com/api_v1/foia/"
+from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
 
 # Set initial parameters
-page = 1
-per_page = 100
 all_data = []
 output_file = "foia_data.json"
 
-
-def fetch_page(page):
-    """
-    Fetches data from a specific page of the MuckRock FOIA API.
-    """
-    response = requests.get(
-        base_url, params={"page": page, "page_size": per_page, "format": "json"}
-    )
-    if response.status_code == 200:
-        return response.json()
-    else:
-        print(f"Error fetching page {page}: {response.status_code}")
-        return None
-
-
 # Fetch and store data from all pages
+fetcher = FOIAFetcher()
 while True:
-    print(f"Fetching page {page}...")
-    data = fetch_page(page)
+    print(f"Fetching page {fetcher.current_page}...")
+    data = fetcher.fetch_next_page()
     if data is None:
-        print(f"Skipping page {page}...")
-        page += 1
+        print(f"Skipping page {fetcher.current_page}...")
         continue
 
     all_data.extend(data["results"])
     if not data["next"]:
         break
 
-    page += 1
 
 # Write data to CSV
 with open(output_file, mode="w", encoding="utf-8") as json_file:
diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py
index 4c154f36..f9fc218b 100644
--- a/source_collectors/muckrock/muck_get.py
+++ b/source_collectors/muckrock/muck_get.py
@@ -6,6 +6,8 @@
 import requests
 import json
 
+from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
+
 # Define the base API endpoint
 base_url = "https://www.muckrock.com/api_v1/foia/"
 
@@ -26,47 +28,38 @@ def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20
     :param per_page: The number of results to retrieve per page.
     :param max_count: The maximum number of results to retrieve. Search stops once this number is reached or exceeded.
     """
-    page = 1
+    fetcher = FOIAFetcher(per_page=per_page)
     all_results = []
 
     while True:
 
-        # Make the GET request with the search string as a query parameter
-        response = requests.get(
-            base_url, params={"page": page, "page_size": per_page, "format": "json"}
-        )
+        data = fetcher.fetch_next_page()
 
-        # Check if the request was successful
-        if response.status_code == 200:
-            # Parse the JSON response
-            data = response.json()
+        if data is None:
+            break
 
-            if not data["results"]:
-                break
+        if not data["results"]:
+            break
 
 
-            # Filter results according to whether the search string is in the title
-            filtered_results = [
-                item
-                for item in data["results"]
-                if search_string.lower() in item["title"].lower()
-            ]
+        # Filter results according to whether the search string is in the title
+        filtered_results = [
+            item
+            for item in data["results"]
+            if search_string.lower() in item["title"].lower()
+        ]
 
-            all_results.extend(filtered_results)
+        all_results.extend(filtered_results)
 
-            if len(filtered_results) > 0:
-                num_results = len(filtered_results)
-                print(f"found {num_results} more matching result(s)...")
+        num_results = len(filtered_results)
+        if num_results > 0:
+            print(f"found {num_results} more matching result(s)...")
 
-            if len(all_results) >= max_count:
-                print(f"max count ({max_count}) reached... exiting")
-                break
+        if len(all_results) >= max_count:
+            print(f"max count ({max_count}) reached... exiting")
+            break
 
-            page += 1
 
-        else:
-            print(f"Error: {response.status_code}")
-            break
     return all_results
 
 if __name__ == "__main__":

From 435b090ae7f93565726638160a4043ea859de2fb Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 11:33:48 -0500
Subject: [PATCH 06/19] Refactor: Add utility functions * Extract logic for
 loading from and saving to json files to separate functions * Add TODOs

---
 source_collectors/muckrock/create_foia_data_db.py      |  1 +
 source_collectors/muckrock/download_muckrock_foia.py   |  4 ++--
 .../muckrock/generate_detailed_muckrock_csv.py         |  6 ++----
 source_collectors/muckrock/get_allegheny_foias.py      |  7 +++----
 source_collectors/muckrock/muck_get.py                 |  5 ++---
 source_collectors/muckrock/search_local_foia_json.py   |  8 ++++----
 source_collectors/muckrock/utils.py                    | 10 ++++++++++
 7 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py
index 4adc5556..85c7fd4b 100644
--- a/source_collectors/muckrock/create_foia_data_db.py
+++ b/source_collectors/muckrock/create_foia_data_db.py
@@ -112,6 +112,7 @@ def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]:
             - None: If there is an error other than 404.
     """
 
+    # TODO: Refactor to use FOIA Fetcher
     per_page = 100
     response = requests.get(
         base_url, params={"page": page, "page_size": per_page, "format": "json"}
diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py
index 1e73c65a..4018053e 100644
--- a/source_collectors/muckrock/download_muckrock_foia.py
+++ b/source_collectors/muckrock/download_muckrock_foia.py
@@ -16,6 +16,7 @@
 import json
 
 from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
+from source_collectors.muckrock.utils import save_json_file
 
 # Set initial parameters
 all_data = []
@@ -36,7 +37,6 @@
 
 
 # Write data to CSV
-with open(output_file, mode="w", encoding="utf-8") as json_file:
-    json.dump(all_data, json_file, indent=4)
+save_json_file(file_path=output_file, data=all_data)
 
 print(f"Data written to {output_file}")
diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index 2fac3bcd..d17b7415 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -4,12 +4,11 @@
 
 # TODO: Look into linking up this logic with other components in pipeline.
 
-import json
 import argparse
 import csv
 import requests
 import time
-from utils import format_filename_json_to_csv
+from utils import format_filename_json_to_csv, load_json_file
 
 # Define the CSV headers
 headers = [
@@ -92,8 +91,7 @@ def main():
     args = parser.parse_args()
 
     # TODO: Generalize logic
-    with open(args.json_file, "r") as f:
-        json_data = json.load(f)
+    json_data = load_json_file(args.json_file)
 
     output_csv = format_filename_json_to_csv(args.json_file)
     # Open a CSV file for writing
diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py
index bf62ba33..3aac1d6f 100644
--- a/source_collectors/muckrock/get_allegheny_foias.py
+++ b/source_collectors/muckrock/get_allegheny_foias.py
@@ -7,6 +7,8 @@
 import json
 import time
 
+from source_collectors.muckrock.utils import save_json_file
+
 
 def fetch_jurisdiction_ids(town_file, base_url):
     """
@@ -70,10 +72,7 @@ def fetch_foia_data(jurisdiction_ids):
                 break
 
     # Save the combined data to a JSON file
-    # TODO: Generalize this logic with similar logic in `muck_get.py` to function
-    with open("foia_data_combined.json", "w") as json_file:
-        json.dump(all_data, json_file, indent=4)
-
+    save_json_file(file_path="foia_data_combined.json", data=all_data)
     print(f"Saved {len(all_data)} records to foia_data_combined.json")
 
 
diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py
index f9fc218b..cbd6e407 100644
--- a/source_collectors/muckrock/muck_get.py
+++ b/source_collectors/muckrock/muck_get.py
@@ -7,6 +7,7 @@
 import json
 
 from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
+from source_collectors.muckrock.utils import save_json_file
 
 # Define the base API endpoint
 base_url = "https://www.muckrock.com/api_v1/foia/"
@@ -16,9 +17,7 @@ def dump_list(all_results: list[dict], search_string: str) -> None:
     Dumps a list of dictionaries into a JSON file.
     """
     json_out_file = search_string.replace(" ", "_") + ".json"
-    with open(json_out_file, "w") as json_file:
-        json.dump(all_results, json_file)
-
+    save_json_file(file_path=json_out_file, data=all_results)
     print(f"List dumped into {json_out_file}")
 
 def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20) -> list[dict]:
diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py
index 562c4bae..3010f42d 100644
--- a/source_collectors/muckrock/search_local_foia_json.py
+++ b/source_collectors/muckrock/search_local_foia_json.py
@@ -7,13 +7,14 @@
 
 import json
 
+from source_collectors.muckrock.utils import load_json_file, save_json_file
+
 # Specify the JSON file path
 json_file = "foia_data.json"
 search_string = "use of force"
 
 # Load the JSON data
-with open(json_file, "r", encoding="utf-8") as file:
-    data = json.load(file)
+data = load_json_file(json_file)
 
 # List to store matching entries
 matching_entries = []
@@ -47,7 +48,6 @@ def search_entry(entry):
 )
 
 # Optionally, write matching entries to a new JSON file
-with open("matching_entries.json", "w", encoding="utf-8") as file:
-    json.dump(matching_entries, file, indent=4)
+save_json_file(file_path="matching_entries.json", data=matching_entries)
 
 print("Matching entries written to 'matching_entries.json'")
diff --git a/source_collectors/muckrock/utils.py b/source_collectors/muckrock/utils.py
index 3d8b63db..3c7eba28 100644
--- a/source_collectors/muckrock/utils.py
+++ b/source_collectors/muckrock/utils.py
@@ -8,6 +8,7 @@
 """
 
 import re
+import json
 
 
 def format_filename_json_to_csv(json_filename: str) -> str:
@@ -24,3 +25,12 @@ def format_filename_json_to_csv(json_filename: str) -> str:
     csv_filename = re.sub(r"_(?=[^.]*$)", "-", json_filename[:-5]) + ".csv"
 
     return csv_filename
+
+def load_json_file(file_path: str) -> dict:
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return data
+
+def save_json_file(file_path: str, data: dict | list[dict]):
+    with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4)
\ No newline at end of file

From 7dd7d0ccea3d619e55b431a9d9ccac47df126e6a Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 18:37:46 -0500
Subject: [PATCH 07/19] Refactor: Create FOIASearcher * Extract `muck_get.py`
 logic to FOIA searcher * Remove deprecated `download_muckrock_foia.py`

---
 source_collectors/muckrock/FOIASearcher.py    | 58 ++++++++++++++++
 source_collectors/muckrock/README.md          |  2 +-
 .../muckrock/download_muckrock_foia.py        | 42 ------------
 source_collectors/muckrock/muck_get.py        | 67 +++----------------
 4 files changed, 67 insertions(+), 102 deletions(-)
 create mode 100644 source_collectors/muckrock/FOIASearcher.py
 delete mode 100644 source_collectors/muckrock/download_muckrock_foia.py

diff --git a/source_collectors/muckrock/FOIASearcher.py b/source_collectors/muckrock/FOIASearcher.py
new file mode 100644
index 00000000..d42a6439
--- /dev/null
+++ b/source_collectors/muckrock/FOIASearcher.py
@@ -0,0 +1,58 @@
+from typing import Optional
+
+from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
+from tqdm import tqdm
+
+class FOIASearcher:
+    """
+    Used for searching FOIA data from MuckRock
+    """
+
+    def __init__(self, fetcher: FOIAFetcher, search_term: Optional[str] = None):
+        self.fetcher = fetcher
+        self.search_term = search_term
+
+    def fetch_page(self) -> dict | None:
+        """
+        Fetches the next page of results using the fetcher.
+        """
+        data = self.fetcher.fetch_next_page()
+        if data is None or data.get("results") is None:
+            return None
+        return data
+
+    def filter_results(self, results: list[dict]) -> list[dict]:
+        """
+        Filters the results based on the search term.
+        Override or modify as needed for custom filtering logic.
+        """
+        if self.search_term:
+            return [result for result in results if self.search_term.lower() in result["title"].lower()]
+        return results
+
+    def update_progress(self, pbar: tqdm, results: list[dict]) -> int:
+        """
+        Updates the progress bar and returns the count of results processed.
+        """
+        num_results = len(results)
+        pbar.update(num_results)
+        return num_results
+
+    def search_to_count(self, max_count: int) -> list[dict]:
+        """
+        Fetches and processes results up to a maximum count.
+        """
+        count = max_count
+        all_results = []
+        with tqdm(total=max_count, desc="Fetching results", unit="result") as pbar:
+            while count > 0:
+                data = self.fetch_page()
+                if not data:
+                    break
+
+                results = self.filter_results(data["results"])
+                all_results.extend(results)
+                count -= self.update_progress(pbar, results)
+
+        return all_results
+
diff --git a/source_collectors/muckrock/README.md b/source_collectors/muckrock/README.md
index d74b77f0..d24b0cef 100644
--- a/source_collectors/muckrock/README.md
+++ b/source_collectors/muckrock/README.md
@@ -56,7 +56,7 @@ pip install -r requirements.txt
 
 ### 2. Clone Muckrock database & search locally
 
-~~- `download_muckrock_foia.py` `search_local_foia_json.py`~~ (deprecated)
+~~- `search_local_foia_json.py`~~ (deprecated)
 
 - scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present)
 
diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py
deleted file mode 100644
index 4018053e..00000000
--- a/source_collectors/muckrock/download_muckrock_foia.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-***DEPRECATED***
-
-download_muckrock_foia.py
-
-This script fetches data from the MuckRock FOIA API and stores the results in a JSON file.
-
-"""
-
-
-# TODO: Logic redundant with `muck_get.py`. Generalize
-
-import requests
-import csv
-import time
-import json
-
-from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
-from source_collectors.muckrock.utils import save_json_file
-
-# Set initial parameters
-all_data = []
-output_file = "foia_data.json"
-
-# Fetch and store data from all pages
-fetcher = FOIAFetcher()
-while True:
-    print(f"Fetching page {fetcher.current_page}...")
-    data = fetcher.fetch_next_page()
-    if data is None:
-        print(f"Skipping page {fetcher.current_page}...")
-        continue
-
-    all_data.extend(data["results"])
-    if not data["next"]:
-        break
-
-
-# Write data to CSV
-save_json_file(file_path=output_file, data=all_data)
-
-print(f"Data written to {output_file}")
diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py
index cbd6e407..1401cd93 100644
--- a/source_collectors/muckrock/muck_get.py
+++ b/source_collectors/muckrock/muck_get.py
@@ -2,66 +2,15 @@
 A straightforward standalone script for downloading data from MuckRock
 and searching for it with a specific search string.
 """
-
-import requests
-import json
-
 from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
+from source_collectors.muckrock.FOIASearcher import FOIASearcher
 from source_collectors.muckrock.utils import save_json_file
 
-# Define the base API endpoint
-base_url = "https://www.muckrock.com/api_v1/foia/"
-
-def dump_list(all_results: list[dict], search_string: str) -> None:
-    """
-    Dumps a list of dictionaries into a JSON file.
-    """
-    json_out_file = search_string.replace(" ", "_") + ".json"
-    save_json_file(file_path=json_out_file, data=all_results)
-    print(f"List dumped into {json_out_file}")
-
-def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20) -> list[dict]:
-    """
-    Search for FOIA data based on a search string.
-    :param search_string: The search string to use.
-    :param per_page: The number of results to retrieve per page.
-    :param max_count: The maximum number of results to retrieve. Search stops once this number is reached or exceeded.
-    """
-    fetcher = FOIAFetcher(per_page=per_page)
-    all_results = []
-
-    while True:
-
-        data = fetcher.fetch_next_page()
-
-        if data is None:
-            break
-
-        if not data["results"]:
-            break
-
-
-        # Filter results according to whether the search string is in the title
-        filtered_results = [
-            item
-            for item in data["results"]
-            if search_string.lower() in item["title"].lower()
-        ]
-
-        all_results.extend(filtered_results)
-
-        num_results = len(filtered_results)
-        if num_results > 0:
-            print(f"found {num_results} more matching result(s)...")
-
-        if len(all_results) >= max_count:
-            print(f"max count ({max_count}) reached... exiting")
-            break
-
-
-    return all_results
-
 if __name__ == "__main__":
-    search_string = "use of force"
-    all_results = search_for_foia(search_string)
-    dump_list(all_results, search_string)
+    search_term = "use of force"
+    fetcher = FOIAFetcher()
+    searcher = FOIASearcher(fetcher=fetcher, search_term=search_term)
+    results = searcher.search_to_count(20)
+    json_out_file = search_term.replace(" ", "_") + ".json"
+    save_json_file(file_path=json_out_file, data=results)
+    print(f"List dumped into {json_out_file}")

From dd3f0a290a7bb7b27acb1545f808d6808b62ccd1 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 18:46:19 -0500
Subject: [PATCH 08/19] Remove `search_local_foia_json.py`

---
 source_collectors/muckrock/README.md          |  2 -
 .../muckrock/search_local_foia_json.py        | 53 -------------------
 2 files changed, 55 deletions(-)
 delete mode 100644 source_collectors/muckrock/search_local_foia_json.py

diff --git a/source_collectors/muckrock/README.md b/source_collectors/muckrock/README.md
index d24b0cef..43bae80d 100644
--- a/source_collectors/muckrock/README.md
+++ b/source_collectors/muckrock/README.md
@@ -56,8 +56,6 @@ pip install -r requirements.txt
 
 ### 2. Clone Muckrock database & search locally
 
-~~- `search_local_foia_json.py`~~ (deprecated)
-
 - scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present)
 
 - `create_foia_data_db.py` creates and populates a SQLite database (`foia_data.db`) with all MuckRock foia requests. Various errors outside the scope of this script may occur; a counter (`last_page_fetched.txt`) is created to keep track of the most recent page fetched and inserted into the database. If the program exits prematurely, simply run `create_foia_data_db.py` again to continue where you left off. A log file is created to capture errors for later reference.
diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py
deleted file mode 100644
index 3010f42d..00000000
--- a/source_collectors/muckrock/search_local_foia_json.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-***DEPRECATED***
-
-search_local_foia_json.py
-
-"""
-
-import json
-
-from source_collectors.muckrock.utils import load_json_file, save_json_file
-
-# Specify the JSON file path
-json_file = "foia_data.json"
-search_string = "use of force"
-
-# Load the JSON data
-data = load_json_file(json_file)
-
-# List to store matching entries
-matching_entries = []
-
-
-def search_entry(entry):
-    """
-    search within an entry
-    """
-    # Check if 'status' is 'done'
-    if entry.get("status") != "done":
-        return False
-
-    # Check if 'title' or 'tags' field contains the search string
-    title_match = "title" in entry and search_string.lower() in entry["title"].lower()
-    tags_match = "tags" in entry and any(
-        search_string.lower() in tag.lower() for tag in entry["tags"]
-    )
-
-    return title_match or tags_match
-
-
-# Iterate through the data and collect matching entries
-for entry in data:
-    if search_entry(entry):
-        matching_entries.append(entry)
-
-# Output the results
-print(
-    f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags."
-)
-
-# Optionally, write matching entries to a new JSON file
-save_json_file(file_path="matching_entries.json", data=matching_entries)
-
-print("Matching entries written to 'matching_entries.json'")

From 01d5f6bb343551c1edc8a7d613bdae700b17e237 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 19:16:01 -0500
Subject: [PATCH 09/19] Refactor: Create MuckrockFetchers * Create
 MuckrockFetcher base class * Implement in FOIAFetcher * Create
 JurisdictionFetcher and AgencyFetcher * Replace relevant logic in
 `generate_detailed_muckrock_csv.py`

---
 source_collectors/__init__.py                 |  0
 source_collectors/muckrock/FOIASearcher.py    |  2 +-
 source_collectors/muckrock/__init__.py        |  0
 .../generate_detailed_muckrock_csv.py         | 56 ++++++-------------
 source_collectors/muckrock/muck_get.py        |  2 +-
 .../muckrock_fetchers/AgencyFetcher.py        | 14 +++++
 .../{ => muckrock_fetchers}/FOIAFetcher.py    | 24 ++++----
 .../muckrock_fetchers/JurisdictionFetcher.py  | 14 +++++
 .../muckrock_fetchers/MuckrockFetcher.py      | 28 ++++++++++
 .../muckrock/muckrock_fetchers/__init__.py    |  0
 10 files changed, 88 insertions(+), 52 deletions(-)
 create mode 100644 source_collectors/__init__.py
 create mode 100644 source_collectors/muckrock/__init__.py
 create mode 100644 source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py
 rename source_collectors/muckrock/{ => muckrock_fetchers}/FOIAFetcher.py (60%)
 create mode 100644 source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py
 create mode 100644 source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py
 create mode 100644 source_collectors/muckrock/muckrock_fetchers/__init__.py

diff --git a/source_collectors/__init__.py b/source_collectors/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/source_collectors/muckrock/FOIASearcher.py b/source_collectors/muckrock/FOIASearcher.py
index d42a6439..9d6116b7 100644
--- a/source_collectors/muckrock/FOIASearcher.py
+++ b/source_collectors/muckrock/FOIASearcher.py
@@ -1,6 +1,6 @@
 from typing import Optional
 
-from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
+from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher
 from tqdm import tqdm
 
 class FOIASearcher:
diff --git a/source_collectors/muckrock/__init__.py b/source_collectors/muckrock/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index d17b7415..207d2118 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -6,8 +6,11 @@
 
 import argparse
 import csv
-import requests
 import time
+
+from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher, AgencyFetchRequest
+from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher, \
+    JurisdictionFetchRequest
 from utils import format_filename_json_to_csv, load_json_file
 
 # Define the CSV headers
@@ -46,41 +49,6 @@
 ]
 
 
-def get_agency(agency_id):
-    """
-    Get agency data from the MuckRock API via agency ID
-    """
-    if agency_id:
-        agency_url = f"https://www.muckrock.com/api_v1/agency/{agency_id}/"
-        response = requests.get(agency_url)
-
-        if response.status_code == 200:
-            agency_data = response.json()
-            return agency_data
-        else:
-            return ""
-    else:
-        print("Agency ID not found in item")
-
-
-def get_jurisdiction(jurisdiction_id):
-    """
-    Get jurisdiction data from the MuckRock API via jurisdiction ID
-    """
-    if jurisdiction_id:
-        jurisdiction_url = (
-            f"https://www.muckrock.com/api_v1/jurisdiction/{jurisdiction_id}/"
-        )
-        response = requests.get(jurisdiction_url)
-
-        if response.status_code == 200:
-            jurisdiction_data = response.json()
-            return jurisdiction_data
-        else:
-            return ""
-    else:
-        print("Jurisdiction ID not found in item")
-
 def main():
     # Load the JSON data
     parser = argparse.ArgumentParser(description="Parse JSON from a file.")
@@ -103,12 +71,19 @@ def main():
         # Write the header row
         writer.writeheader()
 
+        a_fetcher = AgencyFetcher()
+        j_fetcher = JurisdictionFetcher()
+
         # Iterate through the JSON data
         for item in json_data:
             print(f"Writing data for {item.get('title')}")
-            agency_data = get_agency(item.get("agency"))
+            agency_data = a_fetcher.get_agency(agency_id=item.get("agency"))
+            # agency_data = get_agency(item.get("agency"))
             time.sleep(1)
-            jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction"))
+            jurisdiction_data = j_fetcher.get_jurisdiction(
+                jurisdiction_id=agency_data.get("jurisdiction")
+            )
+            # jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction"))
 
             jurisdiction_level = jurisdiction_data.get("level")
             # federal jurisduction level
@@ -125,7 +100,10 @@ def main():
                 juris_type = "state"
             # local jurisdiction level
             if jurisdiction_level == "l":
-                parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent"))
+                parent_juris_data = j_fetcher.get_jurisdiction(
+                    jurisdiction_id=jurisdiction_data.get("parent")
+                )
+
                 state = parent_juris_data.get("abbrev")
                 if "County" in jurisdiction_data.get("name"):
                     county = jurisdiction_data.get("name")
diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py
index 1401cd93..b1a51022 100644
--- a/source_collectors/muckrock/muck_get.py
+++ b/source_collectors/muckrock/muck_get.py
@@ -2,7 +2,7 @@
 A straightforward standalone script for downloading data from MuckRock
 and searching for it with a specific search string.
 """
-from source_collectors.muckrock.FOIAFetcher import FOIAFetcher
+from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher
 from source_collectors.muckrock.FOIASearcher import FOIASearcher
 from source_collectors.muckrock.utils import save_json_file
 
diff --git a/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py b/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py
new file mode 100644
index 00000000..2e36ce31
--- /dev/null
+++ b/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py
@@ -0,0 +1,14 @@
+from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
+from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
+
+
+class AgencyFetchRequest(FetchRequest):
+    agency_id: int
+
+class AgencyFetcher(MuckrockFetcher):
+
+    def build_url(self, request: AgencyFetchRequest) -> str:
+        return f"{BASE_MUCKROCK_URL}/agency/{request.agency_id}/"
+
+    def get_agency(self, agency_id: int):
+        return self.fetch(AgencyFetchRequest(agency_id=agency_id))
\ No newline at end of file
diff --git a/source_collectors/muckrock/FOIAFetcher.py b/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py
similarity index 60%
rename from source_collectors/muckrock/FOIAFetcher.py
rename to source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py
index 566df2cf..5b780a99 100644
--- a/source_collectors/muckrock/FOIAFetcher.py
+++ b/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py
@@ -1,10 +1,15 @@
-import requests
-
+from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockFetcher, FetchRequest
 from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
 
 FOIA_BASE_URL = f"{BASE_MUCKROCK_URL}/foia"
 
-class FOIAFetcher:
+
+class FOIAFetchRequest(FetchRequest):
+    page: int
+    page_size: int
+
+
+class FOIAFetcher(MuckrockFetcher):
 
     def __init__(self, start_page: int = 1, per_page: int = 100):
         """
@@ -17,18 +22,15 @@ def __init__(self, start_page: int = 1, per_page: int = 100):
         self.current_page = start_page
         self.per_page = per_page
 
+    def build_url(self, request: FOIAFetchRequest) -> str:
+        return f"{FOIA_BASE_URL}?page={request.page}&page_size={request.page_size}&format=json"
+
     def fetch_next_page(self) -> dict | None:
         """
         Fetches data from a specific page of the MuckRock FOIA API.
         """
         page = self.current_page
         self.current_page += 1
-        response = requests.get(
-            FOIA_BASE_URL, params={"page": page, "page_size": self.per_page, "format": "json"}
-        )
-        if response.status_code == 200:
-            return response.json()
-        # TODO: Look into raising error instead of returning None
-        print(f"Error fetching page {page}: {response.status_code}")
-        return None
+        request = FOIAFetchRequest(page=page, page_size=self.per_page)
+        return self.fetch(request)
 
diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py b/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py
new file mode 100644
index 00000000..b52ce735
--- /dev/null
+++ b/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py
@@ -0,0 +1,14 @@
+from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
+from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
+
+
+class JurisdictionFetchRequest(FetchRequest):
+    jurisdiction_id: int
+
+class JurisdictionFetcher(MuckrockFetcher):
+
+    def build_url(self, request: JurisdictionFetchRequest) -> str:
+        return f"{BASE_MUCKROCK_URL}/jurisdiction/{request.jurisdiction_id}/"
+
+    def get_jurisdiction(self, jurisdiction_id: int) -> dict:
+        return self.fetch(request=JurisdictionFetchRequest(jurisdiction_id=jurisdiction_id))
diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py
new file mode 100644
index 00000000..33bba21d
--- /dev/null
+++ b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py
@@ -0,0 +1,28 @@
+import abc
+from abc import ABC
+from dataclasses import dataclass
+
+import requests
+from pydantic import BaseModel
+
+
+class FetchRequest(BaseModel):
+    pass
+
+class MuckrockFetcher(ABC):
+
+    def fetch(self, request: FetchRequest):
+        url = self.build_url(request)
+        response = requests.get(url)
+        try:
+            response.raise_for_status()
+        except requests.exceptions.HTTPError as e:
+            print(f"Failed to get records on request `{url}`: {e}")
+            return None
+
+        return response.json()
+
+    @abc.abstractmethod
+    def build_url(self, request: FetchRequest) -> str:
+        pass
+
diff --git a/source_collectors/muckrock/muckrock_fetchers/__init__.py b/source_collectors/muckrock/muckrock_fetchers/__init__.py
new file mode 100644
index 00000000..e69de29b

From cc5b20d2297c412ce3926cfb74837f304375bb20 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 20:19:16 -0500
Subject: [PATCH 10/19] Refactor: Modularize Logic * Create Enum Class *
 Simplify Agency Info data creation * Extract logic to separate functions

---
 .../generate_detailed_muckrock_csv.py         | 288 +++++++++---------
 1 file changed, 143 insertions(+), 145 deletions(-)

diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index 207d2118..f7a65e3b 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -7,167 +7,165 @@
 import argparse
 import csv
 import time
+from enum import Enum
+from typing import Optional
 
-from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher, AgencyFetchRequest
-from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher, \
-    JurisdictionFetchRequest
-from utils import format_filename_json_to_csv, load_json_file
-
-# Define the CSV headers
-headers = [
-    "name",
-    "agency_described",
-    "record_type",
-    "description",
-    "source_url",
-    "readme_url",
-    "scraper_url",
-    "state",
-    "county",
-    "municipality",
-    "agency_type",
-    "jurisdiction_type",
-    "View Archive",
-    "agency_aggregation",
-    "agency_supplied",
-    "supplying_entity",
-    "agency_originated",
-    "originating_agency",
-    "coverage_start",
-    "source_last_updated",
-    "coverage_end",
-    "number_of_records_available",
-    "size",
-    "access_type",
-    "data_portal_type",
-    "access_notes",
-    "record_format",
-    "update_frequency",
-    "update_method",
-    "retention_schedule",
-    "detail_level",
-]
+from pydantic import BaseModel
 
+from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher
+from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher
+from utils import format_filename_json_to_csv, load_json_file
 
-def main():
-    # Load the JSON data
-    parser = argparse.ArgumentParser(description="Parse JSON from a file.")
-    parser.add_argument(
-        "--json_file", type=str, required=True, help="Path to the JSON file"
-    )
 
-    args = parser.parse_args()
+class JurisdictionType(Enum):
+    FEDERAL = "federal"
+    STATE = "state"
+    COUNTY = "county"
+    LOCAL = "local"
+
+
+class AgencyInfo(BaseModel):
+    name: Optional[str] = ""
+    agency_described: Optional[str] = ""
+    record_type: Optional[str] = ""
+    description: Optional[str] = ""
+    source_url: Optional[str] = ""
+    readme_url: Optional[str] = ""
+    scraper_url: Optional[str] = ""
+    state: Optional[str] = ""
+    county: Optional[str] = ""
+    municipality: Optional[str] = ""
+    agency_type: Optional[str] = ""
+    jurisdiction_type: Optional[JurisdictionType] = None
+    agency_aggregation: Optional[str] = ""
+    agency_supplied: Optional[bool] = False
+    supplying_entity: Optional[str] = "MuckRock"
+    agency_originated: Optional[bool] = True
+    originating_agency: Optional[str] = ""
+    coverage_start: Optional[str] = ""
+    source_last_updated: Optional[str] = ""
+    coverage_end: Optional[str] = ""
+    number_of_records_available: Optional[str] = ""
+    size: Optional[str] = ""
+    access_type: Optional[str] = ""
+    data_portal_type: Optional[str] = "MuckRock"
+    access_notes: Optional[str] = ""
+    record_format: Optional[str] = ""
+    update_frequency: Optional[str] = ""
+    update_method: Optional[str] = ""
+    retention_schedule: Optional[str] = ""
+    detail_level: Optional[str] = ""
+
+
+    def model_dump(self, *args, **kwargs):
+        original_dict = super().model_dump(*args, **kwargs)
+        original_dict['View Archive'] = ''
+        return {key: (value.value if isinstance(value, Enum) else value)
+                for key, value in original_dict.items()}
+
+    def keys(self) -> list[str]:
+        return list(self.model_dump().keys())
 
-    # TODO: Generalize logic
-    json_data = load_json_file(args.json_file)
 
-    output_csv = format_filename_json_to_csv(args.json_file)
+def main():
+    json_filename = get_json_filename()
+    json_data = load_json_file(json_filename)
+    output_csv = format_filename_json_to_csv(json_filename)
+    agency_infos = get_agency_infos(json_data)
+    write_to_csv(agency_infos, output_csv)
+
+
+def get_agency_infos(json_data):
+    a_fetcher = AgencyFetcher()
+    j_fetcher = JurisdictionFetcher()
+    agency_infos = []
+    # Iterate through the JSON data
+    for item in json_data:
+        print(f"Writing data for {item.get('title')}")
+        agency_data = a_fetcher.get_agency(agency_id=item.get("agency"))
+        time.sleep(1)
+        jurisdiction_data = j_fetcher.get_jurisdiction(
+            jurisdiction_id=agency_data.get("jurisdiction")
+        )
+        agency_name = agency_data.get("name", "")
+        agency_info = AgencyInfo(
+            name=item.get("title", ""),
+            originating_agency=agency_name,
+            agency_described=agency_name
+        )
+        jurisdiction_level = jurisdiction_data.get("level")
+        add_locational_info(agency_info, j_fetcher, jurisdiction_data, jurisdiction_level)
+        optionally_add_agency_type(agency_data, agency_info)
+        optionally_add_access_info(agency_info, item)
+
+        # Extract the relevant fields from the JSON object
+        # TODO: I question the utility of creating columns that are then left blank until later
+        #   and possibly in a different file entirely.
+        agency_infos.append(agency_info)
+    return agency_infos
+
+
+def write_to_csv(agency_infos, output_csv):
     # Open a CSV file for writing
-
-    # TODO: CSV writing and composition logic is tightly coupled -- separate
     with open(output_csv, "w", newline="") as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=headers)
+        writer = csv.DictWriter(csvfile, fieldnames=AgencyInfo().keys())
 
         # Write the header row
         writer.writeheader()
 
-        a_fetcher = AgencyFetcher()
-        j_fetcher = JurisdictionFetcher()
-
-        # Iterate through the JSON data
-        for item in json_data:
-            print(f"Writing data for {item.get('title')}")
-            agency_data = a_fetcher.get_agency(agency_id=item.get("agency"))
-            # agency_data = get_agency(item.get("agency"))
-            time.sleep(1)
-            jurisdiction_data = j_fetcher.get_jurisdiction(
-                jurisdiction_id=agency_data.get("jurisdiction")
-            )
-            # jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction"))
-
-            jurisdiction_level = jurisdiction_data.get("level")
-            # federal jurisduction level
-            if jurisdiction_level == "f":
-                state = ""
-                county = ""
-                municipality = ""
-                juris_type = "federal"
-            # state jurisdiction level
-            if jurisdiction_level == "s":
-                state = jurisdiction_data.get("name")
-                county = ""
-                municipality = ""
-                juris_type = "state"
-            # local jurisdiction level
-            if jurisdiction_level == "l":
-                parent_juris_data = j_fetcher.get_jurisdiction(
-                    jurisdiction_id=jurisdiction_data.get("parent")
-                )
-
-                state = parent_juris_data.get("abbrev")
-                if "County" in jurisdiction_data.get("name"):
-                    county = jurisdiction_data.get("name")
-                    municipality = ""
-                    juris_type = "county"
-                else:
-                    county = ""
-                    municipality = jurisdiction_data.get("name")
-                    juris_type = "local"
-
-            if "Police" in agency_data.get("types"):
-                agency_type = "law enforcement/police"
-            else:
-                agency_type = ""
-
-            source_url = ""
-            absolute_url = item.get("absolute_url")
-            access_type = ""
-            for comm in item["communications"]:
-                if comm["files"]:
-                    source_url = absolute_url + "#files"
-                    access_type = "Web page,Download,API"
-                    break
-
-            # Extract the relevant fields from the JSON object
-            # TODO: I question the utility of creating columns that are then left blank until later
-            #   and possibly in a different file entirely.
-            csv_row = {
-                "name": item.get("title", ""),
-                "agency_described": agency_data.get("name", "") + " - " + state,
-                "record_type": "",
-                "description": "",
-                "source_url": source_url,
-                "readme_url": absolute_url,
-                "scraper_url": "",
-                "state": state,
-                "county": county,
-                "municipality": municipality,
-                "agency_type": agency_type,
-                "jurisdiction_type": juris_type,
-                "View Archive": "",
-                "agency_aggregation": "",
-                "agency_supplied": "no",
-                "supplying_entity": "MuckRock",
-                "agency_originated": "yes",
-                "originating_agency": agency_data.get("name", ""),
-                "coverage_start": "",
-                "source_last_updated": "",
-                "coverage_end": "",
-                "number_of_records_available": "",
-                "size": "",
-                "access_type": access_type,
-                "data_portal_type": "MuckRock",
-                "access_notes": "",
-                "record_format": "",
-                "update_frequency": "",
-                "update_method": "",
-                "retention_schedule": "",
-                "detail_level": "",
-            }
+        for agency_info in agency_infos:
+            csv_row = agency_info.model_dump()
 
             # Write the extracted row to the CSV file
             writer.writerow(csv_row)
 
 
+def get_json_filename():
+    # Load the JSON data
+    parser = argparse.ArgumentParser(description="Parse JSON from a file.")
+    parser.add_argument(
+        "--json_file", type=str, required=True, help="Path to the JSON file"
+    )
+    args = parser.parse_args()
+    json_filename = args.json_file
+    return json_filename
+
+
+def add_locational_info(agency_info, j_fetcher, jurisdiction_data, jurisdiction_level):
+    # federal jurisdiction level
+    if jurisdiction_level == "f":
+        agency_info.jurisdiction_type = JurisdictionType.FEDERAL
+    # state jurisdiction level
+    if jurisdiction_level == "s":
+        agency_info.jurisdiction_type = JurisdictionType.STATE
+        agency_info.state = jurisdiction_data.get("name")
+    # local jurisdiction level
+    if jurisdiction_level == "l":
+        parent_juris_data = j_fetcher.get_jurisdiction(
+            jurisdiction_id=jurisdiction_data.get("parent")
+        )
+        agency_info.state = parent_juris_data.get("abbrev")
+        if "County" in jurisdiction_data.get("name"):
+            agency_info.county = jurisdiction_data.get("name")
+            agency_info.jurisdiction_type = JurisdictionType.COUNTY
+        else:
+            agency_info.municipality = jurisdiction_data.get("name")
+            agency_info.jurisdiction_type = JurisdictionType.LOCAL
+
+
+def optionally_add_access_info(agency_info, item):
+    absolute_url = item.get("absolute_url")
+    for comm in item["communications"]:
+        if comm["files"]:
+            agency_info.source_url = absolute_url + "#files"
+            agency_info.access_type = "Web page,Download,API"
+            break
+
+
+def optionally_add_agency_type(agency_data, agency_info):
+    if "Police" in agency_data.get("types"):
+        agency_info.agency_type = "law enforcement/police"
+
+
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 56062d2a8170e76655fb3d79f4b6c0dacff6c168 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Sun, 15 Dec 2024 20:22:29 -0500
Subject: [PATCH 11/19] Refactor: Modularize Logic * Create Enum Class *
 Simplify Agency Info data creation * Extract logic to separate functions

---
 .../generate_detailed_muckrock_csv.py         | 36 +++++++++----------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index f7a65e3b..df4a0832 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -132,25 +132,23 @@ def get_json_filename():
 
 
 def add_locational_info(agency_info, j_fetcher, jurisdiction_data, jurisdiction_level):
-    # federal jurisdiction level
-    if jurisdiction_level == "f":
-        agency_info.jurisdiction_type = JurisdictionType.FEDERAL
-    # state jurisdiction level
-    if jurisdiction_level == "s":
-        agency_info.jurisdiction_type = JurisdictionType.STATE
-        agency_info.state = jurisdiction_data.get("name")
-    # local jurisdiction level
-    if jurisdiction_level == "l":
-        parent_juris_data = j_fetcher.get_jurisdiction(
-            jurisdiction_id=jurisdiction_data.get("parent")
-        )
-        agency_info.state = parent_juris_data.get("abbrev")
-        if "County" in jurisdiction_data.get("name"):
-            agency_info.county = jurisdiction_data.get("name")
-            agency_info.jurisdiction_type = JurisdictionType.COUNTY
-        else:
-            agency_info.municipality = jurisdiction_data.get("name")
-            agency_info.jurisdiction_type = JurisdictionType.LOCAL
+    match jurisdiction_level:
+        case "f":  # federal jurisdiction level
+            agency_info.jurisdiction_type = JurisdictionType.FEDERAL
+        case "s":  # state jurisdiction level
+            agency_info.jurisdiction_type = JurisdictionType.STATE
+            agency_info.state = jurisdiction_data.get("name")
+        case "l":  # local jurisdiction level
+            parent_juris_data = j_fetcher.get_jurisdiction(
+                jurisdiction_id=jurisdiction_data.get("parent")
+            )
+            agency_info.state = parent_juris_data.get("abbrev")
+            if "County" in jurisdiction_data.get("name"):
+                agency_info.county = jurisdiction_data.get("name")
+                agency_info.jurisdiction_type = JurisdictionType.COUNTY
+            else:
+                agency_info.municipality = jurisdiction_data.get("name")
+                agency_info.jurisdiction_type = JurisdictionType.LOCAL
 
 
 def optionally_add_access_info(agency_info, item):

From 62f5a50f897c4a0aea670a556b7226abe2ce5714 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Mon, 16 Dec 2024 10:31:39 -0500
Subject: [PATCH 12/19] Refactor get_allegheny_foias.py * Create LoopFetcher
 classes * Implement in `get_allegheny_foias`

---
 .../generate_detailed_muckrock_csv.py         |  4 +-
 .../muckrock/get_allegheny_foias.py           | 72 ++++++-------------
 .../muckrock_fetchers/FOIALoopFetcher.py      | 31 ++++++++
 ...nFetcher.py => JurisdictionByIDFetcher.py} |  8 +--
 .../JurisdictionLoopFetcher.py                | 47 ++++++++++++
 .../muckrock_fetchers/MuckrockLoopFetcher.py  | 41 +++++++++++
 6 files changed, 145 insertions(+), 58 deletions(-)
 create mode 100644 source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py
 rename source_collectors/muckrock/muckrock_fetchers/{JurisdictionFetcher.py => JurisdictionByIDFetcher.py} (56%)
 create mode 100644 source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py
 create mode 100644 source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py

diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index df4a0832..cf3c439d 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -13,7 +13,7 @@
 from pydantic import BaseModel
 
 from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher
-from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher
+from source_collectors.muckrock.muckrock_fetchers.JurisdictionByIDFetcher import JurisdictionByIDFetcher
 from utils import format_filename_json_to_csv, load_json_file
 
 
@@ -77,7 +77,7 @@ def main():
 
 def get_agency_infos(json_data):
     a_fetcher = AgencyFetcher()
-    j_fetcher = JurisdictionFetcher()
+    j_fetcher = JurisdictionByIDFetcher()
     agency_infos = []
     # Iterate through the JSON data
     for item in json_data:
diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py
index 3aac1d6f..bddeffad 100644
--- a/source_collectors/muckrock/get_allegheny_foias.py
+++ b/source_collectors/muckrock/get_allegheny_foias.py
@@ -3,45 +3,28 @@
 and save them to a JSON file
 
 """
-import requests
-import json
-import time
 
+from source_collectors.muckrock.muckrock_fetchers.FOIALoopFetcher import FOIALoopFetchRequest, FOIALoopFetcher
+from source_collectors.muckrock.muckrock_fetchers.JurisdictionLoopFetcher import JurisdictionLoopFetchRequest, \
+    JurisdictionLoopFetcher
 from source_collectors.muckrock.utils import save_json_file
 
 
-def fetch_jurisdiction_ids(town_file, base_url):
+def fetch_jurisdiction_ids(town_file, level="l", parent=126):
     """
     fetch jurisdiction IDs based on town names from a text file
     """
     with open(town_file, "r") as file:
         town_names = [line.strip() for line in file]
 
-    jurisdiction_ids = {}
-    url = base_url
-
-    while url:
-        response = requests.get(url)
-        if response.status_code == 200:
-            data = response.json()
-            for item in data.get("results", []):
-                if item["name"] in town_names:
-                    jurisdiction_ids[item["name"]] = item["id"]
-
-            url = data.get("next")
-            print(
-                f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far..."
-            )
-            time.sleep(1)  # To respect the rate limit
+    request = JurisdictionLoopFetchRequest(
+        level=level, parent=parent, town_names=town_names
+    )
 
-        elif response.status_code == 503:
-            print("Error 503: Skipping page")
-            break
-        else:
-            print(f"Error fetching data: {response.status_code}")
-            break
+    fetcher = JurisdictionLoopFetcher(request)
+    fetcher.loop_fetch()
+    return fetcher.jurisdictions
 
-    return jurisdiction_ids
 
 
 def fetch_foia_data(jurisdiction_ids):
@@ -50,26 +33,11 @@ def fetch_foia_data(jurisdiction_ids):
     """
     all_data = []
     for name, id_ in jurisdiction_ids.items():
-        # TODO: The muckrock api should be centralized in a `constants.py` folder
-        #   and the url should be constructed in a function or class
-        url = f"https://www.muckrock.com/api_v1/foia/?status=done&jurisdiction={id_}"
-        while url:
-            response = requests.get(url)
-            # TODO: If logic similar to `fetch_jurisdiction_ids` and should be generalized
-            if response.status_code == 200:
-                data = response.json()
-                all_data.extend(data.get("results", []))
-                url = data.get("next")
-                print(
-                    f"Fetching records for {name}, {len(all_data)} total records so far..."
-                )
-                time.sleep(1)  # To respect the rate limit
-            elif response.status_code == 503:
-                print(f"Error 503: Skipping page for {name}")
-                break
-            else:
-                print(f"Error fetching data: {response.status_code} for {name}")
-                break
+        print(f"\nFetching records for {name}...")
+        request = FOIALoopFetchRequest(jurisdiction=id_)
+        fetcher = FOIALoopFetcher(request)
+        fetcher.loop_fetch()
+        all_data.extend(fetcher.results)
 
     # Save the combined data to a JSON file
     save_json_file(file_path="foia_data_combined.json", data=all_data)
@@ -81,12 +49,12 @@ def main():
     Execute the script
     """
     town_file = "allegheny-county-towns.txt"
-    jurisdiction_url = (
-        "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126"
-    )
-
     # Fetch jurisdiction IDs based on town names
-    jurisdiction_ids = fetch_jurisdiction_ids(town_file, jurisdiction_url)
+    jurisdiction_ids = fetch_jurisdiction_ids(
+        town_file,
+        level="l",
+        parent=126
+    )
     print(f"Jurisdiction IDs fetched: {jurisdiction_ids}")
 
     # Fetch FOIA data for each jurisdiction ID
diff --git a/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py b/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py
new file mode 100644
index 00000000..2af65c1e
--- /dev/null
+++ b/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py
@@ -0,0 +1,31 @@
+from datasets import tqdm
+
+from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
+from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest
+from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher
+
+class FOIALoopFetchRequest(FetchRequest):
+    jurisdiction: int
+
+class FOIALoopFetcher(MuckrockLoopFetcher):
+
+    def __init__(self, initial_request: FOIALoopFetchRequest):
+        super().__init__(initial_request)
+        self.pbar_records = tqdm(
+            desc="Fetching FOIA records",
+            unit="record",
+        )
+        self.num_found = 0
+        self.results = []
+
+    def process_results(self, results: list[dict]):
+        self.results.extend(results)
+
+    def build_url(self, request: FOIALoopFetchRequest):
+        return f"{BASE_MUCKROCK_URL}/foia/?status=done&jurisdiction={request.jurisdiction}"
+
+    def report_progress(self):
+        old_num_found = self.num_found
+        self.num_found = len(self.results)
+        difference = self.num_found - old_num_found
+        self.pbar_records.update(difference)
diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py b/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py
similarity index 56%
rename from source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py
rename to source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py
index b52ce735..60cb0c2e 100644
--- a/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py
+++ b/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py
@@ -2,13 +2,13 @@
 from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
 
 
-class JurisdictionFetchRequest(FetchRequest):
+class JurisdictionByIDFetchRequest(FetchRequest):
     jurisdiction_id: int
 
-class JurisdictionFetcher(MuckrockFetcher):
+class JurisdictionByIDFetcher(MuckrockFetcher):
 
-    def build_url(self, request: JurisdictionFetchRequest) -> str:
+    def build_url(self, request: JurisdictionByIDFetchRequest) -> str:
         return f"{BASE_MUCKROCK_URL}/jurisdiction/{request.jurisdiction_id}/"
 
     def get_jurisdiction(self, jurisdiction_id: int) -> dict:
-        return self.fetch(request=JurisdictionFetchRequest(jurisdiction_id=jurisdiction_id))
+        return self.fetch(request=JurisdictionByIDFetchRequest(jurisdiction_id=jurisdiction_id))
diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py b/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py
new file mode 100644
index 00000000..816f4b59
--- /dev/null
+++ b/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py
@@ -0,0 +1,47 @@
+from tqdm import tqdm
+
+from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
+from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
+from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher
+
+
+class JurisdictionLoopFetchRequest(FetchRequest):
+    level: str
+    parent: int
+    town_names: list
+
+class JurisdictionLoopFetcher(MuckrockLoopFetcher):
+
+    def __init__(self, initial_request: JurisdictionLoopFetchRequest):
+        super().__init__(initial_request)
+        self.town_names = initial_request.town_names
+        self.pbar_jurisdictions = tqdm(
+            total=len(self.town_names),
+            desc="Fetching jurisdictions",
+            unit="jurisdiction",
+            position=0,
+            leave=False
+        )
+        self.pbar_page = tqdm(
+            desc="Processing pages",
+            unit="page",
+            position=1,
+            leave=False
+        )
+        self.num_found = 0
+        self.jurisdictions = {}
+
+    def build_url(self, request: JurisdictionLoopFetchRequest) -> str:
+        return f"{BASE_MUCKROCK_URL}/jurisdiction/?level={request.level}&parent={request.parent}"
+
+    def process_results(self, results: list[dict]):
+        for item in results:
+            if item["name"] in self.town_names:
+                self.jurisdictions[item["name"]] = item["id"]
+
+    def report_progress(self):
+        old_num_found = self.num_found
+        self.num_found = len(self.jurisdictions)
+        difference = self.num_found - old_num_found
+        self.pbar_jurisdictions.update(difference)
+        self.pbar_page.update(1)
diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py b/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py
new file mode 100644
index 00000000..49011df3
--- /dev/null
+++ b/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py
@@ -0,0 +1,41 @@
+from abc import ABC, abstractmethod
+from time import sleep
+
+import requests
+
+from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest
+
+
+class MuckrockLoopFetcher(ABC):
+
+
+    def __init__(self, initial_request: FetchRequest):
+        self.initial_request = initial_request
+
+    def loop_fetch(self):
+        url = self.build_url(self.initial_request)
+        while url is not None:
+            response = requests.get(url)
+            try:
+                response.raise_for_status()
+            except requests.exceptions.HTTPError as e:
+                print(f"Failed to get records on request `{url}`: {e}")
+                return None
+
+            data = response.json()
+            self.process_results(data["results"])
+            self.report_progress()
+            url = data["next"]
+            sleep(1)
+
+    @abstractmethod
+    def process_results(self, results: list[dict]):
+        pass
+
+    @abstractmethod
+    def build_url(self, request: FetchRequest) -> str:
+        pass
+
+    @abstractmethod
+    def report_progress(self):
+        pass

From b6b30a416a3081a25a167c6ae3bcd7222a64fc63 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Mon, 16 Dec 2024 16:48:37 -0500
Subject: [PATCH 13/19] Refactor create_foia_data_db.py * Create SQLClient
 classes * Add custom exception handling to Muckrock Fetcher. * Clean up
 comments * Extract some logic to separate functions.

---
 source_collectors/muckrock/SQLiteClient.py    |  38 ++++
 .../muckrock/create_foia_data_db.py           | 176 ++++++++----------
 .../muckrock_fetchers/MuckrockFetcher.py      |  14 ++
 3 files changed, 130 insertions(+), 98 deletions(-)
 create mode 100644 source_collectors/muckrock/SQLiteClient.py

diff --git a/source_collectors/muckrock/SQLiteClient.py b/source_collectors/muckrock/SQLiteClient.py
new file mode 100644
index 00000000..96a59d82
--- /dev/null
+++ b/source_collectors/muckrock/SQLiteClient.py
@@ -0,0 +1,38 @@
+import logging
+import sqlite3
+
+
+class SQLClientError(Exception):
+    pass
+
+
+class SQLiteClient:
+
+    def __init__(self, db_path: str) -> None:
+        self.conn = sqlite3.connect(db_path)
+
+    def execute_query(self, query: str, many=None):
+
+        try:
+            if many is not None:
+                self.conn.executemany(query, many)
+            else:
+                self.conn.execute(query)
+            self.conn.commit()
+        except sqlite3.Error as e:
+            print(f"SQLite error: {e}")
+            error_msg = f"Failed to execute query due to SQLite error: {e}"
+            logging.error(error_msg)
+            self.conn.rollback()
+            raise SQLClientError(error_msg)
+
+class SQLiteClientContextManager:
+
+    def __init__(self, db_path: str) -> None:
+        self.client = SQLiteClient(db_path)
+
+    def __enter__(self):
+        return self.client
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.client.conn.close()
\ No newline at end of file
diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py
index 85c7fd4b..6bff13f7 100644
--- a/source_collectors/muckrock/create_foia_data_db.py
+++ b/source_collectors/muckrock/create_foia_data_db.py
@@ -19,20 +19,24 @@
 and/or printed to the console.
 """
 
-import requests
-import sqlite3
 import logging
 import os
 import json
 import time
-from typing import List, Tuple, Dict, Any, Union, Literal
+from typing import List, Tuple, Dict, Any
+
+from tqdm import tqdm
+
+from source_collectors.muckrock.SQLiteClient import SQLiteClientContextManager, SQLClientError
+from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher
+from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError
 
 logging.basicConfig(
     filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s"
 )
 
+# TODO: Why are we pulling every single FOIA request?
 
-base_url = "https://www.muckrock.com/api_v1/foia/"
 last_page_fetched = "last_page_fetched.txt"
 
 NO_MORE_DATA = -1  # flag for program exit
@@ -83,70 +87,32 @@ def create_db() -> bool:
         bool: True, if database is successfully created; False otherwise.
 
     Raises:
-        sqlite3.Error: If the table creation operation fails, prints error and returns False.
-    """
-
-    try:
-        with sqlite3.connect("foia_data.db") as conn:
-            conn.execute(create_table_query)
-            conn.commit()
-        print("Successfully created foia_data.db!")
-        return True
-    except sqlite3.Error as e:
-        print(f"SQLite error: {e}.")
-        logging.error(f"Failed to create foia_data.db due to SQLite error: {e}")
-        return False
-
-
-def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]:
+        sqlite3.Error: If the table creation operation fails,
+        prints error and returns False.
     """
-    Fetches a page of 100 results from the MuckRock FOIA API.
-
-    Args:
-        page (int): The page number to fetch from the API.
-
-    Returns:
-        Union[JSON, None, Literal[NO_MORE_DATA]]:
-            - JSON Dict[str, Any]: The response's JSON data, if the request is successful.
-            - NO_MORE_DATA (int = -1): A constant, if there are no more pages to fetch (indicated by a 404 response).
-            - None: If there is an error other than 404.
-    """
-
-    # TODO: Refactor to use FOIA Fetcher
-    per_page = 100
-    response = requests.get(
-        base_url, params={"page": page, "page_size": per_page, "format": "json"}
-    )
-
-    if response.status_code == 200:
-        return response.json()
-    elif response.status_code == 404:
-        print("No more pages to fetch")
-        return NO_MORE_DATA  # Typically 404 response will mean there are no more pages to fetch
-    elif 500 <= response.status_code < 600:
-        logging.error(f"Server error {response.status_code} on page {page}")
-        page = page + 1
-        return fetch_page(page)
-    else:
-        print(f"Error fetching page {page}: {response.status_code}")
-        logging.error(
-            f"Fetching page {page} failed with response code: {
-                      response.status_code}"
-        )
-        return None
-
+    with SQLiteClientContextManager("foia_data.db") as client:
+        try:
+            client.execute_query(create_table_query)
+            return True
+        except SQLClientError as e:
+            print(f"SQLite error: {e}.")
+            logging.error(f"Failed to create foia_data.db due to SQLite error: {e}")
+            return False
 
 def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]:
     """
-    Transforms the data recieved from the MuckRock FOIA API into a structured format for insertion into a database with `populate_db()`.
+    Transforms the data received from the MuckRock FOIA API
+    into a structured format for insertion into a database with `populate_db()`.
 
-    Transforms JSON input into a list of tuples, as well as serializes the nested `tags` and `communications` fields into JSON strings.
+    Transforms JSON input into a list of tuples,
+    as well as serializes the nested `tags` and `communications` fields
+    into JSON strings.
 
     Args:
-        data_to_transform (JSON: Dict[str, Any]): The JSON data from the API response.
-
+        data_to_transform: The JSON data from the API response.
     Returns:
-        transformed_data (List[Tuple[Any, ...]]: A list of tuples, where each tuple contains the fields of a single FOIA request.
+        A list of tuples, where each tuple contains the fields
+        of a single FOIA request.
     """
 
     transformed_data = []
@@ -198,39 +164,40 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None:
         sqlite3.Error: If the insertion operation fails, attempts to retry operation (max_retries = 2). If retries are
                        exhausted, logs error and exits.
     """
-
-    with sqlite3.connect("foia_data.db") as conn:
-
+    with SQLiteClientContextManager("foia_data.db") as client:
         retries = 0
         max_retries = 2
         while retries < max_retries:
             try:
-                conn.executemany(foia_insert_query, transformed_data)
-                conn.commit()
+                client.execute_query(foia_insert_query, many=transformed_data)
                 print("Successfully inserted data!")
                 return
-            except sqlite3.Error as e:
-                print(f"SQLite error: {e}. Retrying...")
-                conn.rollback()
+            except SQLClientError as e:
+                print(f"{e}. Retrying...")
                 retries += 1
                 time.sleep(1)
 
         if retries == max_retries:
-            print(
-                f"Failed to insert data from page {page} after {
-                max_retries} attempts. Skipping to next page."
-            )
-            logging.error(
-                f"Failed to insert data from page {page} after {
-                max_retries} attempts."
-            )
+            report_max_retries_error(max_retries, page)
+
+
+def report_max_retries_error(max_retries, page):
+    print(
+        f"Failed to insert data from page {page} after {
+        max_retries} attempts. Skipping to next page."
+    )
+    logging.error(
+        f"Failed to insert data from page {page} after {
+        max_retries} attempts."
+    )
 
 
 def main() -> None:
     """
     Main entry point for create_foia_data_db.py.
 
-    This function orchestrates the process of fetching FOIA requests data from the MuckRock FOIA API, transforming it,
+    This function orchestrates the process of fetching
+    FOIA requests data from the MuckRock FOIA API, transforming it,
     and storing it in a SQLite database.
     """
 
@@ -241,33 +208,46 @@ def main() -> None:
             print("Failed to create foia_data.db")
             return
 
-    if os.path.exists(last_page_fetched):
-        with open(last_page_fetched, mode="r") as file:
-            page = int(file.read()) + 1
-    else:
-        page = 1
-
-    while True:
+    start_page = get_start_page()
+    fetcher = FOIAFetcher(
+        start_page=start_page
+    )
 
-        print(f"Fetching page {page}...")
-        page_data = fetch_page(page)
+    with tqdm(initial=start_page, unit="page") as pbar:
+        while True:
 
-        if page_data == NO_MORE_DATA:
-            break  # Exit program because no more data exixts
-        if page_data is None:
-            print(f"Skipping page {page}...")
-            page += 1
-            continue
+            # TODO: Replace with TQDM
+            try:
+                pbar.update()
+                page_data = fetcher.fetch_next_page()
+            except MuckrockNoMoreDataError:
+                # Exit program because no more data exists
+                break
+            if page_data is None:
+                continue
+            transformed_data = transform_page_data(page_data)
+            populate_db(transformed_data, fetcher.current_page)
+
+            with open(last_page_fetched, mode="w") as file:
+                file.write(str(fetcher.current_page))
 
-        transformed_data = transform_page_data(page_data)
+    print("create_foia_data_db.py run finished")
 
-        populate_db(transformed_data, page)
 
-        with open(last_page_fetched, mode="w") as file:
-            file.write(str(page))
-        page += 1
+def get_start_page():
+    """
+    Returns the page number to start fetching from.
 
-    print("create_foia_data_db.py run finished")
+    If the file `last_page_fetched` exists,
+    reads the page number from the file and returns it + 1.
+    Otherwise, returns 1.
+    """
+    if os.path.exists(last_page_fetched):
+        with open(last_page_fetched, mode="r") as file:
+            page = int(file.read()) + 1
+    else:
+        page = 1
+    return page
 
 
 if __name__ == "__main__":
diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py
index 33bba21d..e7a1dff5 100644
--- a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py
+++ b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py
@@ -5,6 +5,11 @@
 import requests
 from pydantic import BaseModel
 
+class MuckrockNoMoreDataError(Exception):
+    pass
+
+class MuckrockServerError(Exception):
+    pass
 
 class FetchRequest(BaseModel):
     pass
@@ -18,6 +23,15 @@ def fetch(self, request: FetchRequest):
             response.raise_for_status()
         except requests.exceptions.HTTPError as e:
             print(f"Failed to get records on request `{url}`: {e}")
+            # If code is 404, raise NoMoreData error
+            if e.response.status_code == 404:
+                raise MuckrockNoMoreDataError
+            if 500 <= e.response.status_code < 600:
+                raise MuckrockServerError
+
+
+
+
             return None
 
         return response.json()

From ee4a854845ebe58a1f36c76710b55317314272a5 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Mon, 16 Dec 2024 17:41:54 -0500
Subject: [PATCH 14/19] Refactor search_foia_data_db.py * Create FOIA DB
 Searcher class, incorporate into module * Extract logic to functions

---
 source_collectors/muckrock/FOIADBSearcher.py  | 65 +++++++++++++++++
 source_collectors/muckrock/constants.py       |  3 +-
 .../muckrock/search_foia_data_db.py           | 72 ++++---------------
 3 files changed, 80 insertions(+), 60 deletions(-)
 create mode 100644 source_collectors/muckrock/FOIADBSearcher.py

diff --git a/source_collectors/muckrock/FOIADBSearcher.py b/source_collectors/muckrock/FOIADBSearcher.py
new file mode 100644
index 00000000..391f7a8d
--- /dev/null
+++ b/source_collectors/muckrock/FOIADBSearcher.py
@@ -0,0 +1,65 @@
+import os
+import sqlite3
+
+import pandas as pd
+
+from source_collectors.muckrock.constants import FOIA_DATA_DB
+
+check_results_table_query = """
+                SELECT name FROM sqlite_master
+                WHERE (type = 'table')
+                AND (name = 'results')
+                """
+
+search_foia_query = """
+        SELECT * FROM results
+        WHERE (title LIKE ? OR tags LIKE ?)
+        AND (status = 'done')
+        """
+
+
+class FOIADBSearcher:
+
+    def __init__(self, db_path = FOIA_DATA_DB):
+        self.db_path = db_path
+        if not os.path.exists(self.db_path):
+            raise FileNotFoundError("foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.")
+
+
+    def search(self, search_string: str) -> pd.DataFrame | None:
+        """
+        Searches the foia_data.db database for FOIA request entries matching the provided search string.
+
+        Args:
+            search_string (str): The string to search for in the `title` and `tags` of the `results` table.
+
+        Returns:
+            Union[pandas.DataFrame, None]:
+                - pandas.DataFrame: A DataFrame containing the matching entries from the database.
+                - None: If an error occurs during the database operation.
+
+        Raises:
+            sqlite3.Error: If any database operation fails, prints error and returns None.
+            Exception: If any unexpected error occurs, prints error and returns None.
+        """
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                results_table = pd.read_sql_query(check_results_table_query, conn)
+                if results_table.empty:
+                    print("The `results` table does not exist in the database.")
+                    return None
+
+                df = pd.read_sql_query(
+                    sql=search_foia_query,
+                    con=conn,
+                    params=[f"%{search_string}%", f"%{search_string}%"]
+                )
+
+        except sqlite3.Error as e:
+            print(f"Sqlite error: {e}")
+            return None
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+            return None
+
+        return df
\ No newline at end of file
diff --git a/source_collectors/muckrock/constants.py b/source_collectors/muckrock/constants.py
index 7109847f..07dca8f4 100644
--- a/source_collectors/muckrock/constants.py
+++ b/source_collectors/muckrock/constants.py
@@ -1,3 +1,4 @@
 
 
-BASE_MUCKROCK_URL = "https://www.muckrock.com/api_v1"
\ No newline at end of file
+BASE_MUCKROCK_URL = "https://www.muckrock.com/api_v1"
+FOIA_DATA_DB = "foia_data.db"
\ No newline at end of file
diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py
index e7550608..7820540d 100644
--- a/source_collectors/muckrock/search_foia_data_db.py
+++ b/source_collectors/muckrock/search_foia_data_db.py
@@ -25,17 +25,7 @@
 import os
 from typing import Union, List, Dict
 
-check_results_table_query = """
-                SELECT name FROM sqlite_master
-                WHERE (type = 'table')
-                AND (name = 'results')
-                """
-
-search_foia_query = """
-        SELECT * FROM results
-        WHERE (title LIKE ? OR tags LIKE ?)
-        AND (status = 'done')
-        """
+from source_collectors.muckrock.FOIADBSearcher import FOIADBSearcher
 
 
 def parser_init() -> argparse.ArgumentParser:
@@ -61,45 +51,8 @@ def parser_init() -> argparse.ArgumentParser:
 
 
 def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]:
-    """
-    Searches the foia_data.db database for FOIA request entries matching the provided search string.
-
-    Args:
-        search_string (str): The string to search for in the `title` and `tags` of the `results` table.
-
-    Returns:
-        Union[pandas.DataFrame, None]:
-            - pandas.DataFrame: A DataFrame containing the matching entries from the database.
-            - None: If an error occurs during the database operation.
-
-    Raises:
-        sqlite3.Error: If any database operation fails, prints error and returns None.
-        Exception: If any unexpected error occurs, prints error and returns None.
-    """
-
-    print(f'Searching foia_data.db for "{search_string}"...')
-
-    try:
-        with sqlite3.connect("foia_data.db") as conn:
-
-            results_table = pd.read_sql_query(check_results_table_query, conn)
-
-            if results_table.empty:
-                print("The `results` table does not exist in the database.")
-                return None
-
-            params = [f"%{search_string}%", f"%{search_string}%"]
-
-            df = pd.read_sql_query(search_foia_query, conn, params=params)
-
-    except sqlite3.Error as e:
-        print(f"Sqlite error: {e}")
-        return None
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}")
-        return None
-
-    return df
+    searcher = FOIADBSearcher()
+    return searcher.search(search_string)
 
 
 def parse_communications_column(communications) -> List[Dict]:
@@ -164,24 +117,25 @@ def main() -> None:
     args = parser.parse_args()
     search_string = args.search_for
 
-    if not os.path.exists("foia_data.db"):
-        print(
-            "foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it."
-        )
-        return
-
     df = search_foia_db(search_string)
     if df is None:
         return
+    update_communications_column(df)
 
-    if not df["communications"].empty:
-        df["communications"] = df["communications"].apply(parse_communications_column)
+    announce_matching_entries(df, search_string)
 
+    generate_json(df, search_string)
+
+
+def announce_matching_entries(df, search_string):
     print(
         f'Found {df.shape[0]} matching entries containing "{search_string}" in the title or tags'
     )
 
-    generate_json(df, search_string)
+
+def update_communications_column(df):
+    if not df["communications"].empty:
+        df["communications"] = df["communications"].apply(parse_communications_column)
 
 
 if __name__ == "__main__":

From ee76173177b14351615b4cb8407526a44bc04e45 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Mon, 16 Dec 2024 17:43:56 -0500
Subject: [PATCH 15/19] Refactor Directory * Move all class files into
 `/classes` module

---
 source_collectors/muckrock/{ => classes}/FOIADBSearcher.py  | 0
 source_collectors/muckrock/{ => classes}/FOIASearcher.py    | 2 +-
 source_collectors/muckrock/{ => classes}/SQLiteClient.py    | 0
 .../muckrock/{muckrock_fetchers => classes}/__init__.py     | 0
 .../{ => classes}/muckrock_fetchers/AgencyFetcher.py        | 2 +-
 .../muckrock/{ => classes}/muckrock_fetchers/FOIAFetcher.py | 2 +-
 .../{ => classes}/muckrock_fetchers/FOIALoopFetcher.py      | 4 ++--
 .../muckrock_fetchers/JurisdictionByIDFetcher.py            | 2 +-
 .../muckrock_fetchers/JurisdictionLoopFetcher.py            | 4 ++--
 .../{ => classes}/muckrock_fetchers/MuckrockFetcher.py      | 0
 .../{ => classes}/muckrock_fetchers/MuckrockLoopFetcher.py  | 2 +-
 .../muckrock/classes/muckrock_fetchers/__init__.py          | 0
 source_collectors/muckrock/create_foia_data_db.py           | 6 +++---
 .../muckrock/generate_detailed_muckrock_csv.py              | 4 ++--
 source_collectors/muckrock/get_allegheny_foias.py           | 4 ++--
 source_collectors/muckrock/muck_get.py                      | 4 ++--
 source_collectors/muckrock/search_foia_data_db.py           | 4 +---
 17 files changed, 19 insertions(+), 21 deletions(-)
 rename source_collectors/muckrock/{ => classes}/FOIADBSearcher.py (100%)
 rename source_collectors/muckrock/{ => classes}/FOIASearcher.py (95%)
 rename source_collectors/muckrock/{ => classes}/SQLiteClient.py (100%)
 rename source_collectors/muckrock/{muckrock_fetchers => classes}/__init__.py (100%)
 rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/AgencyFetcher.py (78%)
 rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/FOIAFetcher.py (90%)
 rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/FOIALoopFetcher.py (82%)
 rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/JurisdictionByIDFetcher.py (81%)
 rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/JurisdictionLoopFetcher.py (87%)
 rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/MuckrockFetcher.py (100%)
 rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/MuckrockLoopFetcher.py (91%)
 create mode 100644 source_collectors/muckrock/classes/muckrock_fetchers/__init__.py

diff --git a/source_collectors/muckrock/FOIADBSearcher.py b/source_collectors/muckrock/classes/FOIADBSearcher.py
similarity index 100%
rename from source_collectors/muckrock/FOIADBSearcher.py
rename to source_collectors/muckrock/classes/FOIADBSearcher.py
diff --git a/source_collectors/muckrock/FOIASearcher.py b/source_collectors/muckrock/classes/FOIASearcher.py
similarity index 95%
rename from source_collectors/muckrock/FOIASearcher.py
rename to source_collectors/muckrock/classes/FOIASearcher.py
index 9d6116b7..f88f8242 100644
--- a/source_collectors/muckrock/FOIASearcher.py
+++ b/source_collectors/muckrock/classes/FOIASearcher.py
@@ -1,6 +1,6 @@
 from typing import Optional
 
-from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher
 from tqdm import tqdm
 
 class FOIASearcher:
diff --git a/source_collectors/muckrock/SQLiteClient.py b/source_collectors/muckrock/classes/SQLiteClient.py
similarity index 100%
rename from source_collectors/muckrock/SQLiteClient.py
rename to source_collectors/muckrock/classes/SQLiteClient.py
diff --git a/source_collectors/muckrock/muckrock_fetchers/__init__.py b/source_collectors/muckrock/classes/__init__.py
similarity index 100%
rename from source_collectors/muckrock/muckrock_fetchers/__init__.py
rename to source_collectors/muckrock/classes/__init__.py
diff --git a/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py
similarity index 78%
rename from source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py
rename to source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py
index 2e36ce31..b70c07e0 100644
--- a/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py
+++ b/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py
@@ -1,5 +1,5 @@
 from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
-from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
 
 
 class AgencyFetchRequest(FetchRequest):
diff --git a/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py
similarity index 90%
rename from source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py
rename to source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py
index 5b780a99..619b92ae 100644
--- a/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py
+++ b/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py
@@ -1,4 +1,4 @@
-from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockFetcher, FetchRequest
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import MuckrockFetcher, FetchRequest
 from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
 
 FOIA_BASE_URL = f"{BASE_MUCKROCK_URL}/foia"
diff --git a/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/FOIALoopFetcher.py
similarity index 82%
rename from source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py
rename to source_collectors/muckrock/classes/muckrock_fetchers/FOIALoopFetcher.py
index 2af65c1e..ad78f0b6 100644
--- a/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py
+++ b/source_collectors/muckrock/classes/muckrock_fetchers/FOIALoopFetcher.py
@@ -1,8 +1,8 @@
 from datasets import tqdm
 
 from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
-from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest
-from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher
 
 class FOIALoopFetchRequest(FetchRequest):
     jurisdiction: int
diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py
similarity index 81%
rename from source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py
rename to source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py
index 60cb0c2e..a038418c 100644
--- a/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py
+++ b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py
@@ -1,5 +1,5 @@
 from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
-from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
 
 
 class JurisdictionByIDFetchRequest(FetchRequest):
diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionLoopFetcher.py
similarity index 87%
rename from source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py
rename to source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionLoopFetcher.py
index 816f4b59..46c1bbf6 100644
--- a/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py
+++ b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionLoopFetcher.py
@@ -1,8 +1,8 @@
 from tqdm import tqdm
 
 from source_collectors.muckrock.constants import BASE_MUCKROCK_URL
-from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher
-from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher
 
 
 class JurisdictionLoopFetchRequest(FetchRequest):
diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py
similarity index 100%
rename from source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py
rename to source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py
diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py
similarity index 91%
rename from source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py
rename to source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py
index 49011df3..2b3d0149 100644
--- a/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py
+++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py
@@ -3,7 +3,7 @@
 
 import requests
 
-from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest
 
 
 class MuckrockLoopFetcher(ABC):
diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/__init__.py b/source_collectors/muckrock/classes/muckrock_fetchers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py
index 6bff13f7..f012f5d3 100644
--- a/source_collectors/muckrock/create_foia_data_db.py
+++ b/source_collectors/muckrock/create_foia_data_db.py
@@ -27,9 +27,9 @@
 
 from tqdm import tqdm
 
-from source_collectors.muckrock.SQLiteClient import SQLiteClientContextManager, SQLClientError
-from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher
-from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError
+from source_collectors.muckrock.classes.SQLiteClient import SQLiteClientContextManager, SQLClientError
+from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError
 
 logging.basicConfig(
     filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s"
diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
index cf3c439d..3cb884c0 100644
--- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py
+++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py
@@ -12,8 +12,8 @@
 
 from pydantic import BaseModel
 
-from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher
-from source_collectors.muckrock.muckrock_fetchers.JurisdictionByIDFetcher import JurisdictionByIDFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers import AgencyFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers.JurisdictionByIDFetcher import JurisdictionByIDFetcher
 from utils import format_filename_json_to_csv, load_json_file
 
 
diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py
index bddeffad..b269ff18 100644
--- a/source_collectors/muckrock/get_allegheny_foias.py
+++ b/source_collectors/muckrock/get_allegheny_foias.py
@@ -4,8 +4,8 @@
 
 """
 
-from source_collectors.muckrock.muckrock_fetchers.FOIALoopFetcher import FOIALoopFetchRequest, FOIALoopFetcher
-from source_collectors.muckrock.muckrock_fetchers.JurisdictionLoopFetcher import JurisdictionLoopFetchRequest, \
+from source_collectors.muckrock.classes.muckrock_fetchers.FOIALoopFetcher import FOIALoopFetchRequest, FOIALoopFetcher
+from source_collectors.muckrock.classes.muckrock_fetchers import JurisdictionLoopFetchRequest, \
     JurisdictionLoopFetcher
 from source_collectors.muckrock.utils import save_json_file
 
diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py
index b1a51022..f51bf9e0 100644
--- a/source_collectors/muckrock/muck_get.py
+++ b/source_collectors/muckrock/muck_get.py
@@ -2,8 +2,8 @@
 A straightforward standalone script for downloading data from MuckRock
 and searching for it with a specific search string.
 """
-from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher
-from source_collectors.muckrock.FOIASearcher import FOIASearcher
+from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher
+from source_collectors.muckrock.classes.FOIASearcher import FOIASearcher
 from source_collectors.muckrock.utils import save_json_file
 
 if __name__ == "__main__":
diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py
index 7820540d..51357663 100644
--- a/source_collectors/muckrock/search_foia_data_db.py
+++ b/source_collectors/muckrock/search_foia_data_db.py
@@ -18,14 +18,12 @@
 Errors encountered during database operations, JSON parsing, or file writing are printed to the console.
 """
 
-import sqlite3
 import pandas as pd
 import json
 import argparse
-import os
 from typing import Union, List, Dict
 
-from source_collectors.muckrock.FOIADBSearcher import FOIADBSearcher
+from source_collectors.muckrock.classes.FOIADBSearcher import FOIADBSearcher
 
 
 def parser_init() -> argparse.ArgumentParser:

From 147a786b9211b068bcb43c69a4fe256720c682db Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Wed, 18 Dec 2024 08:12:50 -0500
Subject: [PATCH 16/19] Begin draft of PDAP client

---
 pdap_api_client/PDAPClient.py | 76 +++++++++++++++++++++++++++++++++++
 pdap_api_client/__init__.py   |  0
 2 files changed, 76 insertions(+)
 create mode 100644 pdap_api_client/PDAPClient.py
 create mode 100644 pdap_api_client/__init__.py

diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py
new file mode 100644
index 00000000..96b9a343
--- /dev/null
+++ b/pdap_api_client/PDAPClient.py
@@ -0,0 +1,76 @@
+from urllib import parse
+from enum import Enum
+from typing import Optional
+
+import requests
+from requests.models import PreparedRequest
+
+API_URL = "https://data-sources-v2.pdap.dev/api"
+
+class Namespaces(Enum):
+    AUTH = "auth"
+
+
+class RequestManager:
+    """
+    Handles making requests and managing the responses
+    """
+
+
+
+
+class URLBuilder:
+
+    def __init__(self):
+        self.base_url = API_URL
+
+    def build_url(
+        self,
+        namespace: Namespaces,
+        subdomains: Optional[list[str]] = None,
+        query_parameters: Optional[dict] = None
+    ):
+        url = f"{self.base_url}/{namespace.value}"
+        if subdomains is not None:
+            url = f"{url}/{'/'.join(subdomains)}"
+        if query_parameters is None:
+            return url
+        req = PreparedRequest()
+        req.prepare_url(url, params=query_parameters)
+        return req.url
+
+
+
+class AccessManager:
+    """
+    Manages login, api key, access and refresh tokens
+    """
+    def __init__(self, email: str, password: str):
+        self.url_builder = URLBuilder()
+
+    def login(self, email: str, password: str):
+        url = self.url_builder.build_url(
+            namespace=Namespaces.AUTH,
+            subdomains=["login"]
+        )
+        response = requests.post(
+            url=url,
+            json={
+                "email": email,
+                "password": password
+            }
+        )
+        response.raise_for_status()
+        # TODO: Finish
+
+
+class PDAPClient:
+
+    def __init__(self):
+        pass
+
+    def match_agency(self):
+        pass
+
+    def check_for_unique_source_url(self, url: str):
+        pass
\ No newline at end of file
diff --git a/pdap_api_client/__init__.py b/pdap_api_client/__init__.py
new file mode 100644
index 00000000..e69de29b

From 82d8c5be812fad3dd631a8744d8a814d74b7bd3a Mon Sep 17 00:00:00 2001
From: maxachis <maxachis@gmail.com>
Date: Wed, 18 Dec 2024 12:00:48 -0500
Subject: [PATCH 17/19] Continue draft

---
 pdap_api_client/DTOs.py       |   6 ++
 pdap_api_client/PDAPClient.py | 148 ++++++++++++++++++++++++++++------
 2 files changed, 131 insertions(+), 23 deletions(-)
 create mode 100644 pdap_api_client/DTOs.py

diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py
new file mode 100644
index 00000000..b85511b3
--- /dev/null
+++ b/pdap_api_client/DTOs.py
@@ -0,0 +1,6 @@
+from pydantic import BaseModel
+
+
+class MatchAgencyInfo(BaseModel):
+    submitted_name: str
+    id: str
\ No newline at end of file
diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py
index 96b9a343..a0763fec 100644
--- a/pdap_api_client/PDAPClient.py
+++ b/pdap_api_client/PDAPClient.py
@@ -1,20 +1,57 @@
+from http import HTTPStatus
 from urllib import parse
 from enum import Enum
-from typing import Optional
+from typing import Optional, List
 
 import requests
+from pydantic import BaseModel
 from requests.models import PreparedRequest
 
+from pdap_api_client.DTOs import MatchAgencyInfo
+
 API_URL = "https://data-sources-v2.pdap.dev/api"
 
 class Namespaces(Enum):
     AUTH = "auth"
-
-
-class RequestManager:
-    """
-    Handles making requests and managing the responses
-    """
+    MATCH = "match"
+
+class RequestType(Enum):
+    POST = "POST"
+    PUT = "PUT"
+    GET = "GET"
+    DELETE = "DELETE"
+
+class RequestInfo(BaseModel):
+    type_: RequestType
+    url: str
+    json: Optional[dict] = None
+    headers: Optional[dict] = None
+    params: Optional[dict] = None
+    timeout: Optional[int] = None
+
+class ResponseInfo(BaseModel):
+    status_code: HTTPStatus
+    data: Optional[dict]
+
+request_methods = {
+    RequestType.POST: requests.post,
+    RequestType.PUT: requests.put,
+    RequestType.GET: requests.get,
+    RequestType.DELETE: requests.delete,
+}
+def make_request(ri: RequestInfo) -> ResponseInfo:
+    response = request_methods[ri.type_](
+        ri.url,
+        json=ri.json,
+        headers=ri.headers,
+        params=ri.params,
+        timeout=ri.timeout
+    )
+    response.raise_for_status()
+    return ResponseInfo(
+        status_code=response.status_code,
+        data=response.json()
+    )
 
 
 
@@ -28,49 +65,114 @@ def build_url(
         self,
         namespace: Namespaces,
         subdomains: Optional[list[str]] = None,
-        query_parameters: Optional[dict] = None
     ):
         url = f"{self.base_url}/{namespace.value}"
         if subdomains is not None:
             url = f"{url}/{'/'.join(subdomains)}"
-        if query_parameters is None:
-            return url
-        req = PreparedRequest()
-        req.prepare_url(url, params=query_parameters)
-        return req.url
-
+        return url
 
+def build_url(
+    namespace: Namespaces,
+    subdomains: Optional[list[str]] = None
+):
+    url = f"{API_URL}/{namespace.value}"
+    if subdomains is not None:
+        url = f"{url}/{'/'.join(subdomains)}"
+    return url
 
 class AccessManager:
     """
     Manages login, api key, access and refresh tokens
     """
-    def __init__(self, email: str, password: str):
+    def __init__(self, email: str, password: str, api_key: Optional[str]):
         self.url_builder = URLBuilder()
+        self.access_token = None
+        self.refresh_token = None
+        self.api_key = None
+        self.login(email=email, password=password)
+
+    # TODO: Add means to refresh if token expired.
+
+    def load_api_key(self):
+        url = build_url(
+            namespace=Namespaces.AUTH,
+            subdomains=["api-key"]
+        )
+        request_info = RequestInfo(
+            url=url,
+            headers=self.jwt_header()
+        )
+        response_info = make_request(request_info)
+        self.api_key = response_info.data["api_key"]
 
     def login(self, email: str, password: str):
-        url = self.url_builder.build_url(
+        url = build_url(
             namespace=Namespaces.AUTH,
             subdomains=["login"]
         )
-        response = requests.post(
+        request_info = RequestInfo(
             url=url,
             json={
                 "email": email,
                 "password": password
             }
         )
-        response.raise_for_status()
-        # TODO: Finish
+        response_info = make_request(request_info)
+        data = response_info.data
+        self.access_token = data["access_token"]
+        self.refresh_token = data["refresh_token"]
+
+
+    def jwt_header(self) -> dict:
+        """
+        Retrieve JWT header
+        Returns: Dictionary of Bearer Authorization with JWT key
+        """
+        return {
+            "Authorization": f"Bearer {self.access_token}"
+        }
+
+    def api_key_header(self):
+        """
+        Retrieve API key header
+        Returns: Dictionary of Basic Authorization with API key
+
+        """
+        if self.api_key is None:
+            self.load_api_key()
+        return {
+            "Authorization": f"Basic {self.api_key}"
+        }
 
 
 class PDAPClient:
 
-    def __init__(self):
-        pass
+    def __init__(self, access_manager: AccessManager):
+        self.access_manager = access_manager
+
+    def match_agency(
+            self,
+            name: str,
+            state: str,
+            county: str,
+            locality: str
+    ) -> List[MatchAgencyInfo]:
+        url = build_url(
+            namespace=Namespaces.MATCH,
+            subdomains=["agency"]
+        )
+        request_info = RequestInfo(
+            url=url,
+            json={
+                "name": name,
+                "state": state,
+                "county": county,
+                "locality": locality
+            }
+        )
+        response_info = make_request(request_info)
+        return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]]
 
-    def match_agency(self):
-        pass
 
     def check_for_unique_source_url(self, url: str):
         pass
\ No newline at end of file

From 55695fb212880f50b0fe59a1447018e41ffba691 Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Wed, 18 Dec 2024 16:20:27 -0500
Subject: [PATCH 18/19] Continue draft of PDAP client

---
 pdap_api_client/DTOs.py       |  50 ++++++++++++-
 pdap_api_client/PDAPClient.py | 136 ++++++++++++++++++++--------------
 2 files changed, 129 insertions(+), 57 deletions(-)

diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py
index b85511b3..31c8c2cf 100644
--- a/pdap_api_client/DTOs.py
+++ b/pdap_api_client/DTOs.py
@@ -1,6 +1,54 @@
+from enum import Enum
+from http import HTTPStatus
+from typing import Optional
+
 from pydantic import BaseModel
 
 
 class MatchAgencyInfo(BaseModel):
     submitted_name: str
-    id: str
\ No newline at end of file
+    id: str
+
+class ApprovalStatus(Enum):
+    APPROVED = "approved"
+    REJECTED = "rejected"
+    PENDING = "pending"
+    NEEDS_IDENTIFICATION = "needs identification"
+
+
+
+class UniqueURLDuplicateInfo(BaseModel):
+    original_url: str
+    approval_status: ApprovalStatus
+    rejection_note: str
+
+class UniqueURLResponseInfo(BaseModel):
+    is_unique: bool
+    duplicates: list[UniqueURLDuplicateInfo]
+
+
+class Namespaces(Enum):
+    AUTH = "auth"
+    MATCH = "match"
+    CHECK = "check"
+
+
+class RequestType(Enum):
+    POST = "POST"
+    PUT = "PUT"
+    GET = "GET"
+    DELETE = "DELETE"
+
+
+class RequestInfo(BaseModel):
+    type_: RequestType
+    url: str
+    json: Optional[dict] = None
+    headers: Optional[dict] = None
+    params: Optional[dict] = None
+    timeout: Optional[int] = 10
+
+
+class ResponseInfo(BaseModel):
+    status_code: HTTPStatus
+    data: Optional[dict]
diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py
index a0763fec..bdac2e05 100644
--- a/pdap_api_client/PDAPClient.py
+++ b/pdap_api_client/PDAPClient.py
@@ -1,75 +1,42 @@
 from http import HTTPStatus
-from urllib import parse
-from enum import Enum
 from typing import Optional, List
 
 import requests
-from pydantic import BaseModel
-from requests.models import PreparedRequest
 
-from pdap_api_client.DTOs import MatchAgencyInfo
+from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \
+    RequestType, RequestInfo, ResponseInfo
 
 API_URL = "https://data-sources-v2.pdap.dev/api"
 
-class Namespaces(Enum):
-    AUTH = "auth"
-    MATCH = "match"
-
-class RequestType(Enum):
-    POST = "POST"
-    PUT = "PUT"
-    GET = "GET"
-    DELETE = "DELETE"
-
-class RequestInfo(BaseModel):
-    type_: RequestType
-    url: str
-    json: Optional[dict] = None
-    headers: Optional[dict] = None
-    params: Optional[dict] = None
-    timeout: Optional[int] = None
-
-class ResponseInfo(BaseModel):
-    status_code: HTTPStatus
-    data: Optional[dict]
-
 request_methods = {
     RequestType.POST: requests.post,
     RequestType.PUT: requests.put,
     RequestType.GET: requests.get,
     RequestType.DELETE: requests.delete,
 }
-def make_request(ri: RequestInfo) -> ResponseInfo:
-    response = request_methods[ri.type_](
-        ri.url,
-        json=ri.json,
-        headers=ri.headers,
-        params=ri.params,
-        timeout=ri.timeout
-    )
-    response.raise_for_status()
-    return ResponseInfo(
-        status_code=response.status_code,
-        data=response.json()
-    )
 
 
+class CustomHTTPException(Exception):
+    pass
 
 
-class URLBuilder:
-
-    def __init__(self):
-        self.base_url = API_URL
+def make_request(ri: RequestInfo) -> ResponseInfo:
+    try:
+        response = request_methods[ri.type_](
+            ri.url,
+            json=ri.json,
+            headers=ri.headers,
+            params=ri.params,
+            timeout=ri.timeout
+        )
+        response.raise_for_status()
+    except requests.RequestException as e:
+        raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}")
+    return ResponseInfo(
+        status_code=HTTPStatus(response.status_code),
+        data=response.json()
+    )
 
-    def build_url(
-        self,
-        namespace: Namespaces,
-        subdomains: Optional[list[str]] = None,
-    ):
-        url = f"{self.base_url}/{namespace.value}"
-        if subdomains is not None:
-            url = f"{url}/{'/'.join(subdomains)}"
-        return url
 
 def build_url(
     namespace: Namespaces,
@@ -85,7 +52,6 @@ class AccessManager:
     Manages login, api key, access and refresh tokens
     """
     def __init__(self, email: str, password: str, api_key: Optional[str]):
-        self.url_builder = URLBuilder()
         self.access_token = None
         self.refresh_token = None
         self.api_key = None
@@ -99,18 +65,49 @@ def load_api_key(self):
             subdomains=["api-key"]
         )
         request_info = RequestInfo(
+            type_ = RequestType.POST,
             url=url,
             headers=self.jwt_header()
         )
         response_info = make_request(request_info)
         self.api_key = response_info.data["api_key"]
 
+    def refresh_access_token(self):
+        url = build_url(
+            namespace=Namespaces.AUTH,
+            subdomains=["refresh-session"],
+        )
+        raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566")
+
+    def make_request(self, ri: RequestInfo) -> ResponseInfo:
+        try:
+            response = request_methods[ri.type_](
+                ri.url,
+                json=ri.json,
+                headers=ri.headers,
+                params=ri.params,
+                timeout=ri.timeout
+            )
+            response.raise_for_status()
+        except requests.RequestException as e:
+            # TODO: Precise string matching here is brittle. Consider changing later.
+            if e.response.json().message == "Token is expired. Please request a new token.":
+                self.refresh_access_token()
+                return make_request(ri)
+            else:
+                raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}")
+        return ResponseInfo(
+            status_code=HTTPStatus(response.status_code),
+            data=response.json()
+        )
+
     def login(self, email: str, password: str):
         url = build_url(
             namespace=Namespaces.AUTH,
             subdomains=["login"]
         )
         request_info = RequestInfo(
+            type_=RequestType.POST,
             url=url,
             json={
                 "email": email,
@@ -157,11 +154,15 @@ def match_agency(
             county: str,
             locality: str
     ) -> List[MatchAgencyInfo]:
+        """
+        Returns agencies, if any, that match or partially match the search criteria
+        """
         url = build_url(
             namespace=Namespaces.MATCH,
             subdomains=["agency"]
         )
         request_info = RequestInfo(
+            type_=RequestType.POST,
             url=url,
             json={
                 "name": name,
@@ -174,5 +175,28 @@ def match_agency(
         return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]]
 
 
-    def check_for_unique_source_url(self, url: str):
-        pass
\ No newline at end of file
+    def is_url_unique(
+        self,
+        url_to_check: str
+    ) -> UniqueURLResponseInfo:
+        """
+        Check if a URL is unique. Returns duplicate info otherwise
+        """
+        url = build_url(
+            namespace=Namespaces.CHECK,
+            subdomains=["unique-url"]
+        )
+        request_info = RequestInfo(
+            type_=RequestType.GET,
+            url=url,
+            params={
+                "url": url_to_check
+            }
+        )
+        response_info = make_request(request_info)
+        duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]]
+        is_unique = (len(duplicates) == 0)
+        return UniqueURLResponseInfo(
+            is_unique=is_unique,
+            duplicates=duplicates
+        )

From e8d599eb87b6d9f33d8b4e7a443e62500831519f Mon Sep 17 00:00:00 2001
From: Max Chis <maxachis@gmail.com>
Date: Wed, 18 Dec 2024 16:23:11 -0500
Subject: [PATCH 19/19] Refactor: Move AccessManager to separate file

---
 pdap_api_client/AccessManager.py | 123 ++++++++++++++++++++++++++
 pdap_api_client/PDAPClient.py    | 147 ++-----------------------------
 2 files changed, 128 insertions(+), 142 deletions(-)
 create mode 100644 pdap_api_client/AccessManager.py

diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py
new file mode 100644
index 00000000..87877466
--- /dev/null
+++ b/pdap_api_client/AccessManager.py
@@ -0,0 +1,123 @@
+from http import HTTPStatus
+from typing import Optional
+
+import requests
+
+from pdap_api_client.DTOs import RequestType, Namespaces, RequestInfo, ResponseInfo
+
+API_URL = "https://data-sources-v2.pdap.dev/api"
+request_methods = {
+    RequestType.POST: requests.post,
+    RequestType.PUT: requests.put,
+    RequestType.GET: requests.get,
+    RequestType.DELETE: requests.delete,
+}
+
+
+class CustomHTTPException(Exception):
+    pass
+
+
+def build_url(
+    namespace: Namespaces,
+    subdomains: Optional[list[str]] = None
+):
+    url = f"{API_URL}/{namespace.value}"
+    if subdomains is not None:
+        url = f"{url}/{'/'.join(subdomains)}"
+    return url
+
+
+class AccessManager:
+    """
+    Manages login, api key, access and refresh tokens
+    """
+    def __init__(self, email: str, password: str, api_key: Optional[str] = None):
+        self.access_token = None
+        self.refresh_token = None
+        self.api_key = api_key
+        self.login(email=email, password=password)
+
+    # TODO: Add means to refresh if token expired.
+
+    def load_api_key(self):
+        url = build_url(
+            namespace=Namespaces.AUTH,
+            subdomains=["api-key"]
+        )
+        request_info = RequestInfo(
+            type_ = RequestType.POST,
+            url=url,
+            headers=self.jwt_header()
+        )
+        response_info = self.make_request(request_info)
+        self.api_key = response_info.data["api_key"]
+
+    def refresh_access_token(self):
+        url = build_url(
+            namespace=Namespaces.AUTH,
+            subdomains=["refresh-session"],
+        )
+        raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566")
+
+    def make_request(self, ri: RequestInfo) -> ResponseInfo:
+        try:
+            response = request_methods[ri.type_](
+                ri.url,
+                json=ri.json,
+                headers=ri.headers,
+                params=ri.params,
+                timeout=ri.timeout
+            )
+            response.raise_for_status()
+        except requests.RequestException as e:
+            # TODO: Precise string matching here is brittle. Consider changing later.
+            if e.response.json().message == "Token is expired. Please request a new token.":
+                self.refresh_access_token()
+                return self.make_request(ri)
+            else:
+                raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}")
+        return ResponseInfo(
+            status_code=HTTPStatus(response.status_code),
+            data=response.json()
+        )
+
+    def login(self, email: str, password: str):
+        url = build_url(
+            namespace=Namespaces.AUTH,
+            subdomains=["login"]
+        )
+        request_info = RequestInfo(
+            type_=RequestType.POST,
+            url=url,
+            json={
+                "email": email,
+                "password": password
+            }
+        )
+        response_info = self.make_request(request_info)
+        data = response_info.data
+        self.access_token = data["access_token"]
+        self.refresh_token = data["refresh_token"]
+
+
+    def jwt_header(self) -> dict:
+        """
+        Retrieve JWT header
+        Returns: Dictionary of Bearer Authorization with JWT key
+        """
+        return {
+            "Authorization": f"Bearer {self.access_token}"
+        }
+
+    def api_key_header(self):
+        """
+        Retrieve API key header
+        Returns: Dictionary of Basic Authorization with API key
+
+        """
+        if self.api_key is None:
+            self.load_api_key()
+        return {
+            "Authorization": f"Basic {self.api_key}"
+        }
diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py
index bdac2e05..6c03ce0f 100644
--- a/pdap_api_client/PDAPClient.py
+++ b/pdap_api_client/PDAPClient.py
@@ -1,145 +1,8 @@
-from http import HTTPStatus
-from typing import Optional, List
-
-import requests
+from typing import List
 
+from pdap_api_client.AccessManager import build_url, AccessManager
 from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \
-    RequestType, RequestInfo, ResponseInfo
-
-API_URL = "https://data-sources-v2.pdap.dev/api"
-
-request_methods = {
-    RequestType.POST: requests.post,
-    RequestType.PUT: requests.put,
-    RequestType.GET: requests.get,
-    RequestType.DELETE: requests.delete,
-}
-
-
-class CustomHTTPException(Exception):
-    pass
-
-
-def make_request(ri: RequestInfo) -> ResponseInfo:
-    try:
-        response = request_methods[ri.type_](
-            ri.url,
-            json=ri.json,
-            headers=ri.headers,
-            params=ri.params,
-            timeout=ri.timeout
-        )
-        response.raise_for_status()
-    except requests.RequestException as e:
-        raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}")
-    return ResponseInfo(
-        status_code=HTTPStatus(response.status_code),
-        data=response.json()
-    )
-
-
-def build_url(
-    namespace: Namespaces,
-    subdomains: Optional[list[str]] = None
-):
-    url = f"{API_URL}/{namespace.value}"
-    if subdomains is not None:
-        url = f"{url}/{'/'.join(subdomains)}"
-    return url
-
-class AccessManager:
-    """
-    Manages login, api key, access and refresh tokens
-    """
-    def __init__(self, email: str, password: str, api_key: Optional[str]):
-        self.access_token = None
-        self.refresh_token = None
-        self.api_key = None
-        self.login(email=email, password=password)
-
-    # TODO: Add means to refresh if token expired.
-
-    def load_api_key(self):
-        url = build_url(
-            namespace=Namespaces.AUTH,
-            subdomains=["api-key"]
-        )
-        request_info = RequestInfo(
-            type_ = RequestType.POST,
-            url=url,
-            headers=self.jwt_header()
-        )
-        response_info = make_request(request_info)
-        self.api_key = response_info.data["api_key"]
-
-    def refresh_access_token(self):
-        url = build_url(
-            namespace=Namespaces.AUTH,
-            subdomains=["refresh-session"],
-        )
-        raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566")
-
-    def make_request(self, ri: RequestInfo) -> ResponseInfo:
-        try:
-            response = request_methods[ri.type_](
-                ri.url,
-                json=ri.json,
-                headers=ri.headers,
-                params=ri.params,
-                timeout=ri.timeout
-            )
-            response.raise_for_status()
-        except requests.RequestException as e:
-            # TODO: Precise string matching here is brittle. Consider changing later.
-            if e.response.json().message == "Token is expired. Please request a new token.":
-                self.refresh_access_token()
-                return make_request(ri)
-            else:
-                raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}")
-        return ResponseInfo(
-            status_code=HTTPStatus(response.status_code),
-            data=response.json()
-        )
-
-    def login(self, email: str, password: str):
-        url = build_url(
-            namespace=Namespaces.AUTH,
-            subdomains=["login"]
-        )
-        request_info = RequestInfo(
-            type_=RequestType.POST,
-            url=url,
-            json={
-                "email": email,
-                "password": password
-            }
-        )
-        response_info = make_request(request_info)
-        data = response_info.data
-        self.access_token = data["access_token"]
-        self.refresh_token = data["refresh_token"]
-
-
-    def jwt_header(self) -> dict:
-        """
-        Retrieve JWT header
-        Returns: Dictionary of Bearer Authorization with JWT key
-        """
-        return {
-            "Authorization": f"Bearer {self.access_token}"
-        }
-
-    def api_key_header(self):
-        """
-        Retrieve API key header
-        Returns: Dictionary of Basic Authorization with API key
-
-        """
-        if self.api_key is None:
-            self.load_api_key()
-        return {
-            "Authorization": f"Basic {self.api_key}"
-        }
+    RequestType, RequestInfo
 
 
 class PDAPClient:
@@ -171,7 +34,7 @@ def match_agency(
                 "locality": locality
             }
         )
-        response_info = make_request(request_info)
+        response_info = self.access_manager.make_request(request_info)
         return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]]
 
 
@@ -193,7 +56,7 @@ def is_url_unique(
                 "url": url_to_check
             }
         )
-        response_info = make_request(request_info)
+        response_info = self.access_manager.make_request(request_info)
         duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]]
         is_unique = (len(duplicates) == 0)
         return UniqueURLResponseInfo(