From 93bdbe76d946045bb0c0f5c515b45fa6d724b1cf Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 10:24:14 -0500 Subject: [PATCH 01/19] Basic refactoring: * Create explicit "main" method * Extract logic to functions * Add detailed docstrings * Add some comments --- source_collectors/muckrock/muck_get.py | 94 +++++++++++++++----------- 1 file changed, 54 insertions(+), 40 deletions(-) diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index 20c29338..4c154f36 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -1,6 +1,6 @@ """ -muck_get.py - +A straightforward standalone script for downloading data from MuckRock +and searching for it with a specific search string. """ import requests @@ -9,53 +9,67 @@ # Define the base API endpoint base_url = "https://www.muckrock.com/api_v1/foia/" -# Define the search string -search_string = "use of force" -per_page = 100 -page = 1 -all_results = [] -max_count = 20 +def dump_list(all_results: list[dict], search_string: str) -> None: + """ + Dumps a list of dictionaries into a JSON file. + """ + json_out_file = search_string.replace(" ", "_") + ".json" + with open(json_out_file, "w") as json_file: + json.dump(all_results, json_file) -while True: + print(f"List dumped into {json_out_file}") - # Make the GET request with the search string as a query parameter - response = requests.get( - base_url, params={"page": page, "page_size": per_page, "format": "json"} - ) +def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20) -> list[dict]: + """ + Search for FOIA data based on a search string. + :param search_string: The search string to use. + :param per_page: The number of results to retrieve per page. + :param max_count: The maximum number of results to retrieve. Search stops once this number is reached or exceeded. + """ + page = 1 + all_results = [] - # Check if the request was successful - if response.status_code == 200: - # Parse the JSON response - data = response.json() + while True: - if not data["results"]: - break + # Make the GET request with the search string as a query parameter + response = requests.get( + base_url, params={"page": page, "page_size": per_page, "format": "json"} + ) - filtered_results = [ - item - for item in data["results"] - if search_string.lower() in item["title"].lower() - ] + # Check if the request was successful + if response.status_code == 200: + # Parse the JSON response + data = response.json() - all_results.extend(filtered_results) + if not data["results"]: + break - if len(filtered_results) > 0: - num_results = len(filtered_results) - print(f"found {num_results} more matching result(s)...") - if len(all_results) >= max_count: - print("max count reached... exiting") - break + # Filter results according to whether the search string is in the title + filtered_results = [ + item + for item in data["results"] + if search_string.lower() in item["title"].lower() + ] - page += 1 + all_results.extend(filtered_results) - else: - print(f"Error: {response.status_code}") - break + if len(filtered_results) > 0: + num_results = len(filtered_results) + print(f"found {num_results} more matching result(s)...") -# Dump list into a JSON file -json_out_file = search_string.replace(" ", "_") + ".json" -with open(json_out_file, "w") as json_file: - json.dump(all_results, json_file) + if len(all_results) >= max_count: + print(f"max count ({max_count}) reached... exiting") + break + + page += 1 + + else: + print(f"Error: {response.status_code}") + break + return all_results -print(f"List dumped into {json_out_file}") +if __name__ == "__main__": + search_string = "use of force" + all_results = search_for_foia(search_string) + dump_list(all_results, search_string) From 0fe043f542926973fea7e2cba75ef7b0ada274bb Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 10:36:54 -0500 Subject: [PATCH 02/19] Basic refactoring: * Create explicit "main" method * Extract logic to functions * Add detailed docstrings * Add some comments and TODOs --- .../muckrock/muckrock_ml_labeler.py | 115 +++++++++++------- 1 file changed, 71 insertions(+), 44 deletions(-) diff --git a/source_collectors/muckrock/muckrock_ml_labeler.py b/source_collectors/muckrock/muckrock_ml_labeler.py index b313c045..e3cb5cc7 100644 --- a/source_collectors/muckrock/muckrock_ml_labeler.py +++ b/source_collectors/muckrock/muckrock_ml_labeler.py @@ -1,6 +1,5 @@ """ -muckrock_ml_labeler.py - +Utilizes a fine-tuned model to label a dataset of URLs. """ from transformers import AutoTokenizer, AutoModelForSequenceClassification @@ -8,45 +7,73 @@ import pandas as pd import argparse -# Load the tokenizer and model -model_name = "PDAP/fine-url-classifier" -tokenizer = AutoTokenizer.from_pretrained(model_name) -model = AutoModelForSequenceClassification.from_pretrained(model_name) -model.eval() - -# Load the dataset from command line argument -parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.") -parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file") -args = parser.parse_args() -df = pd.read_csv(args.csv_file) - -# Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row -columns_to_combine = [ - "url_path", - "html_title", - "h1", -] # Add other columns here as needed -df["combined_text"] = df[columns_to_combine].apply( - lambda row: " ".join(row.values.astype(str)), axis=1 -) - -# Convert the combined text into a list -texts = df["combined_text"].tolist() - -# Tokenize the inputs -inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") - -# Perform inference -with torch.no_grad(): - outputs = model(**inputs) - -# Get the predicted labels -predictions = torch.argmax(outputs.logits, dim=-1) - -# Map predictions to labels -labels = model.config.id2label -predicted_labels = [labels[int(pred)] for pred in predictions] - -# Add the predicted labels to the dataframe and save -df["predicted_label"] = predicted_labels -df.to_csv("labeled_muckrock_dataset.csv", index=False) + +def load_dataset_from_command_line() -> pd.DataFrame: + parser = argparse.ArgumentParser(description="Load CSV file into a pandas DataFrame.") + parser.add_argument("--csv_file", type=str, required=True, help="Path to the CSV file") + args = parser.parse_args() + return pd.read_csv(args.csv_file) + + +def create_combined_text_column(df: pd.DataFrame) -> None: + # Combine multiple columns (e.g., 'url', 'html_title', 'h1') into a single text field for each row + columns_to_combine = [ + "url_path", + "html_title", + "h1", + ] # Add other columns here as needed + df["combined_text"] = df[columns_to_combine].apply( + lambda row: " ".join(row.values.astype(str)), axis=1 + ) + + +def get_list_of_combined_texts(df: pd.DataFrame) -> list[str]: + # Convert the combined text into a list + return df["combined_text"].tolist() + + +def save_labeled_muckrock_dataset_to_csv(): + df.to_csv("labeled_muckrock_dataset.csv", index=False) + + +def create_predicted_labels_column(df: pd.DataFrame, predicted_labels: list[str]) -> None: + df["predicted_label"] = predicted_labels + + +def map_predictions_to_labels(model, predictions) -> list[str]: + labels = model.config.id2label + return [labels[int(pred)] for pred in predictions] + + +def get_predicted_labels(texts: list[str]) -> list[str]: + # Load the tokenizer and model + model_name = "PDAP/fine-url-classifier" + tokenizer = AutoTokenizer.from_pretrained(model_name) + + model = AutoModelForSequenceClassification.from_pretrained(model_name) + model.eval() + # Tokenize the inputs + inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") + # Perform inference + with torch.no_grad(): + outputs = model(**inputs) + # Get the predicted labels + predictions = torch.argmax(outputs.logits, dim=-1) + # Map predictions to labels + predicted_labels = map_predictions_to_labels(model=model, predictions=predictions) + + return predicted_labels + + +if __name__ == "__main__": + df = load_dataset_from_command_line() + # TODO: Check for existence of required columns prior to further processing + create_combined_text_column(df=df) + + texts = get_list_of_combined_texts(df=df) + + predicted_labels = get_predicted_labels(texts=texts) + # Add the predicted labels to the dataframe and save + create_predicted_labels_column(df=df, predicted_labels=predicted_labels) + + save_labeled_muckrock_dataset_to_csv() \ No newline at end of file From 4f5cc516e29307353f1b250a6aab00d3ce0b7603 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 10:48:21 -0500 Subject: [PATCH 03/19] Basic refactoring: * Add detailed docstrings * Add some comments and TODOs --- source_collectors/muckrock/get_allegheny_foias.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index a559f67f..bf62ba33 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -1,5 +1,6 @@ """ -get_allegheny_foias.py +Get Allegheny County FOIA requests +and save them to a JSON file """ import requests @@ -47,9 +48,12 @@ def fetch_foia_data(jurisdiction_ids): """ all_data = [] for name, id_ in jurisdiction_ids.items(): + # TODO: The muckrock api should be centralized in a `constants.py` folder + # and the url should be constructed in a function or class url = f"https://www.muckrock.com/api_v1/foia/?status=done&jurisdiction={id_}" while url: response = requests.get(url) + # TODO: If logic similar to `fetch_jurisdiction_ids` and should be generalized if response.status_code == 200: data = response.json() all_data.extend(data.get("results", [])) @@ -66,6 +70,7 @@ def fetch_foia_data(jurisdiction_ids): break # Save the combined data to a JSON file + # TODO: Generalize this logic with similar logic in `muck_get.py` to function with open("foia_data_combined.json", "w") as json_file: json.dump(all_data, json_file, indent=4) From 9294fb05e6a2b9764f8abb318ffd066f420cd884 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 10:57:54 -0500 Subject: [PATCH 04/19] Basic refactoring: * Create explicit main function and `__main__` section * Add detailed docstrings * Add some comments and TODOs --- .../generate_detailed_muckrock_csv.py | 221 ++++++++++-------- 1 file changed, 118 insertions(+), 103 deletions(-) diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index a077dbc7..2fac3bcd 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -1,3 +1,9 @@ +""" +Converts JSON file of MuckRock FOIA requests to CSV for further processing +""" + +# TODO: Look into linking up this logic with other components in pipeline. + import json import argparse import csv @@ -5,17 +11,6 @@ import time from utils import format_filename_json_to_csv -# Load the JSON data -parser = argparse.ArgumentParser(description="Parse JSON from a file.") -parser.add_argument( - "--json_file", type=str, required=True, help="Path to the JSON file" -) - -args = parser.parse_args() - -with open(args.json_file, "r") as f: - json_data = json.load(f) - # Define the CSV headers headers = [ "name", @@ -54,7 +49,7 @@ def get_agency(agency_id): """ - Function to get agency_described + Get agency data from the MuckRock API via agency ID """ if agency_id: agency_url = f"https://www.muckrock.com/api_v1/agency/{agency_id}/" @@ -71,7 +66,7 @@ def get_agency(agency_id): def get_jurisdiction(jurisdiction_id): """ - Function to get jurisdiction_described + Get jurisdiction data from the MuckRock API via jurisdiction ID """ if jurisdiction_id: jurisdiction_url = ( @@ -87,96 +82,116 @@ def get_jurisdiction(jurisdiction_id): else: print("Jurisdiction ID not found in item") +def main(): + # Load the JSON data + parser = argparse.ArgumentParser(description="Parse JSON from a file.") + parser.add_argument( + "--json_file", type=str, required=True, help="Path to the JSON file" + ) + + args = parser.parse_args() + + # TODO: Generalize logic + with open(args.json_file, "r") as f: + json_data = json.load(f) + + output_csv = format_filename_json_to_csv(args.json_file) + # Open a CSV file for writing + + # TODO: CSV writing and composition logic is tightly coupled -- separate + with open(output_csv, "w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=headers) -output_csv = format_filename_json_to_csv(args.json_file) -# Open a CSV file for writing -with open(output_csv, "w", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=headers) - - # Write the header row - writer.writeheader() - - # Iterate through the JSON data - for item in json_data: - print(f"Writing data for {item.get('title')}") - agency_data = get_agency(item.get("agency")) - time.sleep(1) - jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction")) - - jurisdiction_level = jurisdiction_data.get("level") - # federal jurisduction level - if jurisdiction_level == "f": - state = "" - county = "" - municipality = "" - juris_type = "federal" - # state jurisdiction level - if jurisdiction_level == "s": - state = jurisdiction_data.get("name") - county = "" - municipality = "" - juris_type = "state" - # local jurisdiction level - if jurisdiction_level == "l": - parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent")) - state = parent_juris_data.get("abbrev") - if "County" in jurisdiction_data.get("name"): - county = jurisdiction_data.get("name") + # Write the header row + writer.writeheader() + + # Iterate through the JSON data + for item in json_data: + print(f"Writing data for {item.get('title')}") + agency_data = get_agency(item.get("agency")) + time.sleep(1) + jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction")) + + jurisdiction_level = jurisdiction_data.get("level") + # federal jurisduction level + if jurisdiction_level == "f": + state = "" + county = "" municipality = "" - juris_type = "county" - else: + juris_type = "federal" + # state jurisdiction level + if jurisdiction_level == "s": + state = jurisdiction_data.get("name") county = "" - municipality = jurisdiction_data.get("name") - juris_type = "local" - - if "Police" in agency_data.get("types"): - agency_type = "law enforcement/police" - else: - agency_type = "" - - source_url = "" - absolute_url = item.get("absolute_url") - access_type = "" - for comm in item["communications"]: - if comm["files"]: - source_url = absolute_url + "#files" - access_type = "Web page,Download,API" - break - - # Extract the relevant fields from the JSON object - csv_row = { - "name": item.get("title", ""), - "agency_described": agency_data.get("name", "") + " - " + state, - "record_type": "", - "description": "", - "source_url": source_url, - "readme_url": absolute_url, - "scraper_url": "", - "state": state, - "county": county, - "municipality": municipality, - "agency_type": agency_type, - "jurisdiction_type": juris_type, - "View Archive": "", - "agency_aggregation": "", - "agency_supplied": "no", - "supplying_entity": "MuckRock", - "agency_originated": "yes", - "originating_agency": agency_data.get("name", ""), - "coverage_start": "", - "source_last_updated": "", - "coverage_end": "", - "number_of_records_available": "", - "size": "", - "access_type": access_type, - "data_portal_type": "MuckRock", - "access_notes": "", - "record_format": "", - "update_frequency": "", - "update_method": "", - "retention_schedule": "", - "detail_level": "", - } - - # Write the extracted row to the CSV file - writer.writerow(csv_row) + municipality = "" + juris_type = "state" + # local jurisdiction level + if jurisdiction_level == "l": + parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent")) + state = parent_juris_data.get("abbrev") + if "County" in jurisdiction_data.get("name"): + county = jurisdiction_data.get("name") + municipality = "" + juris_type = "county" + else: + county = "" + municipality = jurisdiction_data.get("name") + juris_type = "local" + + if "Police" in agency_data.get("types"): + agency_type = "law enforcement/police" + else: + agency_type = "" + + source_url = "" + absolute_url = item.get("absolute_url") + access_type = "" + for comm in item["communications"]: + if comm["files"]: + source_url = absolute_url + "#files" + access_type = "Web page,Download,API" + break + + # Extract the relevant fields from the JSON object + # TODO: I question the utility of creating columns that are then left blank until later + # and possibly in a different file entirely. + csv_row = { + "name": item.get("title", ""), + "agency_described": agency_data.get("name", "") + " - " + state, + "record_type": "", + "description": "", + "source_url": source_url, + "readme_url": absolute_url, + "scraper_url": "", + "state": state, + "county": county, + "municipality": municipality, + "agency_type": agency_type, + "jurisdiction_type": juris_type, + "View Archive": "", + "agency_aggregation": "", + "agency_supplied": "no", + "supplying_entity": "MuckRock", + "agency_originated": "yes", + "originating_agency": agency_data.get("name", ""), + "coverage_start": "", + "source_last_updated": "", + "coverage_end": "", + "number_of_records_available": "", + "size": "", + "access_type": access_type, + "data_portal_type": "MuckRock", + "access_notes": "", + "record_format": "", + "update_frequency": "", + "update_method": "", + "retention_schedule": "", + "detail_level": "", + } + + # Write the extracted row to the CSV file + writer.writerow(csv_row) + + +if __name__ == "__main__": + main() \ No newline at end of file From dd3ee80b47c7ec17a79b0d855d35547514b9b5c9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 11:14:52 -0500 Subject: [PATCH 05/19] Refactor: Add FOIAFetcher * Extract logic from `muck_get.py` and `download_muckrock_foia.py` * Create constants for base muckrock api url and foia extension of base url --- source_collectors/muckrock/FOIAFetcher.py | 34 +++++++++++++ source_collectors/muckrock/constants.py | 3 ++ .../muckrock/download_muckrock_foia.py | 32 +++--------- source_collectors/muckrock/muck_get.py | 49 ++++++++----------- 4 files changed, 66 insertions(+), 52 deletions(-) create mode 100644 source_collectors/muckrock/FOIAFetcher.py create mode 100644 source_collectors/muckrock/constants.py diff --git a/source_collectors/muckrock/FOIAFetcher.py b/source_collectors/muckrock/FOIAFetcher.py new file mode 100644 index 00000000..566df2cf --- /dev/null +++ b/source_collectors/muckrock/FOIAFetcher.py @@ -0,0 +1,34 @@ +import requests + +from source_collectors.muckrock.constants import BASE_MUCKROCK_URL + +FOIA_BASE_URL = f"{BASE_MUCKROCK_URL}/foia" + +class FOIAFetcher: + + def __init__(self, start_page: int = 1, per_page: int = 100): + """ + Constructor for the FOIAFetcher class. + + Args: + start_page (int): The page number to start fetching from (default is 1). + per_page (int): The number of results to fetch per page (default is 100). + """ + self.current_page = start_page + self.per_page = per_page + + def fetch_next_page(self) -> dict | None: + """ + Fetches data from a specific page of the MuckRock FOIA API. + """ + page = self.current_page + self.current_page += 1 + response = requests.get( + FOIA_BASE_URL, params={"page": page, "page_size": self.per_page, "format": "json"} + ) + if response.status_code == 200: + return response.json() + # TODO: Look into raising error instead of returning None + print(f"Error fetching page {page}: {response.status_code}") + return None + diff --git a/source_collectors/muckrock/constants.py b/source_collectors/muckrock/constants.py new file mode 100644 index 00000000..7109847f --- /dev/null +++ b/source_collectors/muckrock/constants.py @@ -0,0 +1,3 @@ + + +BASE_MUCKROCK_URL = "https://www.muckrock.com/api_v1" \ No newline at end of file diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py index 0abd527d..1e73c65a 100644 --- a/source_collectors/muckrock/download_muckrock_foia.py +++ b/source_collectors/muckrock/download_muckrock_foia.py @@ -7,49 +7,33 @@ """ + +# TODO: Logic redundant with `muck_get.py`. Generalize + import requests import csv import time import json -# Define the base API endpoint -base_url = "https://www.muckrock.com/api_v1/foia/" +from source_collectors.muckrock.FOIAFetcher import FOIAFetcher # Set initial parameters -page = 1 -per_page = 100 all_data = [] output_file = "foia_data.json" - -def fetch_page(page): - """ - Fetches data from a specific page of the MuckRock FOIA API. - """ - response = requests.get( - base_url, params={"page": page, "page_size": per_page, "format": "json"} - ) - if response.status_code == 200: - return response.json() - else: - print(f"Error fetching page {page}: {response.status_code}") - return None - - # Fetch and store data from all pages +fetcher = FOIAFetcher() while True: - print(f"Fetching page {page}...") - data = fetch_page(page) + print(f"Fetching page {fetcher.current_page}...") + data = fetcher.fetch_next_page() if data is None: - print(f"Skipping page {page}...") - page += 1 + print(f"Skipping page {fetcher.current_page}...") continue all_data.extend(data["results"]) if not data["next"]: break - page += 1 # Write data to CSV with open(output_file, mode="w", encoding="utf-8") as json_file: diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index 4c154f36..f9fc218b 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -6,6 +6,8 @@ import requests import json +from source_collectors.muckrock.FOIAFetcher import FOIAFetcher + # Define the base API endpoint base_url = "https://www.muckrock.com/api_v1/foia/" @@ -26,47 +28,38 @@ def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20 :param per_page: The number of results to retrieve per page. :param max_count: The maximum number of results to retrieve. Search stops once this number is reached or exceeded. """ - page = 1 + fetcher = FOIAFetcher(per_page=per_page) all_results = [] while True: - # Make the GET request with the search string as a query parameter - response = requests.get( - base_url, params={"page": page, "page_size": per_page, "format": "json"} - ) + data = fetcher.fetch_next_page() - # Check if the request was successful - if response.status_code == 200: - # Parse the JSON response - data = response.json() + if data is None: + break - if not data["results"]: - break + if not data["results"]: + break - # Filter results according to whether the search string is in the title - filtered_results = [ - item - for item in data["results"] - if search_string.lower() in item["title"].lower() - ] + # Filter results according to whether the search string is in the title + filtered_results = [ + item + for item in data["results"] + if search_string.lower() in item["title"].lower() + ] - all_results.extend(filtered_results) + all_results.extend(filtered_results) - if len(filtered_results) > 0: - num_results = len(filtered_results) - print(f"found {num_results} more matching result(s)...") + num_results = len(filtered_results) + if num_results > 0: + print(f"found {num_results} more matching result(s)...") - if len(all_results) >= max_count: - print(f"max count ({max_count}) reached... exiting") - break + if len(all_results) >= max_count: + print(f"max count ({max_count}) reached... exiting") + break - page += 1 - else: - print(f"Error: {response.status_code}") - break return all_results if __name__ == "__main__": From 435b090ae7f93565726638160a4043ea859de2fb Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 11:33:48 -0500 Subject: [PATCH 06/19] Refactor: Add utility functions * Extract logic for loading from and saving to json files to separate functions * Add TODOs --- source_collectors/muckrock/create_foia_data_db.py | 1 + source_collectors/muckrock/download_muckrock_foia.py | 4 ++-- .../muckrock/generate_detailed_muckrock_csv.py | 6 ++---- source_collectors/muckrock/get_allegheny_foias.py | 7 +++---- source_collectors/muckrock/muck_get.py | 5 ++--- source_collectors/muckrock/search_local_foia_json.py | 8 ++++---- source_collectors/muckrock/utils.py | 10 ++++++++++ 7 files changed, 24 insertions(+), 17 deletions(-) diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py index 4adc5556..85c7fd4b 100644 --- a/source_collectors/muckrock/create_foia_data_db.py +++ b/source_collectors/muckrock/create_foia_data_db.py @@ -112,6 +112,7 @@ def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]: - None: If there is an error other than 404. """ + # TODO: Refactor to use FOIA Fetcher per_page = 100 response = requests.get( base_url, params={"page": page, "page_size": per_page, "format": "json"} diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py index 1e73c65a..4018053e 100644 --- a/source_collectors/muckrock/download_muckrock_foia.py +++ b/source_collectors/muckrock/download_muckrock_foia.py @@ -16,6 +16,7 @@ import json from source_collectors.muckrock.FOIAFetcher import FOIAFetcher +from source_collectors.muckrock.utils import save_json_file # Set initial parameters all_data = [] @@ -36,7 +37,6 @@ # Write data to CSV -with open(output_file, mode="w", encoding="utf-8") as json_file: - json.dump(all_data, json_file, indent=4) +save_json_file(file_path=output_file, data=all_data) print(f"Data written to {output_file}") diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index 2fac3bcd..d17b7415 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -4,12 +4,11 @@ # TODO: Look into linking up this logic with other components in pipeline. -import json import argparse import csv import requests import time -from utils import format_filename_json_to_csv +from utils import format_filename_json_to_csv, load_json_file # Define the CSV headers headers = [ @@ -92,8 +91,7 @@ def main(): args = parser.parse_args() # TODO: Generalize logic - with open(args.json_file, "r") as f: - json_data = json.load(f) + json_data = load_json_file(args.json_file) output_csv = format_filename_json_to_csv(args.json_file) # Open a CSV file for writing diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index bf62ba33..3aac1d6f 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -7,6 +7,8 @@ import json import time +from source_collectors.muckrock.utils import save_json_file + def fetch_jurisdiction_ids(town_file, base_url): """ @@ -70,10 +72,7 @@ def fetch_foia_data(jurisdiction_ids): break # Save the combined data to a JSON file - # TODO: Generalize this logic with similar logic in `muck_get.py` to function - with open("foia_data_combined.json", "w") as json_file: - json.dump(all_data, json_file, indent=4) - + save_json_file(file_path="foia_data_combined.json", data=all_data) print(f"Saved {len(all_data)} records to foia_data_combined.json") diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index f9fc218b..cbd6e407 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -7,6 +7,7 @@ import json from source_collectors.muckrock.FOIAFetcher import FOIAFetcher +from source_collectors.muckrock.utils import save_json_file # Define the base API endpoint base_url = "https://www.muckrock.com/api_v1/foia/" @@ -16,9 +17,7 @@ def dump_list(all_results: list[dict], search_string: str) -> None: Dumps a list of dictionaries into a JSON file. """ json_out_file = search_string.replace(" ", "_") + ".json" - with open(json_out_file, "w") as json_file: - json.dump(all_results, json_file) - + save_json_file(file_path=json_out_file, data=all_results) print(f"List dumped into {json_out_file}") def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20) -> list[dict]: diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py index 562c4bae..3010f42d 100644 --- a/source_collectors/muckrock/search_local_foia_json.py +++ b/source_collectors/muckrock/search_local_foia_json.py @@ -7,13 +7,14 @@ import json +from source_collectors.muckrock.utils import load_json_file, save_json_file + # Specify the JSON file path json_file = "foia_data.json" search_string = "use of force" # Load the JSON data -with open(json_file, "r", encoding="utf-8") as file: - data = json.load(file) +data = load_json_file(json_file) # List to store matching entries matching_entries = [] @@ -47,7 +48,6 @@ def search_entry(entry): ) # Optionally, write matching entries to a new JSON file -with open("matching_entries.json", "w", encoding="utf-8") as file: - json.dump(matching_entries, file, indent=4) +save_json_file(file_path="matching_entries.json", data=matching_entries) print("Matching entries written to 'matching_entries.json'") diff --git a/source_collectors/muckrock/utils.py b/source_collectors/muckrock/utils.py index 3d8b63db..3c7eba28 100644 --- a/source_collectors/muckrock/utils.py +++ b/source_collectors/muckrock/utils.py @@ -8,6 +8,7 @@ """ import re +import json def format_filename_json_to_csv(json_filename: str) -> str: @@ -24,3 +25,12 @@ def format_filename_json_to_csv(json_filename: str) -> str: csv_filename = re.sub(r"_(?=[^.]*$)", "-", json_filename[:-5]) + ".csv" return csv_filename + +def load_json_file(file_path: str) -> dict: + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + return data + +def save_json_file(file_path: str, data: dict | list[dict]): + with open(file_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=4) \ No newline at end of file From 7dd7d0ccea3d619e55b431a9d9ccac47df126e6a Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 18:37:46 -0500 Subject: [PATCH 07/19] Refactor: Create FOIASearcher * Extract `muck_get.py` logic to FOIA searcher * Remove deprecated `download_muckrock_foia.py` --- source_collectors/muckrock/FOIASearcher.py | 58 ++++++++++++++++ source_collectors/muckrock/README.md | 2 +- .../muckrock/download_muckrock_foia.py | 42 ------------ source_collectors/muckrock/muck_get.py | 67 +++---------------- 4 files changed, 67 insertions(+), 102 deletions(-) create mode 100644 source_collectors/muckrock/FOIASearcher.py delete mode 100644 source_collectors/muckrock/download_muckrock_foia.py diff --git a/source_collectors/muckrock/FOIASearcher.py b/source_collectors/muckrock/FOIASearcher.py new file mode 100644 index 00000000..d42a6439 --- /dev/null +++ b/source_collectors/muckrock/FOIASearcher.py @@ -0,0 +1,58 @@ +from typing import Optional + +from source_collectors.muckrock.FOIAFetcher import FOIAFetcher +from tqdm import tqdm + +class FOIASearcher: + """ + Used for searching FOIA data from MuckRock + """ + + def __init__(self, fetcher: FOIAFetcher, search_term: Optional[str] = None): + self.fetcher = fetcher + self.search_term = search_term + + def fetch_page(self) -> dict | None: + """ + Fetches the next page of results using the fetcher. + """ + data = self.fetcher.fetch_next_page() + if data is None or data.get("results") is None: + return None + return data + + def filter_results(self, results: list[dict]) -> list[dict]: + """ + Filters the results based on the search term. + Override or modify as needed for custom filtering logic. + """ + if self.search_term: + return [result for result in results if self.search_term.lower() in result["title"].lower()] + return results + + def update_progress(self, pbar: tqdm, results: list[dict]) -> int: + """ + Updates the progress bar and returns the count of results processed. + """ + num_results = len(results) + pbar.update(num_results) + return num_results + + def search_to_count(self, max_count: int) -> list[dict]: + """ + Fetches and processes results up to a maximum count. + """ + count = max_count + all_results = [] + with tqdm(total=max_count, desc="Fetching results", unit="result") as pbar: + while count > 0: + data = self.fetch_page() + if not data: + break + + results = self.filter_results(data["results"]) + all_results.extend(results) + count -= self.update_progress(pbar, results) + + return all_results + diff --git a/source_collectors/muckrock/README.md b/source_collectors/muckrock/README.md index d74b77f0..d24b0cef 100644 --- a/source_collectors/muckrock/README.md +++ b/source_collectors/muckrock/README.md @@ -56,7 +56,7 @@ pip install -r requirements.txt ### 2. Clone Muckrock database & search locally -~~- `download_muckrock_foia.py` `search_local_foia_json.py`~~ (deprecated) +~~- `search_local_foia_json.py`~~ (deprecated) - scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present) diff --git a/source_collectors/muckrock/download_muckrock_foia.py b/source_collectors/muckrock/download_muckrock_foia.py deleted file mode 100644 index 4018053e..00000000 --- a/source_collectors/muckrock/download_muckrock_foia.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -***DEPRECATED*** - -download_muckrock_foia.py - -This script fetches data from the MuckRock FOIA API and stores the results in a JSON file. - -""" - - -# TODO: Logic redundant with `muck_get.py`. Generalize - -import requests -import csv -import time -import json - -from source_collectors.muckrock.FOIAFetcher import FOIAFetcher -from source_collectors.muckrock.utils import save_json_file - -# Set initial parameters -all_data = [] -output_file = "foia_data.json" - -# Fetch and store data from all pages -fetcher = FOIAFetcher() -while True: - print(f"Fetching page {fetcher.current_page}...") - data = fetcher.fetch_next_page() - if data is None: - print(f"Skipping page {fetcher.current_page}...") - continue - - all_data.extend(data["results"]) - if not data["next"]: - break - - -# Write data to CSV -save_json_file(file_path=output_file, data=all_data) - -print(f"Data written to {output_file}") diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index cbd6e407..1401cd93 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -2,66 +2,15 @@ A straightforward standalone script for downloading data from MuckRock and searching for it with a specific search string. """ - -import requests -import json - from source_collectors.muckrock.FOIAFetcher import FOIAFetcher +from source_collectors.muckrock.FOIASearcher import FOIASearcher from source_collectors.muckrock.utils import save_json_file -# Define the base API endpoint -base_url = "https://www.muckrock.com/api_v1/foia/" - -def dump_list(all_results: list[dict], search_string: str) -> None: - """ - Dumps a list of dictionaries into a JSON file. - """ - json_out_file = search_string.replace(" ", "_") + ".json" - save_json_file(file_path=json_out_file, data=all_results) - print(f"List dumped into {json_out_file}") - -def search_for_foia(search_string: str, per_page: int = 100, max_count: int = 20) -> list[dict]: - """ - Search for FOIA data based on a search string. - :param search_string: The search string to use. - :param per_page: The number of results to retrieve per page. - :param max_count: The maximum number of results to retrieve. Search stops once this number is reached or exceeded. - """ - fetcher = FOIAFetcher(per_page=per_page) - all_results = [] - - while True: - - data = fetcher.fetch_next_page() - - if data is None: - break - - if not data["results"]: - break - - - # Filter results according to whether the search string is in the title - filtered_results = [ - item - for item in data["results"] - if search_string.lower() in item["title"].lower() - ] - - all_results.extend(filtered_results) - - num_results = len(filtered_results) - if num_results > 0: - print(f"found {num_results} more matching result(s)...") - - if len(all_results) >= max_count: - print(f"max count ({max_count}) reached... exiting") - break - - - return all_results - if __name__ == "__main__": - search_string = "use of force" - all_results = search_for_foia(search_string) - dump_list(all_results, search_string) + search_term = "use of force" + fetcher = FOIAFetcher() + searcher = FOIASearcher(fetcher=fetcher, search_term=search_term) + results = searcher.search_to_count(20) + json_out_file = search_term.replace(" ", "_") + ".json" + save_json_file(file_path=json_out_file, data=results) + print(f"List dumped into {json_out_file}") From dd3f0a290a7bb7b27acb1545f808d6808b62ccd1 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 18:46:19 -0500 Subject: [PATCH 08/19] Remove `search_local_foia_json.py` --- source_collectors/muckrock/README.md | 2 - .../muckrock/search_local_foia_json.py | 53 ------------------- 2 files changed, 55 deletions(-) delete mode 100644 source_collectors/muckrock/search_local_foia_json.py diff --git a/source_collectors/muckrock/README.md b/source_collectors/muckrock/README.md index d24b0cef..43bae80d 100644 --- a/source_collectors/muckrock/README.md +++ b/source_collectors/muckrock/README.md @@ -56,8 +56,6 @@ pip install -r requirements.txt ### 2. Clone Muckrock database & search locally -~~- `search_local_foia_json.py`~~ (deprecated) - - scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present) - `create_foia_data_db.py` creates and populates a SQLite database (`foia_data.db`) with all MuckRock foia requests. Various errors outside the scope of this script may occur; a counter (`last_page_fetched.txt`) is created to keep track of the most recent page fetched and inserted into the database. If the program exits prematurely, simply run `create_foia_data_db.py` again to continue where you left off. A log file is created to capture errors for later reference. diff --git a/source_collectors/muckrock/search_local_foia_json.py b/source_collectors/muckrock/search_local_foia_json.py deleted file mode 100644 index 3010f42d..00000000 --- a/source_collectors/muckrock/search_local_foia_json.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -***DEPRECATED*** - -search_local_foia_json.py - -""" - -import json - -from source_collectors.muckrock.utils import load_json_file, save_json_file - -# Specify the JSON file path -json_file = "foia_data.json" -search_string = "use of force" - -# Load the JSON data -data = load_json_file(json_file) - -# List to store matching entries -matching_entries = [] - - -def search_entry(entry): - """ - search within an entry - """ - # Check if 'status' is 'done' - if entry.get("status") != "done": - return False - - # Check if 'title' or 'tags' field contains the search string - title_match = "title" in entry and search_string.lower() in entry["title"].lower() - tags_match = "tags" in entry and any( - search_string.lower() in tag.lower() for tag in entry["tags"] - ) - - return title_match or tags_match - - -# Iterate through the data and collect matching entries -for entry in data: - if search_entry(entry): - matching_entries.append(entry) - -# Output the results -print( - f"Found {len(matching_entries)} entries containing '{search_string}' in the title or tags." -) - -# Optionally, write matching entries to a new JSON file -save_json_file(file_path="matching_entries.json", data=matching_entries) - -print("Matching entries written to 'matching_entries.json'") From 01d5f6bb343551c1edc8a7d613bdae700b17e237 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 19:16:01 -0500 Subject: [PATCH 09/19] Refactor: Create MuckrockFetchers * Create MuckrockFetcher base class * Implement in FOIAFetcher * Create JurisdictionFetcher and AgencyFetcher * Replace relevant logic in `generate_detailed_muckrock_csv.py` --- source_collectors/__init__.py | 0 source_collectors/muckrock/FOIASearcher.py | 2 +- source_collectors/muckrock/__init__.py | 0 .../generate_detailed_muckrock_csv.py | 56 ++++++------------- source_collectors/muckrock/muck_get.py | 2 +- .../muckrock_fetchers/AgencyFetcher.py | 14 +++++ .../{ => muckrock_fetchers}/FOIAFetcher.py | 24 ++++---- .../muckrock_fetchers/JurisdictionFetcher.py | 14 +++++ .../muckrock_fetchers/MuckrockFetcher.py | 28 ++++++++++ .../muckrock/muckrock_fetchers/__init__.py | 0 10 files changed, 88 insertions(+), 52 deletions(-) create mode 100644 source_collectors/__init__.py create mode 100644 source_collectors/muckrock/__init__.py create mode 100644 source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py rename source_collectors/muckrock/{ => muckrock_fetchers}/FOIAFetcher.py (60%) create mode 100644 source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py create mode 100644 source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py create mode 100644 source_collectors/muckrock/muckrock_fetchers/__init__.py diff --git a/source_collectors/__init__.py b/source_collectors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/source_collectors/muckrock/FOIASearcher.py b/source_collectors/muckrock/FOIASearcher.py index d42a6439..9d6116b7 100644 --- a/source_collectors/muckrock/FOIASearcher.py +++ b/source_collectors/muckrock/FOIASearcher.py @@ -1,6 +1,6 @@ from typing import Optional -from source_collectors.muckrock.FOIAFetcher import FOIAFetcher +from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher from tqdm import tqdm class FOIASearcher: diff --git a/source_collectors/muckrock/__init__.py b/source_collectors/muckrock/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index d17b7415..207d2118 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -6,8 +6,11 @@ import argparse import csv -import requests import time + +from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher, AgencyFetchRequest +from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher, \ + JurisdictionFetchRequest from utils import format_filename_json_to_csv, load_json_file # Define the CSV headers @@ -46,41 +49,6 @@ ] -def get_agency(agency_id): - """ - Get agency data from the MuckRock API via agency ID - """ - if agency_id: - agency_url = f"https://www.muckrock.com/api_v1/agency/{agency_id}/" - response = requests.get(agency_url) - - if response.status_code == 200: - agency_data = response.json() - return agency_data - else: - return "" - else: - print("Agency ID not found in item") - - -def get_jurisdiction(jurisdiction_id): - """ - Get jurisdiction data from the MuckRock API via jurisdiction ID - """ - if jurisdiction_id: - jurisdiction_url = ( - f"https://www.muckrock.com/api_v1/jurisdiction/{jurisdiction_id}/" - ) - response = requests.get(jurisdiction_url) - - if response.status_code == 200: - jurisdiction_data = response.json() - return jurisdiction_data - else: - return "" - else: - print("Jurisdiction ID not found in item") - def main(): # Load the JSON data parser = argparse.ArgumentParser(description="Parse JSON from a file.") @@ -103,12 +71,19 @@ def main(): # Write the header row writer.writeheader() + a_fetcher = AgencyFetcher() + j_fetcher = JurisdictionFetcher() + # Iterate through the JSON data for item in json_data: print(f"Writing data for {item.get('title')}") - agency_data = get_agency(item.get("agency")) + agency_data = a_fetcher.get_agency(agency_id=item.get("agency")) + # agency_data = get_agency(item.get("agency")) time.sleep(1) - jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction")) + jurisdiction_data = j_fetcher.get_jurisdiction( + jurisdiction_id=agency_data.get("jurisdiction") + ) + # jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction")) jurisdiction_level = jurisdiction_data.get("level") # federal jurisduction level @@ -125,7 +100,10 @@ def main(): juris_type = "state" # local jurisdiction level if jurisdiction_level == "l": - parent_juris_data = get_jurisdiction(jurisdiction_data.get("parent")) + parent_juris_data = j_fetcher.get_jurisdiction( + jurisdiction_id=jurisdiction_data.get("parent") + ) + state = parent_juris_data.get("abbrev") if "County" in jurisdiction_data.get("name"): county = jurisdiction_data.get("name") diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index 1401cd93..b1a51022 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -2,7 +2,7 @@ A straightforward standalone script for downloading data from MuckRock and searching for it with a specific search string. """ -from source_collectors.muckrock.FOIAFetcher import FOIAFetcher +from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher from source_collectors.muckrock.FOIASearcher import FOIASearcher from source_collectors.muckrock.utils import save_json_file diff --git a/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py b/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py new file mode 100644 index 00000000..2e36ce31 --- /dev/null +++ b/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py @@ -0,0 +1,14 @@ +from source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher + + +class AgencyFetchRequest(FetchRequest): + agency_id: int + +class AgencyFetcher(MuckrockFetcher): + + def build_url(self, request: AgencyFetchRequest) -> str: + return f"{BASE_MUCKROCK_URL}/agency/{request.agency_id}/" + + def get_agency(self, agency_id: int): + return self.fetch(AgencyFetchRequest(agency_id=agency_id)) \ No newline at end of file diff --git a/source_collectors/muckrock/FOIAFetcher.py b/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py similarity index 60% rename from source_collectors/muckrock/FOIAFetcher.py rename to source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py index 566df2cf..5b780a99 100644 --- a/source_collectors/muckrock/FOIAFetcher.py +++ b/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py @@ -1,10 +1,15 @@ -import requests - +from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockFetcher, FetchRequest from source_collectors.muckrock.constants import BASE_MUCKROCK_URL FOIA_BASE_URL = f"{BASE_MUCKROCK_URL}/foia" -class FOIAFetcher: + +class FOIAFetchRequest(FetchRequest): + page: int + page_size: int + + +class FOIAFetcher(MuckrockFetcher): def __init__(self, start_page: int = 1, per_page: int = 100): """ @@ -17,18 +22,15 @@ def __init__(self, start_page: int = 1, per_page: int = 100): self.current_page = start_page self.per_page = per_page + def build_url(self, request: FOIAFetchRequest) -> str: + return f"{FOIA_BASE_URL}?page={request.page}&page_size={request.page_size}&format=json" + def fetch_next_page(self) -> dict | None: """ Fetches data from a specific page of the MuckRock FOIA API. """ page = self.current_page self.current_page += 1 - response = requests.get( - FOIA_BASE_URL, params={"page": page, "page_size": self.per_page, "format": "json"} - ) - if response.status_code == 200: - return response.json() - # TODO: Look into raising error instead of returning None - print(f"Error fetching page {page}: {response.status_code}") - return None + request = FOIAFetchRequest(page=page, page_size=self.per_page) + return self.fetch(request) diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py b/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py new file mode 100644 index 00000000..b52ce735 --- /dev/null +++ b/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py @@ -0,0 +1,14 @@ +from source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher + + +class JurisdictionFetchRequest(FetchRequest): + jurisdiction_id: int + +class JurisdictionFetcher(MuckrockFetcher): + + def build_url(self, request: JurisdictionFetchRequest) -> str: + return f"{BASE_MUCKROCK_URL}/jurisdiction/{request.jurisdiction_id}/" + + def get_jurisdiction(self, jurisdiction_id: int) -> dict: + return self.fetch(request=JurisdictionFetchRequest(jurisdiction_id=jurisdiction_id)) diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py new file mode 100644 index 00000000..33bba21d --- /dev/null +++ b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py @@ -0,0 +1,28 @@ +import abc +from abc import ABC +from dataclasses import dataclass + +import requests +from pydantic import BaseModel + + +class FetchRequest(BaseModel): + pass + +class MuckrockFetcher(ABC): + + def fetch(self, request: FetchRequest): + url = self.build_url(request) + response = requests.get(url) + try: + response.raise_for_status() + except requests.exceptions.HTTPError as e: + print(f"Failed to get records on request `{url}`: {e}") + return None + + return response.json() + + @abc.abstractmethod + def build_url(self, request: FetchRequest) -> str: + pass + diff --git a/source_collectors/muckrock/muckrock_fetchers/__init__.py b/source_collectors/muckrock/muckrock_fetchers/__init__.py new file mode 100644 index 00000000..e69de29b From cc5b20d2297c412ce3926cfb74837f304375bb20 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 20:19:16 -0500 Subject: [PATCH 10/19] Refactor: Modularize Logic * Create Enum Class * Simplify Agency Info data creation * Extract logic to separate functions --- .../generate_detailed_muckrock_csv.py | 288 +++++++++--------- 1 file changed, 143 insertions(+), 145 deletions(-) diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index 207d2118..f7a65e3b 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -7,167 +7,165 @@ import argparse import csv import time +from enum import Enum +from typing import Optional -from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher, AgencyFetchRequest -from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher, \ - JurisdictionFetchRequest -from utils import format_filename_json_to_csv, load_json_file - -# Define the CSV headers -headers = [ - "name", - "agency_described", - "record_type", - "description", - "source_url", - "readme_url", - "scraper_url", - "state", - "county", - "municipality", - "agency_type", - "jurisdiction_type", - "View Archive", - "agency_aggregation", - "agency_supplied", - "supplying_entity", - "agency_originated", - "originating_agency", - "coverage_start", - "source_last_updated", - "coverage_end", - "number_of_records_available", - "size", - "access_type", - "data_portal_type", - "access_notes", - "record_format", - "update_frequency", - "update_method", - "retention_schedule", - "detail_level", -] +from pydantic import BaseModel +from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher +from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher +from utils import format_filename_json_to_csv, load_json_file -def main(): - # Load the JSON data - parser = argparse.ArgumentParser(description="Parse JSON from a file.") - parser.add_argument( - "--json_file", type=str, required=True, help="Path to the JSON file" - ) - args = parser.parse_args() +class JurisdictionType(Enum): + FEDERAL = "federal" + STATE = "state" + COUNTY = "county" + LOCAL = "local" + + +class AgencyInfo(BaseModel): + name: Optional[str] = "" + agency_described: Optional[str] = "" + record_type: Optional[str] = "" + description: Optional[str] = "" + source_url: Optional[str] = "" + readme_url: Optional[str] = "" + scraper_url: Optional[str] = "" + state: Optional[str] = "" + county: Optional[str] = "" + municipality: Optional[str] = "" + agency_type: Optional[str] = "" + jurisdiction_type: Optional[JurisdictionType] = None + agency_aggregation: Optional[str] = "" + agency_supplied: Optional[bool] = False + supplying_entity: Optional[str] = "MuckRock" + agency_originated: Optional[bool] = True + originating_agency: Optional[str] = "" + coverage_start: Optional[str] = "" + source_last_updated: Optional[str] = "" + coverage_end: Optional[str] = "" + number_of_records_available: Optional[str] = "" + size: Optional[str] = "" + access_type: Optional[str] = "" + data_portal_type: Optional[str] = "MuckRock" + access_notes: Optional[str] = "" + record_format: Optional[str] = "" + update_frequency: Optional[str] = "" + update_method: Optional[str] = "" + retention_schedule: Optional[str] = "" + detail_level: Optional[str] = "" + + + def model_dump(self, *args, **kwargs): + original_dict = super().model_dump(*args, **kwargs) + original_dict['View Archive'] = '' + return {key: (value.value if isinstance(value, Enum) else value) + for key, value in original_dict.items()} + + def keys(self) -> list[str]: + return list(self.model_dump().keys()) - # TODO: Generalize logic - json_data = load_json_file(args.json_file) - output_csv = format_filename_json_to_csv(args.json_file) +def main(): + json_filename = get_json_filename() + json_data = load_json_file(json_filename) + output_csv = format_filename_json_to_csv(json_filename) + agency_infos = get_agency_infos(json_data) + write_to_csv(agency_infos, output_csv) + + +def get_agency_infos(json_data): + a_fetcher = AgencyFetcher() + j_fetcher = JurisdictionFetcher() + agency_infos = [] + # Iterate through the JSON data + for item in json_data: + print(f"Writing data for {item.get('title')}") + agency_data = a_fetcher.get_agency(agency_id=item.get("agency")) + time.sleep(1) + jurisdiction_data = j_fetcher.get_jurisdiction( + jurisdiction_id=agency_data.get("jurisdiction") + ) + agency_name = agency_data.get("name", "") + agency_info = AgencyInfo( + name=item.get("title", ""), + originating_agency=agency_name, + agency_described=agency_name + ) + jurisdiction_level = jurisdiction_data.get("level") + add_locational_info(agency_info, j_fetcher, jurisdiction_data, jurisdiction_level) + optionally_add_agency_type(agency_data, agency_info) + optionally_add_access_info(agency_info, item) + + # Extract the relevant fields from the JSON object + # TODO: I question the utility of creating columns that are then left blank until later + # and possibly in a different file entirely. + agency_infos.append(agency_info) + return agency_infos + + +def write_to_csv(agency_infos, output_csv): # Open a CSV file for writing - - # TODO: CSV writing and composition logic is tightly coupled -- separate with open(output_csv, "w", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=headers) + writer = csv.DictWriter(csvfile, fieldnames=AgencyInfo().keys()) # Write the header row writer.writeheader() - a_fetcher = AgencyFetcher() - j_fetcher = JurisdictionFetcher() - - # Iterate through the JSON data - for item in json_data: - print(f"Writing data for {item.get('title')}") - agency_data = a_fetcher.get_agency(agency_id=item.get("agency")) - # agency_data = get_agency(item.get("agency")) - time.sleep(1) - jurisdiction_data = j_fetcher.get_jurisdiction( - jurisdiction_id=agency_data.get("jurisdiction") - ) - # jurisdiction_data = get_jurisdiction(agency_data.get("jurisdiction")) - - jurisdiction_level = jurisdiction_data.get("level") - # federal jurisduction level - if jurisdiction_level == "f": - state = "" - county = "" - municipality = "" - juris_type = "federal" - # state jurisdiction level - if jurisdiction_level == "s": - state = jurisdiction_data.get("name") - county = "" - municipality = "" - juris_type = "state" - # local jurisdiction level - if jurisdiction_level == "l": - parent_juris_data = j_fetcher.get_jurisdiction( - jurisdiction_id=jurisdiction_data.get("parent") - ) - - state = parent_juris_data.get("abbrev") - if "County" in jurisdiction_data.get("name"): - county = jurisdiction_data.get("name") - municipality = "" - juris_type = "county" - else: - county = "" - municipality = jurisdiction_data.get("name") - juris_type = "local" - - if "Police" in agency_data.get("types"): - agency_type = "law enforcement/police" - else: - agency_type = "" - - source_url = "" - absolute_url = item.get("absolute_url") - access_type = "" - for comm in item["communications"]: - if comm["files"]: - source_url = absolute_url + "#files" - access_type = "Web page,Download,API" - break - - # Extract the relevant fields from the JSON object - # TODO: I question the utility of creating columns that are then left blank until later - # and possibly in a different file entirely. - csv_row = { - "name": item.get("title", ""), - "agency_described": agency_data.get("name", "") + " - " + state, - "record_type": "", - "description": "", - "source_url": source_url, - "readme_url": absolute_url, - "scraper_url": "", - "state": state, - "county": county, - "municipality": municipality, - "agency_type": agency_type, - "jurisdiction_type": juris_type, - "View Archive": "", - "agency_aggregation": "", - "agency_supplied": "no", - "supplying_entity": "MuckRock", - "agency_originated": "yes", - "originating_agency": agency_data.get("name", ""), - "coverage_start": "", - "source_last_updated": "", - "coverage_end": "", - "number_of_records_available": "", - "size": "", - "access_type": access_type, - "data_portal_type": "MuckRock", - "access_notes": "", - "record_format": "", - "update_frequency": "", - "update_method": "", - "retention_schedule": "", - "detail_level": "", - } + for agency_info in agency_infos: + csv_row = agency_info.model_dump() # Write the extracted row to the CSV file writer.writerow(csv_row) +def get_json_filename(): + # Load the JSON data + parser = argparse.ArgumentParser(description="Parse JSON from a file.") + parser.add_argument( + "--json_file", type=str, required=True, help="Path to the JSON file" + ) + args = parser.parse_args() + json_filename = args.json_file + return json_filename + + +def add_locational_info(agency_info, j_fetcher, jurisdiction_data, jurisdiction_level): + # federal jurisdiction level + if jurisdiction_level == "f": + agency_info.jurisdiction_type = JurisdictionType.FEDERAL + # state jurisdiction level + if jurisdiction_level == "s": + agency_info.jurisdiction_type = JurisdictionType.STATE + agency_info.state = jurisdiction_data.get("name") + # local jurisdiction level + if jurisdiction_level == "l": + parent_juris_data = j_fetcher.get_jurisdiction( + jurisdiction_id=jurisdiction_data.get("parent") + ) + agency_info.state = parent_juris_data.get("abbrev") + if "County" in jurisdiction_data.get("name"): + agency_info.county = jurisdiction_data.get("name") + agency_info.jurisdiction_type = JurisdictionType.COUNTY + else: + agency_info.municipality = jurisdiction_data.get("name") + agency_info.jurisdiction_type = JurisdictionType.LOCAL + + +def optionally_add_access_info(agency_info, item): + absolute_url = item.get("absolute_url") + for comm in item["communications"]: + if comm["files"]: + agency_info.source_url = absolute_url + "#files" + agency_info.access_type = "Web page,Download,API" + break + + +def optionally_add_agency_type(agency_data, agency_info): + if "Police" in agency_data.get("types"): + agency_info.agency_type = "law enforcement/police" + + if __name__ == "__main__": main() \ No newline at end of file From 56062d2a8170e76655fb3d79f4b6c0dacff6c168 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 15 Dec 2024 20:22:29 -0500 Subject: [PATCH 11/19] Refactor: Modularize Logic * Create Enum Class * Simplify Agency Info data creation * Extract logic to separate functions --- .../generate_detailed_muckrock_csv.py | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index f7a65e3b..df4a0832 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -132,25 +132,23 @@ def get_json_filename(): def add_locational_info(agency_info, j_fetcher, jurisdiction_data, jurisdiction_level): - # federal jurisdiction level - if jurisdiction_level == "f": - agency_info.jurisdiction_type = JurisdictionType.FEDERAL - # state jurisdiction level - if jurisdiction_level == "s": - agency_info.jurisdiction_type = JurisdictionType.STATE - agency_info.state = jurisdiction_data.get("name") - # local jurisdiction level - if jurisdiction_level == "l": - parent_juris_data = j_fetcher.get_jurisdiction( - jurisdiction_id=jurisdiction_data.get("parent") - ) - agency_info.state = parent_juris_data.get("abbrev") - if "County" in jurisdiction_data.get("name"): - agency_info.county = jurisdiction_data.get("name") - agency_info.jurisdiction_type = JurisdictionType.COUNTY - else: - agency_info.municipality = jurisdiction_data.get("name") - agency_info.jurisdiction_type = JurisdictionType.LOCAL + match jurisdiction_level: + case "f": # federal jurisdiction level + agency_info.jurisdiction_type = JurisdictionType.FEDERAL + case "s": # state jurisdiction level + agency_info.jurisdiction_type = JurisdictionType.STATE + agency_info.state = jurisdiction_data.get("name") + case "l": # local jurisdiction level + parent_juris_data = j_fetcher.get_jurisdiction( + jurisdiction_id=jurisdiction_data.get("parent") + ) + agency_info.state = parent_juris_data.get("abbrev") + if "County" in jurisdiction_data.get("name"): + agency_info.county = jurisdiction_data.get("name") + agency_info.jurisdiction_type = JurisdictionType.COUNTY + else: + agency_info.municipality = jurisdiction_data.get("name") + agency_info.jurisdiction_type = JurisdictionType.LOCAL def optionally_add_access_info(agency_info, item): From 62f5a50f897c4a0aea670a556b7226abe2ce5714 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 16 Dec 2024 10:31:39 -0500 Subject: [PATCH 12/19] Refactor get_allegheny_foias.py * Create LoopFetcher classes * Implement in `get_allegheny_foias` --- .../generate_detailed_muckrock_csv.py | 4 +- .../muckrock/get_allegheny_foias.py | 72 ++++++------------- .../muckrock_fetchers/FOIALoopFetcher.py | 31 ++++++++ ...nFetcher.py => JurisdictionByIDFetcher.py} | 8 +-- .../JurisdictionLoopFetcher.py | 47 ++++++++++++ .../muckrock_fetchers/MuckrockLoopFetcher.py | 41 +++++++++++ 6 files changed, 145 insertions(+), 58 deletions(-) create mode 100644 source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py rename source_collectors/muckrock/muckrock_fetchers/{JurisdictionFetcher.py => JurisdictionByIDFetcher.py} (56%) create mode 100644 source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py create mode 100644 source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index df4a0832..cf3c439d 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -13,7 +13,7 @@ from pydantic import BaseModel from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher -from source_collectors.muckrock.muckrock_fetchers.JurisdictionFetcher import JurisdictionFetcher +from source_collectors.muckrock.muckrock_fetchers.JurisdictionByIDFetcher import JurisdictionByIDFetcher from utils import format_filename_json_to_csv, load_json_file @@ -77,7 +77,7 @@ def main(): def get_agency_infos(json_data): a_fetcher = AgencyFetcher() - j_fetcher = JurisdictionFetcher() + j_fetcher = JurisdictionByIDFetcher() agency_infos = [] # Iterate through the JSON data for item in json_data: diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index 3aac1d6f..bddeffad 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -3,45 +3,28 @@ and save them to a JSON file """ -import requests -import json -import time +from source_collectors.muckrock.muckrock_fetchers.FOIALoopFetcher import FOIALoopFetchRequest, FOIALoopFetcher +from source_collectors.muckrock.muckrock_fetchers.JurisdictionLoopFetcher import JurisdictionLoopFetchRequest, \ + JurisdictionLoopFetcher from source_collectors.muckrock.utils import save_json_file -def fetch_jurisdiction_ids(town_file, base_url): +def fetch_jurisdiction_ids(town_file, level="l", parent=126): """ fetch jurisdiction IDs based on town names from a text file """ with open(town_file, "r") as file: town_names = [line.strip() for line in file] - jurisdiction_ids = {} - url = base_url - - while url: - response = requests.get(url) - if response.status_code == 200: - data = response.json() - for item in data.get("results", []): - if item["name"] in town_names: - jurisdiction_ids[item["name"]] = item["id"] - - url = data.get("next") - print( - f"Processed page, found {len(jurisdiction_ids)}/{len(town_names)} jurisdictions so far..." - ) - time.sleep(1) # To respect the rate limit + request = JurisdictionLoopFetchRequest( + level=level, parent=parent, town_names=town_names + ) - elif response.status_code == 503: - print("Error 503: Skipping page") - break - else: - print(f"Error fetching data: {response.status_code}") - break + fetcher = JurisdictionLoopFetcher(request) + fetcher.loop_fetch() + return fetcher.jurisdictions - return jurisdiction_ids def fetch_foia_data(jurisdiction_ids): @@ -50,26 +33,11 @@ def fetch_foia_data(jurisdiction_ids): """ all_data = [] for name, id_ in jurisdiction_ids.items(): - # TODO: The muckrock api should be centralized in a `constants.py` folder - # and the url should be constructed in a function or class - url = f"https://www.muckrock.com/api_v1/foia/?status=done&jurisdiction={id_}" - while url: - response = requests.get(url) - # TODO: If logic similar to `fetch_jurisdiction_ids` and should be generalized - if response.status_code == 200: - data = response.json() - all_data.extend(data.get("results", [])) - url = data.get("next") - print( - f"Fetching records for {name}, {len(all_data)} total records so far..." - ) - time.sleep(1) # To respect the rate limit - elif response.status_code == 503: - print(f"Error 503: Skipping page for {name}") - break - else: - print(f"Error fetching data: {response.status_code} for {name}") - break + print(f"\nFetching records for {name}...") + request = FOIALoopFetchRequest(jurisdiction=id_) + fetcher = FOIALoopFetcher(request) + fetcher.loop_fetch() + all_data.extend(fetcher.results) # Save the combined data to a JSON file save_json_file(file_path="foia_data_combined.json", data=all_data) @@ -81,12 +49,12 @@ def main(): Execute the script """ town_file = "allegheny-county-towns.txt" - jurisdiction_url = ( - "https://www.muckrock.com/api_v1/jurisdiction/?level=l&parent=126" - ) - # Fetch jurisdiction IDs based on town names - jurisdiction_ids = fetch_jurisdiction_ids(town_file, jurisdiction_url) + jurisdiction_ids = fetch_jurisdiction_ids( + town_file, + level="l", + parent=126 + ) print(f"Jurisdiction IDs fetched: {jurisdiction_ids}") # Fetch FOIA data for each jurisdiction ID diff --git a/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py b/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py new file mode 100644 index 00000000..2af65c1e --- /dev/null +++ b/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py @@ -0,0 +1,31 @@ +from datasets import tqdm + +from source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest +from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher + +class FOIALoopFetchRequest(FetchRequest): + jurisdiction: int + +class FOIALoopFetcher(MuckrockLoopFetcher): + + def __init__(self, initial_request: FOIALoopFetchRequest): + super().__init__(initial_request) + self.pbar_records = tqdm( + desc="Fetching FOIA records", + unit="record", + ) + self.num_found = 0 + self.results = [] + + def process_results(self, results: list[dict]): + self.results.extend(results) + + def build_url(self, request: FOIALoopFetchRequest): + return f"{BASE_MUCKROCK_URL}/foia/?status=done&jurisdiction={request.jurisdiction}" + + def report_progress(self): + old_num_found = self.num_found + self.num_found = len(self.results) + difference = self.num_found - old_num_found + self.pbar_records.update(difference) diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py b/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py similarity index 56% rename from source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py rename to source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py index b52ce735..60cb0c2e 100644 --- a/source_collectors/muckrock/muckrock_fetchers/JurisdictionFetcher.py +++ b/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py @@ -2,13 +2,13 @@ from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher -class JurisdictionFetchRequest(FetchRequest): +class JurisdictionByIDFetchRequest(FetchRequest): jurisdiction_id: int -class JurisdictionFetcher(MuckrockFetcher): +class JurisdictionByIDFetcher(MuckrockFetcher): - def build_url(self, request: JurisdictionFetchRequest) -> str: + def build_url(self, request: JurisdictionByIDFetchRequest) -> str: return f"{BASE_MUCKROCK_URL}/jurisdiction/{request.jurisdiction_id}/" def get_jurisdiction(self, jurisdiction_id: int) -> dict: - return self.fetch(request=JurisdictionFetchRequest(jurisdiction_id=jurisdiction_id)) + return self.fetch(request=JurisdictionByIDFetchRequest(jurisdiction_id=jurisdiction_id)) diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py b/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py new file mode 100644 index 00000000..816f4b59 --- /dev/null +++ b/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py @@ -0,0 +1,47 @@ +from tqdm import tqdm + +from source_collectors.muckrock.constants import BASE_MUCKROCK_URL +from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher +from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher + + +class JurisdictionLoopFetchRequest(FetchRequest): + level: str + parent: int + town_names: list + +class JurisdictionLoopFetcher(MuckrockLoopFetcher): + + def __init__(self, initial_request: JurisdictionLoopFetchRequest): + super().__init__(initial_request) + self.town_names = initial_request.town_names + self.pbar_jurisdictions = tqdm( + total=len(self.town_names), + desc="Fetching jurisdictions", + unit="jurisdiction", + position=0, + leave=False + ) + self.pbar_page = tqdm( + desc="Processing pages", + unit="page", + position=1, + leave=False + ) + self.num_found = 0 + self.jurisdictions = {} + + def build_url(self, request: JurisdictionLoopFetchRequest) -> str: + return f"{BASE_MUCKROCK_URL}/jurisdiction/?level={request.level}&parent={request.parent}" + + def process_results(self, results: list[dict]): + for item in results: + if item["name"] in self.town_names: + self.jurisdictions[item["name"]] = item["id"] + + def report_progress(self): + old_num_found = self.num_found + self.num_found = len(self.jurisdictions) + difference = self.num_found - old_num_found + self.pbar_jurisdictions.update(difference) + self.pbar_page.update(1) diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py b/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py new file mode 100644 index 00000000..49011df3 --- /dev/null +++ b/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py @@ -0,0 +1,41 @@ +from abc import ABC, abstractmethod +from time import sleep + +import requests + +from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest + + +class MuckrockLoopFetcher(ABC): + + + def __init__(self, initial_request: FetchRequest): + self.initial_request = initial_request + + def loop_fetch(self): + url = self.build_url(self.initial_request) + while url is not None: + response = requests.get(url) + try: + response.raise_for_status() + except requests.exceptions.HTTPError as e: + print(f"Failed to get records on request `{url}`: {e}") + return None + + data = response.json() + self.process_results(data["results"]) + self.report_progress() + url = data["next"] + sleep(1) + + @abstractmethod + def process_results(self, results: list[dict]): + pass + + @abstractmethod + def build_url(self, request: FetchRequest) -> str: + pass + + @abstractmethod + def report_progress(self): + pass From b6b30a416a3081a25a167c6ae3bcd7222a64fc63 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 16 Dec 2024 16:48:37 -0500 Subject: [PATCH 13/19] Refactor create_foia_data_db.py * Create SQLClient classes * Add custom exception handling to Muckrock Fetcher. * Clean up comments * Extract some logic to separate functions. --- source_collectors/muckrock/SQLiteClient.py | 38 ++++ .../muckrock/create_foia_data_db.py | 176 ++++++++---------- .../muckrock_fetchers/MuckrockFetcher.py | 14 ++ 3 files changed, 130 insertions(+), 98 deletions(-) create mode 100644 source_collectors/muckrock/SQLiteClient.py diff --git a/source_collectors/muckrock/SQLiteClient.py b/source_collectors/muckrock/SQLiteClient.py new file mode 100644 index 00000000..96a59d82 --- /dev/null +++ b/source_collectors/muckrock/SQLiteClient.py @@ -0,0 +1,38 @@ +import logging +import sqlite3 + + +class SQLClientError(Exception): + pass + + +class SQLiteClient: + + def __init__(self, db_path: str) -> None: + self.conn = sqlite3.connect(db_path) + + def execute_query(self, query: str, many=None): + + try: + if many is not None: + self.conn.executemany(query, many) + else: + self.conn.execute(query) + self.conn.commit() + except sqlite3.Error as e: + print(f"SQLite error: {e}") + error_msg = f"Failed to execute query due to SQLite error: {e}" + logging.error(error_msg) + self.conn.rollback() + raise SQLClientError(error_msg) + +class SQLiteClientContextManager: + + def __init__(self, db_path: str) -> None: + self.client = SQLiteClient(db_path) + + def __enter__(self): + return self.client + + def __exit__(self, exc_type, exc_value, traceback): + self.client.conn.close() \ No newline at end of file diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py index 85c7fd4b..6bff13f7 100644 --- a/source_collectors/muckrock/create_foia_data_db.py +++ b/source_collectors/muckrock/create_foia_data_db.py @@ -19,20 +19,24 @@ and/or printed to the console. """ -import requests -import sqlite3 import logging import os import json import time -from typing import List, Tuple, Dict, Any, Union, Literal +from typing import List, Tuple, Dict, Any + +from tqdm import tqdm + +from source_collectors.muckrock.SQLiteClient import SQLiteClientContextManager, SQLClientError +from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher +from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError logging.basicConfig( filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s" ) +# TODO: Why are we pulling every single FOIA request? -base_url = "https://www.muckrock.com/api_v1/foia/" last_page_fetched = "last_page_fetched.txt" NO_MORE_DATA = -1 # flag for program exit @@ -83,70 +87,32 @@ def create_db() -> bool: bool: True, if database is successfully created; False otherwise. Raises: - sqlite3.Error: If the table creation operation fails, prints error and returns False. - """ - - try: - with sqlite3.connect("foia_data.db") as conn: - conn.execute(create_table_query) - conn.commit() - print("Successfully created foia_data.db!") - return True - except sqlite3.Error as e: - print(f"SQLite error: {e}.") - logging.error(f"Failed to create foia_data.db due to SQLite error: {e}") - return False - - -def fetch_page(page: int) -> Union[JSON, Literal[NO_MORE_DATA], None]: + sqlite3.Error: If the table creation operation fails, + prints error and returns False. """ - Fetches a page of 100 results from the MuckRock FOIA API. - - Args: - page (int): The page number to fetch from the API. - - Returns: - Union[JSON, None, Literal[NO_MORE_DATA]]: - - JSON Dict[str, Any]: The response's JSON data, if the request is successful. - - NO_MORE_DATA (int = -1): A constant, if there are no more pages to fetch (indicated by a 404 response). - - None: If there is an error other than 404. - """ - - # TODO: Refactor to use FOIA Fetcher - per_page = 100 - response = requests.get( - base_url, params={"page": page, "page_size": per_page, "format": "json"} - ) - - if response.status_code == 200: - return response.json() - elif response.status_code == 404: - print("No more pages to fetch") - return NO_MORE_DATA # Typically 404 response will mean there are no more pages to fetch - elif 500 <= response.status_code < 600: - logging.error(f"Server error {response.status_code} on page {page}") - page = page + 1 - return fetch_page(page) - else: - print(f"Error fetching page {page}: {response.status_code}") - logging.error( - f"Fetching page {page} failed with response code: { - response.status_code}" - ) - return None - + with SQLiteClientContextManager("foia_data.db") as client: + try: + client.execute_query(create_table_query) + return True + except SQLClientError as e: + print(f"SQLite error: {e}.") + logging.error(f"Failed to create foia_data.db due to SQLite error: {e}") + return False def transform_page_data(data_to_transform: JSON) -> List[Tuple[Any, ...]]: """ - Transforms the data recieved from the MuckRock FOIA API into a structured format for insertion into a database with `populate_db()`. + Transforms the data received from the MuckRock FOIA API + into a structured format for insertion into a database with `populate_db()`. - Transforms JSON input into a list of tuples, as well as serializes the nested `tags` and `communications` fields into JSON strings. + Transforms JSON input into a list of tuples, + as well as serializes the nested `tags` and `communications` fields + into JSON strings. Args: - data_to_transform (JSON: Dict[str, Any]): The JSON data from the API response. - + data_to_transform: The JSON data from the API response. Returns: - transformed_data (List[Tuple[Any, ...]]: A list of tuples, where each tuple contains the fields of a single FOIA request. + A list of tuples, where each tuple contains the fields + of a single FOIA request. """ transformed_data = [] @@ -198,39 +164,40 @@ def populate_db(transformed_data: List[Tuple[Any, ...]], page: int) -> None: sqlite3.Error: If the insertion operation fails, attempts to retry operation (max_retries = 2). If retries are exhausted, logs error and exits. """ - - with sqlite3.connect("foia_data.db") as conn: - + with SQLiteClientContextManager("foia_data.db") as client: retries = 0 max_retries = 2 while retries < max_retries: try: - conn.executemany(foia_insert_query, transformed_data) - conn.commit() + client.execute_query(foia_insert_query, many=transformed_data) print("Successfully inserted data!") return - except sqlite3.Error as e: - print(f"SQLite error: {e}. Retrying...") - conn.rollback() + except SQLClientError as e: + print(f"{e}. Retrying...") retries += 1 time.sleep(1) if retries == max_retries: - print( - f"Failed to insert data from page {page} after { - max_retries} attempts. Skipping to next page." - ) - logging.error( - f"Failed to insert data from page {page} after { - max_retries} attempts." - ) + report_max_retries_error(max_retries, page) + + +def report_max_retries_error(max_retries, page): + print( + f"Failed to insert data from page {page} after { + max_retries} attempts. Skipping to next page." + ) + logging.error( + f"Failed to insert data from page {page} after { + max_retries} attempts." + ) def main() -> None: """ Main entry point for create_foia_data_db.py. - This function orchestrates the process of fetching FOIA requests data from the MuckRock FOIA API, transforming it, + This function orchestrates the process of fetching + FOIA requests data from the MuckRock FOIA API, transforming it, and storing it in a SQLite database. """ @@ -241,33 +208,46 @@ def main() -> None: print("Failed to create foia_data.db") return - if os.path.exists(last_page_fetched): - with open(last_page_fetched, mode="r") as file: - page = int(file.read()) + 1 - else: - page = 1 - - while True: + start_page = get_start_page() + fetcher = FOIAFetcher( + start_page=start_page + ) - print(f"Fetching page {page}...") - page_data = fetch_page(page) + with tqdm(initial=start_page, unit="page") as pbar: + while True: - if page_data == NO_MORE_DATA: - break # Exit program because no more data exixts - if page_data is None: - print(f"Skipping page {page}...") - page += 1 - continue + # TODO: Replace with TQDM + try: + pbar.update() + page_data = fetcher.fetch_next_page() + except MuckrockNoMoreDataError: + # Exit program because no more data exists + break + if page_data is None: + continue + transformed_data = transform_page_data(page_data) + populate_db(transformed_data, fetcher.current_page) + + with open(last_page_fetched, mode="w") as file: + file.write(str(fetcher.current_page)) - transformed_data = transform_page_data(page_data) + print("create_foia_data_db.py run finished") - populate_db(transformed_data, page) - with open(last_page_fetched, mode="w") as file: - file.write(str(page)) - page += 1 +def get_start_page(): + """ + Returns the page number to start fetching from. - print("create_foia_data_db.py run finished") + If the file `last_page_fetched` exists, + reads the page number from the file and returns it + 1. + Otherwise, returns 1. + """ + if os.path.exists(last_page_fetched): + with open(last_page_fetched, mode="r") as file: + page = int(file.read()) + 1 + else: + page = 1 + return page if __name__ == "__main__": diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py index 33bba21d..e7a1dff5 100644 --- a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py +++ b/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py @@ -5,6 +5,11 @@ import requests from pydantic import BaseModel +class MuckrockNoMoreDataError(Exception): + pass + +class MuckrockServerError(Exception): + pass class FetchRequest(BaseModel): pass @@ -18,6 +23,15 @@ def fetch(self, request: FetchRequest): response.raise_for_status() except requests.exceptions.HTTPError as e: print(f"Failed to get records on request `{url}`: {e}") + # If code is 404, raise NoMoreData error + if e.response.status_code == 404: + raise MuckrockNoMoreDataError + if 500 <= e.response.status_code < 600: + raise MuckrockServerError + + + + return None return response.json() From ee4a854845ebe58a1f36c76710b55317314272a5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 16 Dec 2024 17:41:54 -0500 Subject: [PATCH 14/19] Refactor search_foia_data_db.py * Create FOIA DB Searcher class, incorporate into module * Extract logic to functions --- source_collectors/muckrock/FOIADBSearcher.py | 65 +++++++++++++++++ source_collectors/muckrock/constants.py | 3 +- .../muckrock/search_foia_data_db.py | 72 ++++--------------- 3 files changed, 80 insertions(+), 60 deletions(-) create mode 100644 source_collectors/muckrock/FOIADBSearcher.py diff --git a/source_collectors/muckrock/FOIADBSearcher.py b/source_collectors/muckrock/FOIADBSearcher.py new file mode 100644 index 00000000..391f7a8d --- /dev/null +++ b/source_collectors/muckrock/FOIADBSearcher.py @@ -0,0 +1,65 @@ +import os +import sqlite3 + +import pandas as pd + +from source_collectors.muckrock.constants import FOIA_DATA_DB + +check_results_table_query = """ + SELECT name FROM sqlite_master + WHERE (type = 'table') + AND (name = 'results') + """ + +search_foia_query = """ + SELECT * FROM results + WHERE (title LIKE ? OR tags LIKE ?) + AND (status = 'done') + """ + + +class FOIADBSearcher: + + def __init__(self, db_path = FOIA_DATA_DB): + self.db_path = db_path + if not os.path.exists(self.db_path): + raise FileNotFoundError("foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.") + + + def search(self, search_string: str) -> pd.DataFrame | None: + """ + Searches the foia_data.db database for FOIA request entries matching the provided search string. + + Args: + search_string (str): The string to search for in the `title` and `tags` of the `results` table. + + Returns: + Union[pandas.DataFrame, None]: + - pandas.DataFrame: A DataFrame containing the matching entries from the database. + - None: If an error occurs during the database operation. + + Raises: + sqlite3.Error: If any database operation fails, prints error and returns None. + Exception: If any unexpected error occurs, prints error and returns None. + """ + try: + with sqlite3.connect(self.db_path) as conn: + results_table = pd.read_sql_query(check_results_table_query, conn) + if results_table.empty: + print("The `results` table does not exist in the database.") + return None + + df = pd.read_sql_query( + sql=search_foia_query, + con=conn, + params=[f"%{search_string}%", f"%{search_string}%"] + ) + + except sqlite3.Error as e: + print(f"Sqlite error: {e}") + return None + except Exception as e: + print(f"An unexpected error occurred: {e}") + return None + + return df \ No newline at end of file diff --git a/source_collectors/muckrock/constants.py b/source_collectors/muckrock/constants.py index 7109847f..07dca8f4 100644 --- a/source_collectors/muckrock/constants.py +++ b/source_collectors/muckrock/constants.py @@ -1,3 +1,4 @@ -BASE_MUCKROCK_URL = "https://www.muckrock.com/api_v1" \ No newline at end of file +BASE_MUCKROCK_URL = "https://www.muckrock.com/api_v1" +FOIA_DATA_DB = "foia_data.db" \ No newline at end of file diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py index e7550608..7820540d 100644 --- a/source_collectors/muckrock/search_foia_data_db.py +++ b/source_collectors/muckrock/search_foia_data_db.py @@ -25,17 +25,7 @@ import os from typing import Union, List, Dict -check_results_table_query = """ - SELECT name FROM sqlite_master - WHERE (type = 'table') - AND (name = 'results') - """ - -search_foia_query = """ - SELECT * FROM results - WHERE (title LIKE ? OR tags LIKE ?) - AND (status = 'done') - """ +from source_collectors.muckrock.FOIADBSearcher import FOIADBSearcher def parser_init() -> argparse.ArgumentParser: @@ -61,45 +51,8 @@ def parser_init() -> argparse.ArgumentParser: def search_foia_db(search_string: str) -> Union[pd.DataFrame, None]: - """ - Searches the foia_data.db database for FOIA request entries matching the provided search string. - - Args: - search_string (str): The string to search for in the `title` and `tags` of the `results` table. - - Returns: - Union[pandas.DataFrame, None]: - - pandas.DataFrame: A DataFrame containing the matching entries from the database. - - None: If an error occurs during the database operation. - - Raises: - sqlite3.Error: If any database operation fails, prints error and returns None. - Exception: If any unexpected error occurs, prints error and returns None. - """ - - print(f'Searching foia_data.db for "{search_string}"...') - - try: - with sqlite3.connect("foia_data.db") as conn: - - results_table = pd.read_sql_query(check_results_table_query, conn) - - if results_table.empty: - print("The `results` table does not exist in the database.") - return None - - params = [f"%{search_string}%", f"%{search_string}%"] - - df = pd.read_sql_query(search_foia_query, conn, params=params) - - except sqlite3.Error as e: - print(f"Sqlite error: {e}") - return None - except Exception as e: - print(f"An unexpected error occurred: {e}") - return None - - return df + searcher = FOIADBSearcher() + return searcher.search(search_string) def parse_communications_column(communications) -> List[Dict]: @@ -164,24 +117,25 @@ def main() -> None: args = parser.parse_args() search_string = args.search_for - if not os.path.exists("foia_data.db"): - print( - "foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it." - ) - return - df = search_foia_db(search_string) if df is None: return + update_communications_column(df) - if not df["communications"].empty: - df["communications"] = df["communications"].apply(parse_communications_column) + announce_matching_entries(df, search_string) + generate_json(df, search_string) + + +def announce_matching_entries(df, search_string): print( f'Found {df.shape[0]} matching entries containing "{search_string}" in the title or tags' ) - generate_json(df, search_string) + +def update_communications_column(df): + if not df["communications"].empty: + df["communications"] = df["communications"].apply(parse_communications_column) if __name__ == "__main__": From ee76173177b14351615b4cb8407526a44bc04e45 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 16 Dec 2024 17:43:56 -0500 Subject: [PATCH 15/19] Refactor Directory * Move all class files into `/classes` module --- source_collectors/muckrock/{ => classes}/FOIADBSearcher.py | 0 source_collectors/muckrock/{ => classes}/FOIASearcher.py | 2 +- source_collectors/muckrock/{ => classes}/SQLiteClient.py | 0 .../muckrock/{muckrock_fetchers => classes}/__init__.py | 0 .../{ => classes}/muckrock_fetchers/AgencyFetcher.py | 2 +- .../muckrock/{ => classes}/muckrock_fetchers/FOIAFetcher.py | 2 +- .../{ => classes}/muckrock_fetchers/FOIALoopFetcher.py | 4 ++-- .../muckrock_fetchers/JurisdictionByIDFetcher.py | 2 +- .../muckrock_fetchers/JurisdictionLoopFetcher.py | 4 ++-- .../{ => classes}/muckrock_fetchers/MuckrockFetcher.py | 0 .../{ => classes}/muckrock_fetchers/MuckrockLoopFetcher.py | 2 +- .../muckrock/classes/muckrock_fetchers/__init__.py | 0 source_collectors/muckrock/create_foia_data_db.py | 6 +++--- .../muckrock/generate_detailed_muckrock_csv.py | 4 ++-- source_collectors/muckrock/get_allegheny_foias.py | 4 ++-- source_collectors/muckrock/muck_get.py | 4 ++-- source_collectors/muckrock/search_foia_data_db.py | 4 +--- 17 files changed, 19 insertions(+), 21 deletions(-) rename source_collectors/muckrock/{ => classes}/FOIADBSearcher.py (100%) rename source_collectors/muckrock/{ => classes}/FOIASearcher.py (95%) rename source_collectors/muckrock/{ => classes}/SQLiteClient.py (100%) rename source_collectors/muckrock/{muckrock_fetchers => classes}/__init__.py (100%) rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/AgencyFetcher.py (78%) rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/FOIAFetcher.py (90%) rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/FOIALoopFetcher.py (82%) rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/JurisdictionByIDFetcher.py (81%) rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/JurisdictionLoopFetcher.py (87%) rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/MuckrockFetcher.py (100%) rename source_collectors/muckrock/{ => classes}/muckrock_fetchers/MuckrockLoopFetcher.py (91%) create mode 100644 source_collectors/muckrock/classes/muckrock_fetchers/__init__.py diff --git a/source_collectors/muckrock/FOIADBSearcher.py b/source_collectors/muckrock/classes/FOIADBSearcher.py similarity index 100% rename from source_collectors/muckrock/FOIADBSearcher.py rename to source_collectors/muckrock/classes/FOIADBSearcher.py diff --git a/source_collectors/muckrock/FOIASearcher.py b/source_collectors/muckrock/classes/FOIASearcher.py similarity index 95% rename from source_collectors/muckrock/FOIASearcher.py rename to source_collectors/muckrock/classes/FOIASearcher.py index 9d6116b7..f88f8242 100644 --- a/source_collectors/muckrock/FOIASearcher.py +++ b/source_collectors/muckrock/classes/FOIASearcher.py @@ -1,6 +1,6 @@ from typing import Optional -from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher +from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher from tqdm import tqdm class FOIASearcher: diff --git a/source_collectors/muckrock/SQLiteClient.py b/source_collectors/muckrock/classes/SQLiteClient.py similarity index 100% rename from source_collectors/muckrock/SQLiteClient.py rename to source_collectors/muckrock/classes/SQLiteClient.py diff --git a/source_collectors/muckrock/muckrock_fetchers/__init__.py b/source_collectors/muckrock/classes/__init__.py similarity index 100% rename from source_collectors/muckrock/muckrock_fetchers/__init__.py rename to source_collectors/muckrock/classes/__init__.py diff --git a/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py similarity index 78% rename from source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py rename to source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py index 2e36ce31..b70c07e0 100644 --- a/source_collectors/muckrock/muckrock_fetchers/AgencyFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/AgencyFetcher.py @@ -1,5 +1,5 @@ from source_collectors.muckrock.constants import BASE_MUCKROCK_URL -from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher class AgencyFetchRequest(FetchRequest): diff --git a/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py similarity index 90% rename from source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py rename to source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py index 5b780a99..619b92ae 100644 --- a/source_collectors/muckrock/muckrock_fetchers/FOIAFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/FOIAFetcher.py @@ -1,4 +1,4 @@ -from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockFetcher, FetchRequest +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import MuckrockFetcher, FetchRequest from source_collectors.muckrock.constants import BASE_MUCKROCK_URL FOIA_BASE_URL = f"{BASE_MUCKROCK_URL}/foia" diff --git a/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/FOIALoopFetcher.py similarity index 82% rename from source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py rename to source_collectors/muckrock/classes/muckrock_fetchers/FOIALoopFetcher.py index 2af65c1e..ad78f0b6 100644 --- a/source_collectors/muckrock/muckrock_fetchers/FOIALoopFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/FOIALoopFetcher.py @@ -1,8 +1,8 @@ from datasets import tqdm from source_collectors.muckrock.constants import BASE_MUCKROCK_URL -from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest -from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher class FOIALoopFetchRequest(FetchRequest): jurisdiction: int diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py similarity index 81% rename from source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py rename to source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py index 60cb0c2e..a038418c 100644 --- a/source_collectors/muckrock/muckrock_fetchers/JurisdictionByIDFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionByIDFetcher.py @@ -1,5 +1,5 @@ from source_collectors.muckrock.constants import BASE_MUCKROCK_URL -from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher class JurisdictionByIDFetchRequest(FetchRequest): diff --git a/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionLoopFetcher.py similarity index 87% rename from source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py rename to source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionLoopFetcher.py index 816f4b59..46c1bbf6 100644 --- a/source_collectors/muckrock/muckrock_fetchers/JurisdictionLoopFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/JurisdictionLoopFetcher.py @@ -1,8 +1,8 @@ from tqdm import tqdm from source_collectors.muckrock.constants import BASE_MUCKROCK_URL -from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest, MuckrockFetcher -from source_collectors.muckrock.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockLoopFetcher import MuckrockLoopFetcher class JurisdictionLoopFetchRequest(FetchRequest): diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py similarity index 100% rename from source_collectors/muckrock/muckrock_fetchers/MuckrockFetcher.py rename to source_collectors/muckrock/classes/muckrock_fetchers/MuckrockFetcher.py diff --git a/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py similarity index 91% rename from source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py rename to source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py index 49011df3..2b3d0149 100644 --- a/source_collectors/muckrock/muckrock_fetchers/MuckrockLoopFetcher.py +++ b/source_collectors/muckrock/classes/muckrock_fetchers/MuckrockLoopFetcher.py @@ -3,7 +3,7 @@ import requests -from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import FetchRequest +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import FetchRequest class MuckrockLoopFetcher(ABC): diff --git a/source_collectors/muckrock/classes/muckrock_fetchers/__init__.py b/source_collectors/muckrock/classes/muckrock_fetchers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/source_collectors/muckrock/create_foia_data_db.py b/source_collectors/muckrock/create_foia_data_db.py index 6bff13f7..f012f5d3 100644 --- a/source_collectors/muckrock/create_foia_data_db.py +++ b/source_collectors/muckrock/create_foia_data_db.py @@ -27,9 +27,9 @@ from tqdm import tqdm -from source_collectors.muckrock.SQLiteClient import SQLiteClientContextManager, SQLClientError -from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher -from source_collectors.muckrock.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError +from source_collectors.muckrock.classes.SQLiteClient import SQLiteClientContextManager, SQLClientError +from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher +from source_collectors.muckrock.classes.muckrock_fetchers.MuckrockFetcher import MuckrockNoMoreDataError logging.basicConfig( filename="errors.log", level=logging.ERROR, format="%(levelname)s: %(message)s" diff --git a/source_collectors/muckrock/generate_detailed_muckrock_csv.py b/source_collectors/muckrock/generate_detailed_muckrock_csv.py index cf3c439d..3cb884c0 100644 --- a/source_collectors/muckrock/generate_detailed_muckrock_csv.py +++ b/source_collectors/muckrock/generate_detailed_muckrock_csv.py @@ -12,8 +12,8 @@ from pydantic import BaseModel -from source_collectors.muckrock.muckrock_fetchers.AgencyFetcher import AgencyFetcher -from source_collectors.muckrock.muckrock_fetchers.JurisdictionByIDFetcher import JurisdictionByIDFetcher +from source_collectors.muckrock.classes.muckrock_fetchers import AgencyFetcher +from source_collectors.muckrock.classes.muckrock_fetchers.JurisdictionByIDFetcher import JurisdictionByIDFetcher from utils import format_filename_json_to_csv, load_json_file diff --git a/source_collectors/muckrock/get_allegheny_foias.py b/source_collectors/muckrock/get_allegheny_foias.py index bddeffad..b269ff18 100644 --- a/source_collectors/muckrock/get_allegheny_foias.py +++ b/source_collectors/muckrock/get_allegheny_foias.py @@ -4,8 +4,8 @@ """ -from source_collectors.muckrock.muckrock_fetchers.FOIALoopFetcher import FOIALoopFetchRequest, FOIALoopFetcher -from source_collectors.muckrock.muckrock_fetchers.JurisdictionLoopFetcher import JurisdictionLoopFetchRequest, \ +from source_collectors.muckrock.classes.muckrock_fetchers.FOIALoopFetcher import FOIALoopFetchRequest, FOIALoopFetcher +from source_collectors.muckrock.classes.muckrock_fetchers import JurisdictionLoopFetchRequest, \ JurisdictionLoopFetcher from source_collectors.muckrock.utils import save_json_file diff --git a/source_collectors/muckrock/muck_get.py b/source_collectors/muckrock/muck_get.py index b1a51022..f51bf9e0 100644 --- a/source_collectors/muckrock/muck_get.py +++ b/source_collectors/muckrock/muck_get.py @@ -2,8 +2,8 @@ A straightforward standalone script for downloading data from MuckRock and searching for it with a specific search string. """ -from source_collectors.muckrock.muckrock_fetchers.FOIAFetcher import FOIAFetcher -from source_collectors.muckrock.FOIASearcher import FOIASearcher +from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher +from source_collectors.muckrock.classes.FOIASearcher import FOIASearcher from source_collectors.muckrock.utils import save_json_file if __name__ == "__main__": diff --git a/source_collectors/muckrock/search_foia_data_db.py b/source_collectors/muckrock/search_foia_data_db.py index 7820540d..51357663 100644 --- a/source_collectors/muckrock/search_foia_data_db.py +++ b/source_collectors/muckrock/search_foia_data_db.py @@ -18,14 +18,12 @@ Errors encountered during database operations, JSON parsing, or file writing are printed to the console. """ -import sqlite3 import pandas as pd import json import argparse -import os from typing import Union, List, Dict -from source_collectors.muckrock.FOIADBSearcher import FOIADBSearcher +from source_collectors.muckrock.classes.FOIADBSearcher import FOIADBSearcher def parser_init() -> argparse.ArgumentParser: From 147a786b9211b068bcb43c69a4fe256720c682db Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 18 Dec 2024 08:12:50 -0500 Subject: [PATCH 16/19] Begin draft of PDAP client --- pdap_api_client/PDAPClient.py | 76 +++++++++++++++++++++++++++++++++++ pdap_api_client/__init__.py | 0 2 files changed, 76 insertions(+) create mode 100644 pdap_api_client/PDAPClient.py create mode 100644 pdap_api_client/__init__.py diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py new file mode 100644 index 00000000..96b9a343 --- /dev/null +++ b/pdap_api_client/PDAPClient.py @@ -0,0 +1,76 @@ +from urllib import parse +from enum import Enum +from typing import Optional + +import requests +from requests.models import PreparedRequest + +API_URL = "https://data-sources-v2.pdap.dev/api" + +class Namespaces(Enum): + AUTH = "auth" + + +class RequestManager: + """ + Handles making requests and managing the responses + """ + + + + +class URLBuilder: + + def __init__(self): + self.base_url = API_URL + + def build_url( + self, + namespace: Namespaces, + subdomains: Optional[list[str]] = None, + query_parameters: Optional[dict] = None + ): + url = f"{self.base_url}/{namespace.value}" + if subdomains is not None: + url = f"{url}/{'/'.join(subdomains)}" + if query_parameters is None: + return url + req = PreparedRequest() + req.prepare_url(url, params=query_parameters) + return req.url + + + +class AccessManager: + """ + Manages login, api key, access and refresh tokens + """ + def __init__(self, email: str, password: str): + self.url_builder = URLBuilder() + + def login(self, email: str, password: str): + url = self.url_builder.build_url( + namespace=Namespaces.AUTH, + subdomains=["login"] + ) + response = requests.post( + url=url, + json={ + "email": email, + "password": password + } + ) + response.raise_for_status() + # TODO: Finish + + +class PDAPClient: + + def __init__(self): + pass + + def match_agency(self): + pass + + def check_for_unique_source_url(self, url: str): + pass \ No newline at end of file diff --git a/pdap_api_client/__init__.py b/pdap_api_client/__init__.py new file mode 100644 index 00000000..e69de29b From 82d8c5be812fad3dd631a8744d8a814d74b7bd3a Mon Sep 17 00:00:00 2001 From: maxachis Date: Wed, 18 Dec 2024 12:00:48 -0500 Subject: [PATCH 17/19] Continue draft --- pdap_api_client/DTOs.py | 6 ++ pdap_api_client/PDAPClient.py | 148 ++++++++++++++++++++++++++++------ 2 files changed, 131 insertions(+), 23 deletions(-) create mode 100644 pdap_api_client/DTOs.py diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py new file mode 100644 index 00000000..b85511b3 --- /dev/null +++ b/pdap_api_client/DTOs.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class MatchAgencyInfo(BaseModel): + submitted_name: str + id: str \ No newline at end of file diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index 96b9a343..a0763fec 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -1,20 +1,57 @@ +from http import HTTPStatus from urllib import parse from enum import Enum -from typing import Optional +from typing import Optional, List import requests +from pydantic import BaseModel from requests.models import PreparedRequest +from pdap_api_client.DTOs import MatchAgencyInfo + API_URL = "https://data-sources-v2.pdap.dev/api" class Namespaces(Enum): AUTH = "auth" - - -class RequestManager: - """ - Handles making requests and managing the responses - """ + MATCH = "match" + +class RequestType(Enum): + POST = "POST" + PUT = "PUT" + GET = "GET" + DELETE = "DELETE" + +class RequestInfo(BaseModel): + type_: RequestType + url: str + json: Optional[dict] = None + headers: Optional[dict] = None + params: Optional[dict] = None + timeout: Optional[int] = None + +class ResponseInfo(BaseModel): + status_code: HTTPStatus + data: Optional[dict] + +request_methods = { + RequestType.POST: requests.post, + RequestType.PUT: requests.put, + RequestType.GET: requests.get, + RequestType.DELETE: requests.delete, +} +def make_request(ri: RequestInfo) -> ResponseInfo: + response = request_methods[ri.type_]( + ri.url, + json=ri.json, + headers=ri.headers, + params=ri.params, + timeout=ri.timeout + ) + response.raise_for_status() + return ResponseInfo( + status_code=response.status_code, + data=response.json() + ) @@ -28,49 +65,114 @@ def build_url( self, namespace: Namespaces, subdomains: Optional[list[str]] = None, - query_parameters: Optional[dict] = None ): url = f"{self.base_url}/{namespace.value}" if subdomains is not None: url = f"{url}/{'/'.join(subdomains)}" - if query_parameters is None: - return url - req = PreparedRequest() - req.prepare_url(url, params=query_parameters) - return req.url - + return url +def build_url( + namespace: Namespaces, + subdomains: Optional[list[str]] = None +): + url = f"{API_URL}/{namespace.value}" + if subdomains is not None: + url = f"{url}/{'/'.join(subdomains)}" + return url class AccessManager: """ Manages login, api key, access and refresh tokens """ - def __init__(self, email: str, password: str): + def __init__(self, email: str, password: str, api_key: Optional[str]): self.url_builder = URLBuilder() + self.access_token = None + self.refresh_token = None + self.api_key = None + self.login(email=email, password=password) + + # TODO: Add means to refresh if token expired. + + def load_api_key(self): + url = build_url( + namespace=Namespaces.AUTH, + subdomains=["api-key"] + ) + request_info = RequestInfo( + url=url, + headers=self.jwt_header() + ) + response_info = make_request(request_info) + self.api_key = response_info.data["api_key"] def login(self, email: str, password: str): - url = self.url_builder.build_url( + url = build_url( namespace=Namespaces.AUTH, subdomains=["login"] ) - response = requests.post( + request_info = RequestInfo( url=url, json={ "email": email, "password": password } ) - response.raise_for_status() - # TODO: Finish + response_info = make_request(request_info) + data = response_info.data + self.access_token = data["access_token"] + self.refresh_token = data["refresh_token"] + + + def jwt_header(self) -> dict: + """ + Retrieve JWT header + Returns: Dictionary of Bearer Authorization with JWT key + """ + return { + "Authorization": f"Bearer {self.access_token}" + } + + def api_key_header(self): + """ + Retrieve API key header + Returns: Dictionary of Basic Authorization with API key + + """ + if self.api_key is None: + self.load_api_key() + return { + "Authorization": f"Basic {self.api_key}" + } class PDAPClient: - def __init__(self): - pass + def __init__(self, access_manager: AccessManager): + self.access_manager = access_manager + + def match_agency( + self, + name: str, + state: str, + county: str, + locality: str + ) -> List[MatchAgencyInfo]: + url = build_url( + namespace=Namespaces.MATCH, + subdomains=["agency"] + ) + request_info = RequestInfo( + url=url, + json={ + "name": name, + "state": state, + "county": county, + "locality": locality + } + ) + response_info = make_request(request_info) + return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]] - def match_agency(self): - pass def check_for_unique_source_url(self, url: str): pass \ No newline at end of file From 55695fb212880f50b0fe59a1447018e41ffba691 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 18 Dec 2024 16:20:27 -0500 Subject: [PATCH 18/19] Continue draft of PDAP client --- pdap_api_client/DTOs.py | 50 ++++++++++++- pdap_api_client/PDAPClient.py | 136 ++++++++++++++++++++-------------- 2 files changed, 129 insertions(+), 57 deletions(-) diff --git a/pdap_api_client/DTOs.py b/pdap_api_client/DTOs.py index b85511b3..31c8c2cf 100644 --- a/pdap_api_client/DTOs.py +++ b/pdap_api_client/DTOs.py @@ -1,6 +1,54 @@ +from enum import Enum +from http import HTTPStatus +from typing import Optional + from pydantic import BaseModel class MatchAgencyInfo(BaseModel): submitted_name: str - id: str \ No newline at end of file + id: str + +class ApprovalStatus(Enum): + APPROVED = "approved" + REJECTED = "rejected" + PENDING = "pending" + NEEDS_IDENTIFICATION = "needs identification" + + + +class UniqueURLDuplicateInfo(BaseModel): + original_url: str + approval_status: ApprovalStatus + rejection_note: str + +class UniqueURLResponseInfo(BaseModel): + is_unique: bool + duplicates: list[UniqueURLDuplicateInfo] + + +class Namespaces(Enum): + AUTH = "auth" + MATCH = "match" + CHECK = "check" + + +class RequestType(Enum): + POST = "POST" + PUT = "PUT" + GET = "GET" + DELETE = "DELETE" + + +class RequestInfo(BaseModel): + type_: RequestType + url: str + json: Optional[dict] = None + headers: Optional[dict] = None + params: Optional[dict] = None + timeout: Optional[int] = 10 + + +class ResponseInfo(BaseModel): + status_code: HTTPStatus + data: Optional[dict] diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index a0763fec..bdac2e05 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -1,75 +1,42 @@ from http import HTTPStatus -from urllib import parse -from enum import Enum from typing import Optional, List import requests -from pydantic import BaseModel -from requests.models import PreparedRequest -from pdap_api_client.DTOs import MatchAgencyInfo +from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \ + RequestType, RequestInfo, ResponseInfo API_URL = "https://data-sources-v2.pdap.dev/api" -class Namespaces(Enum): - AUTH = "auth" - MATCH = "match" - -class RequestType(Enum): - POST = "POST" - PUT = "PUT" - GET = "GET" - DELETE = "DELETE" - -class RequestInfo(BaseModel): - type_: RequestType - url: str - json: Optional[dict] = None - headers: Optional[dict] = None - params: Optional[dict] = None - timeout: Optional[int] = None - -class ResponseInfo(BaseModel): - status_code: HTTPStatus - data: Optional[dict] - request_methods = { RequestType.POST: requests.post, RequestType.PUT: requests.put, RequestType.GET: requests.get, RequestType.DELETE: requests.delete, } -def make_request(ri: RequestInfo) -> ResponseInfo: - response = request_methods[ri.type_]( - ri.url, - json=ri.json, - headers=ri.headers, - params=ri.params, - timeout=ri.timeout - ) - response.raise_for_status() - return ResponseInfo( - status_code=response.status_code, - data=response.json() - ) +class CustomHTTPException(Exception): + pass -class URLBuilder: - - def __init__(self): - self.base_url = API_URL +def make_request(ri: RequestInfo) -> ResponseInfo: + try: + response = request_methods[ri.type_]( + ri.url, + json=ri.json, + headers=ri.headers, + params=ri.params, + timeout=ri.timeout + ) + response.raise_for_status() + except requests.RequestException as e: + raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}") + return ResponseInfo( + status_code=HTTPStatus(response.status_code), + data=response.json() + ) - def build_url( - self, - namespace: Namespaces, - subdomains: Optional[list[str]] = None, - ): - url = f"{self.base_url}/{namespace.value}" - if subdomains is not None: - url = f"{url}/{'/'.join(subdomains)}" - return url def build_url( namespace: Namespaces, @@ -85,7 +52,6 @@ class AccessManager: Manages login, api key, access and refresh tokens """ def __init__(self, email: str, password: str, api_key: Optional[str]): - self.url_builder = URLBuilder() self.access_token = None self.refresh_token = None self.api_key = None @@ -99,18 +65,49 @@ def load_api_key(self): subdomains=["api-key"] ) request_info = RequestInfo( + type_ = RequestType.POST, url=url, headers=self.jwt_header() ) response_info = make_request(request_info) self.api_key = response_info.data["api_key"] + def refresh_access_token(self): + url = build_url( + namespace=Namespaces.AUTH, + subdomains=["refresh-session"], + ) + raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566") + + def make_request(self, ri: RequestInfo) -> ResponseInfo: + try: + response = request_methods[ri.type_]( + ri.url, + json=ri.json, + headers=ri.headers, + params=ri.params, + timeout=ri.timeout + ) + response.raise_for_status() + except requests.RequestException as e: + # TODO: Precise string matching here is brittle. Consider changing later. + if e.response.json().message == "Token is expired. Please request a new token.": + self.refresh_access_token() + return make_request(ri) + else: + raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}") + return ResponseInfo( + status_code=HTTPStatus(response.status_code), + data=response.json() + ) + def login(self, email: str, password: str): url = build_url( namespace=Namespaces.AUTH, subdomains=["login"] ) request_info = RequestInfo( + type_=RequestType.POST, url=url, json={ "email": email, @@ -157,11 +154,15 @@ def match_agency( county: str, locality: str ) -> List[MatchAgencyInfo]: + """ + Returns agencies, if any, that match or partially match the search criteria + """ url = build_url( namespace=Namespaces.MATCH, subdomains=["agency"] ) request_info = RequestInfo( + type_=RequestType.POST, url=url, json={ "name": name, @@ -174,5 +175,28 @@ def match_agency( return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]] - def check_for_unique_source_url(self, url: str): - pass \ No newline at end of file + def is_url_unique( + self, + url_to_check: str + ) -> UniqueURLResponseInfo: + """ + Check if a URL is unique. Returns duplicate info otherwise + """ + url = build_url( + namespace=Namespaces.CHECK, + subdomains=["unique-url"] + ) + request_info = RequestInfo( + type_=RequestType.GET, + url=url, + params={ + "url": url_to_check + } + ) + response_info = make_request(request_info) + duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]] + is_unique = (len(duplicates) == 0) + return UniqueURLResponseInfo( + is_unique=is_unique, + duplicates=duplicates + ) From e8d599eb87b6d9f33d8b4e7a443e62500831519f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 18 Dec 2024 16:23:11 -0500 Subject: [PATCH 19/19] Refactor: Move AccessManager to separate file --- pdap_api_client/AccessManager.py | 123 ++++++++++++++++++++++++++ pdap_api_client/PDAPClient.py | 147 ++----------------------------- 2 files changed, 128 insertions(+), 142 deletions(-) create mode 100644 pdap_api_client/AccessManager.py diff --git a/pdap_api_client/AccessManager.py b/pdap_api_client/AccessManager.py new file mode 100644 index 00000000..87877466 --- /dev/null +++ b/pdap_api_client/AccessManager.py @@ -0,0 +1,123 @@ +from http import HTTPStatus +from typing import Optional + +import requests + +from pdap_api_client.DTOs import RequestType, Namespaces, RequestInfo, ResponseInfo + +API_URL = "https://data-sources-v2.pdap.dev/api" +request_methods = { + RequestType.POST: requests.post, + RequestType.PUT: requests.put, + RequestType.GET: requests.get, + RequestType.DELETE: requests.delete, +} + + +class CustomHTTPException(Exception): + pass + + +def build_url( + namespace: Namespaces, + subdomains: Optional[list[str]] = None +): + url = f"{API_URL}/{namespace.value}" + if subdomains is not None: + url = f"{url}/{'/'.join(subdomains)}" + return url + + +class AccessManager: + """ + Manages login, api key, access and refresh tokens + """ + def __init__(self, email: str, password: str, api_key: Optional[str] = None): + self.access_token = None + self.refresh_token = None + self.api_key = api_key + self.login(email=email, password=password) + + # TODO: Add means to refresh if token expired. + + def load_api_key(self): + url = build_url( + namespace=Namespaces.AUTH, + subdomains=["api-key"] + ) + request_info = RequestInfo( + type_ = RequestType.POST, + url=url, + headers=self.jwt_header() + ) + response_info = self.make_request(request_info) + self.api_key = response_info.data["api_key"] + + def refresh_access_token(self): + url = build_url( + namespace=Namespaces.AUTH, + subdomains=["refresh-session"], + ) + raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566") + + def make_request(self, ri: RequestInfo) -> ResponseInfo: + try: + response = request_methods[ri.type_]( + ri.url, + json=ri.json, + headers=ri.headers, + params=ri.params, + timeout=ri.timeout + ) + response.raise_for_status() + except requests.RequestException as e: + # TODO: Precise string matching here is brittle. Consider changing later. + if e.response.json().message == "Token is expired. Please request a new token.": + self.refresh_access_token() + return self.make_request(ri) + else: + raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}") + return ResponseInfo( + status_code=HTTPStatus(response.status_code), + data=response.json() + ) + + def login(self, email: str, password: str): + url = build_url( + namespace=Namespaces.AUTH, + subdomains=["login"] + ) + request_info = RequestInfo( + type_=RequestType.POST, + url=url, + json={ + "email": email, + "password": password + } + ) + response_info = self.make_request(request_info) + data = response_info.data + self.access_token = data["access_token"] + self.refresh_token = data["refresh_token"] + + + def jwt_header(self) -> dict: + """ + Retrieve JWT header + Returns: Dictionary of Bearer Authorization with JWT key + """ + return { + "Authorization": f"Bearer {self.access_token}" + } + + def api_key_header(self): + """ + Retrieve API key header + Returns: Dictionary of Basic Authorization with API key + + """ + if self.api_key is None: + self.load_api_key() + return { + "Authorization": f"Basic {self.api_key}" + } diff --git a/pdap_api_client/PDAPClient.py b/pdap_api_client/PDAPClient.py index bdac2e05..6c03ce0f 100644 --- a/pdap_api_client/PDAPClient.py +++ b/pdap_api_client/PDAPClient.py @@ -1,145 +1,8 @@ -from http import HTTPStatus -from typing import Optional, List - -import requests +from typing import List +from pdap_api_client.AccessManager import build_url, AccessManager from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \ - RequestType, RequestInfo, ResponseInfo - -API_URL = "https://data-sources-v2.pdap.dev/api" - -request_methods = { - RequestType.POST: requests.post, - RequestType.PUT: requests.put, - RequestType.GET: requests.get, - RequestType.DELETE: requests.delete, -} - - -class CustomHTTPException(Exception): - pass - - -def make_request(ri: RequestInfo) -> ResponseInfo: - try: - response = request_methods[ri.type_]( - ri.url, - json=ri.json, - headers=ri.headers, - params=ri.params, - timeout=ri.timeout - ) - response.raise_for_status() - except requests.RequestException as e: - raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}") - return ResponseInfo( - status_code=HTTPStatus(response.status_code), - data=response.json() - ) - - -def build_url( - namespace: Namespaces, - subdomains: Optional[list[str]] = None -): - url = f"{API_URL}/{namespace.value}" - if subdomains is not None: - url = f"{url}/{'/'.join(subdomains)}" - return url - -class AccessManager: - """ - Manages login, api key, access and refresh tokens - """ - def __init__(self, email: str, password: str, api_key: Optional[str]): - self.access_token = None - self.refresh_token = None - self.api_key = None - self.login(email=email, password=password) - - # TODO: Add means to refresh if token expired. - - def load_api_key(self): - url = build_url( - namespace=Namespaces.AUTH, - subdomains=["api-key"] - ) - request_info = RequestInfo( - type_ = RequestType.POST, - url=url, - headers=self.jwt_header() - ) - response_info = make_request(request_info) - self.api_key = response_info.data["api_key"] - - def refresh_access_token(self): - url = build_url( - namespace=Namespaces.AUTH, - subdomains=["refresh-session"], - ) - raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566") - - def make_request(self, ri: RequestInfo) -> ResponseInfo: - try: - response = request_methods[ri.type_]( - ri.url, - json=ri.json, - headers=ri.headers, - params=ri.params, - timeout=ri.timeout - ) - response.raise_for_status() - except requests.RequestException as e: - # TODO: Precise string matching here is brittle. Consider changing later. - if e.response.json().message == "Token is expired. Please request a new token.": - self.refresh_access_token() - return make_request(ri) - else: - raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}") - return ResponseInfo( - status_code=HTTPStatus(response.status_code), - data=response.json() - ) - - def login(self, email: str, password: str): - url = build_url( - namespace=Namespaces.AUTH, - subdomains=["login"] - ) - request_info = RequestInfo( - type_=RequestType.POST, - url=url, - json={ - "email": email, - "password": password - } - ) - response_info = make_request(request_info) - data = response_info.data - self.access_token = data["access_token"] - self.refresh_token = data["refresh_token"] - - - def jwt_header(self) -> dict: - """ - Retrieve JWT header - Returns: Dictionary of Bearer Authorization with JWT key - """ - return { - "Authorization": f"Bearer {self.access_token}" - } - - def api_key_header(self): - """ - Retrieve API key header - Returns: Dictionary of Basic Authorization with API key - - """ - if self.api_key is None: - self.load_api_key() - return { - "Authorization": f"Basic {self.api_key}" - } + RequestType, RequestInfo class PDAPClient: @@ -171,7 +34,7 @@ def match_agency( "locality": locality } ) - response_info = make_request(request_info) + response_info = self.access_manager.make_request(request_info) return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]] @@ -193,7 +56,7 @@ def is_url_unique( "url": url_to_check } ) - response_info = make_request(request_info) + response_info = self.access_manager.make_request(request_info) duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]] is_unique = (len(duplicates) == 0) return UniqueURLResponseInfo(