Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions pdap_api_client/AccessManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from http import HTTPStatus

Check warning on line 1 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L1 <100>

Missing docstring in public module
Raw output
./pdap_api_client/AccessManager.py:1:1: D100 Missing docstring in public module
from typing import Optional

import requests

from pdap_api_client.DTOs import RequestType, Namespaces, RequestInfo, ResponseInfo

API_URL = "https://data-sources-v2.pdap.dev/api"
request_methods = {
RequestType.POST: requests.post,
RequestType.PUT: requests.put,
RequestType.GET: requests.get,
RequestType.DELETE: requests.delete,
}


class CustomHTTPException(Exception):

Check warning on line 17 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L17 <101>

Missing docstring in public class
Raw output
./pdap_api_client/AccessManager.py:17:1: D101 Missing docstring in public class
pass


def build_url(

Check warning on line 21 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L21 <103>

Missing docstring in public function
Raw output
./pdap_api_client/AccessManager.py:21:1: D103 Missing docstring in public function
namespace: Namespaces,
subdomains: Optional[list[str]] = None
):
url = f"{API_URL}/{namespace.value}"
if subdomains is not None:
url = f"{url}/{'/'.join(subdomains)}"
return url


class AccessManager:
"""
Manages login, api key, access and refresh tokens
"""
def __init__(self, email: str, password: str, api_key: Optional[str] = None):

Check warning on line 35 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L35 <107>

Missing docstring in __init__
Raw output
./pdap_api_client/AccessManager.py:35:1: D107 Missing docstring in __init__
self.access_token = None
self.refresh_token = None
self.api_key = api_key
self.login(email=email, password=password)

# TODO: Add means to refresh if token expired.

def load_api_key(self):

Check warning on line 43 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L43 <102>

Missing docstring in public method
Raw output
./pdap_api_client/AccessManager.py:43:1: D102 Missing docstring in public method
url = build_url(
namespace=Namespaces.AUTH,
subdomains=["api-key"]
)
request_info = RequestInfo(
type_ = RequestType.POST,

Check failure on line 49 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L49 <251>

unexpected spaces around keyword / parameter equals
Raw output
./pdap_api_client/AccessManager.py:49:18: E251 unexpected spaces around keyword / parameter equals

Check failure on line 49 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L49 <251>

unexpected spaces around keyword / parameter equals
Raw output
./pdap_api_client/AccessManager.py:49:20: E251 unexpected spaces around keyword / parameter equals
url=url,
headers=self.jwt_header()
)
response_info = self.make_request(request_info)
self.api_key = response_info.data["api_key"]

def refresh_access_token(self):

Check warning on line 56 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L56 <102>

Missing docstring in public method
Raw output
./pdap_api_client/AccessManager.py:56:1: D102 Missing docstring in public method
url = build_url(

Check warning on line 57 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L57 <841>

local variable 'url' is assigned to but never used
Raw output
./pdap_api_client/AccessManager.py:57:9: F841 local variable 'url' is assigned to but never used
namespace=Namespaces.AUTH,
subdomains=["refresh-session"],
)
raise NotImplementedError("Waiting on https://github.com/Police-Data-Accessibility-Project/data-sources-app/issues/566")

def make_request(self, ri: RequestInfo) -> ResponseInfo:

Check warning on line 63 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L63 <102>

Missing docstring in public method
Raw output
./pdap_api_client/AccessManager.py:63:1: D102 Missing docstring in public method
try:
response = request_methods[ri.type_](
ri.url,
json=ri.json,
headers=ri.headers,
params=ri.params,
timeout=ri.timeout
)
response.raise_for_status()
except requests.RequestException as e:
# TODO: Precise string matching here is brittle. Consider changing later.
if e.response.json().message == "Token is expired. Please request a new token.":
self.refresh_access_token()
return self.make_request(ri)
else:
raise CustomHTTPException(f"Error making {ri.type_} request to {ri.url}: {e}")
return ResponseInfo(
status_code=HTTPStatus(response.status_code),
data=response.json()
)

def login(self, email: str, password: str):

Check warning on line 85 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L85 <102>

Missing docstring in public method
Raw output
./pdap_api_client/AccessManager.py:85:1: D102 Missing docstring in public method
url = build_url(
namespace=Namespaces.AUTH,
subdomains=["login"]
)
request_info = RequestInfo(
type_=RequestType.POST,
url=url,
json={
"email": email,
"password": password
}
)
response_info = self.make_request(request_info)
data = response_info.data
self.access_token = data["access_token"]
self.refresh_token = data["refresh_token"]


def jwt_header(self) -> dict:

Check failure on line 104 in pdap_api_client/AccessManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/AccessManager.py#L104 <303>

too many blank lines (2)
Raw output
./pdap_api_client/AccessManager.py:104:5: E303 too many blank lines (2)
"""
Retrieve JWT header
Returns: Dictionary of Bearer Authorization with JWT key
"""
return {
"Authorization": f"Bearer {self.access_token}"
}

def api_key_header(self):
"""
Retrieve API key header
Returns: Dictionary of Basic Authorization with API key

"""
if self.api_key is None:
self.load_api_key()
return {
"Authorization": f"Basic {self.api_key}"
}
54 changes: 54 additions & 0 deletions pdap_api_client/DTOs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from enum import Enum

Check warning on line 1 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L1 <100>

Missing docstring in public module
Raw output
./pdap_api_client/DTOs.py:1:1: D100 Missing docstring in public module
from http import HTTPStatus
from typing import Optional

from pydantic import BaseModel


class MatchAgencyInfo(BaseModel):

Check warning on line 8 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L8 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:8:1: D101 Missing docstring in public class
submitted_name: str
id: str

class ApprovalStatus(Enum):

Check warning on line 12 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L12 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:12:1: D101 Missing docstring in public class
APPROVED = "approved"
REJECTED = "rejected"
PENDING = "pending"
NEEDS_IDENTIFICATION = "needs identification"



class UniqueURLDuplicateInfo(BaseModel):

Check warning on line 20 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L20 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:20:1: D101 Missing docstring in public class

Check failure on line 20 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L20 <303>

too many blank lines (3)
Raw output
./pdap_api_client/DTOs.py:20:1: E303 too many blank lines (3)
original_url: str
approval_status: ApprovalStatus
rejection_note: str

class UniqueURLResponseInfo(BaseModel):

Check warning on line 25 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L25 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:25:1: D101 Missing docstring in public class
is_unique: bool
duplicates: list[UniqueURLDuplicateInfo]


class Namespaces(Enum):

Check warning on line 30 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L30 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:30:1: D101 Missing docstring in public class
AUTH = "auth"
MATCH = "match"
CHECK = "check"


class RequestType(Enum):

Check warning on line 36 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L36 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:36:1: D101 Missing docstring in public class
POST = "POST"
PUT = "PUT"
GET = "GET"
DELETE = "DELETE"


class RequestInfo(BaseModel):

Check warning on line 43 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L43 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:43:1: D101 Missing docstring in public class
type_: RequestType
url: str
json: Optional[dict] = None
headers: Optional[dict] = None
params: Optional[dict] = None
timeout: Optional[int] = 10


class ResponseInfo(BaseModel):

Check warning on line 52 in pdap_api_client/DTOs.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/DTOs.py#L52 <101>

Missing docstring in public class
Raw output
./pdap_api_client/DTOs.py:52:1: D101 Missing docstring in public class
status_code: HTTPStatus
data: Optional[dict]
65 changes: 65 additions & 0 deletions pdap_api_client/PDAPClient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import List

Check warning on line 1 in pdap_api_client/PDAPClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/PDAPClient.py#L1 <100>

Missing docstring in public module
Raw output
./pdap_api_client/PDAPClient.py:1:1: D100 Missing docstring in public module

from pdap_api_client.AccessManager import build_url, AccessManager
from pdap_api_client.DTOs import MatchAgencyInfo, UniqueURLDuplicateInfo, UniqueURLResponseInfo, Namespaces, \
RequestType, RequestInfo


class PDAPClient:

Check warning on line 8 in pdap_api_client/PDAPClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/PDAPClient.py#L8 <101>

Missing docstring in public class
Raw output
./pdap_api_client/PDAPClient.py:8:1: D101 Missing docstring in public class

def __init__(self, access_manager: AccessManager):

Check warning on line 10 in pdap_api_client/PDAPClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/PDAPClient.py#L10 <107>

Missing docstring in __init__
Raw output
./pdap_api_client/PDAPClient.py:10:1: D107 Missing docstring in __init__
self.access_manager = access_manager

def match_agency(
self,
name: str,
state: str,
county: str,
locality: str
) -> List[MatchAgencyInfo]:
"""
Returns agencies, if any, that match or partially match the search criteria
"""
url = build_url(
namespace=Namespaces.MATCH,
subdomains=["agency"]
)
request_info = RequestInfo(
type_=RequestType.POST,
url=url,
json={
"name": name,
"state": state,
"county": county,
"locality": locality
}
)
response_info = self.access_manager.make_request(request_info)
return [MatchAgencyInfo(**agency) for agency in response_info.data["agencies"]]


def is_url_unique(

Check failure on line 41 in pdap_api_client/PDAPClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/PDAPClient.py#L41 <303>

too many blank lines (2)
Raw output
./pdap_api_client/PDAPClient.py:41:5: E303 too many blank lines (2)
self,
url_to_check: str
) -> UniqueURLResponseInfo:
"""
Check if a URL is unique. Returns duplicate info otherwise
"""
url = build_url(
namespace=Namespaces.CHECK,
subdomains=["unique-url"]
)
request_info = RequestInfo(
type_=RequestType.GET,
url=url,
params={
"url": url_to_check
}
)
response_info = self.access_manager.make_request(request_info)
duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]]
is_unique = (len(duplicates) == 0)
return UniqueURLResponseInfo(
is_unique=is_unique,
duplicates=duplicates
)
Empty file added pdap_api_client/__init__.py
Empty file.
Empty file added source_collectors/__init__.py
Empty file.
2 changes: 0 additions & 2 deletions source_collectors/muckrock/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ pip install -r requirements.txt

### 2. Clone Muckrock database & search locally

~~- `download_muckrock_foia.py` `search_local_foia_json.py`~~ (deprecated)

- scripts to clone the MuckRock foia requests collection for fast local querying (total size <2GB at present)

- `create_foia_data_db.py` creates and populates a SQLite database (`foia_data.db`) with all MuckRock foia requests. Various errors outside the scope of this script may occur; a counter (`last_page_fetched.txt`) is created to keep track of the most recent page fetched and inserted into the database. If the program exits prematurely, simply run `create_foia_data_db.py` again to continue where you left off. A log file is created to capture errors for later reference.
Expand Down
Empty file.
65 changes: 65 additions & 0 deletions source_collectors/muckrock/classes/FOIADBSearcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os

Check warning on line 1 in source_collectors/muckrock/classes/FOIADBSearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIADBSearcher.py#L1 <100>

Missing docstring in public module
Raw output
./source_collectors/muckrock/classes/FOIADBSearcher.py:1:1: D100 Missing docstring in public module
import sqlite3

import pandas as pd

from source_collectors.muckrock.constants import FOIA_DATA_DB

check_results_table_query = """
SELECT name FROM sqlite_master
WHERE (type = 'table')
AND (name = 'results')
"""

search_foia_query = """
SELECT * FROM results
WHERE (title LIKE ? OR tags LIKE ?)
AND (status = 'done')
"""


class FOIADBSearcher:

Check warning on line 21 in source_collectors/muckrock/classes/FOIADBSearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIADBSearcher.py#L21 <101>

Missing docstring in public class
Raw output
./source_collectors/muckrock/classes/FOIADBSearcher.py:21:1: D101 Missing docstring in public class

def __init__(self, db_path = FOIA_DATA_DB):

Check warning on line 23 in source_collectors/muckrock/classes/FOIADBSearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIADBSearcher.py#L23 <107>

Missing docstring in __init__
Raw output
./source_collectors/muckrock/classes/FOIADBSearcher.py:23:1: D107 Missing docstring in __init__

Check failure on line 23 in source_collectors/muckrock/classes/FOIADBSearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIADBSearcher.py#L23 <251>

unexpected spaces around keyword / parameter equals
Raw output
./source_collectors/muckrock/classes/FOIADBSearcher.py:23:31: E251 unexpected spaces around keyword / parameter equals

Check failure on line 23 in source_collectors/muckrock/classes/FOIADBSearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIADBSearcher.py#L23 <251>

unexpected spaces around keyword / parameter equals
Raw output
./source_collectors/muckrock/classes/FOIADBSearcher.py:23:33: E251 unexpected spaces around keyword / parameter equals
self.db_path = db_path
if not os.path.exists(self.db_path):
raise FileNotFoundError("foia_data.db does not exist.\nRun create_foia_data_db.py first to create and populate it.")


def search(self, search_string: str) -> pd.DataFrame | None:

Check failure on line 29 in source_collectors/muckrock/classes/FOIADBSearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIADBSearcher.py#L29 <303>

too many blank lines (2)
Raw output
./source_collectors/muckrock/classes/FOIADBSearcher.py:29:5: E303 too many blank lines (2)
"""
Searches the foia_data.db database for FOIA request entries matching the provided search string.

Args:
search_string (str): The string to search for in the `title` and `tags` of the `results` table.

Returns:
Union[pandas.DataFrame, None]:
- pandas.DataFrame: A DataFrame containing the matching entries from the database.
- None: If an error occurs during the database operation.

Raises:
sqlite3.Error: If any database operation fails, prints error and returns None.
Exception: If any unexpected error occurs, prints error and returns None.
"""
try:
with sqlite3.connect(self.db_path) as conn:
results_table = pd.read_sql_query(check_results_table_query, conn)
if results_table.empty:
print("The `results` table does not exist in the database.")
return None

df = pd.read_sql_query(
sql=search_foia_query,
con=conn,
params=[f"%{search_string}%", f"%{search_string}%"]
)

except sqlite3.Error as e:
print(f"Sqlite error: {e}")
return None
except Exception as e:
print(f"An unexpected error occurred: {e}")
return None

return df

Check warning on line 65 in source_collectors/muckrock/classes/FOIADBSearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIADBSearcher.py#L65 <292>

no newline at end of file
Raw output
./source_collectors/muckrock/classes/FOIADBSearcher.py:65:18: W292 no newline at end of file
58 changes: 58 additions & 0 deletions source_collectors/muckrock/classes/FOIASearcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from typing import Optional

Check warning on line 1 in source_collectors/muckrock/classes/FOIASearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIASearcher.py#L1 <100>

Missing docstring in public module
Raw output
./source_collectors/muckrock/classes/FOIASearcher.py:1:1: D100 Missing docstring in public module

from source_collectors.muckrock.classes.muckrock_fetchers import FOIAFetcher
from tqdm import tqdm

class FOIASearcher:
"""
Used for searching FOIA data from MuckRock
"""

def __init__(self, fetcher: FOIAFetcher, search_term: Optional[str] = None):

Check warning on line 11 in source_collectors/muckrock/classes/FOIASearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIASearcher.py#L11 <107>

Missing docstring in __init__
Raw output
./source_collectors/muckrock/classes/FOIASearcher.py:11:1: D107 Missing docstring in __init__
self.fetcher = fetcher
self.search_term = search_term

def fetch_page(self) -> dict | None:
"""
Fetches the next page of results using the fetcher.
"""
data = self.fetcher.fetch_next_page()
if data is None or data.get("results") is None:
return None
return data

def filter_results(self, results: list[dict]) -> list[dict]:
"""
Filters the results based on the search term.
Override or modify as needed for custom filtering logic.
"""
if self.search_term:
return [result for result in results if self.search_term.lower() in result["title"].lower()]
return results

def update_progress(self, pbar: tqdm, results: list[dict]) -> int:
"""
Updates the progress bar and returns the count of results processed.
"""
num_results = len(results)
pbar.update(num_results)
return num_results

def search_to_count(self, max_count: int) -> list[dict]:
"""
Fetches and processes results up to a maximum count.
"""
count = max_count
all_results = []
with tqdm(total=max_count, desc="Fetching results", unit="result") as pbar:
while count > 0:
data = self.fetch_page()
if not data:
break

results = self.filter_results(data["results"])
all_results.extend(results)
count -= self.update_progress(pbar, results)

return all_results

Check warning on line 58 in source_collectors/muckrock/classes/FOIASearcher.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/muckrock/classes/FOIASearcher.py#L58 <391>

blank line at end of file
Raw output
./source_collectors/muckrock/classes/FOIASearcher.py:58:1: W391 blank line at end of file
Loading
Loading