Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Add manual strategy to Batch strategy enum

Revision ID: 028565b77b9e
Revises: e285e6e7cf71
Create Date: 2025-05-03 09:56:51.134406

"""
from typing import Sequence, Union

from alembic import op

Check warning on line 10 in alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py#L10 <401>

'alembic.op' imported but unused
Raw output
./alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py:10:1: F401 'alembic.op' imported but unused
import sqlalchemy as sa

Check warning on line 11 in alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py#L11 <401>

'sqlalchemy as sa' imported but unused
Raw output
./alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py:11:1: F401 'sqlalchemy as sa' imported but unused

from util.alembic_helpers import switch_enum_type

# revision identifiers, used by Alembic.
revision: str = '028565b77b9e'
down_revision: Union[str, None] = 'e285e6e7cf71'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:

Check warning on line 22 in alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py#L22 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py:22:1: D103 Missing docstring in public function
switch_enum_type(
table_name="batches",
column_name="strategy",
enum_name="batch_strategy",
new_enum_values=[
"example",
"ckan",
"muckrock_county_search",
"auto_googler",
"muckrock_all_search",
"muckrock_simple_search",
"common_crawler",
"manual"
],
)


def downgrade() -> None:

Check warning on line 40 in alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py#L40 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py:40:1: D103 Missing docstring in public function
switch_enum_type(
table_name="batches",
column_name="strategy",
enum_name="batch_strategy",
new_enum_values=[
"example",
"ckan",
"muckrock_county_search",
"auto_googler",
"muckrock_all_search",
"muckrock_simple_search",
"common_crawler"
],
)
16 changes: 16 additions & 0 deletions api/routes/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from collector_manager.enums import CollectorType
from core.AsyncCore import AsyncCore
from core.DTOs.CollectorStartInfo import CollectorStartInfo
from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO
from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO
from security_manager.SecurityManager import AccessInfo, get_access_info
from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO
from source_collectors.ckan.DTOs import CKANInputDTO
Expand Down Expand Up @@ -122,4 +124,18 @@ async def start_muckrock_all_foia_collector(
collector_type=CollectorType.MUCKROCK_ALL_SEARCH,
dto=dto,
user_id=access_info.user_id
)

@collector_router.post("/manual")
async def upload_manual_collector(
dto: ManualBatchInputDTO,
core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(get_access_info),
) -> ManualBatchResponseDTO:
"""
Uploads a manual "collector" with existing data
"""
return await core.upload_manual_batch(
dto=dto,
user_id=access_info.user_id
)
58 changes: 57 additions & 1 deletion collector_db/AsyncDatabaseClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo
from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseErrorInfo, \
GetURLsResponseInnerInfo
from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO
from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO
from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO
from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo
Expand Down Expand Up @@ -561,7 +563,7 @@
async def get_urls(self, session: AsyncSession, page: int, errors: bool) -> GetURLsResponseInfo:
statement = select(URL).options(
selectinload(URL.error_info)
)
).order_by(URL.id)
if errors:
# Only return URLs with errors
statement = statement.where(
Expand Down Expand Up @@ -1719,6 +1721,60 @@
)
session.add(agency_suggestion)

@session_manager
async def upload_manual_batch(

Check warning on line 1725 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L1725 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:1725:1: D102 Missing docstring in public method
self,
session: AsyncSession,
user_id: int,
dto: ManualBatchInputDTO
) -> ManualBatchResponseDTO:
batch = Batch(
strategy=CollectorType.MANUAL.value,
status=BatchStatus.READY_TO_LABEL.value,
parameters={
"name": dto.name
},
user_id=user_id
)
session.add(batch)
await session.flush()

batch_id = batch.id
url_ids = []
duplicate_urls = []

for entry in dto.entries:
url = URL(
url=entry.url,
name=entry.name,
description=entry.description,
batch_id=batch_id,
collector_metadata=entry.collector_metadata,
outcome=URLStatus.PENDING.value,
record_type=entry.record_type.value if entry.record_type is not None else None,
)

async with session.begin_nested():
try:
session.add(url)
await session.flush()
except IntegrityError:
duplicate_urls.append(entry.url)
continue
await session.flush()
optional_metadata = URLOptionalDataSourceMetadata(
url_id=url.id,
record_formats=entry.record_formats,
data_portal_type=entry.data_portal_type,
supplying_entity=entry.supplying_entity,
)
session.add(optional_metadata)
url_ids.append(url.id)


return ManualBatchResponseDTO(

Check failure on line 1775 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L1775 <303>

too many blank lines (2)
Raw output
./collector_db/AsyncDatabaseClient.py:1775:9: E303 too many blank lines (2)
batch_id=batch_id,
urls=url_ids,
duplicate_urls=duplicate_urls
)

13 changes: 7 additions & 6 deletions collector_db/DatabaseClient.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from datetime import datetime, timedelta
from functools import wraps
from typing import Optional, List

from sqlalchemy import create_engine, Row
from sqlalchemy import create_engine
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import sessionmaker, scoped_session, aliased
from sqlalchemy.ext.asyncio import AsyncSession

Check warning on line 6 in collector_db/DatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/DatabaseClient.py#L6 <401>

'sqlalchemy.ext.asyncio.AsyncSession' imported but unused
Raw output
./collector_db/DatabaseClient.py:6:1: F401 'sqlalchemy.ext.asyncio.AsyncSession' imported but unused
from sqlalchemy.orm import sessionmaker, scoped_session

from collector_db.ConfigManager import ConfigManager
from collector_db.DTOs.BatchInfo import BatchInfo
from collector_db.DTOs.DuplicateInfo import DuplicateInfo, DuplicateInsertInfo
from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo
from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo
from collector_db.DTOs.LogInfo import LogInfo, LogOutputInfo
from collector_db.DTOs.LogInfo import LogInfo
from collector_db.DTOs.URLInfo import URLInfo
from collector_db.DTOs.URLMapping import URLMapping
from collector_db.helper_functions import get_postgres_connection_string
from collector_db.models import Base, Batch, URL, Log, Duplicate
from collector_manager.enums import CollectorType
from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO

Check warning on line 18 in collector_db/DatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/DatabaseClient.py#L18 <401>

'core.DTOs.ManualBatchInputDTO.ManualBatchInputDTO' imported but unused
Raw output
./collector_db/DatabaseClient.py:18:1: F401 'core.DTOs.ManualBatchInputDTO.ManualBatchInputDTO' imported but unused
from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO

Check warning on line 19 in collector_db/DatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/DatabaseClient.py#L19 <401>

'core.DTOs.ManualBatchResponseDTO.ManualBatchResponseDTO' imported but unused
Raw output
./collector_db/DatabaseClient.py:19:1: F401 'core.DTOs.ManualBatchResponseDTO.ManualBatchResponseDTO' imported but unused
from core.EnvVarManager import EnvVarManager
from core.enums import BatchStatus

Expand Down
1 change: 1 addition & 0 deletions collector_db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Batch(Base):
'muckrock_all_search',
'muckrock_simple_search',
'common_crawler',
'manual',
name='batch_strategy'),
nullable=False)
user_id = Column(Integer, nullable=False)
Expand Down
1 change: 1 addition & 0 deletions collector_manager/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class CollectorType(Enum):
MUCKROCK_COUNTY_SEARCH = "muckrock_county_search"
MUCKROCK_ALL_SEARCH = "muckrock_all_search"
CKAN = "ckan"
MANUAL = "manual"

class URLStatus(Enum):
PENDING = "pending"
Expand Down
12 changes: 11 additions & 1 deletion core/AsyncCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from core.DTOs.GetTasksResponse import GetTasksResponse
from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse
from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo
from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO
from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO
from core.DTOs.MessageResponse import MessageResponse
from core.TaskManager import TaskManager
from core.enums import BatchStatus, RecordType
Expand Down Expand Up @@ -270,5 +272,13 @@
user_id=access_info.user_id
)


async def upload_manual_batch(

Check warning on line 275 in core/AsyncCore.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/AsyncCore.py#L275 <102>

Missing docstring in public method
Raw output
./core/AsyncCore.py:275:1: D102 Missing docstring in public method
self,
dto: ManualBatchInputDTO,
user_id: int
) -> ManualBatchResponseDTO:
return await self.adb_client.upload_manual_batch(
user_id=user_id,
dto=dto
)

24 changes: 24 additions & 0 deletions core/DTOs/ManualBatchInputDTO.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Optional

Check warning on line 1 in core/DTOs/ManualBatchInputDTO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/ManualBatchInputDTO.py#L1 <100>

Missing docstring in public module
Raw output
./core/DTOs/ManualBatchInputDTO.py:1:1: D100 Missing docstring in public module

from pydantic import BaseModel, Field

from core.enums import RecordType


class ManualBatchInnerInputDTO(BaseModel):

Check warning on line 8 in core/DTOs/ManualBatchInputDTO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/ManualBatchInputDTO.py#L8 <101>

Missing docstring in public class
Raw output
./core/DTOs/ManualBatchInputDTO.py:8:1: D101 Missing docstring in public class
url: str
name: Optional[str] = None
description: Optional[str] = None
collector_metadata: Optional[dict] = None
record_type: Optional[RecordType] = None
record_formats: Optional[list[str]] = None
data_portal_type: Optional[str] = None
supplying_entity: Optional[str] = None


class ManualBatchInputDTO(BaseModel):

Check warning on line 19 in core/DTOs/ManualBatchInputDTO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/ManualBatchInputDTO.py#L19 <101>

Missing docstring in public class
Raw output
./core/DTOs/ManualBatchInputDTO.py:19:1: D101 Missing docstring in public class
name: str
entries: list[ManualBatchInnerInputDTO] = Field(
min_length=1,
max_length=1000
)

Check warning on line 24 in core/DTOs/ManualBatchInputDTO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/ManualBatchInputDTO.py#L24 <292>

no newline at end of file
Raw output
./core/DTOs/ManualBatchInputDTO.py:24:6: W292 no newline at end of file
7 changes: 7 additions & 0 deletions core/DTOs/ManualBatchResponseDTO.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from pydantic import BaseModel

Check warning on line 1 in core/DTOs/ManualBatchResponseDTO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/ManualBatchResponseDTO.py#L1 <100>

Missing docstring in public module
Raw output
./core/DTOs/ManualBatchResponseDTO.py:1:1: D100 Missing docstring in public module


class ManualBatchResponseDTO(BaseModel):

Check warning on line 4 in core/DTOs/ManualBatchResponseDTO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/ManualBatchResponseDTO.py#L4 <101>

Missing docstring in public class
Raw output
./core/DTOs/ManualBatchResponseDTO.py:4:1: D101 Missing docstring in public class
batch_id: int
urls: list[int]
duplicate_urls: list[str]

Check warning on line 7 in core/DTOs/ManualBatchResponseDTO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/ManualBatchResponseDTO.py#L7 <292>

no newline at end of file
Raw output
./core/DTOs/ManualBatchResponseDTO.py:7:30: W292 no newline at end of file
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[project]
name="source-collector"
version="0.1.0"
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from http import HTTPStatus
from typing import Optional, Annotated

from fastapi import HTTPException
from pydantic import BaseModel
from starlette.testclient import TestClient

Expand All @@ -24,6 +25,8 @@
from core.DTOs.GetTasksResponse import GetTasksResponse
from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse
from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo
from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO
from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO
from core.DTOs.MessageResponse import MessageResponse
from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo
from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo
Expand All @@ -32,7 +35,11 @@


class ExpectedResponseInfo(BaseModel):
status_code: Annotated[HTTPStatus, "The expected status code"] = HTTPStatus.OK
status_code: Annotated[
HTTPStatus,
"The expected status code"
] = HTTPStatus.OK
message: Optional[str] = None

class RequestValidator:
"""
Expand Down Expand Up @@ -64,6 +71,31 @@
assert response.status_code == expected_response.status_code, response.text
return response.json()

def open_v2(
self,
method: str,
url: str,
params: Optional[dict] = None,
**kwargs
) -> dict:
"""
Variation on open that raises an exception rather than check the status code
"""
if params:
kwargs["params"] = params
response = self.client.request(
method=method,
url=url,
headers={"Authorization": "Bearer token"}, # Fake authentication that is overridden during testing
**kwargs
)
if response.status_code != HTTPStatus.OK:
raise HTTPException(
status_code=response.status_code,
detail=response.json()
)
return response.json()

def get(
self,
url: str,
Expand Down Expand Up @@ -94,6 +126,20 @@
**kwargs
)

def post_v2(

Check warning on line 129 in tests/test_automated/integration/api/helpers/RequestValidator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/test_automated/integration/api/helpers/RequestValidator.py#L129 <102>

Missing docstring in public method
Raw output
./tests/test_automated/integration/api/helpers/RequestValidator.py:129:1: D102 Missing docstring in public method
self,
url: str,
params: Optional[dict] = None,
**kwargs
) -> dict:
return self.open_v2(
method="POST",
url=url,
params=params,
**kwargs
)


def put(
self,
url: str,
Expand Down Expand Up @@ -329,4 +375,14 @@
params=params,
json=all_annotations_post_info.model_dump(mode='json')
)
return GetNextURLForAllAnnotationResponse(**data)
return GetNextURLForAllAnnotationResponse(**data)

async def submit_manual_batch(

Check warning on line 380 in tests/test_automated/integration/api/helpers/RequestValidator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/test_automated/integration/api/helpers/RequestValidator.py#L380 <102>

Missing docstring in public method
Raw output
./tests/test_automated/integration/api/helpers/RequestValidator.py:380:1: D102 Missing docstring in public method
self,
dto: ManualBatchInputDTO,
) -> ManualBatchResponseDTO:
data = self.post_v2(
url="/collector/manual",
json=dto.model_dump(mode='json'),
)
return ManualBatchResponseDTO(**data)

Check warning on line 388 in tests/test_automated/integration/api/helpers/RequestValidator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/test_automated/integration/api/helpers/RequestValidator.py#L388 <292>

no newline at end of file
Raw output
./tests/test_automated/integration/api/helpers/RequestValidator.py:388:46: W292 no newline at end of file
Loading