Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Create url_checked_for_duplicate table

Revision ID: 864107b703ae
Revises: 9d4002437ebe
Create Date: 2025-05-13 07:04:22.592396

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from util.alembic_helpers import switch_enum_type

# revision identifiers, used by Alembic.
revision: str = '864107b703ae'
down_revision: Union[str, None] = '9d4002437ebe'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:

Check warning on line 22 in alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py#L22 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py:22:1: D103 Missing docstring in public function
op.create_table(
'url_checked_for_duplicate',
sa.Column(
'id',
sa.Integer(),
primary_key=True
),
sa.Column(
'url_id',
sa.Integer(),
sa.ForeignKey(
'urls.id',
ondelete='CASCADE'
),
nullable=False
),
sa.Column(
'created_at',
sa.DateTime(),
nullable=False,
server_default=sa.text('now()')
),
)

switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
"HTML",
"Relevancy",
"Record Type",
"Agency Identification",
"Misc Metadata",
"Submit Approved URLs",
"Duplicate Detection"
]
)


def downgrade() -> None:

Check warning on line 63 in alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py#L63 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_13_0704-864107b703ae_create_url_checked_for_duplicate_table.py:63:1: D103 Missing docstring in public function
op.drop_table('url_checked_for_duplicate')

switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
"HTML",
"Relevancy",
"Record Type",
"Agency Identification",
"Misc Metadata",
"Submit Approved URLs",
]
)
47 changes: 46 additions & 1 deletion collector_db/AsyncDatabaseClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \
UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \
UserRecordTypeSuggestion, ReviewingUserURL, URLOptionalDataSourceMetadata, ConfirmedURLAgency, Duplicate, Log, \
BacklogSnapshot, URLDataSource
BacklogSnapshot, URLDataSource, URLCheckedForDuplicate
from collector_manager.enums import URLStatus, CollectorType
from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo
Expand Down Expand Up @@ -60,6 +60,7 @@
from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO
from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo
from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO
from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo
from core.EnvVarManager import EnvVarManager
from core.enums import BatchStatus, SuggestionType, RecordType
Expand Down Expand Up @@ -2224,4 +2225,48 @@

session.add(snapshot)

@session_manager
async def has_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> bool:

Check warning on line 2229 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2229 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2229:1: D102 Missing docstring in public method
query = (select(
URL.id
).outerjoin(
URLCheckedForDuplicate,
URL.id == URLCheckedForDuplicate.url_id
).where(
URL.outcome == URLStatus.PENDING.value,
URLCheckedForDuplicate.id == None

Check failure on line 2237 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2237 <711>

comparison to None should be 'if cond is None:'
Raw output
./collector_db/AsyncDatabaseClient.py:2237:39: E711 comparison to None should be 'if cond is None:'
).limit(1)
)

raw_result = await session.execute(query)
result = raw_result.one_or_none()
return result is not None

@session_manager
async def get_pending_urls_not_checked_for_duplicates(self, session: AsyncSession) -> List[URLDuplicateTDO]:

Check warning on line 2246 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2246 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2246:1: D102 Missing docstring in public method
query = (select(
URL
).outerjoin(
URLCheckedForDuplicate,
URL.id == URLCheckedForDuplicate.url_id
).where(
URL.outcome == URLStatus.PENDING.value,
URLCheckedForDuplicate.id == None

Check failure on line 2254 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2254 <711>

comparison to None should be 'if cond is None:'
Raw output
./collector_db/AsyncDatabaseClient.py:2254:39: E711 comparison to None should be 'if cond is None:'
).limit(100)
)

raw_result = await session.execute(query)
urls = raw_result.scalars().all()
return [URLDuplicateTDO(url=url.url, url_id=url.id) for url in urls]


@session_manager

Check failure on line 2263 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2263 <303>

too many blank lines (2)
Raw output
./collector_db/AsyncDatabaseClient.py:2263:5: E303 too many blank lines (2)
async def mark_all_as_duplicates(self, session: AsyncSession, url_ids: List[int]):

Check warning on line 2264 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2264 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2264:1: D102 Missing docstring in public method
query = update(URL).where(URL.id.in_(url_ids)).values(outcome=URLStatus.DUPLICATE.value)
await session.execute(query)

@session_manager
async def mark_as_checked_for_duplicates(self, session: AsyncSession, url_ids: list[int]):

Check warning on line 2269 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L2269 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:2269:1: D102 Missing docstring in public method
for url_id in url_ids:
url_checked_for_duplicate = URLCheckedForDuplicate(url_id=url_id)
session.add(url_checked_for_duplicate)
1 change: 1 addition & 0 deletions collector_db/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class TaskType(PyEnum):
AGENCY_IDENTIFICATION = "Agency Identification"
MISC_METADATA = "Misc Metadata"
SUBMIT_APPROVED = "Submit Approved URLs"
DUPLICATE_DETECTION = "Duplicate Detection"
IDLE = "Idle"

class PGEnum(TypeDecorator):
Expand Down
14 changes: 14 additions & 0 deletions collector_db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,21 @@
back_populates="url",
uselist=False
)
checked_for_duplicate = relationship(
"URLCheckedForDuplicate",
uselist=False,
back_populates="url"
)

class URLCheckedForDuplicate(Base):

Check warning on line 150 in collector_db/models.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/models.py#L150 <101>

Missing docstring in public class
Raw output
./collector_db/models.py:150:1: D101 Missing docstring in public class
__tablename__ = 'url_checked_for_duplicate'

id = Column(Integer, primary_key=True)
url_id = Column(Integer, ForeignKey('urls.id'), nullable=False)
created_at = get_created_at_column()

# Relationships
url = relationship("URL", uselist=False, back_populates="checked_for_duplicate")

class URLOptionalDataSourceMetadata(Base):
__tablename__ = 'url_optional_data_source_metadata'
Expand Down
9 changes: 9 additions & 0 deletions core/DTOs/task_data_objects/URLDuplicateTDO.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import Optional

Check warning on line 1 in core/DTOs/task_data_objects/URLDuplicateTDO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/task_data_objects/URLDuplicateTDO.py#L1 <100>

Missing docstring in public module
Raw output
./core/DTOs/task_data_objects/URLDuplicateTDO.py:1:1: D100 Missing docstring in public module

from pydantic import BaseModel


class URLDuplicateTDO(BaseModel):

Check warning on line 6 in core/DTOs/task_data_objects/URLDuplicateTDO.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/DTOs/task_data_objects/URLDuplicateTDO.py#L6 <101>

Missing docstring in public class
Raw output
./core/DTOs/task_data_objects/URLDuplicateTDO.py:6:1: D101 Missing docstring in public class
url_id: int
url: str
is_duplicate: Optional[bool] = None
9 changes: 9 additions & 0 deletions core/TaskManager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

from core.classes.task_operators.URLDuplicateTaskOperator import URLDuplicateTaskOperator
from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient
from collector_db.DTOs.TaskInfo import TaskInfo
Expand Down Expand Up @@ -96,9 +97,17 @@
)
return operator

async def get_url_duplicate_task_operator(self):

Check warning on line 100 in core/TaskManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/TaskManager.py#L100 <102>

Missing docstring in public method
Raw output
./core/TaskManager.py:100:1: D102 Missing docstring in public method
operator = URLDuplicateTaskOperator(
adb_client=self.adb_client,
pdap_client=self.pdap_client
)
return operator

async def get_task_operators(self) -> list[TaskOperatorBase]:
return [
await self.get_url_html_task_operator(),
await self.get_url_duplicate_task_operator(),
# await self.get_url_relevance_huggingface_task_operator(),
await self.get_url_record_type_task_operator(),
await self.get_agency_identification_task_operator(),
Expand Down
33 changes: 33 additions & 0 deletions core/classes/task_operators/URLDuplicateTaskOperator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from collector_db.AsyncDatabaseClient import AsyncDatabaseClient

Check warning on line 1 in core/classes/task_operators/URLDuplicateTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/URLDuplicateTaskOperator.py#L1 <100>

Missing docstring in public module
Raw output
./core/classes/task_operators/URLDuplicateTaskOperator.py:1:1: D100 Missing docstring in public module
from collector_db.enums import TaskType
from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO
from core.classes.task_operators.TaskOperatorBase import TaskOperatorBase
from pdap_api_client.PDAPClient import PDAPClient


class URLDuplicateTaskOperator(TaskOperatorBase):

Check warning on line 8 in core/classes/task_operators/URLDuplicateTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/URLDuplicateTaskOperator.py#L8 <101>

Missing docstring in public class
Raw output
./core/classes/task_operators/URLDuplicateTaskOperator.py:8:1: D101 Missing docstring in public class

def __init__(

Check warning on line 10 in core/classes/task_operators/URLDuplicateTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/URLDuplicateTaskOperator.py#L10 <107>

Missing docstring in __init__
Raw output
./core/classes/task_operators/URLDuplicateTaskOperator.py:10:1: D107 Missing docstring in __init__
self,
adb_client: AsyncDatabaseClient,
pdap_client: PDAPClient
):
super().__init__(adb_client)
self.pdap_client = pdap_client

@property
def task_type(self):

Check warning on line 19 in core/classes/task_operators/URLDuplicateTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/URLDuplicateTaskOperator.py#L19 <102>

Missing docstring in public method
Raw output
./core/classes/task_operators/URLDuplicateTaskOperator.py:19:1: D102 Missing docstring in public method
return TaskType.DUPLICATE_DETECTION

async def meets_task_prerequisites(self):

Check warning on line 22 in core/classes/task_operators/URLDuplicateTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/URLDuplicateTaskOperator.py#L22 <102>

Missing docstring in public method
Raw output
./core/classes/task_operators/URLDuplicateTaskOperator.py:22:1: D102 Missing docstring in public method
return await self.adb_client.has_pending_urls_not_checked_for_duplicates()

async def inner_task_logic(self):

Check warning on line 25 in core/classes/task_operators/URLDuplicateTaskOperator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] core/classes/task_operators/URLDuplicateTaskOperator.py#L25 <102>

Missing docstring in public method
Raw output
./core/classes/task_operators/URLDuplicateTaskOperator.py:25:1: D102 Missing docstring in public method
tdos: list[URLDuplicateTDO] = await self.adb_client.get_pending_urls_not_checked_for_duplicates()
url_ids = [tdo.url_id for tdo in tdos]
await self.link_urls_to_task(url_ids=url_ids)
for tdo in tdos:
tdo.is_duplicate = await self.pdap_client.is_url_duplicate(tdo.url)
duplicate_url_ids = [tdo.url_id for tdo in tdos if tdo.is_duplicate]
await self.adb_client.mark_all_as_duplicates(duplicate_url_ids)
await self.adb_client.mark_as_checked_for_duplicates(url_ids)
2 changes: 1 addition & 1 deletion pdap_api_client/DTOs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class ApprovalStatus(Enum):
class UniqueURLDuplicateInfo(BaseModel):
original_url: str
approval_status: ApprovalStatus
rejection_note: str
rejection_note: Optional[str] = None

class UniqueURLResponseInfo(BaseModel):
is_unique: bool
Expand Down
11 changes: 4 additions & 7 deletions pdap_api_client/PDAPClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@
)


async def is_url_unique(
async def is_url_duplicate(

Check failure on line 62 in pdap_api_client/PDAPClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] pdap_api_client/PDAPClient.py#L62 <303>

too many blank lines (2)
Raw output
./pdap_api_client/PDAPClient.py:62:5: E303 too many blank lines (2)
self,
url_to_check: str
) -> UniqueURLResponseInfo:
) -> bool:
"""
Check if a URL is unique. Returns duplicate info otherwise
"""
Expand All @@ -79,11 +79,8 @@
)
response_info = await self.access_manager.make_request(request_info)
duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]]
is_unique = (len(duplicates) == 0)
return UniqueURLResponseInfo(
is_unique=is_unique,
duplicates=duplicates
)
is_duplicate = (len(duplicates) != 0)
return is_duplicate

async def submit_urls(
self,
Expand Down
20 changes: 20 additions & 0 deletions tests/test_automated/integration/tasks/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from unittest.mock import MagicMock, AsyncMock

Check warning on line 1 in tests/test_automated/integration/tasks/conftest.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/test_automated/integration/tasks/conftest.py#L1 <100>

Missing docstring in public module
Raw output
./tests/test_automated/integration/tasks/conftest.py:1:1: D100 Missing docstring in public module

import pytest

from pdap_api_client.AccessManager import AccessManager
from pdap_api_client.PDAPClient import PDAPClient


@pytest.fixture
def mock_pdap_client() -> PDAPClient:

Check warning on line 10 in tests/test_automated/integration/tasks/conftest.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/test_automated/integration/tasks/conftest.py#L10 <103>

Missing docstring in public function
Raw output
./tests/test_automated/integration/tasks/conftest.py:10:1: D103 Missing docstring in public function
mock_access_manager = MagicMock(
spec=AccessManager
)
mock_access_manager.jwt_header = AsyncMock(
return_value={"Authorization": "Bearer token"}
)
pdap_client = PDAPClient(
access_manager=mock_access_manager
)
return pdap_client

Check warning on line 20 in tests/test_automated/integration/tasks/conftest.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/test_automated/integration/tasks/conftest.py#L20 <292>

no newline at end of file
Raw output
./tests/test_automated/integration/tasks/conftest.py:20:23: W292 no newline at end of file
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,7 @@ def mock_make_request(pdap_client: PDAPClient, urls: list[str]):
)
)

@pytest.fixture
def mock_pdap_client() -> PDAPClient:
mock_access_manager = MagicMock(
spec=AccessManager
)
mock_access_manager.jwt_header = AsyncMock(
return_value={"Authorization": "Bearer token"}
)
pdap_client = PDAPClient(
access_manager=mock_access_manager
)
return pdap_client


async def setup_validated_urls(db_data_creator: DBDataCreator) -> list[str]:
creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls(
Expand Down
Loading