diff --git a/alembic/versions/2025_12_31_1106-759ce7d0772b_remove_url_status_attribute.py b/alembic/versions/2025_12_31_1106-759ce7d0772b_remove_url_status_attribute.py new file mode 100644 index 00000000..379c045a --- /dev/null +++ b/alembic/versions/2025_12_31_1106-759ce7d0772b_remove_url_status_attribute.py @@ -0,0 +1,31 @@ +"""Remove URL Status attribute + +Revision ID: 759ce7d0772b +Revises: 42933d84aa52 +Create Date: 2025-12-31 11:06:39.037486 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '759ce7d0772b' +down_revision: Union[str, None] = '42933d84aa52' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.drop_column( + table_name="urls", + column_name="status" + ) + + op.execute("""DROP type url_status""") + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 5a56cf32..b9fcc935 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -4,7 +4,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo -from src.collectors.enums import URLStatus from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -42,7 +41,6 @@ async def run( ) common_where_clause = [ - URL.status == URLStatus.OK.value, LinkBatchURL.batch_id == self.batch_id, ] diff --git a/src/api/endpoints/annotate/_shared/queries/helper.py b/src/api/endpoints/annotate/_shared/queries/helper.py index 76def5c1..57370c36 100644 --- a/src/api/endpoints/annotate/_shared/queries/helper.py +++ b/src/api/endpoints/annotate/_shared/queries/helper.py @@ -5,7 +5,6 @@ from sqlalchemy import Select, case, CTE, ColumnElement from sqlalchemy.orm import joinedload -from src.collectors.enums import URLStatus from src.db.helpers.query import exists_url, not_exists_url from src.db.models.impl.flag.url_suspended.sqlalchemy import FlagURLSuspended from src.db.models.impl.url.core.enums import URLSource @@ -33,7 +32,6 @@ def add_common_where_conditions( query: Select, ) -> Select: return query.where( - URL.status == URLStatus.OK.value, not_exists_url( FlagURLSuspended ), diff --git a/src/api/endpoints/batch/dtos/get/summaries/counts.py b/src/api/endpoints/batch/dtos/get/summaries/counts.py index 0ce4e468..0faaa20b 100644 --- a/src/api/endpoints/batch/dtos/get/summaries/counts.py +++ b/src/api/endpoints/batch/dtos/get/summaries/counts.py @@ -4,7 +4,6 @@ class BatchSummaryURLCounts(BaseModel): total: int pending: int - duplicate: int not_relevant: int submitted: int errored: int diff --git a/src/api/endpoints/batch/routes.py b/src/api/endpoints/batch/routes.py index 87839fb7..4dfbbbfc 100644 --- a/src/api/endpoints/batch/routes.py +++ b/src/api/endpoints/batch/routes.py @@ -10,7 +10,7 @@ from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.collectors.enums import CollectorType from src.core.core import AsyncCore -from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum +from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from src.security.dtos.access_info import AccessInfo from src.security.manager import get_access_info @@ -27,7 +27,7 @@ async def get_batch_status( description="Filter by collector type", default=None ), - status: BatchURLStatusEnum | None = Query( + status: BatchURLStatusViewEnum | None = Query( description="Filter by status", default=None ), diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 5ebe0e4b..8216b10b 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -3,7 +3,7 @@ from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -53,7 +53,6 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, - status=URLStatus.OK.value, source=URLSource.MANUAL, trailing_slash=url_and_scheme.url.endswith('/'), ) diff --git a/src/api/endpoints/metrics/batches/aggregated/query/core.py b/src/api/endpoints/metrics/batches/aggregated/query/core.py index cc6259de..07015c1d 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/core.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/core.py @@ -1,29 +1,15 @@ -from sqlalchemy import case, select +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.sql.functions import coalesce, func +from sqlalchemy.sql.functions import func from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO, \ GetMetricsBatchesAggregatedInnerResponseDTO -from src.api.endpoints.metrics.batches.aggregated.query.all_urls.query import CountAllURLsByBatchStrategyQueryBuilder -from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.query import \ - BatchStatusByBatchStrategyQueryBuilder from src.api.endpoints.metrics.batches.aggregated.query.requester_.requester import \ GetBatchesAggregatedMetricsQueryRequester -from src.api.endpoints.metrics.batches.aggregated.query.submitted_.query import \ - CountSubmittedByBatchStrategyQueryBuilder -from src.api.endpoints.metrics.batches.aggregated.query.url_error.query import URLErrorByBatchStrategyQueryBuilder -from src.api.endpoints.metrics.batches.aggregated.query.validated_.query import \ - ValidatedURLCountByBatchStrategyQueryBuilder -from src.collectors.enums import URLStatus, CollectorType +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer class GetBatchesAggregatedMetricsQueryBuilder(QueryBuilderBase): diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py index a7b9e27a..6712c76d 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -4,7 +4,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse -from src.collectors.enums import URLStatus from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch @@ -28,7 +27,7 @@ async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse] .where( exists_url(URLTaskError) ) - .group_by(Batch.strategy, URL.status) + .group_by(Batch.strategy) ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 5847e309..d46a01b9 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -1,4 +1,4 @@ -from sqlalchemy import select, case, Column +from sqlalchemy import select, Column from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql.functions import coalesce @@ -11,11 +11,9 @@ from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE from src.api.endpoints.metrics.batches.breakdown.total.cte_ import TOTAL_CTE from src.api.endpoints.metrics.batches.breakdown.validated.cte_ import VALIDATED_CTE -from src.collectors.enums import URLStatus, CollectorType +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py b/src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py index 7dbbc48a..1c8ba860 100644 --- a/src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py +++ b/src/api/endpoints/metrics/dtos/get/urls/aggregated/core.py @@ -4,7 +4,7 @@ from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.views.url_status.enums import URLStatusViewEnum +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum class GetMetricsURLValidatedOldestPendingURL(BaseModel): url_id: int diff --git a/src/api/endpoints/metrics/urls/aggregated/query/core.py b/src/api/endpoints/metrics/urls/aggregated/query/core.py index c6dbc29f..880c7e3b 100644 --- a/src/api/endpoints/metrics/urls/aggregated/query/core.py +++ b/src/api/endpoints/metrics/urls/aggregated/query/core.py @@ -11,7 +11,7 @@ from src.core.enums import RecordType from src.db.helpers.session import session_helper as sh from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.views.url_status.enums import URLStatusViewEnum +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py index e086b752..f8a8f571 100644 --- a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py @@ -3,11 +3,10 @@ from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLValidatedOldestPendingURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.views.url_status.core import URLStatusMatView -from src.db.models.views.url_status.enums import URLStatusViewEnum +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh class GetOldestPendingURLQueryBuilder(QueryBuilderBase): @@ -18,14 +17,14 @@ async def run( query = ( select( - URLStatusMatView.url_id, + URLStatusMaterializedView.url_id, URL.created_at ) .join( URL, - URLStatusMatView.url_id == URL.id + URLStatusMaterializedView.url_id == URL.id ).where( - URLStatusMatView.status.not_in( + URLStatusMaterializedView.status.not_in( [ URLStatusViewEnum.SUBMITTED.value, URLStatusViewEnum.ACCEPTED.value, diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py index 05813ce0..6f369b32 100644 --- a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/status.py @@ -4,8 +4,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.helpers.session import session_helper as sh -from src.db.models.views.url_status.core import URLStatusMatView -from src.db.models.views.url_status.enums import URLStatusViewEnum +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase @@ -18,13 +18,13 @@ async def run( query = ( select( - URLStatusMatView.status, + URLStatusMaterializedView.status, func.count( - URLStatusMatView.url_id + URLStatusMaterializedView.url_id ).label("count") ) .group_by( - URLStatusMatView.status + URLStatusMaterializedView.status ) ) diff --git a/src/api/endpoints/metrics/urls/breakdown/query/core.py b/src/api/endpoints/metrics/urls/breakdown/query/core.py index c214b169..df521497 100644 --- a/src/api/endpoints/metrics/urls/breakdown/query/core.py +++ b/src/api/endpoints/metrics/urls/breakdown/query/core.py @@ -3,12 +3,11 @@ from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseInnerDTO, \ GetMetricsURLsBreakdownPendingResponseDTO -from src.collectors.enums import URLStatus from src.db.models.impl.annotation.agency.user.sqlalchemy import AnnotationAgencyUser -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -63,8 +62,7 @@ async def run(self, session: AsyncSession) -> GetMetricsURLsBreakdownPendingResp FlagURLValidated.url_id == URL.id ) .where( - FlagURLValidated.url_id.is_(None), - URL.status == URLStatus.OK + FlagURLValidated.url_id.is_(None) ) .group_by(month) .order_by(month.asc()) diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index 1f9dfe91..ed444bfb 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -4,7 +4,6 @@ from starlette.status import HTTP_400_BAD_REQUEST from src.api.endpoints.review.enums import RejectionReason -from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/src/api/endpoints/submit/data_source/models/response/duplicate.py b/src/api/endpoints/submit/data_source/models/response/duplicate.py index 12367372..f1414b8f 100644 --- a/src/api/endpoints/submit/data_source/models/response/duplicate.py +++ b/src/api/endpoints/submit/data_source/models/response/duplicate.py @@ -1,11 +1,11 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum class SubmitDataSourceURLDuplicateSubmissionResponse(BaseModel): message: str url_id: int url_type: URLType | None - url_status: URLStatus \ No newline at end of file + url_status: URLStatusViewEnum \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/queries/core.py b/src/api/endpoints/submit/data_source/queries/core.py index 77c33dca..aec2e821 100644 --- a/src/api/endpoints/submit/data_source/queries/core.py +++ b/src/api/endpoints/submit/data_source/queries/core.py @@ -1,12 +1,9 @@ import uuid -from typing import Any -from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest -from src.collectors.enums import URLStatus from src.core.enums import BatchStatus from src.db.models.impl.annotation.agency.anon.sqlalchemy import AnnotationAgencyAnon from src.db.models.impl.annotation.location.anon.sqlalchemy import AnnotationLocationAnon @@ -44,7 +41,6 @@ async def run( trailing_slash=full_url.has_trailing_slash, name=self.request.name, description=self.request.description, - status=URLStatus.OK, source=URLSource.MANUAL, ) diff --git a/src/api/endpoints/submit/data_source/queries/duplicate.py b/src/api/endpoints/submit/data_source/queries/duplicate.py index 75346cf6..d4409e91 100644 --- a/src/api/endpoints/submit/data_source/queries/duplicate.py +++ b/src/api/endpoints/submit/data_source/queries/duplicate.py @@ -8,6 +8,7 @@ SubmitDataSourceURLDuplicateSubmissionResponse from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView from src.db.queries.base.builder import QueryBuilderBase @@ -29,13 +30,17 @@ async def run(self, session: AsyncSession) -> None: query = ( select( URL.id, - URL.status, + URLStatusMaterializedView.status, FlagURLValidated.type ) .outerjoin( FlagURLValidated, FlagURLValidated.url_id == URL.id ) + .outerjoin( + URLStatusMaterializedView, + URLStatusMaterializedView.url_id == URL.id + ) .where( URL.url == self.url ) @@ -48,7 +53,7 @@ async def run(self, session: AsyncSession) -> None: model = SubmitDataSourceURLDuplicateSubmissionResponse( message="Duplicate URL found", url_id=mapping[URL.id], - url_status=mapping[URL.status], + url_status=mapping[URLStatusMaterializedView.status], url_type=mapping[FlagURLValidated.type] ) raise HTTPException( diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 54ab5439..49e56a98 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -7,16 +7,15 @@ from src.api.endpoints.submit.url.queries.convert import convert_invalid_url_to_url_response, \ convert_duplicate_urls_to_url_response from src.api.endpoints.submit.url.queries.dedupe import DeduplicateURLQueryBuilder -from src.collectors.enums import URLStatus from src.db.models.impl.annotation.agency.user.sqlalchemy import AnnotationAgencyUser from src.db.models.impl.annotation.location.user.sqlalchemy import AnnotationLocationUser from src.db.models.impl.annotation.name.suggestion.enums import NameSuggestionSource from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement +from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.queries.base.builder import QueryBuilderBase from src.util.models.url_and_scheme import URLAndScheme from src.util.url import clean_url, get_url_and_scheme, is_valid_url @@ -61,7 +60,6 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: url=url_and_scheme.url, scheme=url_and_scheme.scheme, source=URLSource.MANUAL, - status=URLStatus.OK, description=self.request.description, trailing_slash=url_and_scheme.url.endswith('/'), ) diff --git a/src/api/endpoints/task/by_id/query.py b/src/api/endpoints/task/by_id/query.py index 92487327..f1ea5adb 100644 --- a/src/api/endpoints/task/by_id/query.py +++ b/src/api/endpoints/task/by_id/query.py @@ -1,9 +1,8 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload, joinedload +from sqlalchemy.orm import selectinload from src.api.endpoints.task.by_id.dto import TaskInfo -from src.collectors.enums import URLStatus from src.db.enums import TaskType from src.db.models.impl.task.core import Task from src.db.models.impl.task.enums import TaskStatus @@ -35,6 +34,7 @@ async def run(self, session: AsyncSession) -> TaskInfo: error = task.errors[0].error if len(task.errors) > 0 else None # Get error info if any # Get URLs + # TODO: Revise to include URL Status from URL Web metadata urls = task.urls url_infos = [] for url in urls: @@ -43,7 +43,6 @@ async def run(self, session: AsyncSession) -> TaskInfo: batch_id=url.batch.id, url=url.url, collector_metadata=url.collector_metadata, - status=URLStatus(url.status), updated_at=url.updated_at ) url_infos.append(url_info) diff --git a/src/api/endpoints/url/get/dto.py b/src/api/endpoints/url/get/dto.py index a4616d7e..0e10c6e9 100644 --- a/src/api/endpoints/url/get/dto.py +++ b/src/api/endpoints/url/get/dto.py @@ -1,10 +1,9 @@ import datetime -from typing import Optional from pydantic import BaseModel -from src.collectors.enums import URLStatus from src.db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource, TaskType +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum class GetURLsResponseErrorInfo(BaseModel): @@ -25,7 +24,7 @@ class GetURLsResponseInnerInfo(BaseModel): id: int batch_id: int | None url: str - status: URLStatus + status: URLStatusViewEnum | None collector_metadata: dict | None updated_at: datetime.datetime created_at: datetime.datetime diff --git a/src/api/endpoints/url/get/query.py b/src/api/endpoints/url/get/query.py index 6885ef64..a11bbd64 100644 --- a/src/api/endpoints/url/get/query.py +++ b/src/api/endpoints/url/get/query.py @@ -1,12 +1,17 @@ -from sqlalchemy import select, exists +from typing import Sequence + +from sqlalchemy import select, exists, RowMapping, func +from sqlalchemy.dialects.postgresql import aggregate_order_by from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload from src.api.endpoints.url.get.dto import GetURLsResponseInfo, GetURLsResponseErrorInfo, GetURLsResponseInnerInfo -from src.collectors.enums import URLStatus from src.db.client.helpers import add_standard_limit_and_offset +from src.db.models.impl import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView from src.db.queries.base.builder import QueryBuilderBase @@ -22,39 +27,86 @@ def __init__( self.errors = errors async def run(self, session: AsyncSession) -> GetURLsResponseInfo: - statement = select(URL).options( - selectinload(URL.task_errors), - selectinload(URL.batch) - ).order_by(URL.id) + + error_cte = ( + select( + URLTaskError.url_id, + func.array_agg( + aggregate_order_by( + func.jsonb_build_object( + "task_type", URLTaskError.task_type, + "error", URLTaskError.error, + "created_at", URLTaskError.created_at + ), + URLTaskError.created_at, + ) + ).label("error_array") + ) + .group_by( + URLTaskError.url_id + ) + .cte("errors") + ) + + + query = ( + select( + URL.id, + LinkBatchURL.batch_id, + URL.full_url, + URL.collector_metadata, + URLStatusMaterializedView.status, + URL.created_at, + URL.updated_at, + URL.name, + error_cte.c.error_array + ) + .outerjoin( + LinkBatchURL + ) + .outerjoin( + URLStatusMaterializedView, + URLStatusMaterializedView.url_id == URL.id + ) + .outerjoin( + error_cte, + error_cte.c.url_id == URL.id + ) + .outerjoin( + URLScrapeInfo + ) + .order_by(URL.id) + ) if self.errors: # Only return URLs with errors - statement = statement.where( + query = query.where( exists( select(URLTaskError).where(URLTaskError.url_id == URL.id) ) ) - add_standard_limit_and_offset(statement, self.page) - execute_result = await session.execute(statement) - all_results = execute_result.scalars().all() + add_standard_limit_and_offset(query, self.page) + mappings: Sequence[RowMapping] = await self.sh.mappings(session, query) + final_results = [] - for result in all_results: + for mapping in mappings: error_results = [] - for error in result.task_errors: + error_array = mapping["error_array"] or [] + for error in error_array: error_result = GetURLsResponseErrorInfo( - task=error.task_type, - error=error.error, - updated_at=error.created_at + task=error["task_type"], + error=error["error"], + updated_at=error["created_at"] ) error_results.append(error_result) final_results.append( GetURLsResponseInnerInfo( - id=result.id, - batch_id=result.batch.id if result.batch is not None else None, - url=result.full_url, - status=URLStatus(result.status), - collector_metadata=result.collector_metadata, - updated_at=result.updated_at, - created_at=result.created_at, + id=mapping[URL.id], + batch_id=mapping[LinkBatchURL.batch_id], + url=mapping["full_url"], + collector_metadata=mapping[URL.collector_metadata], + status=mapping[URLStatusMaterializedView.status], + created_at=mapping[URL.created_at], + updated_at=mapping[URL.updated_at], errors=error_results, ) ) diff --git a/src/collectors/enums.py b/src/collectors/enums.py index 16711a0c..2e5f6239 100644 --- a/src/collectors/enums.py +++ b/src/collectors/enums.py @@ -9,9 +9,3 @@ class CollectorType(Enum): MUCKROCK_ALL_SEARCH = "muckrock_all_search" CKAN = "ckan" MANUAL = "manual" - -class URLStatus(Enum): - OK = "ok" - ERROR = "error" - DUPLICATE = "duplicate" - BROKEN = "broken" diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index 60f39a2c..3b21d210 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -22,7 +22,6 @@ async def run(self, session: AsyncSession) -> int: url=url_and_scheme.url.rstrip('/'), scheme=url_and_scheme.scheme, collector_metadata=self.url_info.collector_metadata, - status=self.url_info.status.value, source=self.url_info.source, trailing_slash=url_and_scheme.url.endswith('/'), ) diff --git a/src/core/core.py b/src/core/core.py index ad2f20d5..cbee2d84 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -31,7 +31,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.db.models.impl.batch.pydantic.info import BatchInfo -from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum +from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum class AsyncCore: @@ -81,7 +81,7 @@ async def get_duplicate_urls_by_batch(self, batch_id: int, page: int = 1) -> Get async def get_batch_statuses( self, collector_type: CollectorType | None, - status: BatchURLStatusEnum | None, + status: BatchURLStatusViewEnum | None, page: int ) -> GetBatchSummariesResponse: results = await self.adb_client.get_batch_summaries( diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py index 0621ee52..ebef8b45 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py @@ -1,4 +1,3 @@ -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py index 1eaa306d..3abadbf5 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py @@ -1,19 +1,11 @@ from datetime import datetime -from operator import or_ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.sql.functions import count -from src.collectors.enums import URLStatus from src.core.tasks.scheduled.impl.huggingface.queries.cte import HuggingfacePrereqCTEContainer -from src.db.enums import TaskType -from src.db.helpers.query import not_exists_url, no_url_task_error, exists_url from src.db.helpers.session import session_helper as sh -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.state.huggingface import HuggingFaceUploadState -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.impl.url.core.sqlalchemy import URL class CheckValidURLsUpdatedRequester: diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py index 04710ba6..487850dd 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -12,6 +12,8 @@ from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.enums import DataSourcesURLStatus from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel @@ -40,7 +42,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: # Required URL.full_url, URL.name, - URL.status, + URLWebMetadata.status_code, URLRecordType.record_type, agency_id_cte.c.agency_ids, # Optional @@ -72,6 +74,10 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: URLOptionalDataSourceMetadata, URL.id == URLOptionalDataSourceMetadata.url_id, ) + .outerjoin( + URLWebMetadata, + URL.id == URLWebMetadata.url_id + ) .outerjoin( URLInternetArchivesProbeMetadata, URL.id == URLInternetArchivesProbeMetadata.url_id, @@ -118,8 +124,9 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], + # TODO: Change to convert web metadata result to URL Status url_status=convert_sm_url_status_to_ds_url_status( - sm_url_status=mapping[URL.status], + mapping[URLWebMetadata.status_code], ), internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index a710b6f7..8b23f339 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -12,6 +12,7 @@ from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.enums import DataSourcesURLStatus from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel @@ -41,7 +42,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: # Required URL.full_url, URL.name, - URL.status, + URLWebMetadata.status_code, URLRecordType.record_type, agency_id_cte.c.agency_ids, # Optional @@ -82,6 +83,10 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: URLRecordType, URLRecordType.url_id == URL.id, ) + .outerjoin( + URLWebMetadata, + URLWebMetadata.url_id == URL.id, + ) .outerjoin( agency_id_cte, cte.url_id == agency_id_cte.c.url_id @@ -122,7 +127,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other], url_status=convert_sm_url_status_to_ds_url_status( - sm_url_status=mapping[URL.status], + mapping[URLWebMetadata.status_code], ), internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py index 5a784295..02ff8c8f 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py @@ -10,6 +10,7 @@ from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest @@ -36,7 +37,7 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: select( cte.url_id, URL.full_url, - URL.status, + URLWebMetadata.status_code, URLInternetArchivesProbeMetadata.archive_url, agency_id_cte.c.agency_ids ) @@ -47,6 +48,10 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: URL, URL.id == cte.url_id, ) + .join( + URLWebMetadata, + URL.id == URLWebMetadata.url_id, + ) .outerjoin( URLInternetArchivesProbeMetadata, URL.id == URLInternetArchivesProbeMetadata.url_id, @@ -73,7 +78,7 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: agency_ids=mapping["agency_ids"], internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, url_status=convert_sm_url_status_to_ds_url_status( - sm_url_status=mapping[URL.status], + mapping[URLWebMetadata.status_code], ), ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py index 8cdb8ed6..c73909dc 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py @@ -10,6 +10,7 @@ from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest @@ -35,7 +36,7 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: select( cte.ds_meta_url_id, URL.full_url, - URL.status, + URLWebMetadata.status_code, agency_id_cte.c.agency_ids, URLInternetArchivesProbeMetadata.archive_url, ) @@ -50,6 +51,10 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: URLInternetArchivesProbeMetadata, URL.id == URLInternetArchivesProbeMetadata.url_id, ) + .outerjoin( + URLWebMetadata, + URL.id == URLWebMetadata.url_id, + ) .outerjoin( agency_id_cte, cte.url_id == agency_id_cte.c.url_id @@ -72,7 +77,7 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: agency_ids=mapping["agency_ids"] or [], internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, url_status=convert_sm_url_status_to_ds_url_status( - sm_url_status=mapping[URL.status], + mapping[URLWebMetadata.status_code], ), ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py b/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py index 3f586b20..3de3e502 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py @@ -1,14 +1,11 @@ -from src.collectors.enums import URLStatus from src.external.pdap.enums import DataSourcesURLStatus def convert_sm_url_status_to_ds_url_status( - sm_url_status: URLStatus + status_code: int ) -> DataSourcesURLStatus: - match sm_url_status: - case URLStatus.OK: + match status_code: + case 200: return DataSourcesURLStatus.OK - case URLStatus.BROKEN: - return DataSourcesURLStatus.BROKEN case _: - raise ValueError(f"URL status has no corresponding DS Status: {sm_url_status}") \ No newline at end of file + return DataSourcesURLStatus.BROKEN \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/update_url_status/operator.py b/src/core/tasks/scheduled/impl/update_url_status/operator.py deleted file mode 100644 index 82285996..00000000 --- a/src/core/tasks/scheduled/impl/update_url_status/operator.py +++ /dev/null @@ -1,15 +0,0 @@ -from src.core.tasks.scheduled.impl.update_url_status.query import UpdateURLStatusQueryBuilder -from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase -from src.db.enums import TaskType - - -class UpdateURLStatusOperator(ScheduledTaskOperatorBase): - - @property - def task_type(self) -> TaskType: - return TaskType.UPDATE_URL_STATUS - - async def inner_task_logic(self) -> None: - await self.adb_client.run_query_builder( - UpdateURLStatusQueryBuilder() - ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/update_url_status/query.py b/src/core/tasks/scheduled/impl/update_url_status/query.py deleted file mode 100644 index 963405b6..00000000 --- a/src/core/tasks/scheduled/impl/update_url_status/query.py +++ /dev/null @@ -1,49 +0,0 @@ -from sqlalchemy import update, exists, select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata -from src.db.queries.base.builder import QueryBuilderBase - - -class UpdateURLStatusQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> None: - - # Update broken URLs to nonbroken if their status is not 404 - query_broken_to_ok = ( - update(URL) - .values( - status=URLStatus.OK - ) - .where( - exists( - select(1).where( - URLWebMetadata.url_id == URL.id, # <-- correlate - URLWebMetadata.status_code != 404, - URL.status == URLStatus.BROKEN - ) - ) - ) - ) - - # Update ok URLs to broken if their status is 404 - query_ok_to_broken = ( - update(URL) - .values( - status=URLStatus.BROKEN - ) - .where( - exists( - select(1).where( - URLWebMetadata.url_id == URL.id, # <-- correlate - URLWebMetadata.status_code == 404, - URL.status == URLStatus.OK - ) - ) - ) - ) - - await session.execute(query_broken_to_ok) - await session.execute(query_ok_to_broken) \ No newline at end of file diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index d2e96cc1..38ebced3 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -25,7 +25,6 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator from src.core.tasks.scheduled.impl.task_cleanup.operator import TaskCleanupOperator -from src.core.tasks.scheduled.impl.update_url_status.operator import UpdateURLStatusOperator from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.hub.client import HuggingFaceHubClient @@ -230,13 +229,4 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.HOURLY.value, enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") ), - ### URL - ScheduledTaskEntry( - operator=UpdateURLStatusOperator( - adb_client=self.adb_client - ), - interval_minutes=IntervalEnum.DAILY.value, - enabled=self.setup_flag("UPDATE_URL_STATUS_TASK_FLAG") - ), - ] diff --git a/src/core/tasks/url/operators/auto_relevant/queries/cte.py b/src/core/tasks/url/operators/auto_relevant/queries/cte.py index 354e4bd5..a4e14b2d 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/cte.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/cte.py @@ -1,12 +1,11 @@ from sqlalchemy import select, CTE from sqlalchemy.orm import aliased -from src.collectors.enums import URLStatus from src.db.enums import TaskType from src.db.helpers.query import not_exists_url, no_url_task_error +from src.db.models.impl.annotation.url_type.auto.sqlalchemy import AnnotationAutoURLType from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.impl.annotation.url_type.auto.sqlalchemy import AnnotationAutoURLType class AutoRelevantPrerequisitesCTEContainer: @@ -21,7 +20,6 @@ def __init__(self): URL.id == URLCompressedHTML.url_id ) .where( - URL.status == URLStatus.OK.value, not_exists_url(AnnotationAutoURLType), no_url_task_error(TaskType.RELEVANCY) ).cte("auto_relevant_prerequisites") diff --git a/src/core/tasks/url/operators/html/core.py b/src/core/tasks/url/operators/html/core.py index 26f70cdb..5983ab69 100644 --- a/src/core/tasks/url/operators/html/core.py +++ b/src/core/tasks/url/operators/html/core.py @@ -1,6 +1,7 @@ from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.core.tasks.url.operators.html.filter import filter_just_urls, filter_404_subset from src.core.tasks.url.operators.html.queries.insert.query import InsertURLHTMLInfoQueryBuilder +from src.core.tasks.url.operators.html.queries.prerequisites import PendingURLsWithoutHTMLDataPrerequisitesQueryBuilder from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO from src.db.client.async_ import AsyncDatabaseClient @@ -26,7 +27,9 @@ def task_type(self) -> TaskType: return TaskType.HTML async def meets_task_prerequisites(self) -> bool: - return await self.adb_client.has_non_errored_urls_without_html_data() + return await self.run_query_builder( + PendingURLsWithoutHTMLDataPrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: tdos = await self._get_non_errored_urls_without_html_data() diff --git a/src/core/tasks/url/operators/html/queries/get.py b/src/core/tasks/url/operators/html/queries/get.py deleted file mode 100644 index a6cbe4a8..00000000 --- a/src/core/tasks/url/operators/html/queries/get.py +++ /dev/null @@ -1,31 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.db.models.impl.url.core.pydantic.info import URLInfo -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class GetPendingURLsWithoutHTMLDataQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> list[URLInfo]: - statement = StatementComposer.has_non_errored_urls_without_html_data() - statement = statement.limit(100).order_by(URL.id) - scalar_result = await session.scalars(statement) - url_results: list[URL] = scalar_result.all() - - final_results = [] - for url in url_results: - url_info = URLInfo( - id=url.id, - batch_id=url.batch.id if url.batch is not None else None, - url=url.full_url, - collector_metadata=url.collector_metadata, - status=url.status, - created_at=url.created_at, - updated_at=url.updated_at, - name=url.name - ) - final_results.append(url_info) - - return final_results diff --git a/src/core/tasks/scheduled/impl/update_url_status/__init__.py b/src/core/tasks/url/operators/html/queries/get/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/update_url_status/__init__.py rename to src/core/tasks/url/operators/html/queries/get/__init__.py diff --git a/src/core/tasks/url/operators/html/queries/get/query.py b/src/core/tasks/url/operators/html/queries/get/query.py new file mode 100644 index 00000000..a4088157 --- /dev/null +++ b/src/core/tasks/url/operators/html/queries/get/query.py @@ -0,0 +1,32 @@ +from sqlalchemy import RowMapping, Sequence +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.html.queries.helpers import has_non_errored_urls_without_html_data +from src.db.models.impl import LinkBatchURL +from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView +from src.db.queries.base.builder import QueryBuilderBase +from src.db.statement_composer import StatementComposer + + +class GetPendingURLsWithoutHTMLDataQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[URLInfo]: + query = ( + has_non_errored_urls_without_html_data() + .limit(100) + .order_by(URL.id) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings(session, query) + + final_results: list[URLInfo] = [] + for mapping in mappings: + url_info = URLInfo( + id=mapping[URL.id], + url=mapping["full_url"], + ) + final_results.append(url_info) + + return final_results diff --git a/src/core/tasks/url/operators/html/queries/helpers.py b/src/core/tasks/url/operators/html/queries/helpers.py new file mode 100644 index 00000000..4c7eb89c --- /dev/null +++ b/src/core/tasks/url/operators/html/queries/helpers.py @@ -0,0 +1,51 @@ +from sqlalchemy import ColumnElement, exists, select, Select + +from src.db.enums import TaskType +from src.db.models.impl import LinkBatchURL +from src.db.models.impl.link.task_url import LinkTaskURL +from src.db.models.impl.task.core import Task +from src.db.models.impl.task.enums import TaskStatus +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView + + +def _exclude_completed_html_task_subquery() -> ColumnElement[bool]: + return ~exists( + select(1) + .select_from( + LinkTaskURL + ) + .join( + Task, + LinkTaskURL.task_id == Task.id + ) + .where( + LinkTaskURL.url_id == URL.id, + Task.task_type == TaskType.HTML.value, + Task.task_status == TaskStatus.COMPLETE.value + ) + ) + +def has_non_errored_urls_without_html_data() -> Select: + query = ( + select( + URL.id, + URL.full_url, + ) + .join( + URLWebMetadata, + URLWebMetadata.url_id == URL.id + ) + .outerjoin( + URLScrapeInfo + ) + .where( + URLScrapeInfo.url_id == None, + _exclude_completed_html_task_subquery, + URLWebMetadata.status_code == 200, + URLWebMetadata.content_type.like("%html%"), + ) + ) + return query diff --git a/src/core/tasks/url/operators/html/queries/prerequisites.py b/src/core/tasks/url/operators/html/queries/prerequisites.py new file mode 100644 index 00000000..5fa0c94a --- /dev/null +++ b/src/core/tasks/url/operators/html/queries/prerequisites.py @@ -0,0 +1,13 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.html.queries.helpers import has_non_errored_urls_without_html_data +from src.db.queries.base.builder import QueryBuilderBase + + +class PendingURLsWithoutHTMLDataPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + statement = has_non_errored_urls_without_html_data() + statement = statement.limit(1) + scalar_result = await session.scalars(statement) + return bool(scalar_result.first()) \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index e30c13bf..6377fa60 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -36,13 +36,13 @@ from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse, GetTasksResponseTaskInfo from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.api.endpoints.url.get.query import GetURLsQueryBuilder -from src.collectors.enums import URLStatus, CollectorType +from src.collectors.enums import CollectorType from src.collectors.queries.insert.urls.query import InsertURLsQueryBuilder from src.core.enums import BatchStatus, RecordType from src.core.env_var_manager import EnvVarManager from src.core.tasks.scheduled.impl.huggingface.queries.state import SetHuggingFaceUploadStateQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.html.queries.get import \ +from src.core.tasks.url.operators.html.queries.get.query import \ GetPendingURLsWithoutHTMLDataQueryBuilder from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.db.client.helpers import add_standard_limit_and_offset @@ -83,7 +83,7 @@ from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.models.templates_.base import Base -from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum +from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder from src.db.queries.implementations.core.metrics.urls.aggregated.pending import \ @@ -321,14 +321,6 @@ async def add_user_record_type_suggestion( # endregion record_type - - @session_manager - async def has_non_errored_urls_without_html_data(self, session: AsyncSession) -> bool: - statement = self.statement_composer.has_non_errored_urls_without_html_data() - statement = statement.limit(1) - scalar_result = await session.scalars(statement) - return bool(scalar_result.first()) - @session_manager async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URLMiscellaneousMetadataTDO]): updates = [] @@ -700,7 +692,7 @@ async def get_batch_summaries( session, page: int, collector_type: CollectorType | None = None, - status: BatchURLStatusEnum | None = None, + status: BatchURLStatusViewEnum | None = None, ) -> GetBatchSummariesResponse: # Get only the batch_id, collector_type, status, and created_at builder = GetRecentBatchSummariesQueryBuilder( @@ -831,7 +823,6 @@ async def populate_backlog_snapshot( ) .outerjoin(FlagURLValidated, URL.id == FlagURLValidated.url_id) .where( - URL.status == URLStatus.OK.value, FlagURLValidated.url_id.is_(None), ) ) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index e29909cf..c5d90167 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -124,7 +124,6 @@ def insert_url(self, session, url_info: URLInfo) -> int: url=url_and_scheme.url, scheme=url_and_scheme.scheme, collector_metadata=url_info.collector_metadata, - status=url_info.status, name=url_info.name, trailing_slash=url_and_scheme.url.endswith('/'), source=url_info.source diff --git a/src/db/models/impl/url/core/pydantic/info.py b/src/db/models/impl/url/core/pydantic/info.py index 0985b3fc..74082427 100644 --- a/src/db/models/impl/url/core/pydantic/info.py +++ b/src/db/models/impl/url/core/pydantic/info.py @@ -1,9 +1,7 @@ import datetime -from typing import Optional from pydantic import BaseModel -from src.collectors.enums import URLStatus from src.db.models.impl.url.core.enums import URLSource @@ -12,7 +10,6 @@ class URLInfo(BaseModel): batch_id: int | None= None url: str collector_metadata: dict | None = None - status: URLStatus = URLStatus.OK updated_at: datetime.datetime | None = None created_at: datetime.datetime | None = None name: str | None = None diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index ed73b6c1..643cab15 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -1,5 +1,3 @@ -from src.collectors.enums import URLStatus -from src.core.enums import RecordType from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.templates_.base import Base @@ -17,6 +15,5 @@ def sa_model(cls) -> type[Base]: scheme: str | None = None collector_metadata: dict | None = None name: str | None = None - status: URLStatus = URLStatus.OK source: URLSource trailing_slash: bool \ No newline at end of file diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 45e8b45b..b9eedc5c 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -2,7 +2,6 @@ from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import relationship, Mapped -from src.collectors.enums import URLStatus from src.db.models.helpers import enum_column from src.db.models.impl.annotation.agency.anon.sqlalchemy import AnnotationAgencyAnon from src.db.models.impl.annotation.agency.auto.subtask.sqlalchemy import AnnotationAgencyAutoSubtask @@ -10,19 +9,18 @@ from src.db.models.impl.annotation.location.anon.sqlalchemy import AnnotationLocationAnon from src.db.models.impl.annotation.location.auto.subtask.sqlalchemy import AnnotationLocationAutoSubtask from src.db.models.impl.annotation.location.user.sqlalchemy import AnnotationLocationUser -from src.db.models.impl.annotation.name.anon.sqlalchemy import AnnotationNameAnonEndorsement from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion from src.db.models.impl.annotation.record_type.anon.sqlalchemy import AnnotationRecordTypeAnon +from src.db.models.impl.annotation.record_type.auto.sqlalchemy import AnnotationAutoRecordType +from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.annotation.url_type.anon.sqlalchemy import AnnotationURLTypeAnon +from src.db.models.impl.annotation.url_type.auto.sqlalchemy import AnnotationAutoURLType +from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType -from src.db.models.impl.annotation.record_type.auto.sqlalchemy import AnnotationAutoRecordType -from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser -from src.db.models.impl.annotation.url_type.auto.sqlalchemy import AnnotationAutoURLType -from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin from src.db.models.templates_.with_id import WithIDBase @@ -39,11 +37,6 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # The metadata from the collector collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. - status: Mapped[URLStatus] = enum_column( - URLStatus, - name='url_status', - nullable=False - ) trailing_slash = Column(Boolean, nullable=False) @hybrid_property diff --git a/src/db/models/views/batch_url_status/__init__.py b/src/db/models/materialized_views/batch_url_status/__init__.py similarity index 100% rename from src/db/models/views/batch_url_status/__init__.py rename to src/db/models/materialized_views/batch_url_status/__init__.py diff --git a/src/db/models/views/batch_url_status/core.py b/src/db/models/materialized_views/batch_url_status/core.py similarity index 98% rename from src/db/models/views/batch_url_status/core.py rename to src/db/models/materialized_views/batch_url_status/core.py index 1ec0711d..12d2872e 100644 --- a/src/db/models/views/batch_url_status/core.py +++ b/src/db/models/materialized_views/batch_url_status/core.py @@ -66,7 +66,7 @@ from src.db.models.templates_.base import Base -class BatchURLStatusMatView( +class BatchURLStatusMaterializedView( Base, ViewMixin, BatchDependentMixin diff --git a/src/db/models/views/batch_url_status/enums.py b/src/db/models/materialized_views/batch_url_status/enums.py similarity index 81% rename from src/db/models/views/batch_url_status/enums.py rename to src/db/models/materialized_views/batch_url_status/enums.py index 2f524de4..2ce74325 100644 --- a/src/db/models/views/batch_url_status/enums.py +++ b/src/db/models/materialized_views/batch_url_status/enums.py @@ -1,7 +1,7 @@ from enum import Enum -class BatchURLStatusEnum(Enum): +class BatchURLStatusViewEnum(Enum): ERROR = "Error" NO_URLS = "No URLs" LABELING_COMPLETE = "Labeling Complete" diff --git a/src/db/models/views/url_status/__init__.py b/src/db/models/materialized_views/url_status/__init__.py similarity index 100% rename from src/db/models/views/url_status/__init__.py rename to src/db/models/materialized_views/url_status/__init__.py diff --git a/src/db/models/views/url_status/enums.py b/src/db/models/materialized_views/url_status/enums.py similarity index 100% rename from src/db/models/views/url_status/enums.py rename to src/db/models/materialized_views/url_status/enums.py diff --git a/src/db/models/materialized_views/url_status/sqlalchemy.py b/src/db/models/materialized_views/url_status/sqlalchemy.py new file mode 100644 index 00000000..fe6c2466 --- /dev/null +++ b/src/db/models/materialized_views/url_status/sqlalchemy.py @@ -0,0 +1,15 @@ +from sqlalchemy.orm import Mapped + +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + + +class URLStatusMaterializedView( + Base, + URLDependentViewMixin +): + + __tablename__ = "url_status_mat_view" + + status: Mapped[str] + code: Mapped[int] \ No newline at end of file diff --git a/src/db/models/views/url_status/core.py b/src/db/models/views/url_status/core.py deleted file mode 100644 index be771fe5..00000000 --- a/src/db/models/views/url_status/core.py +++ /dev/null @@ -1,72 +0,0 @@ -""" - CREATE MATERIALIZED VIEW url_status_mat_view AS - with - urls_with_relevant_errors as ( - select - ute.url_id - from - url_task_error ute - where - ute.task_type in ( - 'Screenshot', - 'HTML', - 'URL Probe' - ) - ) - select - u.id as url_id, - case - when ( - -- Validated as not relevant, individual record, or not found - fuv.type in ('not relevant', 'individual record', 'not found') - -- Has Meta URL in data sources app - OR udmu.url_id is not null - -- Has data source in data sources app - OR uds.url_id is not null - ) Then 'Submitted/Pipeline Complete' - when fuv.type is not null THEN 'Accepted' - when ( - -- Has compressed HTML - uch.url_id is not null - AND - -- Has web metadata - uwm.url_id is not null - AND - -- Has screenshot - us.url_id is not null - ) THEN 'Community Labeling' - when uwre.url_id is not null then 'Error' - ELSE 'Intake' - END as status - - from - urls u - left join urls_with_relevant_errors uwre - on u.id = uwre.url_id - left join url_screenshot us - on u.id = us.url_id - left join url_compressed_html uch - on u.id = uch.url_id - left join url_web_metadata uwm - on u.id = uwm.url_id - left join flag_url_validated fuv - on u.id = fuv.url_id - left join url_ds_meta_url udmu - on u.id = udmu.url_id - left join url_data_source uds - on u.id = uds.url_id -""" -from sqlalchemy import String, Column - -from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin -from src.db.models.templates_.base import Base - - -class URLStatusMatView( - Base, - URLDependentViewMixin -): - __tablename__ = "url_status_mat_view" - - status = Column(String) \ No newline at end of file diff --git a/src/db/queries/implementations/core/common/annotation_exists_/core.py b/src/db/queries/implementations/core/common/annotation_exists_/core.py index 53e8bcf6..4c7328a2 100644 --- a/src/db/queries/implementations/core/common/annotation_exists_/core.py +++ b/src/db/queries/implementations/core/common/annotation_exists_/core.py @@ -16,12 +16,11 @@ from sqlalchemy import case, func, Select, select -from src.collectors.enums import URLStatus -from src.db.queries.implementations.core.common.annotation_exists_.constants import ALL_ANNOTATION_MODELS from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase +from src.db.queries.implementations.core.common.annotation_exists_.constants import ALL_ANNOTATION_MODELS class AnnotationExistsCTEQueryBuilder(QueryBuilderBase): @@ -73,7 +72,6 @@ async def build(self) -> Any: FlagURLValidated.url_id == URL.id ) anno_exists_query = anno_exists_query.where( - URL.status == URLStatus.OK.value, FlagURLValidated.url_id.is_(None) ) anno_exists_query = anno_exists_query.group_by(URL.id).cte("annotations_exist") diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index 5de2eb55..f5696e7e 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -5,10 +5,9 @@ from src.api.endpoints.batch.dtos.get.summaries.counts import BatchSummaryURLCounts from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.collectors.enums import CollectorType -from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.views.batch_url_status.core import BatchURLStatusMatView -from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum +from src.db.models.materialized_views.batch_url_status.core import BatchURLStatusMaterializedView +from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.builder import URLCountsCTEQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -20,7 +19,7 @@ def __init__( self, page: int = 1, collector_type: CollectorType | None = None, - status: BatchURLStatusEnum | None = None, + status: BatchURLStatusViewEnum | None = None, batch_id: int | None = None, ): super().__init__() @@ -41,7 +40,7 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: *builder.get_all(), Batch.strategy, Batch.status, - BatchURLStatusMatView.batch_url_status, + BatchURLStatusMaterializedView.batch_url_status, Batch.parameters, Batch.user_id, Batch.compute_time, @@ -50,8 +49,8 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: builder.query, builder.get(count_labels.batch_id) == Batch.id, ).outerjoin( - BatchURLStatusMatView, - BatchURLStatusMatView.batch_id == Batch.id, + BatchURLStatusMaterializedView, + BatchURLStatusMaterializedView.batch_id == Batch.id, ).order_by( Batch.id.asc() ) @@ -75,7 +74,6 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: date_generated=row.date_generated, url_counts=BatchSummaryURLCounts( total=row[count_labels.total], - duplicate=row[count_labels.duplicate], not_relevant=row[count_labels.not_relevant], submitted=row[count_labels.submitted], errored=row[count_labels.error], diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 27240b7d..7192f1fa 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -1,20 +1,13 @@ -from sqlalchemy import Select, case, Label, and_, exists -from sqlalchemy.sql.functions import count, coalesce, func +from sqlalchemy import Select +from sqlalchemy.sql.functions import func -from src.collectors.enums import URLStatus, CollectorType -from src.core.enums import BatchStatus -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL +from src.collectors.enums import CollectorType from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource -from src.db.models.views.batch_url_status.core import BatchURLStatusMatView -from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum +from src.db.models.materialized_views.batch_url_status.core import BatchURLStatusMaterializedView +from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.helpers import add_page_offset from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.all import ALL_CTE -from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.duplicate import DUPLICATE_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.error import ERROR_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.not_relevant import NOT_RELEVANT_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.pending import PENDING_CTE @@ -28,7 +21,7 @@ def __init__( self, page: int = 1, collector_type: CollectorType | None = None, - status: BatchURLStatusEnum | None = None, + status: BatchURLStatusViewEnum | None = None, batch_id: int | None = None ): super().__init__(URLCountsLabels()) @@ -43,7 +36,6 @@ def get_core_query(self): query = ( Select( Batch.id.label(labels.batch_id), - func.coalesce(DUPLICATE_CTE.count, 0).label(labels.duplicate), func.coalesce(SUBMITTED_CTE.count, 0).label(labels.submitted), func.coalesce(PENDING_CTE.count, 0).label(labels.pending), func.coalesce(ALL_CTE.count, 0).label(labels.total), @@ -52,11 +44,11 @@ def get_core_query(self): ) .select_from(Batch) .join( - BatchURLStatusMatView, - BatchURLStatusMatView.batch_id == Batch.id, + BatchURLStatusMaterializedView, + BatchURLStatusMaterializedView.batch_id == Batch.id, ) ) - for cte in [DUPLICATE_CTE, SUBMITTED_CTE, PENDING_CTE, ALL_CTE, NOT_RELEVANT_CTE, ERROR_CTE]: + for cte in [SUBMITTED_CTE, PENDING_CTE, ALL_CTE, NOT_RELEVANT_CTE, ERROR_CTE]: query = query.outerjoin( cte.cte, Batch.id == cte.batch_id @@ -86,4 +78,4 @@ def apply_collector_type_filter(self, query: Select): def apply_status_filter(self, query: Select): if self.status is None: return query - return query.where(BatchURLStatusMatView.batch_url_status == self.status.value) + return query.where(BatchURLStatusMaterializedView.batch_url_status == self.status.value) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py deleted file mode 100644 index 906dd49c..00000000 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py +++ /dev/null @@ -1,29 +0,0 @@ -from sqlalchemy import select, func - -from src.collectors.enums import URLStatus -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ - URLCountsCTEContainer - -DUPLICATE_CTE = URLCountsCTEContainer( - select( - Batch.id, - func.count(URL.id).label("duplicate_count") - ) - .join( - LinkBatchURL, - LinkBatchURL.batch_id == Batch.id, - ) - .join( - URL, - URL.id == LinkBatchURL.url_id, - ) - .where( - URL.status == URLStatus.DUPLICATE - ) - .group_by( - Batch.id - ).cte("duplicate_count") -) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py index 953a5c0d..2109588b 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py @@ -1,6 +1,5 @@ from sqlalchemy import select, func -from src.collectors.enums import URLStatus from src.db.helpers.query import exists_url from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/labels.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/labels.py index c55d8f45..72806c13 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/labels.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/labels.py @@ -11,6 +11,5 @@ class URLCountsLabels(LabelsBase): submitted: str = "count_submitted" not_relevant: str = "count_not_relevant" error: str = "count_error" - duplicate: str = "count_duplicate" diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index c224fa40..e95726bf 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -4,15 +4,15 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO -from src.collectors.enums import URLStatus from src.db.models.impl.annotation.agency.user.sqlalchemy import AnnotationAgencyUser -from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder + class PendingAnnotationExistsCTEQueryBuilder(AnnotationExistsCTEQueryBuilder): @property @@ -43,9 +43,7 @@ async def build(self) -> Any: URL, URL.id == self.url_id ) - .where( - URL.status == URLStatus.OK.value - ).cte("pending") + .cte("pending") ) diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index faa965a8..d3e90b8b 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -1,20 +1,9 @@ -from http import HTTPStatus -from typing import Any +from sqlalchemy import Select, select, exists, func, Subquery, not_, ColumnElement -from sqlalchemy import Select, select, exists, func, Subquery, and_, not_, ColumnElement -from sqlalchemy.orm import selectinload - -from src.collectors.enums import URLStatus -from src.db.enums import TaskType from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.link.task_url import LinkTaskURL -from src.db.models.impl.task.core import Task -from src.db.models.impl.task.enums import TaskStatus from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata -from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo -from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -23,46 +12,6 @@ class StatementComposer: Assists in the composition of SQLAlchemy statements """ - @staticmethod - def has_non_errored_urls_without_html_data() -> Select: - exclude_subquery = ( - select(1). - select_from(LinkTaskURL). - join(Task, LinkTaskURL.task_id == Task.id). - where(LinkTaskURL.url_id == URL.id). - where(Task.task_type == TaskType.HTML.value). - where(Task.task_status == TaskStatus.COMPLETE.value) - ) - query = ( - select(URL) - .join(URLWebMetadata) - .outerjoin(URLScrapeInfo) - .where( - URLScrapeInfo.url_id == None, - ~exists(exclude_subquery), - URLWebMetadata.status_code == HTTPStatus.OK.value, - URLWebMetadata.content_type.like("%html%"), - ) - .options( - selectinload(URL.batch) - ) - ) - return query - - @staticmethod - def exclude_urls_with_extant_model( - statement: Select, - model: Any - ): - return (statement.where( - ~exists( - select(model.id). - where( - model.url_id == URL.id - ) - ) - )) - @staticmethod def simple_count_subquery(model, attribute: str, label: str) -> Subquery: attr_value = getattr(model, attribute) @@ -74,12 +23,9 @@ def simple_count_subquery(model, attribute: str, label: str) -> Subquery: @staticmethod def pending_urls_missing_miscellaneous_metadata_query() -> Select: query = select(URL).where( - and_( - URL.status == URLStatus.OK.value, URL.name == None, URL.description == None, URLOptionalDataSourceMetadata.url_id == None - ) ).outerjoin( URLOptionalDataSourceMetadata ).join( diff --git a/tests/alembic/helpers.py b/tests/alembic/helpers.py index a284e0fc..0e19d035 100644 --- a/tests/alembic/helpers.py +++ b/tests/alembic/helpers.py @@ -1,5 +1,3 @@ -from typing import Optional - from sqlalchemy import text from sqlalchemy.orm import Session diff --git a/tests/automated/integration/api/_helpers/RequestValidator.py b/tests/automated/integration/api/_helpers/RequestValidator.py index 0db00cb3..b1bfbf20 100644 --- a/tests/automated/integration/api/_helpers/RequestValidator.py +++ b/tests/automated/integration/api/_helpers/RequestValidator.py @@ -10,7 +10,6 @@ from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.api.shared.models.message_response import MessageResponse from src.api.endpoints.batch.duplicates.dto import GetDuplicatesByBatchResponse from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO @@ -32,11 +31,12 @@ from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse from src.api.endpoints.url.get.dto import GetURLsResponseInfo +from src.api.shared.models.message_response import MessageResponse from src.collectors.enums import CollectorType from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.core.enums import BatchStatus from src.db.enums import TaskType -from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum +from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from src.util.helper_functions import update_if_not_none @@ -268,7 +268,7 @@ def delete( def get_batch_statuses( self, collector_type: CollectorType | None = None, - status: BatchURLStatusEnum | None = None, + status: BatchURLStatusViewEnum | None = None, ) -> GetBatchSummariesResponse: params = {} update_if_not_none( diff --git a/tests/automated/integration/api/annotate/all/test_anon_count.py b/tests/automated/integration/api/annotate/all/test_anon_count.py index 16fe728b..05975236 100644 --- a/tests/automated/integration/api/annotate/all/test_anon_count.py +++ b/tests/automated/integration/api/annotate/all/test_anon_count.py @@ -1,6 +1,7 @@ -import pytest import uuid +import pytest + from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse from src.core.enums import RecordType from src.db.models.impl.annotation.agency.anon.sqlalchemy import AnnotationAgencyAnon diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index a356fa56..8a62c3e8 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -11,10 +11,10 @@ from src.db.models.impl.annotation.agency.user.sqlalchemy import AnnotationAgencyUser from src.db.models.impl.annotation.location.user.sqlalchemy import AnnotationLocationUser from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion -from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser +from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/annotate/all/test_sorting.py b/tests/automated/integration/api/annotate/all/test_sorting.py index 1a81dc89..2f9f7b2a 100644 --- a/tests/automated/integration/api/annotate/all/test_sorting.py +++ b/tests/automated/integration/api/annotate/all/test_sorting.py @@ -1,7 +1,6 @@ import pytest from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.location__user_follow import LinkLocationUserFollow from src.db.models.impl.link.location_batch.sqlalchemy import LinkLocationBatch from src.db.models.impl.url.core.enums import URLSource diff --git a/tests/automated/integration/api/annotate/anonymous/test_core.py b/tests/automated/integration/api/annotate/anonymous/test_core.py index e977accb..65f18965 100644 --- a/tests/automated/integration/api/annotate/anonymous/test_core.py +++ b/tests/automated/integration/api/annotate/anonymous/test_core.py @@ -15,6 +15,7 @@ from src.db.models.impl.annotation.agency.user.sqlalchemy import AnnotationAgencyUser from src.db.models.impl.annotation.location.anon.sqlalchemy import AnnotationLocationAnon from src.db.models.impl.annotation.location.user.sqlalchemy import AnnotationLocationUser +from src.db.models.impl.annotation.name.anon.sqlalchemy import AnnotationNameAnonEndorsement from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement from src.db.models.impl.annotation.record_type.anon.sqlalchemy import AnnotationRecordTypeAnon @@ -22,7 +23,6 @@ from src.db.models.impl.annotation.url_type.anon.sqlalchemy import AnnotationURLTypeAnon from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.annotation.name.anon.sqlalchemy import AnnotationNameAnonEndorsement from src.db.models.mixins import URLDependentMixin from tests.automated.integration.api.annotate.anonymous.helper import get_next_url_for_anonymous_annotation, \ post_and_get_next_url_for_anonymous_annotation diff --git a/tests/automated/integration/api/batch/summaries/test_happy_path.py b/tests/automated/integration/api/batch/summaries/test_happy_path.py index 6af9ce2b..126f1118 100644 --- a/tests/automated/integration/api/batch/summaries/test_happy_path.py +++ b/tests/automated/integration/api/batch/summaries/test_happy_path.py @@ -68,7 +68,6 @@ async def test_get_batch_summaries(api_test_helper): assert counts_1.pending == 1 assert counts_1.submitted == 2 assert counts_1.not_relevant == 0 - assert counts_1.duplicate == 0 assert counts_1.errored == 0 result_2 = results[1] @@ -79,7 +78,6 @@ async def test_get_batch_summaries(api_test_helper): assert counts_2.errored == 0 assert counts_2.pending == 0 assert counts_2.submitted == 0 - assert counts_2.duplicate == 0 result_3 = results[2] assert result_3.id == batch_3_id @@ -89,4 +87,3 @@ async def test_get_batch_summaries(api_test_helper): assert counts_3.errored == 0 assert counts_3.pending == 7 assert counts_3.submitted == 1 - assert counts_3.duplicate == 7 diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py index f4181629..7ebc4ccf 100644 --- a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.dtos.url.mapping_.simple import SimpleURLMapping -from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum +from src.db.models.materialized_views.batch_url_status.enums import BatchURLStatusViewEnum from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -52,7 +52,7 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Test filter for pending URLs and only retrieve the second batch pending_urls_results = ath.request_validator.get_batch_statuses( - status=BatchURLStatusEnum.HAS_UNLABELED_URLS + status=BatchURLStatusViewEnum.HAS_UNLABELED_URLS ) assert len(pending_urls_results.results) == 1 diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 3d84d6d7..00936d15 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -1,6 +1,6 @@ import pytest -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.mapping_.simple import SimpleURLMapping @@ -25,12 +25,10 @@ async def test_get_batches_aggregated_metrics( ) url_mappings_broken: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, - status=URLStatus.BROKEN, count=4, ) url_mappings_ok: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, - status=URLStatus.OK, count=11, ) url_mappings_all: list[SimpleURLMapping] = url_mappings_broken + url_mappings_ok diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index 6921c3c1..71b7c96b 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -2,7 +2,7 @@ import pytest -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.mapping_.simple import SimpleURLMapping diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index 181c295e..a6de442e 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -1,7 +1,6 @@ import pendulum import pytest -from src.collectors.enums import URLStatus from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index e203b722..5dc163c7 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -2,7 +2,7 @@ import pytest -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters @@ -46,7 +46,7 @@ async def test_get_urls_aggregated_metrics(api_test_helper): batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, ) - url_mappings_2_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_mappings_2_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=4) url_mappings_2_validated: list[SimpleURLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) url_mappings_2_not_relevant: list[SimpleURLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py index d0a25ab1..a9a52d2e 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py @@ -1,7 +1,7 @@ import pendulum import pytest -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters diff --git a/tests/automated/integration/api/proposals/test_agencies.py b/tests/automated/integration/api/proposals/test_agencies.py index d1a2d2ab..354481f1 100644 --- a/tests/automated/integration/api/proposals/test_agencies.py +++ b/tests/automated/integration/api/proposals/test_agencies.py @@ -3,9 +3,9 @@ from src.api.endpoints.proposals.agencies.by_id.approve.response import ProposalAgencyApproveResponse from src.api.endpoints.proposals.agencies.by_id.locations.get.response import ProposalAgencyGetLocationsOuterResponse from src.api.endpoints.proposals.agencies.by_id.put.request import ProposalAgencyPutRequest -from src.api.endpoints.proposals.agencies.root.get.response import ProposalAgencyGetOuterResponse from src.api.endpoints.proposals.agencies.by_id.reject.request import ProposalAgencyRejectRequestModel from src.api.endpoints.proposals.agencies.by_id.reject.response import ProposalAgencyRejectResponse +from src.api.endpoints.proposals.agencies.root.get.response import ProposalAgencyGetOuterResponse from src.api.endpoints.submit.agency.enums import AgencyProposalRequestStatus from src.api.endpoints.submit.agency.request import SubmitAgencyRequestModel from src.api.endpoints.submit.agency.response import SubmitAgencyProposalResponse diff --git a/tests/automated/integration/api/submit/data_source/test_core.py b/tests/automated/integration/api/submit/data_source/test_core.py index 51bbc93b..bf339bfd 100644 --- a/tests/automated/integration/api/submit/data_source/test_core.py +++ b/tests/automated/integration/api/submit/data_source/test_core.py @@ -4,7 +4,6 @@ import pytest from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest -from src.collectors.enums import URLStatus from src.core.enums import RecordType, BatchStatus from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.annotation.agency.anon.sqlalchemy import AnnotationAgencyAnon @@ -78,7 +77,6 @@ async def test_submit_data_source( assert url.scheme == "https" assert url.trailing_slash == True assert url.source == URLSource.MANUAL - assert url.status == URLStatus.OK assert url.description == "Example description" # Check for Batch diff --git a/tests/automated/integration/api/submit/data_source/test_duplicate.py b/tests/automated/integration/api/submit/data_source/test_duplicate.py index ea16e1ec..87dd21a7 100644 --- a/tests/automated/integration/api/submit/data_source/test_duplicate.py +++ b/tests/automated/integration/api/submit/data_source/test_duplicate.py @@ -1,12 +1,13 @@ import pytest from fastapi import HTTPException -from src.api.endpoints.submit.data_source.models.response.duplicate import SubmitDataSourceURLDuplicateSubmissionResponse +from src.api.endpoints.submit.data_source.models.response.duplicate import \ + SubmitDataSourceURLDuplicateSubmissionResponse from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @@ -18,6 +19,7 @@ async def test_submit_data_source_duplicate( pittsburgh_locality: LocalityCreationInfo, test_url_data_source_mapping: SimpleURLMapping ): + await api_test_helper.adb_client().refresh_materialized_views() ath = api_test_helper try: @@ -34,5 +36,5 @@ async def test_submit_data_source_duplicate( model = SubmitDataSourceURLDuplicateSubmissionResponse(**response) assert model.url_id == test_url_data_source_mapping.url_id assert model.url_type == URLType.DATA_SOURCE - assert model.url_status == URLStatus.OK + assert model.url_status == URLStatusViewEnum.AWAITING_SUBMISSION assert model.message == "Duplicate URL found" diff --git a/tests/automated/integration/api/submit/test_url_maximal.py b/tests/automated/integration/api/submit/test_url_maximal.py index 1d458c98..5e9f0ec4 100644 --- a/tests/automated/integration/api/submit/test_url_maximal.py +++ b/tests/automated/integration/api/submit/test_url_maximal.py @@ -10,9 +10,9 @@ from src.db.models.impl.annotation.name.suggestion.enums import NameSuggestionSource from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement +from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index fa3f7884..ad8bfe3f 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -2,12 +2,12 @@ import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.batch.sqlalchemy import Batch from src.collectors.enums import CollectorType from src.core.enums import RecordType +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata @pytest.mark.asyncio diff --git a/tests/automated/integration/api/url/by_id/delete/test_any_url.py b/tests/automated/integration/api/url/by_id/delete/test_any_url.py index 2711c103..d61f1553 100644 --- a/tests/automated/integration/api/url/by_id/delete/test_any_url.py +++ b/tests/automated/integration/api/url/by_id/delete/test_any_url.py @@ -19,8 +19,13 @@ from src.db.models.impl.annotation.location.user.sqlalchemy import AnnotationLocationUser from src.db.models.impl.annotation.name.suggestion.enums import NameSuggestionSource from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion +from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement from src.db.models.impl.annotation.record_type.anon.sqlalchemy import AnnotationRecordTypeAnon +from src.db.models.impl.annotation.record_type.auto.sqlalchemy import AnnotationAutoRecordType +from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.annotation.url_type.anon.sqlalchemy import AnnotationURLTypeAnon +from src.db.models.impl.annotation.url_type.auto.sqlalchemy import AnnotationAutoURLType +from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser from src.db.models.impl.change_log import ChangeLog from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL @@ -29,7 +34,6 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL -from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL @@ -40,10 +44,6 @@ from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.db.models.impl.url.internet_archives.save.sqlalchemy import URLInternetArchivesSaveMetadata from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot -from src.db.models.impl.annotation.record_type.auto.sqlalchemy import AnnotationAutoRecordType -from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser -from src.db.models.impl.annotation.url_type.auto.sqlalchemy import AnnotationAutoURLType -from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder diff --git a/tests/automated/integration/api/url/by_id/snapshot/test_not_found.py b/tests/automated/integration/api/url/by_id/snapshot/test_not_found.py index cce84649..155b56d7 100644 --- a/tests/automated/integration/api/url/by_id/snapshot/test_not_found.py +++ b/tests/automated/integration/api/url/by_id/snapshot/test_not_found.py @@ -1,7 +1,8 @@ import pytest +from fastapi import Response from tests.helpers.api_test_helper import APITestHelper -from fastapi import Response + @pytest.mark.asyncio async def test_get_url_screenshot_not_found(api_test_helper: APITestHelper): diff --git a/tests/automated/integration/api/url/test_get.py b/tests/automated/integration/api/url/test_get.py index 8c95c670..d1607f7c 100644 --- a/tests/automated/integration/api/url/test_get.py +++ b/tests/automated/integration/api/url/test_get.py @@ -28,6 +28,7 @@ async def test_get_urls(api_test_helper: APITestHelper): # Add errors await db_data_creator.task_errors(url_ids=url_ids) + await api_test_helper.adb_client().refresh_materialized_views() data: GetURLsResponseInfo = api_test_helper.request_validator.get_urls() assert data.count == 3 diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 22537d20..8a9a8569 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -6,7 +6,6 @@ from starlette.testclient import TestClient from src.api.main import app -from src.collectors.enums import URLStatus from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore from src.core.enums import RecordType @@ -245,21 +244,9 @@ async def test_url_id( url="example.com", source=URLSource.COLLECTOR, trailing_slash=False, - status=URLStatus.OK ) return await db_data_creator.adb_client.add(url, return_id=True) -@pytest_asyncio.fixture -async def test_url_id_2( - db_data_creator: DBDataCreator, -) -> int: - url = URL( - url="example.com/2", - source=URLSource.COLLECTOR, - trailing_slash=False, - status=URLStatus.OK - ) - return await db_data_creator.adb_client.add(url, return_id=True) @pytest_asyncio.fixture diff --git a/tests/automated/integration/core/async_/conclude_task/test_error.py b/tests/automated/integration/core/async_/conclude_task/test_error.py index 1a31b87e..a747aa3a 100644 --- a/tests/automated/integration/core/async_/conclude_task/test_error.py +++ b/tests/automated/integration/core/async_/conclude_task/test_error.py @@ -1,6 +1,5 @@ import pytest -from src.core.enums import BatchStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.db.models.impl.task.enums import TaskStatus from tests.automated.integration.core.async_.conclude_task.helpers import setup_run_info diff --git a/tests/automated/integration/core/async_/conclude_task/test_success.py b/tests/automated/integration/core/async_/conclude_task/test_success.py index 03cc5b52..eb0e8988 100644 --- a/tests/automated/integration/core/async_/conclude_task/test_success.py +++ b/tests/automated/integration/core/async_/conclude_task/test_success.py @@ -1,6 +1,5 @@ import pytest -from src.core.enums import BatchStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.db.models.impl.task.enums import TaskStatus from tests.automated.integration.core.async_.conclude_task.helpers import setup_run_info diff --git a/tests/automated/integration/core/async_/run_task/test_break_loop.py b/tests/automated/integration/core/async_/run_task/test_break_loop.py index 71b5704f..0235bc08 100644 --- a/tests/automated/integration/core/async_/run_task/test_break_loop.py +++ b/tests/automated/integration/core/async_/run_task/test_break_loop.py @@ -4,10 +4,10 @@ import pytest from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.enums import TaskType -from src.core.tasks.url.enums import TaskOperatorOutcome from tests.automated.integration.core.async_.helpers import setup_async_core from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/core/async_/run_task/test_prereq_met.py b/tests/automated/integration/core/async_/run_task/test_prereq_met.py index e5425fd9..8d68034f 100644 --- a/tests/automated/integration/core/async_/run_task/test_prereq_met.py +++ b/tests/automated/integration/core/async_/run_task/test_prereq_met.py @@ -3,13 +3,11 @@ import pytest -from src.core.enums import BatchStatus from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.enums import TaskType -from src.db.models.impl.task.core import Task from tests.automated.integration.core.async_.helpers import setup_async_core from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py index c419fb70..a91873a7 100644 --- a/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py +++ b/tests/automated/integration/db/client/annotate_url/test_agency_not_in_db.py @@ -2,8 +2,8 @@ from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency -from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency @pytest.mark.asyncio diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 9421c1b7..76150283 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -1,7 +1,6 @@ import pytest from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency @@ -9,8 +8,8 @@ from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.reviewing_user import ReviewingUserURL -from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @pytest.mark.asyncio @@ -42,7 +41,6 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): assert len(urls) == 1 url = urls[0] assert url.id == url_mapping.url_id - assert url.status == URLStatus.OK assert url.name == "Test Name" assert url.description == "Test Description" diff --git a/tests/automated/integration/db/client/approve_url/test_error.py b/tests/automated/integration/db/client/approve_url/test_error.py index f358a74b..c8e33547 100644 --- a/tests/automated/integration/db/client/approve_url/test_error.py +++ b/tests/automated/integration/db/client/approve_url/test_error.py @@ -2,9 +2,8 @@ from starlette.exceptions import HTTPException from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.core.enums import RecordType -from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @pytest.mark.asyncio diff --git a/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py b/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py index 86d4a3ee..c32441f3 100644 --- a/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py +++ b/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py @@ -2,8 +2,8 @@ from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse from src.core.enums import SuggestionType -from tests.helpers.setup.annotation.core import setup_for_get_next_url_for_annotation from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.setup.annotation.core import setup_for_get_next_url_for_annotation @pytest.mark.asyncio diff --git a/tests/automated/integration/db/structure/test_updated_at.py b/tests/automated/integration/db/structure/test_updated_at.py index 0a4c18a4..d65c44c3 100644 --- a/tests/automated/integration/db/structure/test_updated_at.py +++ b/tests/automated/integration/db/structure/test_updated_at.py @@ -1,9 +1,7 @@ -import asyncio from datetime import datetime import pytest -from src.collectors.enums import URLStatus from src.db.models.impl.url.core.pydantic.upsert import URLUpsertModel from src.db.models.impl.url.core.sqlalchemy import URL from tests.helpers.data_creator.core import DBDataCreator @@ -14,7 +12,6 @@ async def test_updated_at(db_data_creator: DBDataCreator): _ = await db_data_creator.create_urls( count=1, - status=URLStatus.OK ) urls: list[URL] = await db_data_creator.adb_client.get_all(URL) diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py index 16c30869..2abab495 100644 --- a/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py +++ b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py @@ -3,6 +3,7 @@ from src.api.endpoints.data_source.get.response import DataSourceGetResponse from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + @pytest.mark.asyncio async def test_get_by_id(readonly_helper: ReadOnlyTestHelper): raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( diff --git a/tests/automated/integration/readonly/setup/annotations.py b/tests/automated/integration/readonly/setup/annotations.py index 9e701f62..6829e714 100644 --- a/tests/automated/integration/readonly/setup/annotations.py +++ b/tests/automated/integration/readonly/setup/annotations.py @@ -4,10 +4,10 @@ from src.db.models.impl.annotation.location.user.sqlalchemy import AnnotationLocationUser from src.db.models.impl.annotation.name.suggestion.enums import NameSuggestionSource from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion -from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement from src.db.models.impl.annotation.record_type.user.user import AnnotationRecordTypeUser from src.db.models.impl.annotation.url_type.user.sqlalchemy import AnnotationURLTypeUser +from src.db.models.impl.flag.url_validated.enums import URLType async def add_full_data_sources_annotations( diff --git a/tests/automated/integration/readonly/setup/data_source.py b/tests/automated/integration/readonly/setup/data_source.py index e22929ee..d5984c06 100644 --- a/tests/automated/integration/readonly/setup/data_source.py +++ b/tests/automated/integration/readonly/setup/data_source.py @@ -1,6 +1,5 @@ from datetime import date -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.flag.url_validated.enums import URLType @@ -27,7 +26,6 @@ async def add_maximal_data_source( collector_metadata={ "url": "https://read-only.com/" }, - status=URLStatus.OK, source=URLSource.COLLECTOR, ) url_id: int = await adb_client.add(url, return_id=True) @@ -82,7 +80,6 @@ async def add_minimal_data_source( name="Minimal name", trailing_slash=False, collector_metadata={}, - status=URLStatus.OK, source=URLSource.ROOT_URL, ) url_id: int = await adb_client.add(url, return_id=True) diff --git a/tests/automated/integration/readonly/setup/meta_url.py b/tests/automated/integration/readonly/setup/meta_url.py index 837274bb..d5ea9da4 100644 --- a/tests/automated/integration/readonly/setup/meta_url.py +++ b/tests/automated/integration/readonly/setup/meta_url.py @@ -1,4 +1,3 @@ -from src.collectors.enums import URLStatus from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.core.enums import URLSource @@ -20,7 +19,6 @@ async def add_meta_url( collector_metadata={ "url": "https://read-only-meta-url.com/" }, - status=URLStatus.OK, source=URLSource.REDIRECT, ) url_id: int = await adb_client.add(url, return_id=True) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 1d1085a5..f8fb2351 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -1,6 +1,5 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL @@ -38,7 +37,6 @@ async def run(self, session: AsyncSession) -> list[int]: url = URL( url=get_test_url(i), scheme=None, - status=URLStatus.OK, name=name, description=description, source=URLSource.COLLECTOR, diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py index 25c4d09d..9c767f71 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py @@ -3,7 +3,6 @@ from src.core.enums import RecordType from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator -from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse from src.db.client.async_ import AsyncDatabaseClient from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_not_called from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py index b4abc0ee..d4c9d4c8 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator @@ -9,11 +8,11 @@ from src.db.client.async_ import AsyncDatabaseClient from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_results_called from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import generate_expected_outputs +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ TestPushToHuggingFaceURLSetupEntryInput -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ - PushToHuggingFaceTestSetupStatusEnum from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py index 4ca89aa1..4ac74f4e 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator @@ -9,11 +8,11 @@ from src.db.client.async_ import AsyncDatabaseClient from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_results_called from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import generate_expected_outputs +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ TestPushToHuggingFaceURLSetupEntryInput -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ - PushToHuggingFaceTestSetupStatusEnum from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_entry_not_found.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_entry_not_found.py index 8a2157ed..80e6c129 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_entry_not_found.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_entry_not_found.py @@ -4,8 +4,8 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.scheduled.impl.internet_archives.probe.setup import add_urls +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error @pytest.mark.asyncio diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_happy_path.py index 90131605..96174e6b 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/test_happy_path.py @@ -6,9 +6,9 @@ from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.external.internet_archives.models.capture import IACapture -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.scheduled.impl.internet_archives.probe.constants import TEST_URL_1, TEST_URL_2 from tests.automated.integration.tasks.scheduled.impl.internet_archives.probe.setup import add_urls +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error @pytest.mark.asyncio diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py index fa31dc40..2e57e042 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py @@ -5,7 +5,6 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.external.pdap.client import PDAPClient -from src.external.pdap.enums import DataSourcesURLStatus from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ @@ -79,7 +78,6 @@ async def test_add( assert content.access_notes is None assert content.access_types == [] assert content.data_portal_type_other is None - assert content.url_status == DataSourcesURLStatus.OK assert content.agency_ids == [test_agency_id] diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py index 94273019..6d52afc2 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py @@ -1,7 +1,5 @@ from datetime import date -from sqlalchemy import update - from src.api.shared.models.message_response import MessageResponse from src.core.enums import RecordType from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py index e63e1496..dcdfb56b 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py @@ -22,6 +22,11 @@ async def test_add( mock_pdap_client: PDAPClient, test_agency_id: int ): + await db_data_creator.create_web_metadata( + url_ids=[test_url_meta_url_id] + ) + + await db_data_creator.adb_client.refresh_materialized_views() operator = DSAppSyncMetaURLsAddTaskOperator( adb_client=adb_client_test, pdap_client=mock_pdap_client @@ -46,7 +51,6 @@ async def test_add( # Run task and confirm runs without error await run_task_and_confirm_success(operator) - # Confirm expected method was called with expected parameters request: AddMetaURLsOuterRequest = extract_and_validate_sync_request( mock_pdap_client, diff --git a/tests/automated/integration/tasks/scheduled/impl/update_url_status/__init__.py b/tests/automated/integration/tasks/scheduled/impl/update_url_status/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py b/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py deleted file mode 100644 index 6b06fe31..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py +++ /dev/null @@ -1,77 +0,0 @@ -import pytest -from sqlalchemy import update - -from src.collectors.enums import URLStatus -from src.core.tasks.scheduled.impl.update_url_status.operator import UpdateURLStatusOperator -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_update_url_status_task( - test_url_data_source_id: int, - test_url_meta_url_id: int, - adb_client_test: AsyncDatabaseClient, - db_data_creator: DBDataCreator -): - - # Create Operator - operator = UpdateURLStatusOperator( - adb_client=adb_client_test, - ) - - # Add web metadata to URLs - ## Data Source URL: Add 404 - await db_data_creator.create_web_metadata( - url_ids=[test_url_data_source_id], - status_code=404 - ) - - ## Meta URL: Add 200 - await db_data_creator.create_web_metadata( - url_ids=[test_url_meta_url_id], - status_code=200 - ) - - # Run Task - await operator.run_task() - - # Check URLs - urls: list[URL] = await adb_client_test.get_all(URL) - id_status_set_tuple: set[tuple[int, URLStatus]] = { - (url.id, url.status) - for url in urls - } - ## Data Source URL: Status should now be broken - ## Meta URL: Status should be unchanged - assert id_status_set_tuple == { - (test_url_data_source_id, URLStatus.BROKEN), - (test_url_meta_url_id, URLStatus.OK) - } - - # Update Web Metadata for Data Source URL to be 404 - statement = update(URLWebMetadata).where( - URLWebMetadata.url_id == test_url_data_source_id, - ).values( - status_code=200 - ) - await adb_client_test.execute(statement) - - # Run Task - await operator.run_task() - - # Check URLs - urls: list[URL] = await adb_client_test.get_all(URL) - id_status_set_tuple: set[tuple[int, URLStatus]] = { - (url.id, url.status) - for url in urls - } - ## Data Source URL: Status should now be ok - ## Meta URL: Status should be unchanged - assert id_status_set_tuple == { - (test_url_data_source_id, URLStatus.OK), - (test_url_meta_url_id, URLStatus.OK) - } - diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index cb70ff8c..4e5bb551 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 22 +NUMBER_OF_ENTRIES = 21 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py b/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py index 74e31306..feeba3bd 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py @@ -5,6 +5,7 @@ from src.db.models.impl.annotation.agency.auto.subtask.enum import AutoAgencyIDSubtaskType from tests.helpers.data_creator.core import DBDataCreator + @pytest.mark.asyncio async def test_survey_flag( operator: AgencyIdentificationTaskOperator, diff --git a/tests/automated/integration/tasks/url/impl/html/check/__init__.py b/tests/automated/integration/tasks/url/impl/html/check/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/html/check/manager.py b/tests/automated/integration/tasks/url/impl/html/check/manager.py deleted file mode 100644 index deb0fa11..00000000 --- a/tests/automated/integration/tasks/url/impl/html/check/manager.py +++ /dev/null @@ -1,68 +0,0 @@ -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo -from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata -from tests.automated.integration.tasks.url.impl.html.setup.models.record import TestURLHTMLTaskSetupRecord - - -class TestURLHTMLTaskCheckManager: - - def __init__( - self, - adb_client: AsyncDatabaseClient, - records: list[TestURLHTMLTaskSetupRecord] - ): - self.adb_client = adb_client - self.records = records - self._id_to_entry = {record.url_id: record.entry for record in records} - - async def check(self): - await self._check_has_html() - await self._check_scrape_status() - await self._check_has_same_url_status() - await self._check_marked_as_404() - - async def _check_has_html(self) -> None: - urls_with_html = [ - record.url_id - for record in self.records - if record.entry.expected_result.has_html - ] - - compressed_html_list: list[URLCompressedHTML] = await self.adb_client.get_all(URLCompressedHTML) - assert len(compressed_html_list) == len(urls_with_html) - for compressed_html in compressed_html_list: - assert compressed_html.url_id in urls_with_html - - async def _check_scrape_status(self) -> None: - urls_with_scrape_status = [ - record.url_id - for record in self.records - if record.entry.expected_result.scrape_status is not None - ] - - url_scrape_info_list: list[URLScrapeInfo] = await self.adb_client.get_all(URLScrapeInfo) - assert len(url_scrape_info_list) == len(urls_with_scrape_status) - for url_scrape_info in url_scrape_info_list: - assert url_scrape_info.url_id in urls_with_scrape_status - entry = self._id_to_entry[url_scrape_info.url_id] - expected_scrape_status = entry.expected_result.scrape_status - assert url_scrape_info.status == expected_scrape_status - - async def _check_has_same_url_status(self): - urls: list[URL] = await self.adb_client.get_all(URL) - for url in urls: - entry = self._id_to_entry[url.id] - if entry.expected_result.web_metadata_status_marked_404: - continue - assert url.status == entry.url_info.status, f"URL {url.url} has outcome {url.status} instead of {entry.url_info.status}" - - async def _check_marked_as_404(self): - web_metadata_list: list[URLWebMetadata] = await self.adb_client.get_all( - URLWebMetadata - ) - for web_metadata in web_metadata_list: - entry = self._id_to_entry[web_metadata.url_id] - if entry.expected_result.web_metadata_status_marked_404: - assert web_metadata.status_code == 404, f"URL {entry.url_info.url} has status code {web_metadata.status_code} instead of 404" diff --git a/tests/automated/integration/tasks/url/impl/html/conftest.py b/tests/automated/integration/tasks/url/impl/html/conftest.py new file mode 100644 index 00000000..b73a93e5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/html/conftest.py @@ -0,0 +1,28 @@ +import types + +import pytest + +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.db.client.async_ import AsyncDatabaseClient +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.impl.html.mocks.methods import mock_parse + + +class _MockURLRequestInterface: + + async def make_requests_with_html(self, urls: list[str]) -> list[URLResponseInfo]: + return [] + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> URLHTMLTaskOperator: + html_parser = HTMLResponseParser() + html_parser.parse = types.MethodType(mock_parse, html_parser) + operator = URLHTMLTaskOperator( + adb_client=adb_client_test, + url_request_interface=_MockURLRequestInterface(), + html_parser=html_parser + ) + return operator \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/mocks/methods.py b/tests/automated/integration/tasks/url/impl/html/mocks/methods.py index d6799eea..0e0c5657 100644 --- a/tests/automated/integration/tasks/url/impl/html/mocks/methods.py +++ b/tests/automated/integration/tasks/url/impl/html/mocks/methods.py @@ -1,5 +1,3 @@ -from typing import Optional - from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo @@ -10,6 +8,3 @@ async def mock_parse(self, url: str, html_content: str, content_type: str) -> Re description="fake description", ) - -async def mock_get_from_cache(self, url: str) -> Optional[str]: - return None diff --git a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/__init__.py b/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/core.py b/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/core.py deleted file mode 100644 index 49e6b1f3..00000000 --- a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/core.py +++ /dev/null @@ -1,11 +0,0 @@ -from src.external.url_request.dtos.url_response import URLResponseInfo -from tests.automated.integration.tasks.url.impl.html.mocks.url_request_interface.setup import setup_url_to_response_info - - -class MockURLRequestInterface: - - def __init__(self): - self._url_to_response_info: dict[str, URLResponseInfo] = setup_url_to_response_info() - - async def make_requests_with_html(self, urls: list[str]) -> list[URLResponseInfo]: - return [self._url_to_response_info[url] for url in urls] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py b/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py deleted file mode 100644 index c0dbef6a..00000000 --- a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py +++ /dev/null @@ -1,57 +0,0 @@ -from http import HTTPStatus - -from src.external.url_request.dtos.url_response import URLResponseInfo -from tests.automated.integration.tasks.url.impl.html.setup.data import TEST_ENTRIES -from tests.automated.integration.tasks.url.impl.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestErrorType - - -def _get_success( - entry: TestURLHTMLTaskSetupEntry -) -> bool: - if entry.give_error is not None: - return False - return True - -def get_http_status( - entry: TestURLHTMLTaskSetupEntry -) -> HTTPStatus: - if entry.give_error is None: - return HTTPStatus.OK - if entry.give_error == TestErrorType.HTTP_404: - return HTTPStatus.NOT_FOUND - return HTTPStatus.INTERNAL_SERVER_ERROR - -def _get_content_type( - entry: TestURLHTMLTaskSetupEntry -) -> str | None: - if entry.give_error is not None: - return None - return "text/html" - -def _generate_test_html() -> str: - return """ - - -
-This is an example of HTML content.
- - - """ - -def setup_url_to_response_info( -) -> dict[str, URLResponseInfo]: - d = {} - for entry in TEST_ENTRIES: - response_info = URLResponseInfo( - success=_get_success(entry), - status=get_http_status(entry), - html=_generate_test_html() if _get_success(entry) else None, - content_type=_get_content_type(entry), - exception=None if _get_success(entry) else "Error" - ) - d[entry.url_info.url] = response_info - return d diff --git a/tests/automated/integration/tasks/url/impl/html/setup/__init__.py b/tests/automated/integration/tasks/url/impl/html/setup/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py deleted file mode 100644 index a3a43f8b..00000000 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ /dev/null @@ -1,94 +0,0 @@ -from http import HTTPStatus - -from src.collectors.enums import URLStatus -from src.db.models.impl.url.scrape_info.enums import ScrapeStatus -from tests.automated.integration.tasks.url.impl.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestURLInfo, \ - TestWebMetadataInfo, ExpectedResult, TestErrorType - -TEST_ENTRIES = [ - # URLs that give 200s should be updated with the appropriate scrape status - # and their html should be stored - TestURLHTMLTaskSetupEntry( - url_info=TestURLInfo( - url="happy-path.com/pending", - status=URLStatus.OK - ), - web_metadata_info=TestWebMetadataInfo( - accessed=True, - content_type="text/html", - response_code=HTTPStatus.OK, - error_message=None - ), - expected_result=ExpectedResult( - has_html=True, # Test for both compressed HTML and content metadata - scrape_status=ScrapeStatus.SUCCESS - ) - ), - # URLs that give 404s should be updated with the appropriate scrape status - # and their web metadata status should be updated to 404 - TestURLHTMLTaskSetupEntry( - url_info=TestURLInfo( - url="not-found-path.com/submitted", - status=URLStatus.OK - ), - web_metadata_info=TestWebMetadataInfo( - accessed=True, - content_type="text/html", - response_code=HTTPStatus.OK, - error_message=None - ), - give_error=TestErrorType.HTTP_404, - expected_result=ExpectedResult( - has_html=False, - scrape_status=ScrapeStatus.ERROR, - web_metadata_status_marked_404=True - ) - ), - # URLs that give errors should be updated with the appropriate scrape status - TestURLHTMLTaskSetupEntry( - url_info=TestURLInfo( - url="error-path.com/submitted", - status=URLStatus.OK - ), - web_metadata_info=TestWebMetadataInfo( - accessed=True, - content_type="text/html", - response_code=HTTPStatus.OK, - error_message=None - ), - give_error=TestErrorType.SCRAPER, - expected_result=ExpectedResult( - has_html=False, - scrape_status=ScrapeStatus.ERROR - ) - ), - # URLs with non-200 web metadata should not be processed - TestURLHTMLTaskSetupEntry( - url_info=TestURLInfo( - url="not-200-path.com/submitted", - status=URLStatus.OK - ), - web_metadata_info=TestWebMetadataInfo( - accessed=True, - content_type="text/html", - response_code=HTTPStatus.PERMANENT_REDIRECT, - error_message=None - ), - expected_result=ExpectedResult( - has_html=False, - scrape_status=None - ) - ), - # URLs with no web metadata should not be processed - TestURLHTMLTaskSetupEntry( - url_info=TestURLInfo( - url="no-web-metadata.com/submitted", - status=URLStatus.OK - ), - web_metadata_info=None, - expected_result=ExpectedResult( - has_html=False, - scrape_status=None - ) - ) -] \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/setup/manager.py b/tests/automated/integration/tasks/url/impl/html/setup/manager.py deleted file mode 100644 index e01f7b6d..00000000 --- a/tests/automated/integration/tasks/url/impl/html/setup/manager.py +++ /dev/null @@ -1,79 +0,0 @@ -import types - -from src.core.enums import RecordType -from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator -from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic -from tests.automated.integration.tasks.url.impl.html.mocks.methods import mock_parse -from tests.automated.integration.tasks.url.impl.html.mocks.url_request_interface.core import MockURLRequestInterface -from tests.automated.integration.tasks.url.impl.html.setup.data import TEST_ENTRIES -from tests.automated.integration.tasks.url.impl.html.setup.models.record import TestURLHTMLTaskSetupRecord - - -class TestURLHTMLTaskSetupManager: - - def __init__(self, adb_client: AsyncDatabaseClient): - self.adb_client = adb_client - - - async def setup(self) -> list[TestURLHTMLTaskSetupRecord]: - - records = await self._setup_urls() - await self.setup_web_metadata(records) - return records - - async def _setup_urls(self) -> list[TestURLHTMLTaskSetupRecord]: - url_insert_models: list[URLInsertModel] = [] - for entry in TEST_ENTRIES: - url_insert_model = URLInsertModel( - status=entry.url_info.status, - url=entry.url_info.url, - name=f"Test for {entry.url_info.url}", - record_type=RecordType.RESOURCES, - source=URLSource.COLLECTOR, - trailing_slash=False - ) - url_insert_models.append(url_insert_model) - url_ids = await self.adb_client.bulk_insert(url_insert_models, return_ids=True) - - records = [] - for url_id, entry in zip(url_ids, TEST_ENTRIES): - record = TestURLHTMLTaskSetupRecord( - url_id=url_id, - entry=entry - ) - records.append(record) - return records - - async def setup_web_metadata( - self, - records: list[TestURLHTMLTaskSetupRecord] - ) -> None: - models = [] - for record in records: - entry = record.entry - web_metadata_info = entry.web_metadata_info - if web_metadata_info is None: - continue - model = URLWebMetadataPydantic( - url_id=record.url_id, - accessed=web_metadata_info.accessed, - status_code=web_metadata_info.response_code.value, - content_type=web_metadata_info.content_type, - error_message=web_metadata_info.error_message - ) - models.append(model) - await self.adb_client.bulk_insert(models) - -async def setup_operator() -> URLHTMLTaskOperator: - html_parser = HTMLResponseParser() - html_parser.parse = types.MethodType(mock_parse, html_parser) - operator = URLHTMLTaskOperator( - adb_client=AsyncDatabaseClient(), - url_request_interface=MockURLRequestInterface(), - html_parser=html_parser - ) - return operator diff --git a/tests/automated/integration/tasks/url/impl/html/setup/models/__init__.py b/tests/automated/integration/tasks/url/impl/html/setup/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/html/setup/models/entry.py b/tests/automated/integration/tasks/url/impl/html/setup/models/entry.py deleted file mode 100644 index 287bb52c..00000000 --- a/tests/automated/integration/tasks/url/impl/html/setup/models/entry.py +++ /dev/null @@ -1,34 +0,0 @@ -from enum import Enum -from http import HTTPStatus - -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from src.db.models.impl.url.scrape_info.enums import ScrapeStatus - - -class TestErrorType(Enum): - SCRAPER = "scraper" - HTTP_404 = "http-404" - - -class TestWebMetadataInfo(BaseModel): - accessed: bool - content_type: str | None - response_code: HTTPStatus - error_message: str | None - -class TestURLInfo(BaseModel): - url: str - status: URLStatus - -class ExpectedResult(BaseModel): - has_html: bool - scrape_status: ScrapeStatus | None # Does not have scrape info if none - web_metadata_status_marked_404: bool = False - -class TestURLHTMLTaskSetupEntry(BaseModel): - url_info: TestURLInfo - web_metadata_info: TestWebMetadataInfo | None - give_error: TestErrorType | None = None - expected_result: ExpectedResult \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/setup/models/record.py b/tests/automated/integration/tasks/url/impl/html/setup/models/record.py deleted file mode 100644 index 022c9639..00000000 --- a/tests/automated/integration/tasks/url/impl/html/setup/models/record.py +++ /dev/null @@ -1,8 +0,0 @@ -from pydantic import BaseModel - -from tests.automated.integration.tasks.url.impl.html.setup.models.entry import TestURLHTMLTaskSetupEntry - - -class TestURLHTMLTaskSetupRecord(BaseModel): - url_id: int - entry: TestURLHTMLTaskSetupEntry \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/test_200.py b/tests/automated/integration/tasks/url/impl/html/test_200.py new file mode 100644 index 00000000..cdfbd2fe --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/html/test_200.py @@ -0,0 +1,80 @@ +from http import HTTPStatus + +import pytest + +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_met, assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + +MOCK_HTML_CONTENT = """ + + + +This is an example of HTML content.
+ + +""" + +class _MockURLRequestInterface: + + async def make_requests_with_html(self, urls: list[str]) -> list[URLResponseInfo]: + assert len(urls) == 1 + return [ + URLResponseInfo( + success=True, + status=HTTPStatus.OK, + exception=None, + html=MOCK_HTML_CONTENT, + content_type="text/html" + ) + ] + + +@pytest.mark.asyncio +async def test_200( + adb_client_test: AsyncDatabaseClient, + db_data_creator: DBDataCreator, + operator: URLHTMLTaskOperator, + test_url_id: int +): + """ + URLs that give 200s should be updated with the appropriate scrape status + and their html should be stored + """ + + await db_data_creator.create_web_metadata( + url_ids=[test_url_id], + status_code=200 + ) + + # Adjust Mock Request Interface to return a 404 + operator.url_request_interface = _MockURLRequestInterface() + + await assert_prereqs_met(operator) + + run_info = await operator.run_task() + assert_task_ran_without_error(run_info) + + # Check for the absence of Compressed HTML Data + results: list[URLCompressedHTML] = await adb_client_test.get_all(URLCompressedHTML) + assert len(results) == 1 + assert results[0].url_id == test_url_id + assert results[0].compressed_html is not None + + # Web Metadata should be unchanged + web_metadata: URLWebMetadata = (await adb_client_test.get_all(URLWebMetadata))[0] + assert web_metadata.status_code == 200 + + # Check that URLScrapeInfo is updated + scrape_info: URLScrapeInfo = (await adb_client_test.get_all(URLScrapeInfo))[0] + assert scrape_info.status == ScrapeStatus.SUCCESS \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/test_404.py b/tests/automated/integration/tasks/url/impl/html/test_404.py new file mode 100644 index 00000000..51589277 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/html/test_404.py @@ -0,0 +1,66 @@ +from http import HTTPStatus + +import pytest + +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_met, assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + + +class _MockURLRequestInterface: + + async def make_requests_with_html(self, urls: list[str]) -> list[URLResponseInfo]: + assert len(urls) == 1 + return [ + URLResponseInfo( + success=False, + status=HTTPStatus.NOT_FOUND, + exception="Not Found" + ) + ] + + + +@pytest.mark.asyncio +async def test_404( + adb_client_test: AsyncDatabaseClient, + db_data_creator: DBDataCreator, + operator: URLHTMLTaskOperator, + test_url_id: int +): + """ + URLs that give 404s should be updated with the appropriate scrape status + and their web metadata status should be updated to 404 + """ + await db_data_creator.create_web_metadata( + url_ids=[test_url_id], + status_code=200 + ) + + + # Adjust Mock Request Interface to return a 404 + operator.url_request_interface = _MockURLRequestInterface() + + await assert_prereqs_met(operator) + + run_info = await operator.run_task() + assert_task_ran_without_error(run_info) + + + # Check for the absence of Compressed HTML Data + results: list[URLCompressedHTML] = await adb_client_test.get_all(URLCompressedHTML) + assert len(results) == 0 + + # Web Metadata should be unchanged + web_metadata: URLWebMetadata = (await adb_client_test.get_all(URLWebMetadata))[0] + assert web_metadata.status_code == 404 + + # Check that URLScrapeInfo is updated + scrape_info: URLScrapeInfo = (await adb_client_test.get_all(URLScrapeInfo))[0] + assert scrape_info.status == ScrapeStatus.ERROR \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/test_error.py b/tests/automated/integration/tasks/url/impl/html/test_error.py new file mode 100644 index 00000000..1290460f --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/html/test_error.py @@ -0,0 +1,63 @@ +from http import HTTPStatus + +import pytest + +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.scrape_info.enums import ScrapeStatus +from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_met, assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + + +class _MockURLRequestInterface: + + async def make_requests_with_html(self, urls: list[str]) -> list[URLResponseInfo]: + assert len(urls) == 1 + return [ + URLResponseInfo( + success=False, + status=HTTPStatus.INTERNAL_SERVER_ERROR, + exception="Mock Exception" + ) + ] + +@pytest.mark.asyncio +async def test_error( + adb_client_test: AsyncDatabaseClient, + db_data_creator: DBDataCreator, + operator: URLHTMLTaskOperator, + test_url_id: int +): + """ + URLs that give errors should be updated with the appropriate scrape status + """ + await db_data_creator.create_web_metadata( + url_ids=[test_url_id], + status_code=200 + ) + + + # Adjust Mock Request Interface to return a 404 + operator.url_request_interface = _MockURLRequestInterface() + + await assert_prereqs_met(operator) + + run_info = await operator.run_task() + assert_task_ran_without_error(run_info) + + + # Check for the absence of Compressed HTML Data + results: list[URLCompressedHTML] = await adb_client_test.get_all(URLCompressedHTML) + assert len(results) == 0 + + # Web Metadata should be unchanged + web_metadata: URLWebMetadata = (await adb_client_test.get_all(URLWebMetadata))[0] + assert web_metadata.status_code == 200 + + # Check that URLScrapeInfo is updated + scrape_info: URLScrapeInfo = (await adb_client_test.get_all(URLScrapeInfo))[0] + assert scrape_info.status == ScrapeStatus.ERROR \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/test_no_web_metadata.py b/tests/automated/integration/tasks/url/impl/html/test_no_web_metadata.py new file mode 100644 index 00000000..06442164 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/html/test_no_web_metadata.py @@ -0,0 +1,26 @@ +import pytest + +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_not_met, assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_no_web_metadata( + adb_client_test: AsyncDatabaseClient, + operator: URLHTMLTaskOperator, + test_url_id: int +): + """ + URLs with no web metadata should not be processed + """ + await assert_prereqs_not_met(operator) + + run_info = await operator.run_task() + assert_task_ran_without_error(run_info) + + # Check for the absence of Compressed HTML Data + results: list[URLCompressedHTML] = await adb_client_test.get_all(URLCompressedHTML) + assert len(results) == 0 + diff --git a/tests/automated/integration/tasks/url/impl/html/test_non_200.py b/tests/automated/integration/tasks/url/impl/html/test_non_200.py new file mode 100644 index 00000000..0b80ba86 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/html/test_non_200.py @@ -0,0 +1,32 @@ +import pytest + +from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_not_met, assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_non_200( + adb_client_test: AsyncDatabaseClient, + db_data_creator: DBDataCreator, + operator: URLHTMLTaskOperator, + test_url_id: int +): + """ + URLs with non-200 web metadata should not be processed + """ + await db_data_creator.create_web_metadata( + url_ids=[test_url_id], + status_code=500 + ) + + await assert_prereqs_not_met(operator) + + run_info = await operator.run_task() + assert_task_ran_without_error(run_info) + + # Check for the absence of Compressed HTML Data + results: list[URLCompressedHTML] = await adb_client_test.get_all(URLCompressedHTML) + assert len(results) == 0 \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/test_task.py b/tests/automated/integration/tasks/url/impl/html/test_task.py deleted file mode 100644 index e7462e65..00000000 --- a/tests/automated/integration/tasks/url/impl/html/test_task.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_not_met, assert_prereqs_met, \ - assert_task_ran_without_error -from tests.automated.integration.tasks.url.impl.html.check.manager import TestURLHTMLTaskCheckManager -from tests.automated.integration.tasks.url.impl.html.setup.manager import setup_operator, \ - TestURLHTMLTaskSetupManager - - -@pytest.mark.asyncio -async def test_url_html_task(adb_client_test: AsyncDatabaseClient): - setup = TestURLHTMLTaskSetupManager(adb_client_test) - - operator = await setup_operator() - - # No URLs were created, the prereqs should not be met - await assert_prereqs_not_met(operator) - - records = await setup.setup() - await assert_prereqs_met(operator) - - run_info = await operator.run_task() - assert_task_ran_without_error(run_info) - - checker = TestURLHTMLTaskCheckManager( - adb_client=adb_client_test, - records=records - ) - await checker.check() - - await assert_prereqs_not_met(operator) diff --git a/tests/automated/integration/tasks/url/impl/probe/check/manager.py b/tests/automated/integration/tasks/url/impl/probe/check/manager.py index 200f428a..10505920 100644 --- a/tests/automated/integration/tasks/url/impl/probe/check/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/check/manager.py @@ -1,10 +1,10 @@ from sqlalchemy import select -from src.collectors.enums import URLStatus from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL -from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.models.materialized_views.url_status.enums import URLStatusViewEnum +from src.db.models.materialized_views.url_status.sqlalchemy import URLStatusMaterializedView class TestURLProbeCheckManager: @@ -18,13 +18,13 @@ def __init__( async def check_url( self, url_id: int, - expected_status: URLStatus + expected_status: URLStatusViewEnum ): - url: URL = await self.adb_client.one_or_none( - statement=select(URL).where(URL.id == url_id) + url: URLStatusMaterializedView = await self.adb_client.one_or_none( + statement=select(URLStatusMaterializedView).where(URLStatusMaterializedView.url_id == url_id) ) assert url is not None - assert url.status == expected_status + assert url.status == expected_status.value async def check_web_metadata( self, diff --git a/tests/automated/integration/tasks/url/impl/probe/models/__init__.py b/tests/automated/integration/tasks/url/impl/probe/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/probe/models/entry.py b/tests/automated/integration/tasks/url/impl/probe/models/entry.py deleted file mode 100644 index 810f40ea..00000000 --- a/tests/automated/integration/tasks/url/impl/probe/models/entry.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper - - -class TestURLProbeTaskEntry(BaseModel): - url: str - url_status: URLStatus - planned_response: URLProbeResponseOuterWrapper \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py index 85dd71f5..787d0d33 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py @@ -1,8 +1,6 @@ import pytest -from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager @@ -32,19 +30,15 @@ async def test_url_probe_task_error( ) ) assert not await operator.meets_task_prerequisites() - url_id: int = await setup_manager.setup_url(URLStatus.OK) + url_id: int = await setup_manager.setup_url() await db_data_creator.create_validated_flags([url_id], validation_type=URLType.DATA_SOURCE) await db_data_creator.create_url_data_sources([url_id]) assert await operator.meets_task_prerequisites() + run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() - await check_manager.check_url( - url_id=url_id, - expected_status=URLStatus.OK - ) - await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py index 31216e23..866e7533 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.enums import URLType from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager @@ -32,16 +31,13 @@ async def test_url_probe_task_not_found( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.OK) + url_id = await setup_manager.setup_url() await db_data_creator.create_validated_flags([url_id], validation_type=URLType.NOT_RELEVANT) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() - await check_manager.check_url( - url_id=url_id, - expected_status=URLStatus.OK - ) + await check_manager.check_web_metadata( url_id=url_id, status_code=404, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py index ecaec084..dca1349e 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager @@ -28,15 +27,12 @@ async def test_url_probe_task_no_redirect_ok( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.OK) + url_id = await setup_manager.setup_url() assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() - await check_manager.check_url( - url_id=url_id, - expected_status=URLStatus.OK - ) + await check_manager.check_web_metadata( url_id=url_id, status_code=200, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index c3b0c6c4..d628ea53 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from src.db.models.impl.url.core.sqlalchemy import URL from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager @@ -31,8 +30,8 @@ async def test_two_urls( ] ) assert not await operator.meets_task_prerequisites() - url_id_1 = await setup_manager.setup_url(URLStatus.OK, url=url_1) - url_id_2 = await setup_manager.setup_url(URLStatus.OK, url=url_2) + url_id_1 = await setup_manager.setup_url(url_1) + url_id_2 = await setup_manager.setup_url(url_2) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py index df695021..f7f9cb6e 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager @@ -28,13 +27,10 @@ async def test_url_probe_task_redirect_dest_new_ok( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.OK) + source_url_id = await setup_manager.setup_url() run_info = await operator.run_task() assert_task_ran_without_error(run_info) - await check_manager.check_url( - url_id=source_url_id, - expected_status=URLStatus.OK - ) + await check_manager.check_web_metadata( url_id=source_url_id, status_code=301, @@ -43,10 +39,7 @@ async def test_url_probe_task_redirect_dest_new_ok( accessed=True ) dest_url_id = await check_manager.check_redirect(source_url_id) - await check_manager.check_url( - url_id=dest_url_id, - expected_status=URLStatus.OK - ) + await check_manager.check_web_metadata( url_id=dest_url_id, status_code=200, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index 7aeeb1f8..92729102 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager @@ -29,8 +28,8 @@ async def test_url_probe_task_redirect_dest_exists_in_db( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.OK) - dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL.replace("https://", "")) + source_url_id = await setup_manager.setup_url() + dest_url_id = await setup_manager.setup_url(TEST_DEST_URL.replace("https://", "")) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( url_id=dest_url_id, @@ -42,14 +41,6 @@ async def test_url_probe_task_redirect_dest_exists_in_db( await setup_manager.adb_client.bulk_insert([web_metadata]) run_info = await operator.run_task() assert_task_ran_without_error(run_info) - await check_manager.check_url( - url_id=source_url_id, - expected_status=URLStatus.OK - ) - await check_manager.check_url( - url_id=dest_url_id, - expected_status=URLStatus.OK - ) await check_manager.check_web_metadata( url_id=source_url_id, status_code=302, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py index a8cb51f7..cbf59b20 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.util.models.full_url import FullURL @@ -31,7 +30,7 @@ async def test_url_probe_task_functional_equivalent( redirect_url=FullURL(TEST_URL + "/") ) ) - url_id = await setup_manager.setup_url(URLStatus.OK) + url_id = await setup_manager.setup_url() await run_task_and_confirm_success(operator) urls: list[URL] = await setup_manager.adb_client.get_all(URL) diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index 1dcd98d9..e8216f17 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -1,6 +1,5 @@ import pytest -from src.collectors.enums import URLStatus from src.util.models.full_url import FullURL from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager @@ -35,18 +34,11 @@ async def test_url_probe_task_redirect_two_urls_same_dest( ), ] ) - source_url_id_1 = await setup_manager.setup_url(URLStatus.OK) - source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="example.com/2") + source_url_id_1 = await setup_manager.setup_url() + source_url_id_2 = await setup_manager.setup_url("example.com/2") run_info = await operator.run_task() assert_task_ran_without_error(run_info) - await check_manager.check_url( - url_id=source_url_id_1, - expected_status=URLStatus.OK - ) - await check_manager.check_url( - url_id=source_url_id_2, - expected_status=URLStatus.OK - ) + redirect_url_id_1 = await check_manager.check_redirect( source_url_id=source_url_id_1 ) diff --git a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py index 44b5bd54..bf65e9f6 100644 --- a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py @@ -1,6 +1,5 @@ from typing import cast, Literal -from src.collectors.enums import URLStatus from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.core.pydantic.insert import URLInsertModel @@ -23,12 +22,10 @@ def __init__( async def setup_url( self, - url_status: URLStatus, url: str = TEST_URL ) -> int: url_insert_model = URLInsertModel( url=url, - status=url_status, source=TEST_SOURCE, trailing_slash=False ) diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py index 8a40a476..384966a8 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py @@ -1,7 +1,6 @@ import pytest from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator -from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL from src.db.models.impl.url.core.enums import URLSource diff --git a/tests/automated/integration/tasks/url/impl/test_example_task.py b/tests/automated/integration/tasks/url/impl/test_example_task.py index 00ec7c34..c54425f7 100644 --- a/tests/automated/integration/tasks/url/impl/test_example_task.py +++ b/tests/automated/integration/tasks/url/impl/test_example_task.py @@ -2,12 +2,13 @@ import pytest -from src.db.enums import TaskType from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.db.enums import TaskType from src.db.models.impl.link.task_url import LinkTaskURL from tests.helpers.data_creator.core import DBDataCreator + class ExampleTaskOperator( URLTaskOperatorBase, ): diff --git a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py index bc3f240d..8a907fdc 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py @@ -2,11 +2,11 @@ import pytest -from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator -from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata -from src.db.models.impl.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType from src.core.tasks.url.enums import TaskOperatorOutcome +from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from tests.helpers.data_creator.core import DBDataCreator diff --git a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py index d9f1de4f..84471a70 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py @@ -2,14 +2,15 @@ import pytest -from src.db.enums import TaskType -from src.db.models.impl.annotation.record_type.auto.sqlalchemy import AnnotationAutoRecordType +from src.core.enums import RecordType from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator -from src.core.enums import RecordType +from src.core.tasks.url.operators.record_type.llm_api.record_classifier.deepseek import DeepSeekRecordClassifier +from src.db.enums import TaskType +from src.db.models.impl.annotation.record_type.auto.sqlalchemy import AnnotationAutoRecordType from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from tests.helpers.data_creator.core import DBDataCreator -from src.core.tasks.url.operators.record_type.llm_api.record_classifier.deepseek import DeepSeekRecordClassifier + @pytest.mark.asyncio async def test_url_record_type_task(db_data_creator: DBDataCreator): diff --git a/tests/automated/integration/tasks/url/impl/validate/test_data_source.py b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py index 95d636c2..b17f726e 100644 --- a/tests/automated/integration/tasks/url/impl/validate/test_data_source.py +++ b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py @@ -14,10 +14,10 @@ from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator from src.db.models.impl.annotation.agency.anon.sqlalchemy import AnnotationAgencyAnon from src.db.models.impl.annotation.location.anon.sqlalchemy import AnnotationLocationAnon +from src.db.models.impl.annotation.name.anon.sqlalchemy import AnnotationNameAnonEndorsement from src.db.models.impl.annotation.record_type.anon.sqlalchemy import AnnotationRecordTypeAnon from src.db.models.impl.annotation.url_type.anon.sqlalchemy import AnnotationURLTypeAnon from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.annotation.name.anon.sqlalchemy import AnnotationNameAnonEndorsement from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper, DEFAULT_RECORD_TYPE from tests.helpers.run import run_task_and_confirm_success diff --git a/tests/automated/unit/core/test_core_logger.py b/tests/automated/unit/core/test_core_logger.py index 6c4f0375..01dae052 100644 --- a/tests/automated/unit/core/test_core_logger.py +++ b/tests/automated/unit/core/test_core_logger.py @@ -3,8 +3,8 @@ import pytest -from src.db.models.impl.log.pydantic.info import LogInfo from src.core.logger import AsyncCoreLogger +from src.db.models.impl.log.pydantic.info import LogInfo @pytest.mark.asyncio diff --git a/tests/automated/unit/security_manager/test_security_manager.py b/tests/automated/unit/security_manager/test_security_manager.py index 66399d7f..ae58ed6e 100644 --- a/tests/automated/unit/security_manager/test_security_manager.py +++ b/tests/automated/unit/security_manager/test_security_manager.py @@ -4,9 +4,9 @@ from fastapi import HTTPException from jwt import InvalidTokenError -from src.security.manager import SecurityManager, get_access_info from src.security.dtos.access_info import AccessInfo from src.security.enums import Permissions +from src.security.manager import SecurityManager, get_access_info SECRET_KEY = "test_secret_key" VALID_TOKEN = "valid_token" diff --git a/tests/automated/unit/source_collectors/test_autogoogler_collector.py b/tests/automated/unit/source_collectors/test_autogoogler_collector.py index cc191dc3..e4e617a1 100644 --- a/tests/automated/unit/source_collectors/test_autogoogler_collector.py +++ b/tests/automated/unit/source_collectors/test_autogoogler_collector.py @@ -2,11 +2,11 @@ import pytest -from src.collectors.impl.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO +from src.collectors.impl.auto_googler.collector import AutoGooglerCollector from src.collectors.impl.auto_googler.dtos.input import AutoGooglerInputDTO -from src.db.client.async_ import AsyncDatabaseClient +from src.collectors.impl.auto_googler.dtos.query_results import GoogleSearchQueryResultsInnerDTO from src.core.logger import AsyncCoreLogger -from src.collectors.impl.auto_googler.collector import AutoGooglerCollector +from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo diff --git a/tests/automated/unit/source_collectors/test_common_crawl_collector.py b/tests/automated/unit/source_collectors/test_common_crawl_collector.py index 0a10680f..c76bad38 100644 --- a/tests/automated/unit/source_collectors/test_common_crawl_collector.py +++ b/tests/automated/unit/source_collectors/test_common_crawl_collector.py @@ -2,10 +2,10 @@ import pytest +from src.collectors.impl.common_crawler.collector import CommonCrawlerCollector from src.collectors.impl.common_crawler.input import CommonCrawlerInputDTO -from src.db.client.async_ import AsyncDatabaseClient from src.core.logger import AsyncCoreLogger -from src.collectors.impl.common_crawler.collector import CommonCrawlerCollector +from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo diff --git a/tests/automated/unit/source_collectors/test_example_collector.py b/tests/automated/unit/source_collectors/test_example_collector.py index 632a6293..c99217b0 100644 --- a/tests/automated/unit/source_collectors/test_example_collector.py +++ b/tests/automated/unit/source_collectors/test_example_collector.py @@ -1,9 +1,9 @@ from unittest.mock import AsyncMock -from src.db.client.sync import DatabaseClient -from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.impl.example.core import ExampleCollector +from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.core.logger import AsyncCoreLogger +from src.db.client.sync import DatabaseClient def test_example_collector(): diff --git a/tests/automated/unit/source_collectors/test_muckrock_collectors.py b/tests/automated/unit/source_collectors/test_muckrock_collectors.py index 6c845b8e..009e550a 100644 --- a/tests/automated/unit/source_collectors/test_muckrock_collectors.py +++ b/tests/automated/unit/source_collectors/test_muckrock_collectors.py @@ -4,12 +4,12 @@ import pytest from src.collectors.impl.muckrock.collectors.county.core import MuckrockCountyLevelSearchCollector -from src.collectors.impl.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector -from src.db.client.async_ import AsyncDatabaseClient -from src.core.logger import AsyncCoreLogger from src.collectors.impl.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO +from src.collectors.impl.muckrock.collectors.simple.core import MuckrockSimpleSearchCollector from src.collectors.impl.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO from src.collectors.impl.muckrock.fetch_requests.foia import FOIAFetchRequest +from src.core.logger import AsyncCoreLogger +from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo diff --git a/tests/helpers/batch_creation_parameters/core.py b/tests/helpers/batch_creation_parameters/core.py index 4562cbdf..3719dae0 100644 --- a/tests/helpers/batch_creation_parameters/core.py +++ b/tests/helpers/batch_creation_parameters/core.py @@ -1,5 +1,4 @@ import datetime -from typing import Optional from pydantic import BaseModel, model_validator diff --git a/tests/helpers/data_creator/commands/impl/html_data.py b/tests/helpers/data_creator/commands/impl/html_data.py index 38ecb4bd..dbfe39f1 100644 --- a/tests/helpers/data_creator/commands/impl/html_data.py +++ b/tests/helpers/data_creator/commands/impl/html_data.py @@ -1,11 +1,9 @@ -from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.raw_html import RawHTMLInfo +from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase -from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer class HTMLDataCreatorCommand(DBDataCreatorCommandBase): diff --git a/tests/helpers/data_creator/commands/impl/suggestion/agency_confirmed.py b/tests/helpers/data_creator/commands/impl/suggestion/agency_confirmed.py index e096d15e..0a293e71 100644 --- a/tests/helpers/data_creator/commands/impl/suggestion/agency_confirmed.py +++ b/tests/helpers/data_creator/commands/impl/suggestion/agency_confirmed.py @@ -7,6 +7,7 @@ from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand + @final class AgencyConfirmedSuggestionCommand(DBDataCreatorCommandBase): diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py index ab29a817..e714714d 100644 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py @@ -10,6 +10,7 @@ from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand + @final class AgencyAutoSuggestionsCommand(DBDataCreatorCommandBase): diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py index c1e2db31..a7d2bdd1 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/convert.py +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -1,25 +1,6 @@ -from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.enums import URLCreationEnum - -def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) -> URLStatus: - match url_creation_enum: - case URLCreationEnum.OK: - return URLStatus.OK - case URLCreationEnum.SUBMITTED: - return URLStatus.OK - case URLCreationEnum.VALIDATED: - return URLStatus.OK - case URLCreationEnum.NOT_RELEVANT: - return URLStatus.OK - case URLCreationEnum.ERROR: - raise ValueError("Invalid URL Status") - case URLCreationEnum.DUPLICATE: - return URLStatus.DUPLICATE - case _: - raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") - def convert_url_creation_enum_to_validated_type( url_creation_enum: URLCreationEnum ) -> URLType: diff --git a/tests/helpers/data_creator/commands/impl/urls_/query.py b/tests/helpers/data_creator/commands/impl/urls_/query.py index c4fddad4..fd40834d 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/query.py +++ b/tests/helpers/data_creator/commands/impl/urls_/query.py @@ -1,13 +1,12 @@ from datetime import datetime -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource -from tests.helpers.data_creator.commands.impl.urls_.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase -from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_url_status +from tests.helpers.data_creator.commands.impl.urls_.tdo import SubmittedURLInfo from tests.helpers.simple_test_data_functions import generate_test_urls @@ -40,7 +39,6 @@ def run_sync(self) -> InsertURLsInfo: url_infos.append( URLInfo( url=url, - status=convert_url_creation_enum_to_url_status(self.status), name="Test Name" if self.status in ( URLCreationEnum.VALIDATED, URLCreationEnum.SUBMITTED, diff --git a/tests/helpers/data_creator/commands/impl/urls_/tdo.py b/tests/helpers/data_creator/commands/impl/urls_/tdo.py index a8991dcd..fdb5a1cc 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/tdo.py +++ b/tests/helpers/data_creator/commands/impl/urls_/tdo.py @@ -2,8 +2,6 @@ from pydantic import BaseModel -from src.core.enums import RecordType - class SubmittedURLInfo(BaseModel): url_id: int diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/core.py b/tests/helpers/data_creator/commands/impl/urls_v2/core.py index f7042720..20edd618 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/core.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/core.py @@ -1,16 +1,13 @@ from datetime import datetime from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.annotate import AnnotateCommand from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand -from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_validated_type from tests.helpers.data_creator.commands.impl.urls_.query import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response -from tests.helpers.data_creator.generate import generate_validated_flags from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/response.py b/tests/helpers/data_creator/commands/impl/urls_v2/response.py index 74aa8e20..935785e2 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/response.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/response.py @@ -1,6 +1,5 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index d3f6c924..c1e27ae3 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -3,7 +3,7 @@ from typing import Optional, Any from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus, SuggestionType, RecordType from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO @@ -20,13 +20,13 @@ from src.db.models.impl.annotation.location.user.sqlalchemy import AnnotationLocationUser from src.db.models.impl.annotation.name.suggestion.enums import NameSuggestionSource from src.db.models.impl.annotation.name.suggestion.sqlalchemy import AnnotationNameSuggestion +from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL -from src.db.models.impl.annotation.name.user.sqlalchemy import AnnotationNameUserEndorsement from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.impl.url.core.enums import URLSource @@ -439,7 +439,6 @@ async def create_submitted_urls( async def create_urls( self, - status: URLStatus = URLStatus.OK, source: URLSource = URLSource.COLLECTOR, record_type: RecordType | None = RecordType.RESOURCES, collector_metadata: dict | None = None, @@ -449,7 +448,6 @@ async def create_urls( url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=self.adb_client, - status=status, source=source, record_type=record_type, collector_metadata=collector_metadata, diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 57c9f9da..1c2073fd 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -1,6 +1,6 @@ from datetime import datetime -from src.collectors.enums import CollectorType, URLStatus +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus, RecordType from src.db import County, Locality, USState from src.db.client.async_ import AsyncDatabaseClient @@ -32,14 +32,12 @@ async def create_batch( async def create_urls( adb_client: AsyncDatabaseClient, - status: URLStatus = URLStatus.OK, source: URLSource = URLSource.COLLECTOR, record_type: RecordType | None = RecordType.RESOURCES, collector_metadata: dict | None = None, count: int = 1 ) -> list[SimpleURLMapping]: urls: list[URLInsertModel] = generate_urls( - status=status, source=source, collector_metadata=collector_metadata, count=count, diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index b447888d..aa63b202 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -1,11 +1,10 @@ from datetime import datetime -from src.collectors.enums import URLStatus, CollectorType -from src.core.enums import BatchStatus, RecordType +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus from src.db.models.impl.batch.pydantic.insert import BatchInsertModel from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel @@ -39,7 +38,6 @@ def generate_batch_url_links( ] def generate_urls( - status: URLStatus = URLStatus.OK, source: URLSource = URLSource.COLLECTOR, collector_metadata: dict | None = None, count: int = 1 @@ -50,7 +48,6 @@ def generate_urls( results.append(URLInsertModel( url=f"example.com/{val}", scheme="https", - status=status, source=source, name=f"Example {val}", collector_metadata=collector_metadata, diff --git a/tests/helpers/setup/annotation/core.py b/tests/helpers/setup/annotation/core.py index 70123cb9..10bc67b7 100644 --- a/tests/helpers/setup/annotation/core.py +++ b/tests/helpers/setup/annotation/core.py @@ -1,4 +1,3 @@ -from src.collectors.enums import URLStatus from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.setup.annotation.model import AnnotationSetupInfo diff --git a/tests/helpers/setup/final_review/core.py b/tests/helpers/setup/final_review/core.py index c474fe2c..20c0f8df 100644 --- a/tests/helpers/setup/final_review/core.py +++ b/tests/helpers/setup/final_review/core.py @@ -1,5 +1,3 @@ -from typing import Optional - from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index f6cd3582..7d4f0672 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -1,4 +1,4 @@ -from sqlalchemy import create_engine, Engine +from sqlalchemy import Engine from src.db.models.templates_.base import Base diff --git a/tests/manual/api/test_contributions.py b/tests/manual/api/test_contributions.py index 90d8e8de..6689ffdf 100644 --- a/tests/manual/api/test_contributions.py +++ b/tests/manual/api/test_contributions.py @@ -1,9 +1,9 @@ import pytest -from src.api.endpoints.contributions.leaderboard.query import GetContributionsLeaderboardQueryBuilder from src.api.endpoints.contributions.user.queries.core import GetUserContributionsQueryBuilder from src.db.client.async_ import AsyncDatabaseClient + # 72 = Max # 17 = Josh diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index 22203910..6eedb7f0 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -2,9 +2,9 @@ import dotenv -from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors.enums import CollectorType from src.core.enums import BatchStatus +from src.db.models.impl.batch.pydantic.info import BatchInfo def test_auto_googler_collector_lifecycle(test_core): diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index 66020a92..85bfca55 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,8 +1,9 @@ -from src.db.models.impl.batch.pydantic.info import BatchInfo +from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion + from src.collectors import CollectorType -from src.core.enums import BatchStatus from src.collectors.impl.ckan import group_search, package_search, organization_search -from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion +from src.core.enums import BatchStatus +from src.db.models.impl.batch.pydantic.info import BatchInfo def test_ckan_lifecycle(test_core): diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index 216638dc..c78a8199 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -1,9 +1,10 @@ -from src.db.models.impl.batch.pydantic.info import BatchInfo -from src.collectors import CollectorType -from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion from test_automated.integration.core.helpers.constants import ALLEGHENY_COUNTY_MUCKROCK_ID, ALLEGHENY_COUNTY_TOWN_NAMES +from src.collectors import CollectorType +from src.core.enums import BatchStatus +from src.db.models.impl.batch.pydantic.info import BatchInfo + def test_muckrock_simple_search_collector_lifecycle(test_core): ci = test_core diff --git a/tests/manual/core/tasks/scheduled/test_push_to_huggingface.py b/tests/manual/core/tasks/scheduled/test_push_to_huggingface.py index a091ff5c..3a864bae 100644 --- a/tests/manual/core/tasks/scheduled/test_push_to_huggingface.py +++ b/tests/manual/core/tasks/scheduled/test_push_to_huggingface.py @@ -1,8 +1,6 @@ import pytest - from environs import Env -from src.core.env_var_manager import EnvVarManager from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.hub.client import HuggingFaceHubClient diff --git a/tests/manual/external/huggingface/inference/test_relevancy.py b/tests/manual/external/huggingface/inference/test_relevancy.py index e001d864..abe4296b 100644 --- a/tests/manual/external/huggingface/inference/test_relevancy.py +++ b/tests/manual/external/huggingface/inference/test_relevancy.py @@ -1,12 +1,11 @@ import pytest from aiohttp import ClientSession +from environs import Env from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.huggingface.inference.models.input import BasicInput from tests.manual.external.huggingface.inference.constants import EXAMPLE_WEBSITE -from environs import Env - @pytest.mark.asyncio async def test_huggingface_inference_relevancy_annotation(): diff --git a/tests/manual/external/internet_archive/test_search.py b/tests/manual/external/internet_archive/test_search.py index 930d0304..41dcee1f 100644 --- a/tests/manual/external/internet_archive/test_search.py +++ b/tests/manual/external/internet_archive/test_search.py @@ -2,7 +2,6 @@ from aiohttp import ClientSession from src.external.internet_archives.client import InternetArchivesClient -from src.external.internet_archives.models.capture import IACapture # BASE_URL = "nola.gov/getattachment/NOPD/Policies/Chapter-12-1-Department-Operations-Manual-EFFECTIVE-1-14-18.pdf/" BASE_URL = "example.com" diff --git a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py index f26f2a6f..0ec3ba16 100644 --- a/tests/manual/llm_api_logic/test_deepseek_record_classifier.py +++ b/tests/manual/llm_api_logic/test_deepseek_record_classifier.py @@ -1,7 +1,7 @@ import pytest -from src.db.dtos.url.html_content import URLHTMLContentInfo from src.core.tasks.url.operators.record_type.llm_api.record_classifier.deepseek import DeepSeekRecordClassifier +from src.db.dtos.url.html_content import URLHTMLContentInfo @pytest.mark.asyncio diff --git a/tests/manual/llm_api_logic/test_openai_record_classifier.py b/tests/manual/llm_api_logic/test_openai_record_classifier.py index 3b3ec08b..25208b63 100644 --- a/tests/manual/llm_api_logic/test_openai_record_classifier.py +++ b/tests/manual/llm_api_logic/test_openai_record_classifier.py @@ -1,7 +1,7 @@ import pytest -from src.db.dtos.url.html_content import URLHTMLContentInfo from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier +from src.db.dtos.url.html_content import URLHTMLContentInfo @pytest.mark.asyncio diff --git a/tests/manual/source_collectors/test_autogoogler_collector.py b/tests/manual/source_collectors/test_autogoogler_collector.py index 39d1f8e7..e0f609cb 100644 --- a/tests/manual/source_collectors/test_autogoogler_collector.py +++ b/tests/manual/source_collectors/test_autogoogler_collector.py @@ -1,13 +1,14 @@ from unittest.mock import AsyncMock import pytest +from environs import Env +from src.collectors.impl.auto_googler.collector import AutoGooglerCollector from src.collectors.impl.auto_googler.dtos.input import AutoGooglerInputDTO from src.core.env_var_manager import EnvVarManager from src.core.logger import AsyncCoreLogger -from src.collectors.impl.auto_googler.collector import AutoGooglerCollector from src.db.client.async_ import AsyncDatabaseClient -from environs import Env + @pytest.mark.asyncio async def test_autogoogler_collector(monkeypatch): diff --git a/tests/manual/source_collectors/test_ckan_collector.py b/tests/manual/source_collectors/test_ckan_collector.py index 9b5edc9f..753c8a30 100644 --- a/tests/manual/source_collectors/test_ckan_collector.py +++ b/tests/manual/source_collectors/test_ckan_collector.py @@ -4,9 +4,8 @@ from marshmallow import Schema, fields from src.collectors.impl.ckan.collector import CKANCollector -from src.core.logger import AsyncCoreLogger -from src.collectors.impl.ckan import collector from src.collectors.impl.ckan.dtos.input import CKANInputDTO +from src.core.logger import AsyncCoreLogger class CKANSchema(Schema): diff --git a/tests/manual/source_collectors/test_common_crawler_collector.py b/tests/manual/source_collectors/test_common_crawler_collector.py index e508c2ac..61e6fdbc 100644 --- a/tests/manual/source_collectors/test_common_crawler_collector.py +++ b/tests/manual/source_collectors/test_common_crawler_collector.py @@ -3,9 +3,8 @@ import pytest from marshmallow import Schema, fields -from src.core.logger import AsyncCoreLogger -from src.collectors.impl.common_crawler import collector from src.collectors.impl.common_crawler import CommonCrawlerInputDTO +from src.core.logger import AsyncCoreLogger class CommonCrawlerSchema(Schema): diff --git a/tests/manual/source_collectors/test_muckrock_collectors.py b/tests/manual/source_collectors/test_muckrock_collectors.py index d8153c6b..0a69cfc0 100644 --- a/tests/manual/source_collectors/test_muckrock_collectors.py +++ b/tests/manual/source_collectors/test_muckrock_collectors.py @@ -3,16 +3,17 @@ import pytest from marshmallow import Schema, fields -from src.core.logger import AsyncCoreLogger +from src.collectors.impl import MuckrockSimpleSearchCollector, \ + MuckrockCountyLevelSearchCollector, MuckrockAllFOIARequestsCollector from src.collectors.impl.muckrock.collectors.all_foia.dto import MuckrockAllFOIARequestsCollectorInputDTO from src.collectors.impl.muckrock.collectors.county.dto import MuckrockCountySearchCollectorInputDTO from src.collectors.impl.muckrock.collectors.simple.dto import MuckrockSimpleSearchCollectorInputDTO -from src.collectors.impl import MuckrockSimpleSearchCollector, \ - MuckrockCountyLevelSearchCollector, MuckrockAllFOIARequestsCollector +from src.core.logger import AsyncCoreLogger from src.db.client.async_ import AsyncDatabaseClient from tests.automated.integration.core.helpers.constants import ALLEGHENY_COUNTY_MUCKROCK_ID, \ ALLEGHENY_COUNTY_TOWN_NAMES + class MuckrockURLInfoSchema(Schema): url = fields.String(required=True) metadata = fields.Dict(required=True) diff --git a/tests/manual/unsorted/test_common_crawler_integration.py b/tests/manual/unsorted/test_common_crawler_integration.py index 4b79893a..d458079d 100644 --- a/tests/manual/unsorted/test_common_crawler_integration.py +++ b/tests/manual/unsorted/test_common_crawler_integration.py @@ -1,10 +1,7 @@ import csv -import datetime -import json import os import shutil import tempfile -from unittest.mock import patch import pytest from common_crawler.cache import CommonCrawlerCacheManager