diff --git a/.secrets.baseline b/.secrets.baseline index 1dcdba13..fc44cabf 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -110,13 +110,13 @@ { "hashed_secret": "4dcba4ad1d671981e2d211ebe56da8a5b40f14ef", "is_verified": false, - "line_number": 225, + "line_number": 231, "type": "Hex High Entropy String" }, { "hashed_secret": "ecdb6b62dc6de954dbbef8185029415aecae5e5a", "is_verified": false, - "line_number": 291, + "line_number": 289, "type": "Hex High Entropy String" } ] diff --git a/docs/openapi.yaml b/docs/openapi.yaml index e27f9a19..bc4e7767 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -685,7 +685,7 @@ paths: If `merge` is True, then any aliases that are not in the new data will be kept.' - operationId: update_metadata_alias_metadata__guid__aliases_put + operationId: update_metadata_aliases_metadata__guid__aliases_put parameters: - in: path name: guid @@ -721,7 +721,7 @@ paths: security: - HTTPBasic: [] - HTTPBearer: [] - summary: Update Metadata Alias + summary: Update Metadata Aliases tags: - Aliases /metadata/{guid}/aliases/{alias}: @@ -1026,6 +1026,367 @@ paths: summary: Get Object Latest tags: - Object + /semi-structured/{guid}: + delete: + description: Delete the record with the specified GUID or alias permanently. + If possible, AVOID deletion as it reduces reproducibility of research using + this data. + operationId: delete_semi_structured_data_semi_structured__guid__delete + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Delete Semi Structured Data + tags: + - Semi-Structured + get: + description: Get the semi-structured data record associated with the specified + GUID or alias. + operationId: get_semi_structured_data_semi_structured__guid__get + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Semi Structured Data + tags: + - Semi-Structured + post: + description: Create a brand new record with the specified GUID or alias. + operationId: create_semi_structured_data_semi_structured__guid__post + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + requestBody: + content: + application/json: + schema: + title: Data + type: object + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Create Semi Structured Data + tags: + - Semi-Structured + put: + description: Create new version of existing record with specified GUID or alias; + if no existing record, then create brand new record. Either way, return the + final object created which may or may not have the same GUID specified above. + operationId: update_semi_structured_data_semi_structured__guid__put + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + requestBody: + content: + application/json: + schema: + title: Data + type: object + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Update Semi Structured Data + tags: + - Semi-Structured + /semi-structured/{guid}/aliases: + delete: + description: Delete all metadata_aliases of the GUID. + operationId: delete_all_metadata_aliases_semi_structured__guid__aliases_delete + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Delete All Metadata Aliases + tags: + - Aliases + get: + description: Get the aliases for the provided GUID + operationId: get_metadata_aliases_semi_structured__guid__aliases_get + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Metadata Aliases + tags: + - Query + post: + description: Create metadata aliases for the GUID. + operationId: create_metadata_aliases_semi_structured__guid__aliases_post + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/AliasObjInput' + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Create Metadata Aliases + tags: + - Aliases + put: + description: 'Update the metadata aliases of the GUID. + + + If `merge` is True, then any aliases that are not in the new data will be + + kept.' + operationId: update_metadata_aliases_semi_structured__guid__aliases_put + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + - in: query + name: merge + required: false + schema: + default: false + title: Merge + type: boolean + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/AliasObjInput' + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Update Metadata Aliases + tags: + - Aliases + /semi-structured/{guid}/aliases/{alias}: + delete: + description: Delete the specified metadata_alias of the GUID. + operationId: delete_metadata_alias_semi_structured__guid__aliases__alias__delete + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + - in: path + name: alias + required: true + schema: + title: Alias + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + security: + - HTTPBasic: [] + - HTTPBearer: [] + summary: Delete Metadata Alias + tags: + - Aliases + /semi-structured/{guid}/latest: + get: + description: Get latest version of uniquely identified semi-structured data. + operationId: get_semi_structured_data_latest_semi_structured__guid__latest_get + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Semi Structured Data Latest + tags: + - Semi-Structured + /semi-structured/{guid}/versions: + get: + description: Get all versions of uniquely identified semi-structured data. + operationId: get_semi_structured_data_versions_semi_structured__guid__versions_get + parameters: + - in: path + name: guid + required: true + schema: + title: Guid + type: string + - description: Switch to returning a list of GUIDs (false), or GUIDs mapping + to their metadata (true). + in: query + name: data + required: false + schema: + default: false + description: Switch to returning a list of GUIDs (false), or GUIDs mapping + to their metadata (true). + title: Data + type: boolean + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Semi Structured Data Versions + tags: + - Semi-Structured /version: get: description: '' diff --git a/migrations/versions/1f19cdfc4d64_add_baseid_and_created_date_columns.py b/migrations/versions/1f19cdfc4d64_add_baseid_and_created_date_columns.py new file mode 100644 index 00000000..d0e9d20b --- /dev/null +++ b/migrations/versions/1f19cdfc4d64_add_baseid_and_created_date_columns.py @@ -0,0 +1,28 @@ +"""Add baseid and created_date columns + +Revision ID: 1f19cdfc4d64 +Revises: 4d93784a25e5 +Create Date: 2022-09-16 03:16:36.805415 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "1f19cdfc4d64" +down_revision = "6819874e85b9" # pragma: allowlist secret +branch_labels = None +depends_on = None + + +def upgrade(): + """Add nullable baseid and created_date columns to support versioning.""" + op.add_column("metadata", sa.Column("baseid", sa.Unicode())) + op.add_column("metadata", sa.Column("created_date", sa.DateTime)) + + +def downgrade(): + """Remove baseid and created_date columns.""" + op.drop_column("metadata", "baseid") + op.drop_column("metadata", "created_date") diff --git a/poetry.lock b/poetry.lock index f1a49b9f..19eec60b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -673,6 +673,17 @@ toml = "*" [package.extras] testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] +[[package]] +name = "pytest-dependency" +version = "0.5.1" +description = "Manage dependencies of tests" +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +pytest = ">=3.6.0" + [[package]] name = "python-dotenv" version = "0.20.0" @@ -928,7 +939,7 @@ python-versions = ">=3.4" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "39c8b44d529241d5a9f71b28f18d21fa67438217d1ff4e088204cf2939e53fc0" +content-hash = "6f3cf63cdf7e85d86ba88a7f45a57d15822f3b63246c1a0cac0621f71209a35c" [metadata.files] alembic = [ @@ -1457,6 +1468,9 @@ pytest-cov = [ {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"}, {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"}, ] +pytest-dependency = [ + {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, +] python-dotenv = [ {file = "python-dotenv-0.20.0.tar.gz", hash = "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f"}, {file = "python_dotenv-0.20.0-py3-none-any.whl", hash = "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938"}, diff --git a/pyproject.toml b/pyproject.toml index 428d0c5d..f1834755 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,9 +42,14 @@ nest-asyncio = "^1.5.1" "maintain" = "mds.maintain" "index" = "mds.index" "objects" = "mds.objects" +"semi_structured" = "mds.semi_structured" "agg_mds" = "mds.agg_mds" "aliases" = "mds.aliases" + +[tool.poetry.group.dev.dependencies] +pytest-dependency = "^0.5.1" + [build-system] requires = ["poetry>=0.12"] build-backend = "poetry.masonry.api" diff --git a/src/mds/aliases.py b/src/mds/aliases.py index eec71f3b..1a0a4fd6 100644 --- a/src/mds/aliases.py +++ b/src/mds/aliases.py @@ -6,6 +6,9 @@ for naming blobs. However, in cases where you want multiple identifiers to point to the same blob, aliases allow that without duplicating the actual blob. + +Note: Must ensure that the /semi-structured endpoints here load before those in semi_structured.py +I think it happens now because aliases.py is lexicographically sorted before semi_structured.py """ import json import re @@ -47,6 +50,7 @@ class AliasObjInput(BaseModel): @mod.post("/metadata/{guid:path}/aliases") +@mod.post("/semi-structured/{guid:path}/aliases") async def create_metadata_aliases( guid: str, body: AliasObjInput, @@ -63,11 +67,21 @@ async def create_metadata_aliases( input_body_aliases = body.aliases or [] aliases = list(set(input_body_aliases)) - metadata_aliases = await MetadataAlias.query.where( + existing_metadata_guids = await Metadata.query.where( + Metadata.guid.in_(aliases) + ).gino.all() + + if existing_metadata_guids: + raise HTTPException( + HTTP_409_CONFLICT, + f"GUIDs with following names already exist: {[metadata.guid for metadata in existing_metadata_guids]}.", + ) + + existing_metadata_aliases = await MetadataAlias.query.where( MetadataAlias.guid == guid ).gino.all() - if metadata_aliases: + if existing_metadata_aliases: raise HTTPException( HTTP_409_CONFLICT, f"Aliases already exist for {guid}. " "Use PUT to overwrite.", @@ -96,7 +110,8 @@ async def create_metadata_aliases( @mod.put("/metadata/{guid:path}/aliases") -async def update_metadata_alias( +@mod.put("/semi-structured/{guid:path}/aliases") +async def update_metadata_aliases( guid: str, body: AliasObjInput, request: Request, @@ -119,6 +134,16 @@ async def update_metadata_alias( requested_aliases = set(input_body_aliases) logger.debug(f"requested_aliases: {requested_aliases}") + existing_metadata_guids = await Metadata.query.where( + Metadata.guid.in_(requested_aliases) + ).gino.all() + + if existing_metadata_guids: + raise HTTPException( + HTTP_409_CONFLICT, + f"GUIDs with following names already exist: {[metadata.guid for metadata in existing_metadata_guids]}.", + ) + existing_metadata_aliases = await MetadataAlias.query.where( MetadataAlias.guid == guid ).gino.all() @@ -162,6 +187,7 @@ async def update_metadata_alias( @mod.delete("/metadata/{guid:path}/aliases/{alias:path}") +@mod.delete("/semi-structured/{guid:path}/aliases/{alias:path}") async def delete_metadata_alias( guid: str, alias: str, @@ -191,6 +217,7 @@ async def delete_metadata_alias( @mod.delete("/metadata/{guid:path}/aliases") +@mod.delete("/semi-structured/{guid:path}/aliases") async def delete_all_metadata_aliases( guid: str, request: Request, diff --git a/src/mds/maintain.py b/src/mds/maintain.py index 1c7b6f21..782fac87 100644 --- a/src/mds/maintain.py +++ b/src/mds/maintain.py @@ -38,9 +38,13 @@ async def batch_create_metadata( data = bindparam("data") stmt = await conn.prepare( insert(Metadata) - .values(guid=bindparam("guid"), data=data, authz=authz) + .values( + guid=bindparam("guid"), data=data, authz=authz, created_date=None + ) .on_conflict_do_update( - index_elements=[Metadata.guid], set_=dict(data=data) + index_elements=[Metadata.guid], + set_=dict(data=data), + where=(Metadata.created_date == None), ) .returning(db.text("xmax")) ) @@ -49,12 +53,17 @@ async def batch_create_metadata( bad_input.append(data["guid"]) elif await stmt.scalar(data) == 0: created.append(data["guid"]) + elif await stmt.scalar(data) == None: + bad_input.append(data["guid"]) else: updated.append(data["guid"]) else: stmt = await conn.prepare( insert(Metadata).values( - guid=bindparam("guid"), data=bindparam("data"), authz=authz + guid=bindparam("guid"), + data=bindparam("data"), + authz=authz, + created_date=None, ) ) for data in data_list: @@ -83,23 +92,32 @@ async def create_metadata(guid, data: dict, overwrite: bool = False): # POST /api/v1/objects/{GUID or ALIAS} and POST /api/v1/objects/upload endpoints if guid in FORBIDDEN_IDS: raise HTTPException( - HTTP_400_BAD_REQUEST, "GUID cannot have value: {FORBIDDEN_IDS}" + HTTP_400_BAD_REQUEST, f"GUID cannot have value: {FORBIDDEN_IDS}" ) if overwrite: rv = await db.first( insert(Metadata) - .values(guid=guid, data=data, authz=authz) - .on_conflict_do_update(index_elements=[Metadata.guid], set_=dict(data=data)) + .values(guid=guid, data=data, authz=authz, created_date=None) + .on_conflict_do_update( + index_elements=[Metadata.guid], + set_=dict(data=data), + where=(Metadata.created_date == None), + ) .returning(Metadata.data, db.text("xmax")) ) + if rv == None: + raise HTTPException( + HTTP_400_BAD_REQUEST, + f"Cannot overwrite {guid}, which corresponds to existing semi-structured data record. Use /semi-structured endpoint instead", + ) if rv["xmax"] != 0: created = False else: try: rv = ( await Metadata.insert() - .values(guid=guid, data=data, authz=authz) + .values(guid=guid, data=data, authz=authz, created_date=None) .returning(*Metadata) .gino.first() ) @@ -120,17 +138,17 @@ async def update_metadata(guid, data: dict, merge: bool = False): is also known as the shallow merge. The metadata service currently doesn't support deep merge. """ - # TODO PUT should create if it doesn't exist... metadata = ( await Metadata.update.values(data=(Metadata.data + data) if merge else data) .where(Metadata.guid == guid) + .where(Metadata.created_date == None) # exclude semi-structured data record .returning(*Metadata) .gino.first() ) if metadata: return metadata.data else: - raise HTTPException(HTTP_404_NOT_FOUND, f"Not found: {guid}") + return await create_metadata(guid, data, overwrite=True) @mod.delete("/metadata/{guid:path}") @@ -138,6 +156,7 @@ async def delete_metadata(guid): """Delete the metadata of the GUID.""" metadata = ( await Metadata.delete.where(Metadata.guid == guid) + .where(Metadata.created_date == None) # exclude semi-structured data record .returning(*Metadata) .gino.first() ) diff --git a/src/mds/models.py b/src/mds/models.py index 4cd99175..3948b616 100644 --- a/src/mds/models.py +++ b/src/mds/models.py @@ -1,4 +1,6 @@ from gino.ext.starlette import Gino +import datetime +from sqlalchemy import DateTime from sqlalchemy.dialects.postgresql import JSONB from . import config @@ -19,7 +21,10 @@ class Metadata(db.Model): __tablename__ = "metadata" guid = db.Column(db.Unicode(), primary_key=True) + baseid = db.Column(db.Unicode()) data = db.Column(JSONB()) + # Note: default function needs to be defined as lambda in order to support function mocking for unit testing + created_date = db.Column(DateTime, default=lambda: datetime.datetime.utcnow()) authz = db.Column(JSONB(), nullable=False) diff --git a/src/mds/objects.py b/src/mds/objects.py index 32555dc4..99ad92cf 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -463,7 +463,11 @@ async def delete_object( # Recreate data in metadata table in case of any exception if metadata_obj: await Metadata.create( - guid=metadata_obj.guid, data=metadata_obj.data, authz=metadata_obj.authz + guid=metadata_obj.guid, + baseid=metadata_obj.baseid, + data=metadata_obj.data, + created_date=metadata_obj.created_date, + authz=metadata_obj.authz, ) status_code = ( err.response.status_code if err.response else HTTP_500_INTERNAL_SERVER_ERROR @@ -696,7 +700,7 @@ async def _add_metadata(blank_guid: str, metadata: dict, authz: dict, uploader: try: rv = ( await Metadata.insert() - .values(guid=blank_guid, data=metadata, authz=authz) + .values(guid=blank_guid, data=metadata, authz=authz, created_date=None) .returning(*Metadata) .gino.first() ) diff --git a/src/mds/query.py b/src/mds/query.py index c2365427..acc6dda3 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -104,6 +104,7 @@ def add_filter(query): @mod.get("/metadata/{guid:path}/aliases") +@mod.get("/semi-structured/{guid:path}/aliases") async def get_metadata_aliases( guid: str, ) -> JSONResponse: diff --git a/src/mds/semi_structured.py b/src/mds/semi_structured.py new file mode 100644 index 00000000..6813d44b --- /dev/null +++ b/src/mds/semi_structured.py @@ -0,0 +1,344 @@ +""" +Note: Must ensure that the /semi-structured endpoints in aliases.py load before those here. +I think it happens now because aliases.py is lexicographically sorted before semi_structured.py +""" +import uuid +import json +from asyncpg import UniqueViolationError +from fastapi import HTTPException, Query, APIRouter, Depends +import httpx +from starlette.responses import JSONResponse +from starlette.status import ( + HTTP_200_OK, + HTTP_201_CREATED, + HTTP_204_NO_CONTENT, + HTTP_400_BAD_REQUEST, + HTTP_404_NOT_FOUND, + HTTP_409_CONFLICT, + HTTP_500_INTERNAL_SERVER_ERROR, +) + +from .admin_login import admin_required +from .models import Metadata, MetadataAlias +from . import config + +mod = APIRouter() + + +@mod.get("/semi-structured/{guid:path}/versions") +async def get_semi_structured_data_versions( + guid: str, + data: bool = Query( + False, + description="Switch to returning a list of GUIDs (false), " + "or GUIDs mapping to their metadata (true).", + ), +) -> JSONResponse: + """ + Get all versions of uniquely identified semi-structured data. + + Args: + guid (str): GUID or alias + data (bool): output full metadata else just GUIDs (default false) + + Returns: + 200: if successfully get versions + 404: if record with specified GUID not found + """ + # resolve alias if exists + alias_record = await MetadataAlias.get(guid) + if alias_record: + guid = alias_record.guid + + existing_record = await Metadata.get(guid) + + if not existing_record: + raise HTTPException(HTTP_404_NOT_FOUND, f"Not found: {guid}") + + if existing_record.baseid: + versions = await Metadata.query.where( + Metadata.baseid == existing_record.baseid + ).gino.all() + else: + versions = [existing_record] + + response = {"versions": []} + for version in versions: + output = {"guid": version.guid} + if data: + output |= {"guid_type": "semi-structured"} + if version.created_date: + output |= {"created_date": version.created_date.isoformat()} + output |= version.data + response["versions"].append(output) + + return JSONResponse(response, HTTP_200_OK) + + +@mod.get("/semi-structured/{guid:path}/latest") +async def get_semi_structured_data_latest(guid: str) -> JSONResponse: + """ + Get latest version of uniquely identified semi-structured data. + + Args: + guid (str): GUID or alias + + Returns: + 200: if successfully get latest version + 404: if record with specified GUID not found + """ + # resolve alias if exists + alias_record = await MetadataAlias.get(guid) + if alias_record: + guid = alias_record.guid + + existing_record = await Metadata.get(guid) + + if not existing_record: + raise HTTPException(HTTP_404_NOT_FOUND, f"Not found: {guid}") + + if not existing_record.created_date: + raise HTTPException( + HTTP_400_BAD_REQUEST, f"Cannot call /latest on metadata record" + ) + + if existing_record.baseid: + latest = ( + await Metadata.query.where(Metadata.baseid == existing_record.baseid) + .order_by(Metadata.created_date.desc()) + .gino.first() + ) + else: + latest = existing_record + + response = { + "guid": latest.guid, + "guid_type": "semi-structured", + } + if latest.created_date: + response |= {"created_date": latest.created_date.isoformat()} + response |= latest.data + + return JSONResponse(response, HTTP_200_OK) + + +@mod.get("/semi-structured/{guid:path}") +async def get_semi_structured_data(guid: str) -> JSONResponse: + """ + Get the semi-structured data record associated with the specified GUID or alias. + + Args: + guid (str): GUID or alias + + Returns: + 200: { "guid": guid, "guid_type": "semi-structured", ...data... } + 404: if record with specified GUID not found + 500: if alias record exists but corresponding GUID not found + """ + # resolve alias if exists + alias_record = await MetadataAlias.get(guid) + if alias_record: + guid = alias_record.guid + + existing_record = await Metadata.get(guid) + if not existing_record: + raise HTTPException(HTTP_404_NOT_FOUND, f"Not found: {guid}") + + response = {"guid": guid} + if existing_record.created_date: + response |= {"guid_type": "semi-structured"} + response |= existing_record.data + return JSONResponse(response, HTTP_200_OK) + + +@mod.post("/semi-structured/{guid:path}", dependencies=[Depends(admin_required)]) +async def create_semi_structured_data(guid: str, data: dict) -> JSONResponse: + """ + Create a brand new record with the specified GUID or alias. + + Args: + guid (str): GUID or alias + data (dict): semi-structured data + + Returns: + 201: { "guid": guid, "guid_type": "semi-structured", ...data... } + 400: if invalid data, e.g. data specifies guid inconsistent with query parameter guid + 403: if authorization insufficient for requested action + 409: if record with specified GUID already exists + """ + # resolve alias if exists + alias_record = await MetadataAlias.get(guid) + if alias_record: # should cause 409 + guid = alias_record.guid + + # guid_type must be "semi-structured" + if "guid_type" in data and data["guid_type"] != "semi-structured": + raise HTTPException( + HTTP_400_BAD_REQUEST, + f"Query param data['guid_type'] must equal 'semi-structured'", + ) + + # guid must equal data['guid'], if it exists + if "guid" in data and data["guid"] != guid: + raise HTTPException( + HTTP_400_BAD_REQUEST, + f"Query param guid='{guid}' inconsistent with data['guid']='{data['guid']}'", + ) + + # data attribute should not contain "guid" or "guid_type" if exists + data.pop("guid", None) + data.pop("guid_type", None) + + try: + record = ( + await Metadata.insert() + .values( + guid=guid, + data=data, + authz=json.loads(config.DEFAULT_AUTHZ_STR), + ) + .returning(*Metadata) + .gino.first() + ) + except UniqueViolationError: + raise HTTPException(HTTP_409_CONFLICT, f"Conflict: {guid}") + + response = { + "guid": guid, + "guid_type": "semi-structured", + } | record["data"] + + return JSONResponse(response, HTTP_201_CREATED) + + +@mod.put("/semi-structured/{guid:path}", dependencies=[Depends(admin_required)]) +async def update_semi_structured_data(guid: str, data: dict) -> JSONResponse: + """ + Create new version of existing record with specified GUID or alias; if no existing record, then create brand new record. Either way, return the final object created which may or may not have the same GUID specified above. + + Args: + guid (str): GUID or alias + data (dict): semi-structured data + + Returns: + 201: { "guid": guid, "guid_type": "semi-structured", ...data... } + 400: if invalid data, e.g. data specifies guid_type other than "semi-structured" or data doesn't specify updated guid despite existing record + 403: if authorization insufficient for requested action + 409: if record with specified GUID already exists + 500: if data['guid'] matches alias but corresponding GUID not found + """ + # resolve alias if exists + alias_record = await MetadataAlias.get(guid) + if alias_record: + guid = alias_record.guid + + # guid_type must be "semi-structured" + if "guid_type" in data and data["guid_type"] != "semi-structured": + raise HTTPException( + HTTP_400_BAD_REQUEST, + f"Query param data['guid_type'] must equal 'semi-structured'", + ) + + existing_record = await Metadata.get(guid) + + # if specifying new_guid=data['guid'], then record with old_guid must already exist + if "guid" in data and data["guid"] != guid and not existing_record: + raise HTTPException( + HTTP_400_BAD_REQUEST, + f"Query param guid='{guid}' different from data['guid']='{data['guid']}' yet no record to update", + ) + + # if updating existing record, then data must specify the updated guid + if existing_record and "guid" not in data: + raise HTTPException( + HTTP_400_BAD_REQUEST, + f"Query param data doesn't specify updated guid", + ) + + # cannot update metadata record using this endpoint + if existing_record and not existing_record.created_date: + raise HTTPException( + HTTP_400_BAD_REQUEST, + f"Cannot update metadata record. Use /metadata endpoint instead", + ) + + # set baseid + if existing_record: + if existing_record.baseid: + baseid = existing_record.baseid + else: + baseid = str(uuid.uuid4()) + await existing_record.update(baseid=baseid).apply() + else: + baseid = None + + # data attribute should not contain "guid" or "guid_type" if exists + guid = data.pop("guid", guid) + data.pop("guid_type", None) + + # resolve data['guid'] as alias if exists + alias_record = await MetadataAlias.get(guid) + if alias_record: # should cause either 409 or 500 error + guid = alias_record.guid + + try: + record = ( + await Metadata.insert() + .values( + guid=guid, + data=data, + authz=json.loads(config.DEFAULT_AUTHZ_STR), + baseid=baseid, + ) + .returning(*Metadata) + .gino.first() + ) + except UniqueViolationError: + raise HTTPException(HTTP_409_CONFLICT, f"Conflict: {guid}") + + # if data['guid'] matches existing alias but corresponding guid not found + if alias_record: + message = f"Alias record exists but GUID not found: {guid}" + logging.error(message) + raise HTTPException(HTTP_500_INTERNAL_SERVER_ERROR, message) + + response = { + "guid": guid, + "guid_type": "semi-structured", + } | record["data"] + + return JSONResponse(response, HTTP_201_CREATED) + + +@mod.delete("/semi-structured/{guid:path}", dependencies=[Depends(admin_required)]) +async def delete_semi_structured_data(guid: str) -> JSONResponse: + """ + Delete the record with the specified GUID or alias permanently. If possible, AVOID deletion as it reduces reproducibility of research using this data. + + Args: + guid (str): GUID or alias + + Returns: + 204: if record is successfully deleted + 403: if authorization insufficient for requested action + 404: if record with specified GUID not found + """ + # resolve alias if exists + alias_record = await MetadataAlias.get(guid) + if alias_record: + guid = alias_record.guid + + existing_record = ( + await Metadata.delete.where(Metadata.guid == guid) + .where(Metadata.created_date != None) + .returning(*Metadata) + .gino.first() + ) + if not existing_record: + raise HTTPException(HTTP_404_NOT_FOUND, f"Not found: {guid}") + + return JSONResponse({}, HTTP_204_NO_CONTENT) + + +def init_app(app): + app.include_router(mod, tags=["Semi-Structured"]) diff --git a/tests/test_aliases.py b/tests/test_aliases.py index 243b763a..a2506fe8 100644 --- a/tests/test_aliases.py +++ b/tests/test_aliases.py @@ -233,6 +233,56 @@ def test_update_already_created_aliases(guid, aliases, updates, merge, client): client.delete(f"/metadata/{guid}").raise_for_status() +@pytest.mark.parametrize( + "guid1,guid2,aliases", + [ + ( + "test_get_aliases", + "dg.1234/test_get_aliases", + ["test_get_aliases", "alias_a"], + ), + ( + "test_get_aliases", + "dg.1234/test_get_aliases", + ["dg.1234/test_get_aliases", "alias_b"], + ), + ], +) +def test_alias_matches_existing_guid(guid1, guid2, aliases, client): + """ + Ensure non-successful response when trying to POST/PUT an alias which matches an existing guid + """ + data = dict(a=1, b=2) + client.post(f"/metadata/{guid1}", json=data).raise_for_status() + try: + client.post(f"/metadata/{guid2}", json=data).raise_for_status() + try: + response = client.post( + f"/metadata/{guid1}/aliases", json={"aliases": aliases} + ) + assert response.status_code == 409 + + response = client.post( + f"/metadata/{guid2}/aliases", json={"aliases": aliases} + ) + assert response.status_code == 409 + + response = client.put( + f"/metadata/{guid1}/aliases", json={"aliases": aliases} + ) + assert response.status_code == 409 + + response = client.put( + f"/metadata/{guid2}/aliases", json={"aliases": aliases} + ) + assert response.status_code == 409 + + finally: + client.delete(f"/metadata/{guid2}").raise_for_status() + finally: + client.delete(f"/metadata/{guid1}").raise_for_status() + + @pytest.mark.parametrize( "guid,aliases", [ diff --git a/tests/test_maintain.py b/tests/test_maintain.py index 2ac56af8..842c46de 100644 --- a/tests/test_maintain.py +++ b/tests/test_maintain.py @@ -41,6 +41,18 @@ def test_create_forbidden_guid(client, forbidden_id): assert resp.json().get("detail") +@pytest.mark.parametrize("key", ["test_create", "dg.1234/test_create"]) +def test_overwrite_semi_structured_data(client, key): + data = dict(a=1, b=2) + client.post(f"/semi-structured/{key}", json=data).raise_for_status() + try: + resp = client.post(f"/metadata/{key}?overwrite=true", json=data) + assert resp.status_code == 400 + + finally: + client.delete(f"/semi-structured/{key}").raise_for_status() + + def test_batch_create(client): data = dict(a=1, b=2) try: @@ -122,24 +134,78 @@ def test_batch_create_forbidden_guid(client, forbidden_id): client.delete(f"/metadata/tbc_{i}") +def test_batch_create_semi_structured_data(client): + data = dict(a=1, b=2) + semi_structured_guid = "foobar" + client.post( + f"/semi-structured/{semi_structured_guid}", json=data + ).raise_for_status() + try: + batch_data = [dict(guid=f"tbc_{i}", data=data) for i in range(64)] + batch_data.append({"guid": semi_structured_guid, "data": data}) + try: + resp = client.post("/metadata", json=batch_data) + resp.raise_for_status() + assert len(resp.json()["created"]) == 64 + assert len(resp.json()["updated"]) == 0 + assert len(resp.json()["conflict"]) == 0 + assert len(resp.json()["bad_input"]) == 1 + + batch_data = [dict(guid=f"tbc_{i}", data=data) for i in range(32, 96)] + batch_data.append({"guid": semi_structured_guid, "data": data}) + resp = client.post("/metadata", json=batch_data) + resp.raise_for_status() + assert len(resp.json()["created"]) == 32 + assert len(resp.json()["updated"]) == 32 + assert len(resp.json()["conflict"]) == 0 + assert len(resp.json()["bad_input"]) == 1 + + batch_data = [dict(guid=f"tbc_{i}", data=data) for i in range(64, 128)] + batch_data.append({"guid": semi_structured_guid, "data": data}) + resp = client.post( + "/metadata?overwrite=false", + json=batch_data, + ) + resp.raise_for_status() + assert len(resp.json()["created"]) == 32 + assert len(resp.json()["updated"]) == 0 + assert len(resp.json()["conflict"]) == 33 + assert len(resp.json()["bad_input"]) == 0 + + finally: + for i in range(128): + client.delete(f"/metadata/tbc_{i}") + finally: + client.delete(f"/semi-structured/{semi_structured_guid}") + + @pytest.mark.parametrize("key", ["test_update", "dg.1234/test_update"]) def test_update(client, key): data = dict(a=1, b=2) - client.post("/metadata/" + key, json={}).raise_for_status() + client.put(f"/metadata/{key}", json={}).raise_for_status() try: - resp = client.put("/metadata/" + key, json=data) + resp = client.put(f"/metadata/{key}", json=data) resp.raise_for_status() assert resp.json() == data - resp = client.get("/metadata/" + key) + resp = client.get(f"/metadata/{key}") resp.raise_for_status() assert resp.json() == data - resp = client.put("/metadata/{}_not_existing".format(key), json=data) - assert resp.status_code == 404 + finally: + client.delete(f"/metadata/{key}") + + +@pytest.mark.parametrize("key", ["test_update", "dg.1234/test_update"]) +def test_update_semi_structured_data(client, key): + data = dict(a=1, b=2) + client.post(f"/semi-structured/{key}", json={}).raise_for_status() + try: + resp = client.put(f"/metadata/{key}", json=data) + assert resp.status_code == 400 finally: - client.delete("/metadata/" + key) + client.delete(f"/semi-structured/{key}") @pytest.mark.parametrize("merge", [True, False]) @@ -173,3 +239,14 @@ def test_delete(client, key): client.delete("/metadata/" + key).raise_for_status() resp = client.delete("/metadata/" + key) assert resp.status_code == 404 + + +@pytest.mark.parametrize("key", ["test_delete", "dg.1234/test_delete"]) +def test_delete_semi_structured_data(client, key): + client.post(f"/semi-structured/{key}", json={}).raise_for_status() + try: + resp = client.delete(f"/metadata/{key}") + assert resp.status_code == 404 + + finally: + client.delete(f"/semi-structured/{key}").raise_for_status() diff --git a/tests/test_migrations.py b/tests/test_migrations.py index 0348d994..3b6b45d2 100644 --- a/tests/test_migrations.py +++ b/tests/test_migrations.py @@ -116,7 +116,13 @@ async def test_4d93784a25e5_downgrade( db.text(f"SELECT * FROM metadata WHERE guid = '{fake_guid}'") ) row = {k: v for k, v in data[0].items()} - assert row == {"guid": fake_guid, "data": new_metadata, "authz": authz_data} + assert row == { + "guid": fake_guid, + "data": new_metadata, + "authz": authz_data, + "baseid": None, + "created_date": None, + } # downgrade to before "add_authz_column" migration alembic_main(["--raiseerr", "downgrade", "f96cb3b2c523"]) diff --git a/tests/test_semi_structured.py b/tests/test_semi_structured.py new file mode 100644 index 00000000..068dd6a0 --- /dev/null +++ b/tests/test_semi_structured.py @@ -0,0 +1,347 @@ +import datetime +import pytest + +from unittest.mock import patch, Mock +from mds import config + +DETERMINISTIC_START_TIME = datetime.datetime(1970, 1, 1) # Unix epoch + + +def reset_time(): + """ + Reset time returned by mocked_utcnow to DETERMINISTIC_START_TIME. + Used for resetting mocked time between test runs. + """ + global TIME_DELTA + TIME_DELTA = datetime.timedelta(days=-1) + + +def mocked_utcnow(): + """ + Begin at DETERMINISTIC_START_TIME and increment one day every time this function is called. + Mocks datetime.datetime.utcnow(), which is used as default `created_date` in `models` module. + Used for mocking endpoints which return `created_date` attribute. + """ + global DETERMINISTIC_START_TIME, TIME_DELTA + TIME_DELTA += datetime.timedelta(days=1) + return DETERMINISTIC_START_TIME + TIME_DELTA + + +@pytest.mark.parametrize( + "key", ["test_semi_structured", "dg.1234/test_semi_structured"] +) +@pytest.mark.dependency(name="test_delete") +def test_delete(client, key): + """ + Try to DELETE a semi-structed data record. + Ensure 404 if record doesn't exist or trying to delete metadata record. + """ + client.post(f"/semi-structured/{key}", json={}).raise_for_status() + + resp = client.delete(f"/semi-structured/{key}") + assert resp.status_code == 204 + + resp = client.delete(f"/semi-structured/{key}") + assert resp.status_code == 404 + + client.post(f"/metadata/{key}", json={}).raise_for_status() + try: + resp = client.delete(f"/semi-structured/{key}") + assert resp.status_code == 404 + finally: + client.delete(f"/metadata/{key}") + + +@pytest.mark.parametrize( + "key,guid,guid_type", + [ + ("test_semi_structured", None, None), + ("dg.1234/test_semi_structured", None, None), + ("dg.2345/test_semi_structured", "dg.2345/test_semi_structured", None), + ("dg.3456/test_semi_structured", "anything else", None), + ("dg.4567/test_semi_structured", None, "semi-structured"), + ("dg.5678/test_semi_structured", None, "anything else"), + ], +) +@pytest.mark.parametrize("is_post", [True, False]) +@pytest.mark.dependency(depends=["test_delete"]) +def test_create_get(client, key, guid, guid_type, is_post): + """ + Create a semi-structed data record with either POST or PUT. + Ensure that if either guid or guid_type are specified in data that they are set appropriately. + Ensure that able to GET newly created record. + Ensure that duplicated creation causes 409 and doesn't alter existing record. + """ + + data = dict(a=1, b=2) + if guid is not None: + data["guid"] = guid + if guid_type is not None: + data["guid_type"] = guid_type + + expected_resp = { + "guid": key, + "guid_type": "semi-structured", + } | data + + if is_post: + resp = client.post(f"/semi-structured/{key}", json=data) + else: + # use PUT instead of POST, should result in same behavior for new keys + resp = client.put(f"/semi-structured/{key}", json=data) + + if (guid and guid != key) or (guid_type and guid_type != "semi-structured"): + assert resp.status_code == 400 + else: + assert resp.status_code == 201 + try: + assert resp.json() == expected_resp + + resp = client.post(f"/semi-structured/{key}", json=data) + assert resp.status_code == 409 + + # check that GET works + that 409 didn't alter existing record + resp = client.get(f"/semi-structured/{key}") + assert resp.status_code == 200 + assert resp.json() == expected_resp + + finally: + client.delete(f"/semi-structured/{key}").raise_for_status() + + +@pytest.mark.parametrize( + "old_key,new_key", + [ + ("test_semi_structured", None), + ("test_semi_structured", "test_semi_structured"), + ("dg.1234/test_semi_structured", "dg.2345/test_semi_structured"), + ], +) +@pytest.mark.dependency(depends=["test_delete"]) +def test_update(client, old_key, new_key): + """ + Create a semi-structed data record with POST then try to create new updated record with PUT. + Ensure that old record is not changed by update. + """ + old_data = dict(a=1, b=2) + new_data = dict(c=3) + if new_key is not None: + new_data["guid"] = new_key + + old_expected_resp = { + "guid": old_key, + "guid_type": "semi-structured", + } | old_data + + new_expected_resp = { + "guid": new_key, + "guid_type": "semi-structured", + } | new_data + + # Do not support updating metadata record + client.post(f"/metadata/{old_key}", json=old_data).raise_for_status() + try: + resp = client.put(f"/semi-structured/{old_key}", json=new_data) + assert resp.status_code == 400 + finally: + client.delete(f"/metadata/{old_key}").raise_for_status() + + client.post(f"/semi-structured/{old_key}", json=old_data).raise_for_status() + try: + resp = client.put(f"/semi-structured/{old_key}", json=new_data) + if not new_key: + assert resp.status_code == 400 + elif new_key == old_key: + assert resp.status_code == 409 + else: + assert resp.status_code == 201 + try: + assert resp.json() == new_expected_resp + + resp = client.get(f"/semi-structured/{old_key}") + resp.raise_for_status() + assert resp.json() == old_expected_resp + + resp = client.get(f"/semi-structured/{new_key}") + resp.raise_for_status() + assert resp.json() == new_expected_resp + + finally: + client.delete(f"/semi-structured/{new_key}").raise_for_status() + finally: + client.delete(f"/semi-structured/{old_key}").raise_for_status() + + +@pytest.mark.parametrize( + "key1,key2,key3", + [ + ("test_semi_structured1", "test_semi_structured2", "test_semi_structured3"), + ( + "dg.1234/test_semi_structured", + "dg.2345/test_semi_structured", + "dg.3456/test_semi_structured", + ), + ], +) +@pytest.mark.parametrize("data", [False, True]) +@patch("datetime.datetime", Mock(utcnow=mocked_utcnow)) +@pytest.mark.dependency(depends=["test_delete"]) +def test_versions(client, key1, key2, key3, data): + """ + Perform several updates and verify integrity of version history. + Check both types of output by setting `data` test paramater. + Ensure 404 if record with specified key doesn't exist. + """ + reset_time() # resets mocked time to DETERMINISTIC_START_TIME between test runs + + data1 = dict(guid=key1, a=1, b=2) + data2 = dict(guid=key2, c=3) + data3 = dict(guid=key3, d=4, e=5, f=6) + + if data: + expected_resp = dict( + versions=[ + data1 + | { + "guid_type": "semi-structured", + "created_date": "1970-01-01T00:00:00", + }, + data2 + | { + "guid_type": "semi-structured", + "created_date": "1970-01-02T00:00:00", + }, + data3 + | { + "guid_type": "semi-structured", + "created_date": "1970-01-03T00:00:00", + }, + ] + ) + else: + expected_resp = dict( + versions=[ + {"guid": key1}, + {"guid": key2}, + {"guid": key3}, + ] + ) + + # 404 if no existing record + resp = client.get(f"/semi-structured/{key1}/versions") + assert resp.status_code == 404 + + client.post(f"/semi-structured/{key1}", json=data1).raise_for_status() + try: + # support for even just one version + client.get(f"/semi-structured/{key1}/versions").raise_for_status() + + client.put(f"/semi-structured/{key1}", json=data2).raise_for_status() + try: + client.put(f"/semi-structured/{key1}", json=data3).raise_for_status() + try: + resp = client.get(f"/semi-structured/{key1}/versions?data={data}") + assert resp.status_code == 200 + assert resp.json() == expected_resp + + resp = client.get(f"/semi-structured/{key2}/versions?data={data}") + assert resp.status_code == 200 + assert resp.json() == expected_resp + + resp = client.get(f"/semi-structured/{key3}/versions?data={data}") + assert resp.status_code == 200 + assert resp.json() == expected_resp + + finally: + client.delete(f"/semi-structured/{key3}").raise_for_status() + finally: + client.delete(f"/semi-structured/{key2}").raise_for_status() + finally: + client.delete(f"/semi-structured/{key1}").raise_for_status() + + +@pytest.mark.parametrize( + "key1,key2,key3", + [ + ("test_semi_structured1", "test_semi_structured2", "test_semi_structured3"), + ( + "dg.1234/test_semi_structured", + "dg.2345/test_semi_structured", + "dg.3456/test_semi_structured", + ), + ], +) +@patch("datetime.datetime", Mock(utcnow=mocked_utcnow)) +@pytest.mark.dependency(depends=["test_delete"]) +def test_latest(client, key1, key2, key3): + """ + Perform several updates and verify latest record. + Ensure 400 if calling on metadata record. + Ensure 404 if record with specified key doesn't exist + """ + reset_time() # resets mocked time to DETERMINISTIC_START_TIME between test runs + + data1 = dict(guid=key1, a=1, b=2) + data2 = dict(guid=key2, c=3) + data3 = dict(guid=key3, d=4, e=5, f=6) + + expected_resp = { + "guid_type": "semi-structured", + "created_date": "1970-01-03T00:00:00", + } | data3 + + # 400 if metadata record + client.post(f"/metadata/{key1}", json=data1).raise_for_status() + try: + resp = client.get(f"/semi-structured/{key1}/latest") + assert resp.status_code == 400 + finally: + client.delete(f"/metadata/{key1}").raise_for_status() + + # 404 if no existing record + resp = client.get(f"/semi-structured/{key1}/latest") + assert resp.status_code == 404 + + client.post(f"/semi-structured/{key1}", json=data1).raise_for_status() + try: + # support for even just one version + client.get(f"/semi-structured/{key1}/latest").raise_for_status() + + client.put(f"/semi-structured/{key1}", json=data2).raise_for_status() + try: + client.put(f"/semi-structured/{key1}", json=data3).raise_for_status() + try: + resp = client.get(f"/semi-structured/{key1}/latest") + assert resp.status_code == 200 + assert resp.json() == expected_resp + + resp = client.get(f"/semi-structured/{key2}/latest") + assert resp.status_code == 200 + assert resp.json() == expected_resp + + resp = client.get(f"/semi-structured/{key3}/latest") + assert resp.status_code == 200 + assert resp.json() == expected_resp + + finally: + client.delete(f"/semi-structured/{key3}").raise_for_status() + finally: + client.delete(f"/semi-structured/{key2}").raise_for_status() + finally: + client.delete(f"/semi-structured/{key1}").raise_for_status() + + +def test_authorization(client): + """ + Ensure that create, update, and delete endpoints are protected by admin authorization. + """ + tmp = config.DEBUG + config.DEBUG = False + + try: + assert client.post(f"/semi-structured/foobar", json={}).status_code == 403 + assert client.put(f"/semi-structured/foobar", json={}).status_code == 403 + assert client.delete(f"/semi-structured/foobar", json={}).status_code == 403 + + finally: + config.DEBUG = tmp