From 2d7666a829f7dc35212f1432ada024833c7ce35f Mon Sep 17 00:00:00 2001 From: John McCann Date: Wed, 18 Nov 2020 22:05:01 -0800 Subject: [PATCH 01/39] feat(endpoint): add GET /objects --- src/mds/objects.py | 62 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index 8cc27b29..bddb55a5 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -3,7 +3,7 @@ from authutils.token.fastapi import access_token from asyncpg import UniqueViolationError -from fastapi import HTTPException, APIRouter, Security +from fastapi import HTTPException, APIRouter, Security, Query from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials import httpx from pydantic import BaseModel @@ -22,7 +22,7 @@ from . import config, logger from .models import Metadata -from .query import get_metadata +from .query import search_metadata, get_metadata mod = APIRouter() @@ -226,6 +226,64 @@ async def create_object_for_id( return JSONResponse(response, HTTP_201_CREATED) +@mod.get("/objects") +async def get_objects( + request: Request, + data: bool = Query( + False, + description="Switch to returning a list of GUIDs (false), " + "or GUIDs mapping to their metadata (true).", + ), + limit: int = Query( + 10, description="Maximum number of records returned. (max: 2000)" + ), + offset: int = Query(0, description="Return results at this given offset."), + _request_paths: str = Query( + "*", description="Return results at this given offset." + ), +) -> JSONResponse: + """ + XXX comments + """ + # XXX just pass kwargs? + metadata_objects = await search_metadata( + request, data=data, limit=limit, offset=offset + ) + + try: + endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + f"/index" + response = await request.app.async_client.get( + endpoint, params=request.query_params + ) + response.raise_for_status() + + records = response.json() + except httpx.HTTPError as err: + logger.debug(err) + if err.response and err.response.status_code == 404: + # XXX better error message + msg = f"Got a 404 from indexd's /index endpoint :(" + logger.debug(msg) + raise HTTPException( + HTTP_404_NOT_FOUND, + msg, + ) + else: + # XXX better message + msg = f"Unable to get successful response from indexd's /index endpoint" + logger.error(f"{msg}\nException:\n{err}", exc_info=True) + raise + records = {r["did"]: r for r in records["records"]} + + # import pdb; pdb.set_trace() + response = { + guid: {"record": records[guid], "metadata": o} + for guid, o in metadata_objects.items() + if guid in records + } + return response + + @mod.get("/objects/{guid:path}/download") async def get_object_signed_download_url( guid: str, From fe95adfe998453b2ccfd39750cbb935ed3f1339b Mon Sep 17 00:00:00 2001 From: John McCann Date: Wed, 2 Dec 2020 17:04:30 -0800 Subject: [PATCH 02/39] feat(GET /objects): use search_metadata_helper --- src/mds/objects.py | 32 ++++++++++++++++++++++++++------ src/mds/query.py | 26 +++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index bddb55a5..fec79fe7 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -7,6 +7,7 @@ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials import httpx from pydantic import BaseModel +from starlette.datastructures import QueryParams from starlette.requests import Request from starlette.responses import JSONResponse from starlette.status import ( @@ -22,7 +23,7 @@ from . import config, logger from .models import Metadata -from .query import search_metadata, get_metadata +from .query import search_metadata_helper, get_metadata mod = APIRouter() @@ -229,6 +230,7 @@ async def create_object_for_id( @mod.get("/objects") async def get_objects( request: Request, + # XXX how to handle this being False data: bool = Query( False, description="Switch to returning a list of GUIDs (false), " @@ -245,19 +247,36 @@ async def get_objects( """ XXX comments """ + + # import pdb; pdb.set_trace() + mds_query_params = { + k[len("metadata.") :]: v + for k, v in request.query_params.items() + if k.startswith("metadata.") + } + # mds_query_params = {} + # for k, v in request.query_params.multi_items(): + # queries.setdefault(key, []).append(value) + # XXX just pass kwargs? - metadata_objects = await search_metadata( - request, data=data, limit=limit, offset=offset + metadata_objects = await search_metadata_helper( + mds_query_params, data=data, limit=limit, offset=offset ) try: endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + f"/index" + indexd_query_params = { + k[len("record.") :]: v + for k, v in request.query_params.items() + if k.startswith("record.") + } + indexd_query_params["limit"] = limit response = await request.app.async_client.get( - endpoint, params=request.query_params + endpoint, params=indexd_query_params ) response.raise_for_status() - records = response.json() + records = {r["did"]: r for r in response.json()["records"]} except httpx.HTTPError as err: logger.debug(err) if err.response and err.response.status_code == 404: @@ -272,9 +291,10 @@ async def get_objects( # XXX better message msg = f"Unable to get successful response from indexd's /index endpoint" logger.error(f"{msg}\nException:\n{err}", exc_info=True) + # XXX would 400-range error be more appropriate? raise - records = {r["did"]: r for r in records["records"]} + # XXX need to handle data=False # import pdb; pdb.set_trace() response = { guid: {"record": records[guid], "metadata": o} diff --git a/src/mds/query.py b/src/mds/query.py index c5013b78..edf291b4 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -19,6 +19,28 @@ async def search_metadata( 10, description="Maximum number of records returned. (max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), +): + """ + XXX comments + """ + return search_metadata_helper( + request.query_params, data=data, limit=limit, offset=offset + ) + + +# XXX rename/update docstring +async def search_metadata_helper( + # request: Request, + query_params: dict, + data: bool = Query( + False, + description="Switch to returning a list of GUIDs (false), " + "or GUIDs mapping to their metadata (true).", + ), + limit: int = Query( + 10, description="Maximum number of records returned. (max: 2000)" + ), + offset: int = Query(0, description="Return results at this given offset."), ): """Search the metadata. @@ -60,12 +82,14 @@ async def search_metadata( """ limit = min(limit, 2000) queries = {} - for key, value in request.query_params.multi_items(): + # for key, value in request.query_params.multi_items(): + for key, value in query_params.items(): if key not in {"data", "limit", "offset"}: queries.setdefault(key, []).append(value) def add_filter(query): for path, values in queries.items(): + # import pdb; pdb.set_trace() query = query.where( db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) ) From da7f330dbdb5bb7f156076b38ec05c57f7613b7b Mon Sep 17 00:00:00 2001 From: John McCann Date: Mon, 7 Dec 2020 23:46:08 -0800 Subject: [PATCH 03/39] feat(GET /objects): filter resource paths LIKE val --- src/mds/objects.py | 41 +++++++++++++++++++-------------- src/mds/query.py | 56 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index fec79fe7..ca613dbc 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -6,7 +6,7 @@ from fastapi import HTTPException, APIRouter, Security, Query from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials import httpx -from pydantic import BaseModel +from pydantic import BaseModel, Json from starlette.datastructures import QueryParams from starlette.requests import Request from starlette.responses import JSONResponse @@ -240,27 +240,31 @@ async def get_objects( 10, description="Maximum number of records returned. (max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), - _request_paths: str = Query( - "*", description="Return results at this given offset." - ), + # XXX description + filter: Json = Query(Json(), description="The filters!"), ) -> JSONResponse: """ XXX comments """ # import pdb; pdb.set_trace() - mds_query_params = { - k[len("metadata.") :]: v - for k, v in request.query_params.items() - if k.startswith("metadata.") - } - # mds_query_params = {} + # mds_query_params = { + # k[len("metadata.") :]: v + # for k, v in request.query_params.items() + # if k.startswith("metadata.") + # } + # queries = {} # for k, v in request.query_params.multi_items(): # queries.setdefault(key, []).append(value) # XXX just pass kwargs? metadata_objects = await search_metadata_helper( - mds_query_params, data=data, limit=limit, offset=offset + # mds_query_params, data=data, limit=limit, offset=offset + data=data, + limit=limit, + offset=offset, + filter=filter + # data=data, limit=limit, offset=offset, filters=filter ) try: @@ -291,16 +295,19 @@ async def get_objects( # XXX better message msg = f"Unable to get successful response from indexd's /index endpoint" logger.error(f"{msg}\nException:\n{err}", exc_info=True) - # XXX would 400-range error be more appropriate? + # XXX would 400-range error be more appropriate? or maybe only return mds objects? raise # XXX need to handle data=False # import pdb; pdb.set_trace() - response = { - guid: {"record": records[guid], "metadata": o} - for guid, o in metadata_objects.items() - if guid in records - } + if type(metadata_objects) is dict: + response = { + guid: {"record": records[guid], "metadata": o} + for guid, o in metadata_objects.items() + if guid in records + } + else: + response = metadata_objects return response diff --git a/src/mds/query.py b/src/mds/query.py index edf291b4..cb59764c 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -1,4 +1,8 @@ +import operator + from fastapi import HTTPException, Query, APIRouter +from pydantic import Json +from sqlalchemy.sql import column from starlette.requests import Request from starlette.status import HTTP_404_NOT_FOUND @@ -8,6 +12,7 @@ @mod.get("/metadata") +# XXX fix tests async def search_metadata( request: Request, data: bool = Query( @@ -23,15 +28,17 @@ async def search_metadata( """ XXX comments """ - return search_metadata_helper( + # XXX return await + response = await search_metadata_helper( request.query_params, data=data, limit=limit, offset=offset ) + return response # XXX rename/update docstring async def search_metadata_helper( # request: Request, - query_params: dict, + # query_params: dict, data: bool = Query( False, description="Switch to returning a list of GUIDs (false), " @@ -41,6 +48,8 @@ async def search_metadata_helper( 10, description="Maximum number of records returned. (max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), + # XXX description + filter: Json = Query(Json(), description="The filters!"), ): """Search the metadata. @@ -81,18 +90,43 @@ async def search_metadata_helper( """ limit = min(limit, 2000) - queries = {} + # queries = {} # for key, value in request.query_params.multi_items(): - for key, value in query_params.items(): - if key not in {"data", "limit", "offset"}: - queries.setdefault(key, []).append(value) + # for key, value in query_params.items(): + # if key not in {"data", "limit", "offset"}: + # queries.setdefault(key, []).append(value) + filter_operators = {"eq": operator.eq, "in": operator.contains} + # XXX should maybe default query def add_filter(query): - for path, values in queries.items(): - # import pdb; pdb.set_trace() - query = query.where( - db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) - ) + # for path, values in queries.items(): + # query = query.where( + # db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) + # ) + for _filter in filter: + if _filter["op"] == "eq": + query = query.where( + Metadata.data[list(_filter["name"].split("."))].astext + == _filter["val"] + ) + elif _filter["op"] == "has": + query = query.where( + Metadata.data[list(_filter["name"].split("."))].has_key( + _filter["val"] + ) + # XXX from sqlalchemy for line below: NotImplementedError: Operator 'contains' is not supported on this expression + # _filter['val'] in Metadata.data[list(_filter['name'].split("."))] + ) + elif _filter["op"] == "any": + array_func = db.func.jsonb_array_elements_text( + Metadata.data[list(_filter["name"].split("."))] + ).alias() + query = ( + db.select(Metadata) + .select_from(Metadata) + .select_from(array_func) + .where(column(array_func.name).like(_filter["val"])) + ) return query.offset(offset).limit(limit) if data: From d639b2b72cc7969f44d140fc1c8d155748cc45f8 Mon Sep 17 00:00:00 2001 From: John McCann Date: Wed, 9 Dec 2020 23:36:07 -0800 Subject: [PATCH 04/39] feat(GET /objects): map filter operators w/ dict --- src/mds/query.py | 74 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 24 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index cb59764c..9bcb96fd 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -2,7 +2,9 @@ from fastapi import HTTPException, Query, APIRouter from pydantic import Json -from sqlalchemy.sql import column +import sqlalchemy +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.sql.operators import ColumnOperators from starlette.requests import Request from starlette.status import HTTP_404_NOT_FOUND @@ -95,7 +97,18 @@ async def search_metadata_helper( # for key, value in query_params.items(): # if key not in {"data", "limit", "offset"}: # queries.setdefault(key, []).append(value) - filter_operators = {"eq": operator.eq, "in": operator.contains} + filter_operators = { + "any": (None, None), + "eq": ColumnOperators.__eq__, + "ne": ColumnOperators.__ne__, + # "gte": ColumnOperators.__gte__, + # XXX check how mongoDB filters for null (does it use it, if an object doesn't have a given key, is that considered null, etc.) + "is": ColumnOperators.is_, + "is_not": ColumnOperators.isnot, + "like": (ColumnOperators.like, str), + # "like": (ColumnOperators.like, str, + } + # import pdb; pdb.set_trace() # XXX should maybe default query def add_filter(query): @@ -104,29 +117,42 @@ def add_filter(query): # db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) # ) for _filter in filter: - if _filter["op"] == "eq": - query = query.where( - Metadata.data[list(_filter["name"].split("."))].astext - == _filter["val"] - ) - elif _filter["op"] == "has": - query = query.where( - Metadata.data[list(_filter["name"].split("."))].has_key( - _filter["val"] + json_object = Metadata.data[list(_filter["name"].split("."))] + operator = filter_operators[_filter["op"]] + other = sqlalchemy.cast(_filter["val"], JSONB) + if type(operator) is tuple: + if operator[0] is None: + # XXX not necessarily going to always be text + # json_object = db.func.jsonb_array_elements_text(Metadata.data[list(_filter["name"].split("."))]).alias() + # XXX going to have to do this recursively + # operator = filter_operators[_filter["val"]["op"]] + # operator = filter_operators[_filter["val"]["op"]][0] + # other = _filter["val"]["val"] + + # XXX handle duplicates + # XXX any and has have to be used with other scalar operation + # (i.e. (_resource_paths,:any,(,:like,"/programs/*")). whether + # there is a key for the scalar comparator might determine + # whether to use jsonb_array_elements + array_func = db.func.jsonb_array_elements_text( + Metadata.data[list(_filter["name"].split("."))] + ).alias() + query = ( + db.select(Metadata) + .select_from(Metadata) + .select_from(array_func) + .where(sqlalchemy.column(array_func.name).like(_filter["val"])) ) - # XXX from sqlalchemy for line below: NotImplementedError: Operator 'contains' is not supported on this expression - # _filter['val'] in Metadata.data[list(_filter['name'].split("."))] - ) - elif _filter["op"] == "any": - array_func = db.func.jsonb_array_elements_text( - Metadata.data[list(_filter["name"].split("."))] - ).alias() - query = ( - db.select(Metadata) - .select_from(Metadata) - .select_from(array_func) - .where(column(array_func.name).like(_filter["val"])) - ) + return query.offset(offset).limit(limit) + + else: + if len(operator) > 1 and operator[1] is str: + json_object = json_object.astext + operator = operator[0] + other = _filter["val"] + + query = query.where(operator(json_object, other)) + return query.offset(offset).limit(limit) if data: From 6a6093f8cc6d36bdf04195fc04a304da82167ca0 Mon Sep 17 00:00:00 2001 From: John McCann Date: Fri, 11 Dec 2020 21:01:31 -0800 Subject: [PATCH 05/39] feat(GET /objects): filter all of array where op --- src/mds/query.py | 128 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 92 insertions(+), 36 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 9bcb96fd..4e3c01a2 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -3,7 +3,7 @@ from fastapi import HTTPException, Query, APIRouter from pydantic import Json import sqlalchemy -from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.dialects.postgresql import ARRAY, JSONB from sqlalchemy.sql.operators import ColumnOperators from starlette.requests import Request from starlette.status import HTTP_404_NOT_FOUND @@ -97,18 +97,31 @@ async def search_metadata_helper( # for key, value in query_params.items(): # if key not in {"data", "limit", "offset"}: # queries.setdefault(key, []).append(value) + # XXX using __op__? filter_operators = { - "any": (None, None), + # "all": (None, None), + # "any": (None, None), + "all": None, + "any": None, "eq": ColumnOperators.__eq__, "ne": ColumnOperators.__ne__, - # "gte": ColumnOperators.__gte__, + "gt": ColumnOperators.__gt__, + "gte": ColumnOperators.__ge__, + "lt": ColumnOperators.__lt__, + "lte": ColumnOperators.__le__, # XXX check how mongoDB filters for null (does it use it, if an object doesn't have a given key, is that considered null, etc.) + # XXX in is probably unnecessary (i.e. instead of (score, :in, [1, 2]) do (OR (score, :eq, 1), (score, :eq, 2))) + # "in": ColumnOperators.in_, + # XXX what does SQL IS mean? "is": ColumnOperators.is_, "is_not": ColumnOperators.isnot, - "like": (ColumnOperators.like, str), - # "like": (ColumnOperators.like, str, + # "like": (ColumnOperators.like, str), + "like": ColumnOperators.like, + # "like": (ColumnOperators.like, str), } - # import pdb; pdb.set_trace() + textual_operators = ["like"] + other_not_to_be_cast = ["like", "in"] + other_type = {"in": ARRAY} # XXX should maybe default query def add_filter(query): @@ -116,40 +129,83 @@ def add_filter(query): # query = query.where( # db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) # ) + + # XXX make this an optional url query param for _filter in filter: json_object = Metadata.data[list(_filter["name"].split("."))] operator = filter_operators[_filter["op"]] other = sqlalchemy.cast(_filter["val"], JSONB) - if type(operator) is tuple: - if operator[0] is None: - # XXX not necessarily going to always be text - # json_object = db.func.jsonb_array_elements_text(Metadata.data[list(_filter["name"].split("."))]).alias() - # XXX going to have to do this recursively - # operator = filter_operators[_filter["val"]["op"]] - # operator = filter_operators[_filter["val"]["op"]][0] - # other = _filter["val"]["val"] - - # XXX handle duplicates - # XXX any and has have to be used with other scalar operation - # (i.e. (_resource_paths,:any,(,:like,"/programs/*")). whether - # there is a key for the scalar comparator might determine - # whether to use jsonb_array_elements - array_func = db.func.jsonb_array_elements_text( - Metadata.data[list(_filter["name"].split("."))] - ).alias() - query = ( - db.select(Metadata) - .select_from(Metadata) - .select_from(array_func) - .where(sqlalchemy.column(array_func.name).like(_filter["val"])) - ) - return query.offset(offset).limit(limit) - - else: - if len(operator) > 1 and operator[1] is str: - json_object = json_object.astext - operator = operator[0] - other = _filter["val"] + # XXX this could be used for has_key (e.g. for does have bears key: (teams, :any, (,:eq, "bears"))) + if _filter["op"] == "any": + + # XXX handle duplicates + # XXX any and has have to be used with other scalar operation + # (i.e. (_resource_paths,:any,(,:like,"/programs/*")). whether + # there is a key for the scalar comparator might determine + # whether to use jsonb_array_elements + + # e = sqlalchemy.exists(query.select('*').select_from(f).where(f.like('%'))) + # e = sqlalchemy.exists(query.select(f.name).select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) + # e = sqlalchemy.exists(query.select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) + # e = sqlalchemy.exists(query.select(sqlalchemy.text(f.name)).select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) + # e = sqlalchemy.exists(db.all().select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) + # e = sqlalchemy.exists(db.select(f.name).select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) + + # works + # e = sqlalchemy.exists(db.select('*').select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) + + other = sqlalchemy.cast(_filter["val"]["val"], JSONB) + f = db.func.jsonb_array_elements + if _filter["val"]["op"] in textual_operators: + f = db.func.jsonb_array_elements_text + + if _filter["val"]["op"] in other_not_to_be_cast: + other = _filter["val"]["val"] + + # if _filter["val"]["op"] in other_type: + # other = sqlalchemy.cast(_filter["val"]["val"], other_type[_filter["val"]["op"]]) + + f = f(Metadata.data[list(_filter["name"].split("."))]).alias() + operator = filter_operators[_filter["val"]["op"]] + + e = sqlalchemy.exists( + db.select("*") + .select_from(f) + .where(operator(sqlalchemy.column(f.name), other)) + ) + + query = query.where(e) + + return query.offset(offset).limit(limit) + + elif _filter["op"] == "all": + count_f = db.func.jsonb_array_length(json_object) + f = db.func.jsonb_array_elements + + if _filter["val"]["op"] in textual_operators: + f = db.func.jsonb_array_elements_text + f = f(json_object).alias() + + operator = filter_operators[_filter["val"]["op"]] + + if _filter["val"]["op"] in other_not_to_be_cast: + other = _filter["val"]["val"] + + sq = ( + db.select([db.func.count()]) + .select_from(f) + .where(operator(sqlalchemy.column(f.name), other)) + ) + query = query.where(count_f == sq) + + return query.offset(offset).limit(limit) + + # elif _filter["op"] == "is": + + if _filter["op"] == "like": + json_object = json_object.astext + operator = operator[0] + other = _filter["val"] query = query.where(operator(json_object, other)) From 683ed5d6aa26ed4a981e320a16fb077c903b478f Mon Sep 17 00:00:00 2001 From: John McCann Date: Sat, 12 Dec 2020 23:26:50 -0800 Subject: [PATCH 06/39] feat(GET /objects): POST /bulk/documents in indexd --- src/mds/objects.py | 47 +++++++++++++++++++++------------------------- src/mds/query.py | 3 +-- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index ca613dbc..c7fd3757 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -241,6 +241,7 @@ async def get_objects( ), offset: int = Query(0, description="Return results at this given offset."), # XXX description + # XXX how to name this variable something other than filter filter: Json = Query(Json(), description="The filters!"), ) -> JSONResponse: """ @@ -267,44 +268,38 @@ async def get_objects( # data=data, limit=limit, offset=offset, filters=filter ) + records = {} try: - endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + f"/index" - indexd_query_params = { - k[len("record.") :]: v - for k, v in request.query_params.items() - if k.startswith("record.") - } - indexd_query_params["limit"] = limit - response = await request.app.async_client.get( - endpoint, params=indexd_query_params + endpoint_path = "/bulk/documents" + full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path + response = await request.app.async_client.post( + full_endpoint, json=list(metadata_objects.keys()) ) response.raise_for_status() - - records = {r["did"]: r for r in response.json()["records"]} + records = {r["did"]: r for r in response.json()} except httpx.HTTPError as err: - logger.debug(err) - if err.response and err.response.status_code == 404: - # XXX better error message - msg = f"Got a 404 from indexd's /index endpoint :(" - logger.debug(msg) - raise HTTPException( - HTTP_404_NOT_FOUND, - msg, + logger.debug(err, exc_info=True) + # logger.debug(err) + if err.response: + # logger.error(f"indexd `POST /bulk/documents` endpoint returned a {err.response.status_code} HTTP status code") + logger.error( + "indexd `POST %s` endpoint returned a %s HTTP status code", + endpoint_path, + err.response.status_code, ) else: - # XXX better message - msg = f"Unable to get successful response from indexd's /index endpoint" - logger.error(f"{msg}\nException:\n{err}", exc_info=True) - # XXX would 400-range error be more appropriate? or maybe only return mds objects? - raise + logger.error( + "Unable to get a response from indexd `POST %s` endpoint", endpoint_path + ) + # XXX exception info needed? + # logger.error(f"{msg}\nException:\n{err}", exc_info=True) # XXX need to handle data=False # import pdb; pdb.set_trace() if type(metadata_objects) is dict: response = { - guid: {"record": records[guid], "metadata": o} + guid: {"record": records[guid] if guid in records else {}, "metadata": o} for guid, o in metadata_objects.items() - if guid in records } else: response = metadata_objects diff --git a/src/mds/query.py b/src/mds/query.py index 4e3c01a2..c3e57ce2 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -98,9 +98,8 @@ async def search_metadata_helper( # if key not in {"data", "limit", "offset"}: # queries.setdefault(key, []).append(value) # XXX using __op__? + # XXX aggregate functions (e.g size, max, etc.) filter_operators = { - # "all": (None, None), - # "any": (None, None), "all": None, "any": None, "eq": ColumnOperators.__eq__, From 05c879cb27eb93aba051783499a08b338258162a Mon Sep 17 00:00:00 2001 From: John McCann Date: Sun, 13 Dec 2020 16:58:47 -0800 Subject: [PATCH 07/39] feat(GET /objects): filter w/ () syntax (x,:eq,42) --- src/mds/objects.py | 11 ++-- src/mds/query.py | 154 ++++++++++++++++++++++----------------------- 2 files changed, 81 insertions(+), 84 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index c7fd3757..b285ef15 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -7,7 +7,6 @@ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials import httpx from pydantic import BaseModel, Json -from starlette.datastructures import QueryParams from starlette.requests import Request from starlette.responses import JSONResponse from starlette.status import ( @@ -23,7 +22,7 @@ from . import config, logger from .models import Metadata -from .query import search_metadata_helper, get_metadata +from .query import get_metadata, search_metadata_helper mod = APIRouter() @@ -231,6 +230,7 @@ async def create_object_for_id( async def get_objects( request: Request, # XXX how to handle this being False + # XXX what is Query? data: bool = Query( False, description="Switch to returning a list of GUIDs (false), " @@ -242,7 +242,8 @@ async def get_objects( offset: int = Query(0, description="Return results at this given offset."), # XXX description # XXX how to name this variable something other than filter - filter: Json = Query(Json(), description="The filters!"), + # filter: Json = Query(Json(), description="The filters!"), + filter: str = Query("", description="The filters!"), ) -> JSONResponse: """ XXX comments @@ -279,9 +280,7 @@ async def get_objects( records = {r["did"]: r for r in response.json()} except httpx.HTTPError as err: logger.debug(err, exc_info=True) - # logger.debug(err) if err.response: - # logger.error(f"indexd `POST /bulk/documents` endpoint returned a {err.response.status_code} HTTP status code") logger.error( "indexd `POST %s` endpoint returned a %s HTTP status code", endpoint_path, @@ -291,8 +290,6 @@ async def get_objects( logger.error( "Unable to get a response from indexd `POST %s` endpoint", endpoint_path ) - # XXX exception info needed? - # logger.error(f"{msg}\nException:\n{err}", exc_info=True) # XXX need to handle data=False # import pdb; pdb.set_trace() diff --git a/src/mds/query.py b/src/mds/query.py index c3e57ce2..521110a5 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -1,9 +1,9 @@ -import operator +import json from fastapi import HTTPException, Query, APIRouter from pydantic import Json -import sqlalchemy -from sqlalchemy.dialects.postgresql import ARRAY, JSONB +from sqlalchemy import cast, column, exists +from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.sql.operators import ColumnOperators from starlette.requests import Request from starlette.status import HTTP_404_NOT_FOUND @@ -15,6 +15,7 @@ @mod.get("/metadata") # XXX fix tests +# XXX maybe translate old way to use filter argument here? async def search_metadata( request: Request, data: bool = Query( @@ -26,32 +27,6 @@ async def search_metadata( 10, description="Maximum number of records returned. (max: 2000)" ), offset: int = Query(0, description="Return results at this given offset."), -): - """ - XXX comments - """ - # XXX return await - response = await search_metadata_helper( - request.query_params, data=data, limit=limit, offset=offset - ) - return response - - -# XXX rename/update docstring -async def search_metadata_helper( - # request: Request, - # query_params: dict, - data: bool = Query( - False, - description="Switch to returning a list of GUIDs (false), " - "or GUIDs mapping to their metadata (true).", - ), - limit: int = Query( - 10, description="Maximum number of records returned. (max: 2000)" - ), - offset: int = Query(0, description="Return results at this given offset."), - # XXX description - filter: Json = Query(Json(), description="The filters!"), ): """Search the metadata. @@ -90,52 +65,90 @@ async def search_metadata_helper( {"a": {"b": {"d": 5}}} {"a": {"b": {"c": "333", "d": 4}}} + """ + return await search_metadata_helper( + # request.query_params, data=data, limit=limit, offset=offset + data=data, + limit=limit, + offset=offset, + filter={}, + ) + + +# XXX rename/update docstring +async def search_metadata_helper( + data: bool = Query( + False, + description="Switch to returning a list of GUIDs (false), " + "or GUIDs mapping to their metadata (true).", + ), + limit: int = Query( + 10, description="Maximum number of records returned. (max: 2000)" + ), + offset: int = Query(0, description="Return results at this given offset."), + # XXX description + # XXX how to name this variable something other than filter + # filter: Json = Query(Json(), description="The filters!"), + filter: str = Query("", description="The filters!"), +): + """ + XXX comments """ limit = min(limit, 2000) - # queries = {} - # for key, value in request.query_params.multi_items(): - # for key, value in query_params.items(): - # if key not in {"data", "limit", "offset"}: - # queries.setdefault(key, []).append(value) # XXX using __op__? # XXX aggregate functions (e.g size, max, etc.) filter_operators = { - "all": None, - "any": None, - "eq": ColumnOperators.__eq__, - "ne": ColumnOperators.__ne__, - "gt": ColumnOperators.__gt__, - "gte": ColumnOperators.__ge__, - "lt": ColumnOperators.__lt__, - "lte": ColumnOperators.__le__, + ":all": None, + ":any": None, + ":eq": ColumnOperators.__eq__, + ":ne": ColumnOperators.__ne__, + ":gt": ColumnOperators.__gt__, + ":gte": ColumnOperators.__ge__, + ":lt": ColumnOperators.__lt__, + ":lte": ColumnOperators.__le__, # XXX check how mongoDB filters for null (does it use it, if an object doesn't have a given key, is that considered null, etc.) # XXX in is probably unnecessary (i.e. instead of (score, :in, [1, 2]) do (OR (score, :eq, 1), (score, :eq, 2))) - # "in": ColumnOperators.in_, + # ":in": ColumnOperators.in_, # XXX what does SQL IS mean? - "is": ColumnOperators.is_, - "is_not": ColumnOperators.isnot, - # "like": (ColumnOperators.like, str), - "like": ColumnOperators.like, - # "like": (ColumnOperators.like, str), + ":is": ColumnOperators.is_, + ":is_not": ColumnOperators.isnot, + ":like": ColumnOperators.like, } - textual_operators = ["like"] - other_not_to_be_cast = ["like", "in"] - other_type = {"in": ARRAY} + textual_operators = [":like"] + other_not_to_be_cast = [":like"] + + # XXX comments + def parantheses_to_json(s): + if not s: + return {} + # XXX what if only one is a parantheses? + if s[0] != "(" and s[-1] != ")": + try: + return json.loads(s) + except: + return s + + f = s[1:-1].split(",", 2) + # f = {"name": f[0], "op": f[1].strip(":"), "val": parantheses_to_json(f[2])} + f = {"name": f[0], "op": f[1], "val": parantheses_to_json(f[2])} + return f + + filter = [parantheses_to_json(filter)] # XXX should maybe default query + # XXX comments def add_filter(query): - # for path, values in queries.items(): - # query = query.where( - # db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) - # ) # XXX make this an optional url query param for _filter in filter: + if not _filter: + continue + json_object = Metadata.data[list(_filter["name"].split("."))] operator = filter_operators[_filter["op"]] - other = sqlalchemy.cast(_filter["val"], JSONB) + other = cast(_filter["val"], JSONB) # XXX this could be used for has_key (e.g. for does have bears key: (teams, :any, (,:eq, "bears"))) - if _filter["op"] == "any": + if _filter["op"] == ":any": # XXX handle duplicates # XXX any and has have to be used with other scalar operation @@ -143,17 +156,7 @@ def add_filter(query): # there is a key for the scalar comparator might determine # whether to use jsonb_array_elements - # e = sqlalchemy.exists(query.select('*').select_from(f).where(f.like('%'))) - # e = sqlalchemy.exists(query.select(f.name).select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) - # e = sqlalchemy.exists(query.select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) - # e = sqlalchemy.exists(query.select(sqlalchemy.text(f.name)).select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) - # e = sqlalchemy.exists(db.all().select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) - # e = sqlalchemy.exists(db.select(f.name).select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) - - # works - # e = sqlalchemy.exists(db.select('*').select_from(f).where(sqlalchemy.column(f.name).like(_filter['val']))) - - other = sqlalchemy.cast(_filter["val"]["val"], JSONB) + other = cast(_filter["val"]["val"], JSONB) f = db.func.jsonb_array_elements if _filter["val"]["op"] in textual_operators: f = db.func.jsonb_array_elements_text @@ -161,23 +164,21 @@ def add_filter(query): if _filter["val"]["op"] in other_not_to_be_cast: other = _filter["val"]["val"] - # if _filter["val"]["op"] in other_type: - # other = sqlalchemy.cast(_filter["val"]["val"], other_type[_filter["val"]["op"]]) - f = f(Metadata.data[list(_filter["name"].split("."))]).alias() operator = filter_operators[_filter["val"]["op"]] - e = sqlalchemy.exists( + e = exists( + # XXX select 1 db.select("*") .select_from(f) - .where(operator(sqlalchemy.column(f.name), other)) + .where(operator(column(f.name), other)) ) query = query.where(e) return query.offset(offset).limit(limit) - elif _filter["op"] == "all": + elif _filter["op"] == ":all": count_f = db.func.jsonb_array_length(json_object) f = db.func.jsonb_array_elements @@ -193,7 +194,7 @@ def add_filter(query): sq = ( db.select([db.func.count()]) .select_from(f) - .where(operator(sqlalchemy.column(f.name), other)) + .where(operator(column(f.name), other)) ) query = query.where(count_f == sq) @@ -201,9 +202,8 @@ def add_filter(query): # elif _filter["op"] == "is": - if _filter["op"] == "like": + if _filter["op"] in textual_operators: json_object = json_object.astext - operator = operator[0] other = _filter["val"] query = query.where(operator(json_object, other)) From bf3a6f68cadd1c5d85a3a1980cab120355c627a7 Mon Sep 17 00:00:00 2001 From: John McCann Date: Mon, 14 Dec 2020 18:48:19 -0800 Subject: [PATCH 08/39] feat(GET /objects): parse filter with parsimonious --- poetry.lock | 16 ++++- pyproject.toml | 1 + src/mds/query.py | 171 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 173 insertions(+), 15 deletions(-) diff --git a/poetry.lock b/poetry.lock index 63044c08..03055cbc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -447,6 +447,17 @@ version = "20.4" pyparsing = ">=2.0.2" six = "*" +[[package]] +category = "main" +description = "(Soon to be) the fastest pure-Python PEG parser I could muster" +name = "parsimonious" +optional = false +python-versions = "*" +version = "0.8.1" + +[package.dependencies] +six = ">=1.9.0" + [[package]] category = "dev" description = "plugin and hook calling mechanisms for python" @@ -760,7 +771,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=3.5,<3.7.3 || >3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "jaraco.test (>=3.2.0)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] [metadata] -content-hash = "3f374a92fd69decbe68583c9b06f0cfee76137bd8a76d5f93c1cd081ab070c1f" +content-hash = "c371a8caa85a575895030ef7106791216179d84c54d875d06fd56b1088ec7002" python-versions = "^3.7" [metadata.files] @@ -1060,6 +1071,9 @@ packaging = [ {file = "packaging-20.4-py2.py3-none-any.whl", hash = "sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181"}, {file = "packaging-20.4.tar.gz", hash = "sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8"}, ] +parsimonious = [ + {file = "parsimonious-0.8.1.tar.gz", hash = "sha256:3add338892d580e0cb3b1a39e4a1b427ff9f687858fdd61097053742391a9f6b"}, +] pluggy = [ {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, diff --git a/pyproject.toml b/pyproject.toml index e9a22925..39c35857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ indexclient = "^2.1.0" httpx = "^0.12.1" authutils = "^5.0.4" cdislogging = "^1.0.0" +parsimonious = "0.8.1" [tool.poetry.dev-dependencies] pytest = "^5.3" diff --git a/src/mds/query.py b/src/mds/query.py index 521110a5..4c3eee6d 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -8,6 +8,8 @@ from starlette.requests import Request from starlette.status import HTTP_404_NOT_FOUND +from parsimonious.grammar import Grammar + from .models import db, Metadata mod = APIRouter() @@ -66,12 +68,47 @@ async def search_metadata( {"a": {"b": {"c": "333", "d": 4}}} """ + limit = min(limit, 2000) + queries = {} + for key, value in request.query_params.multi_items(): + if key not in {"data", "limit", "offset"}: + queries.setdefault(key, []).append(value) + + # def add_filter(query): + def add_filters(): + filter_param = "(and" + for path, values in queries.items(): + filter_param += ",(or" + for v in values: + filter_param += f",({path},:like,{v})" + # query = query.where( + # db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) + # db.or_(Metadata.data[list(path.split("."))].astext == v) + # ) + filter_param += ")" + filter_param += ")" + return filter_param + # return query.offset(offset).limit(limit) + + # if data: + # return { + # metadata.guid: metadata.data + # for metadata in await add_filter(Metadata.query).gino.all() + # } + # else: + # return [ + # row[0] + # for row in await add_filter(db.select([Metadata.guid])) + # .gino.return_model(False) + # .all() + # ] + return await search_metadata_helper( # request.query_params, data=data, limit=limit, offset=offset data=data, limit=limit, offset=offset, - filter={}, + filter=add_filters(), ) @@ -94,6 +131,8 @@ async def search_metadata_helper( """ XXX comments """ + # import pdb; pdb.set_trace() + limit = min(limit, 2000) # XXX using __op__? # XXX aggregate functions (e.g size, max, etc.) @@ -117,21 +156,112 @@ async def search_metadata_helper( textual_operators = [":like"] other_not_to_be_cast = [":like"] + def fun_parser(s): + # grammar = Grammar( + # """ + # bold_text = bold_open text bold_close + # text = ~"[A-Z 0-9]*"i + # bold_open = "((" + # bold_close = "))" + # """) + + # '(and,(or,(fun_fact,:like,rhinos_rock),(fun_fact,:like,cats_rock)),(or,(_uploader_id,:like,57)))' + # (_uploader_id,:eq,“57”) + # (or,(_uploader_id,:like,57)) + + # compound_filter = open boolean separator scalar_filter close + # boolean = "and" + # json_value = "57" / "rhinosrock" / "catsrock" + # json_value = ~"[A-Z 0-9_]*"i + grammar = Grammar( + """ + boolean_filter = scalar_filter / (open boolean (separator (scalar_filter / boolean_filter))+ close) + boolean = "and" / "or" + scalar_filter = open json_key separator operator separator (json_value / scalar_filter) close + json_key = ~"[A-Z 0-9_]*"i + # json_value = "57" / "rhinosrock" / "catsrock" + operator = ":eq" / ":any" + open = "(" + close = ")" + separator = "," + + json_value = true_false_null / number / doubleString + # json_value = true_false_null / number + true_false_null = "true" / "false" / "null" + number = (int frac exp) / (int exp) / (int frac) / int + int = "-"? ((digit1to9 digits) / digit) + frac = "." digits + exp = e digits + digits = digit+ + e = "e+" / "e-" / "e" / "E+" / "E-" / "E" + digit1to9 = ~"[1-9]" + digit = ~"[0-9]" + + # string = ~"\"[^\"]*\"" + # string = ~"[\".*\"]" + # string = ~"\".*\"" + # doubleString = ~'"([^"]|(\"))*"' + doubleString = ~'"([^"])*"' + """ + ) + # o = grammar.parse('(fun_fact,:eq,"rhinos_rock")') + # o = grammar.parse('(and,(or,(fun_fact,:eq,"rhinosrock"),(funfact,:eq,"catsrock")),(or,(uploaderid,:eq,57)))') + # import pdb; pdb.set_trace() + + # json_value = "\"57\"" + # print(grammar.parse('((bold stuff))')) + # print(grammar.parse('(_uploader_id,:eq,57)')) + # print(grammar.parse('(_uploader_id,:eq,true)')) + # print(grammar.parse('(_uploader_id,:eq,false)')) + # print(grammar.parse('(_uploader_id,:eq,null)')) + # print(grammar.parse('(fun_fact,:eq,"rhinos_rock")')) + # print(grammar.parse('(uploader,:eq,"57")')) + # print(grammar.parse('(uploader,:eq,"57")')) + # print(grammar.parse('(paths,:any,(,:eq,57))')) + # print(grammar.parse('(and,(uploader,:eq,57))')) + # print(grammar.parse('(or,(uploader,:eq,57))')) + # print(grammar.parse('(and,(or,(funfact,:eq,rhinosrock),(funfact,:eq,catsrock)),(or,(uploaderid,:eq,57)))')) + print( + grammar.parse( + '(and,(or,(fun_fact,:eq,"rhinosrock"),(funfact,:eq,"catsrock")),(or,(uploaderid,:eq,57)))' + ) + ) + + # print(grammar.parse('(paths,:eq,(,:eq,57))')) + # print(grammar.parse('(and,(uploader,:eq,57))')) + # print(grammar.parse('(,:eq,57)')) + + fun_parser(filter) + # XXX comments def parantheses_to_json(s): - if not s: - return {} - # XXX what if only one is a parantheses? - if s[0] != "(" and s[-1] != ")": - try: - return json.loads(s) - except: - return s - - f = s[1:-1].split(",", 2) - # f = {"name": f[0], "op": f[1].strip(":"), "val": parantheses_to_json(f[2])} - f = {"name": f[0], "op": f[1], "val": parantheses_to_json(f[2])} - return f + def scalar_to_json(s): + if not s: + return {} + # XXX what if only one is a parantheses? + if s[0] != "(" and s[-1] != ")": + try: + return json.loads(s) + except: + return s + + f = s[1:-1].split(",", 2) + f = {"name": f[0], "op": f[1], "val": parantheses_to_json(f[2])} + return f + + # XXX DRY with boolean_operators = ["or", "and"] + # scalar_operator_pattern = r'\({json_key_pattern}*,{operator_pattern},{json_value_pattern}\)' + # compund_operator_pattern = r'\({json_key_pattern}*,{operator_pattern},({scalar_value_pattern}|{json_value_pattern})\)' + # # filter_pattern = r'\({{and|or}{,filter_pattern}+}|{compund_operator}\)' + # boolean_filters_pattern = r'\({{and|or}{,compund_operator}+}\)' + # filter_pattern = r'' + # r‘\((and|or)(,\([.+]]\))\)’ + # if s.startswith('(or'): + # return {"or": [parantheses_to_json(s[3:-1])]} + # if s.startswith('(and'): + # return {"and": parantheses_to_json(s[4:-1])} + + return scalar_to_json(s) filter = [parantheses_to_json(filter)] @@ -207,6 +337,19 @@ def add_filter(query): other = _filter["val"] query = query.where(operator(json_object, other)) + # query = query.where(db.or_(operator(json_object, other), ColumnOperators.__eq__(json_object, cast("cats_rock", JSONB)))) + # query = query.where(db.and_(operator(json_object, other), ColumnOperators.__eq__(json_object, cast("cats_rock", JSONB)))) + # query = query.where( + # db.and_( + # db.or_( + # operator(json_object, other), + # operator(json_object, "cats_rock"), + # ), + # db.or_( + # operator(json_object, "%"), + # ) + # ) + # ) return query.offset(offset).limit(limit) From 06cc2f0a9584ed3c13da9980a500976eae038251 Mon Sep 17 00:00:00 2001 From: John McCann Date: Mon, 14 Dec 2020 23:50:35 -0800 Subject: [PATCH 09/39] feat(GET /objects): DRY up sqlalchemy clauses --- src/mds/query.py | 203 +++++++++++++++++++++-------------------------- 1 file changed, 92 insertions(+), 111 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 4c3eee6d..0c10a475 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -112,7 +112,6 @@ def add_filters(): ) -# XXX rename/update docstring async def search_metadata_helper( data: bool = Query( False, @@ -131,30 +130,81 @@ async def search_metadata_helper( """ XXX comments """ - # import pdb; pdb.set_trace() - limit = min(limit, 2000) - # XXX using __op__? - # XXX aggregate functions (e.g size, max, etc.) - filter_operators = { - ":all": None, - ":any": None, - ":eq": ColumnOperators.__eq__, - ":ne": ColumnOperators.__ne__, - ":gt": ColumnOperators.__gt__, - ":gte": ColumnOperators.__ge__, - ":lt": ColumnOperators.__lt__, - ":lte": ColumnOperators.__le__, - # XXX check how mongoDB filters for null (does it use it, if an object doesn't have a given key, is that considered null, etc.) - # XXX in is probably unnecessary (i.e. instead of (score, :in, [1, 2]) do (OR (score, :eq, 1), (score, :eq, 2))) - # ":in": ColumnOperators.in_, + + # XXX get_any_sql_clause could possibly be used for a :has_key operation + # w/ a little more work (e.g. does teams object {} have "bears" key: + # (teams,:any,(,:eq,"bears"))) (jsonb_object_keys + # https://www.postgresql.org/docs/9.5/functions-json.html#FUNCTIONS-JSON-OP-TABLE) + def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): + if ( + "type" in operators[scalar_operator_name] + and operators[scalar_operator_name]["type"] == "text" + ): + f = db.func.jsonb_array_elements_text + else: + f = db.func.jsonb_array_elements + scalar_other = cast(scalar_other, JSONB) + + f = f(target_json_object).alias() + + return exists( + # XXX select 1 + db.select("*") + .select_from(f) + .where( + operators[scalar_operator_name]["sql_comparator"]( + column(f.name), scalar_other + ) + ) + ) + + def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): + if ( + "type" in operators[scalar_operator_name] + and operators[scalar_operator_name]["type"] == "text" + ): + f = db.func.jsonb_array_elements_text + else: + f = db.func.jsonb_array_elements + scalar_other = cast(scalar_other, JSONB) + + f = f(target_json_object).alias() + count_f = db.func.jsonb_array_length(target_json_object) + + return ColumnOperators.__eq__( + count_f, + db.select([db.func.count()]) + .select_from(f) + .where( + operators[scalar_operator_name]["sql_comparator"]( + column(f.name), scalar_other + ) + ), + ) + + # XXX aggregate operators might be nice (e.g size, max, etc.) + operators = { + ":eq": {"sql_comparator": ColumnOperators.__eq__}, + ":ne": {"sql_comparator": ColumnOperators.__ne__}, + ":gt": {"sql_comparator": ColumnOperators.__gt__}, + ":gte": {"sql_comparator": ColumnOperators.__ge__}, + ":lt": {"sql_comparator": ColumnOperators.__lt__}, + ":lte": {"sql_comparator": ColumnOperators.__le__}, + # XXX not working # XXX what does SQL IS mean? - ":is": ColumnOperators.is_, - ":is_not": ColumnOperators.isnot, - ":like": ColumnOperators.like, + # XXX check how mongoDB filters for null (does it use it?, if an object + # doesn't have a given key, is that considered null? etc.) + ":is": {"sql_comparator": ColumnOperators.is_}, + # XXX in is probably unnecessary (i.e. instead of (score, :in, [1, 2]) do (or(score,:eq,1),(score,:eq,2))) + # ":in": ColumnOperators.in_, + ":like": { + "type": "text", + "sql_comparator": ColumnOperators.like, + }, + ":all": {"sql_clause": get_all_sql_clause}, + ":any": {"sql_clause": get_any_sql_clause}, } - textual_operators = [":like"] - other_not_to_be_cast = [":like"] def fun_parser(s): # grammar = Grammar( @@ -265,99 +315,30 @@ def scalar_to_json(s): filter = [parantheses_to_json(filter)] - # XXX should maybe default query - # XXX comments - def add_filter(query): - - # XXX make this an optional url query param - for _filter in filter: - if not _filter: - continue - - json_object = Metadata.data[list(_filter["name"].split("."))] - operator = filter_operators[_filter["op"]] - other = cast(_filter["val"], JSONB) - # XXX this could be used for has_key (e.g. for does have bears key: (teams, :any, (,:eq, "bears"))) - if _filter["op"] == ":any": - - # XXX handle duplicates - # XXX any and has have to be used with other scalar operation - # (i.e. (_resource_paths,:any,(,:like,"/programs/*")). whether - # there is a key for the scalar comparator might determine - # whether to use jsonb_array_elements - - other = cast(_filter["val"]["val"], JSONB) - f = db.func.jsonb_array_elements - if _filter["val"]["op"] in textual_operators: - f = db.func.jsonb_array_elements_text - - if _filter["val"]["op"] in other_not_to_be_cast: - other = _filter["val"]["val"] - - f = f(Metadata.data[list(_filter["name"].split("."))]).alias() - operator = filter_operators[_filter["val"]["op"]] - - e = exists( - # XXX select 1 - db.select("*") - .select_from(f) - .where(operator(column(f.name), other)) - ) - - query = query.where(e) - - return query.offset(offset).limit(limit) - - elif _filter["op"] == ":all": - count_f = db.func.jsonb_array_length(json_object) - f = db.func.jsonb_array_elements - - if _filter["val"]["op"] in textual_operators: - f = db.func.jsonb_array_elements_text - f = f(json_object).alias() - - operator = filter_operators[_filter["val"]["op"]] - - if _filter["val"]["op"] in other_not_to_be_cast: - other = _filter["val"]["val"] - - sq = ( - db.select([db.func.count()]) - .select_from(f) - .where(operator(column(f.name), other)) - ) - query = query.where(count_f == sq) - - return query.offset(offset).limit(limit) - - # elif _filter["op"] == "is": - - if _filter["op"] in textual_operators: - json_object = json_object.astext - other = _filter["val"] + def add_filter(target_key, operator_name, other): + json_object = Metadata.data[list(target_key.split("."))] + if type(other) is dict: + return operators[operator_name]["sql_clause"]( + json_object, other["op"], other["val"] + ) - query = query.where(operator(json_object, other)) - # query = query.where(db.or_(operator(json_object, other), ColumnOperators.__eq__(json_object, cast("cats_rock", JSONB)))) - # query = query.where(db.and_(operator(json_object, other), ColumnOperators.__eq__(json_object, cast("cats_rock", JSONB)))) - # query = query.where( - # db.and_( - # db.or_( - # operator(json_object, other), - # operator(json_object, "cats_rock"), - # ), - # db.or_( - # operator(json_object, "%"), - # ) - # ) - # ) + if ( + "type" in operators[operator_name] + and operators[operator_name]["type"] == "text" + ): + json_object = json_object.astext + else: + other = cast(other, JSONB) - return query.offset(offset).limit(limit) + return operators[operator_name]["sql_comparator"](json_object, other) if data: - return { - metadata.guid: metadata.data - for metadata in await add_filter(Metadata.query).gino.all() - } + query = Metadata.query + if filter[0]: + query = query.where( + add_filter(filter[0]["name"], filter[0]["op"], filter[0]["val"]) + ) + return {metadata.guid: metadata.data for metadata in await query.gino.all()} else: return [ row[0] From a8d24e56539ccc701cb4a67e97777929109717f9 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 15 Dec 2020 07:24:03 -0800 Subject: [PATCH 10/39] feat(GET /objects): remove unnecessary comments --- src/mds/query.py | 150 ++++++++++++++++++++--------------------------- 1 file changed, 63 insertions(+), 87 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 0c10a475..ff7138ea 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -1,7 +1,6 @@ import json from fastapi import HTTPException, Query, APIRouter -from pydantic import Json from sqlalchemy import cast, column, exists from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.sql.operators import ColumnOperators @@ -9,6 +8,7 @@ from starlette.status import HTTP_404_NOT_FOUND from parsimonious.grammar import Grammar +from parsimonious.nodes import NodeVisitor from .models import db, Metadata @@ -17,7 +17,6 @@ @mod.get("/metadata") # XXX fix tests -# XXX maybe translate old way to use filter argument here? async def search_metadata( request: Request, data: bool = Query( @@ -74,37 +73,20 @@ async def search_metadata( if key not in {"data", "limit", "offset"}: queries.setdefault(key, []).append(value) - # def add_filter(query): def add_filters(): filter_param = "(and" for path, values in queries.items(): filter_param += ",(or" for v in values: - filter_param += f",({path},:like,{v})" - # query = query.where( - # db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) - # db.or_(Metadata.data[list(path.split("."))].astext == v) - # ) + # to maintain backwards compatibility, use :like because it's + # a textual operator. + # XXX will need to escape % + filter_param += f',({path},:like,"{v}")' filter_param += ")" filter_param += ")" return filter_param - # return query.offset(offset).limit(limit) - - # if data: - # return { - # metadata.guid: metadata.data - # for metadata in await add_filter(Metadata.query).gino.all() - # } - # else: - # return [ - # row[0] - # for row in await add_filter(db.select([Metadata.guid])) - # .gino.return_model(False) - # .all() - # ] return await search_metadata_helper( - # request.query_params, data=data, limit=limit, offset=offset data=data, limit=limit, offset=offset, @@ -124,7 +106,6 @@ async def search_metadata_helper( offset: int = Query(0, description="Return results at this given offset."), # XXX description # XXX how to name this variable something other than filter - # filter: Json = Query(Json(), description="The filters!"), filter: str = Query("", description="The filters!"), ): """ @@ -206,31 +187,19 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): ":any": {"sql_clause": get_any_sql_clause}, } - def fun_parser(s): - # grammar = Grammar( - # """ - # bold_text = bold_open text bold_close - # text = ~"[A-Z 0-9]*"i - # bold_open = "((" - # bold_close = "))" - # """) - - # '(and,(or,(fun_fact,:like,rhinos_rock),(fun_fact,:like,cats_rock)),(or,(_uploader_id,:like,57)))' - # (_uploader_id,:eq,“57”) - # (or,(_uploader_id,:like,57)) - - # compound_filter = open boolean separator scalar_filter close - # boolean = "and" - # json_value = "57" / "rhinosrock" / "catsrock" - # json_value = ~"[A-Z 0-9_]*"i + def parse_filter(filter_string): + if not filter_string: + return + + # https://github.com/erikrose/parsimonious/pull/23/files grammar = Grammar( """ boolean_filter = scalar_filter / (open boolean (separator (scalar_filter / boolean_filter))+ close) boolean = "and" / "or" scalar_filter = open json_key separator operator separator (json_value / scalar_filter) close - json_key = ~"[A-Z 0-9_]*"i + json_key = ~"[A-Z 0-9_.]*"i # json_value = "57" / "rhinosrock" / "catsrock" - operator = ":eq" / ":any" + operator = ":eq" / ":ne" / ":gt" / ":gte" / ":lt" / ":lte" / ":is" / ":like" / ":all" / ":any" open = "(" close = ")" separator = "," @@ -247,41 +216,41 @@ def fun_parser(s): digit1to9 = ~"[1-9]" digit = ~"[0-9]" - # string = ~"\"[^\"]*\"" - # string = ~"[\".*\"]" - # string = ~"\".*\"" - # doubleString = ~'"([^"]|(\"))*"' doubleString = ~'"([^"])*"' """ ) - # o = grammar.parse('(fun_fact,:eq,"rhinos_rock")') - # o = grammar.parse('(and,(or,(fun_fact,:eq,"rhinosrock"),(funfact,:eq,"catsrock")),(or,(uploaderid,:eq,57)))') - # import pdb; pdb.set_trace() - - # json_value = "\"57\"" - # print(grammar.parse('((bold stuff))')) - # print(grammar.parse('(_uploader_id,:eq,57)')) - # print(grammar.parse('(_uploader_id,:eq,true)')) - # print(grammar.parse('(_uploader_id,:eq,false)')) - # print(grammar.parse('(_uploader_id,:eq,null)')) - # print(grammar.parse('(fun_fact,:eq,"rhinos_rock")')) - # print(grammar.parse('(uploader,:eq,"57")')) - # print(grammar.parse('(uploader,:eq,"57")')) - # print(grammar.parse('(paths,:any,(,:eq,57))')) - # print(grammar.parse('(and,(uploader,:eq,57))')) - # print(grammar.parse('(or,(uploader,:eq,57))')) - # print(grammar.parse('(and,(or,(funfact,:eq,rhinosrock),(funfact,:eq,catsrock)),(or,(uploaderid,:eq,57)))')) - print( - grammar.parse( - '(and,(or,(fun_fact,:eq,"rhinosrock"),(funfact,:eq,"catsrock")),(or,(uploaderid,:eq,57)))' - ) - ) - - # print(grammar.parse('(paths,:eq,(,:eq,57))')) - # print(grammar.parse('(and,(uploader,:eq,57))')) - # print(grammar.parse('(,:eq,57)')) - - fun_parser(filter) + # print( + # grammar.parse(filter_string) + # ) + return grammar.parse(filter_string) + + class FilterVisitor(NodeVisitor): + def visit_scalar_filter(self, node, visited_children): + # import pdb; pdb.set_trace() + ( + _, + json_key, + _, + operator, + _, + json_value_or_scalar_filter, + _, + ) = visited_children + # print(json_key, operator, json_value_or_scalar_filter) + # for node in (json_key, operator, json_value_or_scalar_filter): + # print(node.text) + print(json_key) + return (json_key, operator, json_value_or_scalar_filter) + + def generic_visit(self, node, visited_children): + """ The generic visit method. """ + # return visited_children or node # return section.text + return visited_children or node + + filter_tree = parse_filter(filter) + if filter_tree: + filter_tree_visitor = FilterVisitor() + filter_tree_visitor.visit(filter_tree) # XXX comments def parantheses_to_json(s): @@ -299,18 +268,6 @@ def scalar_to_json(s): f = {"name": f[0], "op": f[1], "val": parantheses_to_json(f[2])} return f - # XXX DRY with boolean_operators = ["or", "and"] - # scalar_operator_pattern = r'\({json_key_pattern}*,{operator_pattern},{json_value_pattern}\)' - # compund_operator_pattern = r'\({json_key_pattern}*,{operator_pattern},({scalar_value_pattern}|{json_value_pattern})\)' - # # filter_pattern = r'\({{and|or}{,filter_pattern}+}|{compund_operator}\)' - # boolean_filters_pattern = r'\({{and|or}{,compund_operator}+}\)' - # filter_pattern = r'' - # r‘\((and|or)(,\([.+]]\))\)’ - # if s.startswith('(or'): - # return {"or": [parantheses_to_json(s[3:-1])]} - # if s.startswith('(and'): - # return {"and": parantheses_to_json(s[4:-1])} - return scalar_to_json(s) filter = [parantheses_to_json(filter)] @@ -332,12 +289,31 @@ def add_filter(target_key, operator_name, other): return operators[operator_name]["sql_comparator"](json_object, other) + def get_clause(node): + if node is None: + return + + # boolean = getattr("boolean", node) + boolean = getattr(node, "boolean") + if boolean is not None: + if boolean == "and": + return db.and_(get_clause(c) for c in node.children) + elif boolean == "or": + return db.or_(get_clause(c) for c in node.children) + raise + + return add_filter(node.json_key, node.operator, node.json_value) + if data: query = Metadata.query if filter[0]: query = query.where( add_filter(filter[0]["name"], filter[0]["op"], filter[0]["val"]) ) + + # if filter_tree: + # query = query.where(get_clause(filter_tree)) + return {metadata.guid: metadata.data for metadata in await query.gino.all()} else: return [ From f86f0367d85bbbb44232bec0bc3023706f129f06 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 15 Dec 2020 17:26:49 -0800 Subject: [PATCH 11/39] feat(GET /objects): support boolean SQL clauses --- src/mds/query.py | 139 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 107 insertions(+), 32 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index ff7138ea..6fb0f3b5 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -113,6 +113,23 @@ async def search_metadata_helper( """ limit = min(limit, 2000) + def add_filter(target_key, operator_name, other): + json_object = Metadata.data[list(target_key.split("."))] + if type(other) is dict: + return operators[operator_name]["sql_clause"]( + json_object, other["op"], other["val"] + ) + + if ( + "type" in operators[operator_name] + and operators[operator_name]["type"] == "text" + ): + json_object = json_object.astext + else: + other = cast(other, JSONB) + + return operators[operator_name]["sql_comparator"](json_object, other) + # XXX get_any_sql_clause could possibly be used for a :has_key operation # w/ a little more work (e.g. does teams object {} have "bears" key: # (teams,:any,(,:eq,"bears"))) (jsonb_object_keys @@ -192,20 +209,24 @@ def parse_filter(filter_string): return # https://github.com/erikrose/parsimonious/pull/23/files + # parantheses get their own list, even if they only have one element grammar = Grammar( """ - boolean_filter = scalar_filter / (open boolean (separator (scalar_filter / boolean_filter))+ close) - boolean = "and" / "or" - scalar_filter = open json_key separator operator separator (json_value / scalar_filter) close + filter = scalar_filter / boolean_filter + boolean_filter = open boolean boolean_operands close + boolean = "and" / "or" + boolean_operands = boolean_operand+ + boolean_operand = separator scalar_filter_or_boolean_filter + scalar_filter_or_boolean_filter = scalar_filter / boolean_filter + scalar_filter = open json_key separator operator separator json_value_or_scalar_filter close + json_value_or_scalar_filter = json_value / scalar_filter json_key = ~"[A-Z 0-9_.]*"i - # json_value = "57" / "rhinosrock" / "catsrock" operator = ":eq" / ":ne" / ":gt" / ":gte" / ":lt" / ":lte" / ":is" / ":like" / ":all" / ":any" open = "(" close = ")" separator = "," json_value = true_false_null / number / doubleString - # json_value = true_false_null / number true_false_null = "true" / "false" / "null" number = (int frac exp) / (int exp) / (int frac) / int int = "-"? ((digit1to9 digits) / digit) @@ -225,6 +246,39 @@ def parse_filter(filter_string): return grammar.parse(filter_string) class FilterVisitor(NodeVisitor): + # def visit_boolean_filter(self, node, visited_children): + # pass + + def visit_filter(self, node, visited_children): + return visited_children[0] + + def visit_boolean_filter(self, node, visited_children): + _, boolean, boolean_operands, _ = visited_children + # return { boolean: boolean_operands } + # import pdb; pdb.set_trace() + # return db.and_(c for c in visited_children if c.expr_name == "scalar_filter") + # return db.and_(c for c in visited_children) + # return db.and_(c for c in visited_children[0]) + + if boolean == "and": + return db.and_(clause for clause in boolean_operands) + elif boolean == "or": + return db.or_(clause for clause in boolean_operands) + else: + raise + + def visit_boolean(self, node, visited_children): + return node.text + + def visit_boolean_operand(self, node, visited_children): + # return node.text + _, scalar_filter_or_boolean_filter = visited_children + return scalar_filter_or_boolean_filter + + def visit_scalar_filter_or_boolean_filter(self, node, visited_children): + # import pdb; pdb.set_trace() + return visited_children[0] + def visit_scalar_filter(self, node, visited_children): # import pdb; pdb.set_trace() ( @@ -236,21 +290,55 @@ def visit_scalar_filter(self, node, visited_children): json_value_or_scalar_filter, _, ) = visited_children + + # return node.text # print(json_key, operator, json_value_or_scalar_filter) # for node in (json_key, operator, json_value_or_scalar_filter): # print(node.text) - print(json_key) - return (json_key, operator, json_value_or_scalar_filter) + # print(json_key) + + # import pdb; pdb.set_trace() + + # return (json_key, operator, json_value_or_scalar_filter) + # return (json_key.text, operator[0].text, json_value_or_scalar_filter[0][0].text) + + # XXX need try around json.loads() ? + # import pdb; pdb.set_trace() + # return { "name": json_key.text, "op": operator[0].text, "val": json.loads(json_value_or_scalar_filter[0][0].text) } + # import pdb; pdb.set_trace() + # return { "name": json_key.text, "op": operator.text, "val": json.loads(json_value_or_scalar_filter.text) } + + # return { "name": json_key, "op": operator, "val": json_value_or_scalar_filter } + + return add_filter(json_key, operator, json_value_or_scalar_filter) + + def visit_json_value_or_scalar_filter(self, node, visited_children): + return visited_children[0] + + def visit_json_key(self, node, visited_children): + return node.text + + def visit_operator(self, node, visited_children): + return node.text + + def visit_json_value(self, node, visited_children): + return json.loads(node.text) def generic_visit(self, node, visited_children): """ The generic visit method. """ # return visited_children or node # return section.text + # import pdb; pdb.set_trace() + # return visited_children or node if node.expr_name not in ["open", "close", "separator"] else None return visited_children or node + # return node filter_tree = parse_filter(filter) + # import pdb; pdb.set_trace() if filter_tree: + # pass filter_tree_visitor = FilterVisitor() - filter_tree_visitor.visit(filter_tree) + root_sql_clause = filter_tree_visitor.visit(filter_tree) + # import pdb; pdb.set_trace() # XXX comments def parantheses_to_json(s): @@ -272,29 +360,14 @@ def scalar_to_json(s): filter = [parantheses_to_json(filter)] - def add_filter(target_key, operator_name, other): - json_object = Metadata.data[list(target_key.split("."))] - if type(other) is dict: - return operators[operator_name]["sql_clause"]( - json_object, other["op"], other["val"] - ) - - if ( - "type" in operators[operator_name] - and operators[operator_name]["type"] == "text" - ): - json_object = json_object.astext - else: - other = cast(other, JSONB) - - return operators[operator_name]["sql_comparator"](json_object, other) - def get_clause(node): - if node is None: + if not node: return # boolean = getattr("boolean", node) - boolean = getattr(node, "boolean") + expression_name = getattr(node, "expr_name") + # if expression_name + if boolean is not None: if boolean == "and": return db.and_(get_clause(c) for c in node.children) @@ -305,11 +378,13 @@ def get_clause(node): return add_filter(node.json_key, node.operator, node.json_value) if data: - query = Metadata.query - if filter[0]: - query = query.where( - add_filter(filter[0]["name"], filter[0]["op"], filter[0]["val"]) - ) + # query = Metadata.query + # if filter[0]: + # query = query.where( + # add_filter(filter[0]["name"], filter[0]["op"], filter[0]["val"]) + # ) + + query = Metadata.query.where(root_sql_clause) # if filter_tree: # query = query.where(get_clause(filter_tree)) From 4e0191d73b205fa9f20b4f785b00a60903f408a8 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 15 Dec 2020 19:02:47 -0800 Subject: [PATCH 12/39] feat(GET /objects): use filter_dict --- src/mds/query.py | 62 +++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 6fb0f3b5..4d3777d4 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -254,18 +254,18 @@ def visit_filter(self, node, visited_children): def visit_boolean_filter(self, node, visited_children): _, boolean, boolean_operands, _ = visited_children - # return { boolean: boolean_operands } + return {boolean: boolean_operands} # import pdb; pdb.set_trace() # return db.and_(c for c in visited_children if c.expr_name == "scalar_filter") # return db.and_(c for c in visited_children) # return db.and_(c for c in visited_children[0]) - if boolean == "and": - return db.and_(clause for clause in boolean_operands) - elif boolean == "or": - return db.or_(clause for clause in boolean_operands) - else: - raise + # if boolean == "and": + # return db.and_(clause for clause in boolean_operands) + # elif boolean == "or": + # return db.or_(clause for clause in boolean_operands) + # else: + # raise def visit_boolean(self, node, visited_children): return node.text @@ -308,9 +308,13 @@ def visit_scalar_filter(self, node, visited_children): # import pdb; pdb.set_trace() # return { "name": json_key.text, "op": operator.text, "val": json.loads(json_value_or_scalar_filter.text) } - # return { "name": json_key, "op": operator, "val": json_value_or_scalar_filter } + return { + "name": json_key, + "op": operator, + "val": json_value_or_scalar_filter, + } - return add_filter(json_key, operator, json_value_or_scalar_filter) + # return add_filter(json_key, operator, json_value_or_scalar_filter) def visit_json_value_or_scalar_filter(self, node, visited_children): return visited_children[0] @@ -333,12 +337,14 @@ def generic_visit(self, node, visited_children): # return node filter_tree = parse_filter(filter) + filter_dict = {} # import pdb; pdb.set_trace() if filter_tree: # pass filter_tree_visitor = FilterVisitor() - root_sql_clause = filter_tree_visitor.visit(filter_tree) - # import pdb; pdb.set_trace() + # root_sql_clause = filter_tree_visitor.visit(filter_tree) + filter_dict = filter_tree_visitor.visit(filter_tree) + # import pdb; pdb.set_trace() # XXX comments def parantheses_to_json(s): @@ -360,34 +366,46 @@ def scalar_to_json(s): filter = [parantheses_to_json(filter)] - def get_clause(node): - if not node: + def get_clause(filter_dict): + if not filter_dict: return # boolean = getattr("boolean", node) - expression_name = getattr(node, "expr_name") + # expression_name = getattr(node, "expr_name") # if expression_name - if boolean is not None: + # if boolean is not None: + # if boolean == "and": + # return db.and_(get_clause(c) for c in node.children) + # elif boolean == "or": + # return db.or_(get_clause(c) for c in node.children) + # raise + + if len(filter_dict) == 1: + # import pdb; pdb.set_trace() + boolean = list(filter_dict.keys())[0] + + boolean_operand_clauses = (get_clause(c) for c in filter_dict[boolean]) if boolean == "and": - return db.and_(get_clause(c) for c in node.children) + return db.and_(boolean_operand_clauses) elif boolean == "or": - return db.or_(get_clause(c) for c in node.children) + return db.or_(boolean_operand_clauses) raise - return add_filter(node.json_key, node.operator, node.json_value) + # return add_filter(node.json_key, node.operator, node.json_value) + return add_filter(filter_dict["name"], filter_dict["op"], filter_dict["val"]) if data: - # query = Metadata.query + query = Metadata.query # if filter[0]: # query = query.where( # add_filter(filter[0]["name"], filter[0]["op"], filter[0]["val"]) # ) - query = Metadata.query.where(root_sql_clause) + # query = Metadata.query.where(root_sql_clause) - # if filter_tree: - # query = query.where(get_clause(filter_tree)) + if filter_dict: + query = query.where(get_clause(filter_dict)) return {metadata.guid: metadata.data for metadata in await query.gino.all()} else: From f48a0150523836d85e31933df105787b0b5fb287 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 15 Dec 2020 19:59:25 -0800 Subject: [PATCH 13/39] feat(GET /objects): account for empty filter param --- src/mds/query.py | 74 ++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 4d3777d4..38db640f 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -74,16 +74,20 @@ async def search_metadata( queries.setdefault(key, []).append(value) def add_filters(): - filter_param = "(and" - for path, values in queries.items(): - filter_param += ",(or" - for v in values: - # to maintain backwards compatibility, use :like because it's - # a textual operator. - # XXX will need to escape % - filter_param += f',({path},:like,"{v}")' + filter_param = "" + + if queries: + filter_param = "(and" + for path, values in queries.items(): + filter_param += ",(or" + for v in values: + # to maintain backwards compatibility, use :like because it's + # a textual operator. + # XXX will need to escape % + filter_param += f',({path},:like,"{v}")' + filter_param += ")" filter_param += ")" - filter_param += ")" + return filter_param return await search_metadata_helper( @@ -111,6 +115,7 @@ async def search_metadata_helper( """ XXX comments """ + # import pdb; pdb.set_trace() limit = min(limit, 2000) def add_filter(target_key, operator_name, other): @@ -205,6 +210,7 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): } def parse_filter(filter_string): + # import pdb; pdb.set_trace() if not filter_string: return @@ -347,24 +353,24 @@ def generic_visit(self, node, visited_children): # import pdb; pdb.set_trace() # XXX comments - def parantheses_to_json(s): - def scalar_to_json(s): - if not s: - return {} - # XXX what if only one is a parantheses? - if s[0] != "(" and s[-1] != ")": - try: - return json.loads(s) - except: - return s + # def parantheses_to_json(s): + # def scalar_to_json(s): + # if not s: + # return {} + # # XXX what if only one is a parantheses? + # if s[0] != "(" and s[-1] != ")": + # try: + # return json.loads(s) + # except: + # return s - f = s[1:-1].split(",", 2) - f = {"name": f[0], "op": f[1], "val": parantheses_to_json(f[2])} - return f + # f = s[1:-1].split(",", 2) + # f = {"name": f[0], "op": f[1], "val": parantheses_to_json(f[2])} + # return f - return scalar_to_json(s) + # return scalar_to_json(s) - filter = [parantheses_to_json(filter)] + # filter = [parantheses_to_json(filter)] def get_clause(filter_dict): if not filter_dict: @@ -382,7 +388,6 @@ def get_clause(filter_dict): # raise if len(filter_dict) == 1: - # import pdb; pdb.set_trace() boolean = list(filter_dict.keys())[0] boolean_operand_clauses = (get_clause(c) for c in filter_dict[boolean]) @@ -392,29 +397,18 @@ def get_clause(filter_dict): return db.or_(boolean_operand_clauses) raise - # return add_filter(node.json_key, node.operator, node.json_value) return add_filter(filter_dict["name"], filter_dict["op"], filter_dict["val"]) if data: query = Metadata.query - # if filter[0]: - # query = query.where( - # add_filter(filter[0]["name"], filter[0]["op"], filter[0]["val"]) - # ) - - # query = Metadata.query.where(root_sql_clause) - if filter_dict: query = query.where(get_clause(filter_dict)) - return {metadata.guid: metadata.data for metadata in await query.gino.all()} else: - return [ - row[0] - for row in await add_filter(db.select([Metadata.guid])) - .gino.return_model(False) - .all() - ] + query = db.select([Metadata.guid]) + if filter_dict: + query = query.where(get_clause(filter_dict)) + return [row[0] for row in await query.gino.return_model(False).all()] @mod.get("/metadata/{guid:path}") From e77418963cf22bbd1f0d8a2e0d905ece5b6d9bff Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 15 Dec 2020 22:06:05 -0800 Subject: [PATCH 14/39] feat(GET /objects): remove commented code --- src/mds/objects.py | 40 ++++++---------- src/mds/query.py | 117 ++++++++------------------------------------- 2 files changed, 35 insertions(+), 122 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index b285ef15..f1fc82ed 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -229,8 +229,8 @@ async def create_object_for_id( @mod.get("/objects") async def get_objects( request: Request, - # XXX how to handle this being False - # XXX what is Query? + # XXX would be nice to be able to specify whether to return indexd records + # (e.g GET objects?data=metadata would only return mds objects) data: bool = Query( False, description="Switch to returning a list of GUIDs (false), " @@ -241,41 +241,31 @@ async def get_objects( ), offset: int = Query(0, description="Return results at this given offset."), # XXX description - # XXX how to name this variable something other than filter - # filter: Json = Query(Json(), description="The filters!"), - filter: str = Query("", description="The filters!"), + # XXX how to name this python variable something other than filter but + # still have client use "filter" as URL query param? (bc filter is already + # built in to Python) + filter: str = Query("", description="Filters to apply."), ) -> JSONResponse: """ XXX comments """ - # import pdb; pdb.set_trace() - # mds_query_params = { - # k[len("metadata.") :]: v - # for k, v in request.query_params.items() - # if k.startswith("metadata.") - # } - # queries = {} - # for k, v in request.query_params.multi_items(): - # queries.setdefault(key, []).append(value) - - # XXX just pass kwargs? metadata_objects = await search_metadata_helper( - # mds_query_params, data=data, limit=limit, offset=offset - data=data, - limit=limit, - offset=offset, - filter=filter - # data=data, limit=limit, offset=offset, filters=filter + data=data, limit=limit, offset=offset, filter=filter ) records = {} try: endpoint_path = "/bulk/documents" full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path - response = await request.app.async_client.post( - full_endpoint, json=list(metadata_objects.keys()) + guids = ( + list(metadata_objects.keys()) + if hasattr(metadata_objects, "keys") + else metadata_objects ) + # XXX /bulk/documents endpoint in indexd currently doesn't support + # filters + response = await request.app.async_client.post(full_endpoint, json=guids) response.raise_for_status() records = {r["did"]: r for r in response.json()} except httpx.HTTPError as err: @@ -291,8 +281,6 @@ async def get_objects( "Unable to get a response from indexd `POST %s` endpoint", endpoint_path ) - # XXX need to handle data=False - # import pdb; pdb.set_trace() if type(metadata_objects) is dict: response = { guid: {"record": records[guid] if guid in records else {}, "metadata": o} diff --git a/src/mds/query.py b/src/mds/query.py index 38db640f..00c26e54 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -16,7 +16,6 @@ @mod.get("/metadata") -# XXX fix tests async def search_metadata( request: Request, data: bool = Query( @@ -81,9 +80,9 @@ def add_filters(): for path, values in queries.items(): filter_param += ",(or" for v in values: - # to maintain backwards compatibility, use :like because it's - # a textual operator. - # XXX will need to escape % + # to maintain backwards compatibility for this endpoint, + # use :like instead of :eq because SQL LIKE is a textual + # operator. XXX will need to escape % filter_param += f',({path},:like,"{v}")' filter_param += ")" filter_param += ")" @@ -109,18 +108,22 @@ async def search_metadata_helper( ), offset: int = Query(0, description="Return results at this given offset."), # XXX description - # XXX how to name this variable something other than filter - filter: str = Query("", description="The filters!"), + # XXX how to name this python variable something other than filter but + # still have client use "filter" as URL query param? + filter: str = Query("", description="Filters to apply."), ): """ XXX comments """ - # import pdb; pdb.set_trace() limit = min(limit, 2000) def add_filter(target_key, operator_name, other): json_object = Metadata.data[list(target_key.split("."))] + if type(other) is dict: + # since other isn't a primitive type, it means that this is a compound + # operator + # (e.g. GET objects?filter=(_resource_paths,:any,(,:like,"/programs/%")) ) return operators[operator_name]["sql_clause"]( json_object, other["op"], other["val"] ) @@ -163,6 +166,8 @@ def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): ) def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): + # XXX should DRY duplication between get_any_sql_clause and + # get_all_sql_clause functions if ( "type" in operators[scalar_operator_name] and operators[scalar_operator_name]["type"] == "text" @@ -194,12 +199,12 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): ":gte": {"sql_comparator": ColumnOperators.__ge__}, ":lt": {"sql_comparator": ColumnOperators.__lt__}, ":lte": {"sql_comparator": ColumnOperators.__le__}, - # XXX not working + # XXX :is is not working # XXX what does SQL IS mean? # XXX check how mongoDB filters for null (does it use it?, if an object # doesn't have a given key, is that considered null? etc.) ":is": {"sql_comparator": ColumnOperators.is_}, - # XXX in is probably unnecessary (i.e. instead of (score, :in, [1, 2]) do (or(score,:eq,1),(score,:eq,2))) + # XXX :in is probably unnecessary (i.e. instead of (score, :in, [1, 2]) do (or(score,:eq,1),(score,:eq,2))) # ":in": ColumnOperators.in_, ":like": { "type": "text", @@ -210,12 +215,10 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): } def parse_filter(filter_string): - # import pdb; pdb.set_trace() if not filter_string: return # https://github.com/erikrose/parsimonious/pull/23/files - # parantheses get their own list, even if they only have one element grammar = Grammar( """ filter = scalar_filter / boolean_filter @@ -246,47 +249,27 @@ def parse_filter(filter_string): doubleString = ~'"([^"])*"' """ ) - # print( - # grammar.parse(filter_string) - # ) return grammar.parse(filter_string) class FilterVisitor(NodeVisitor): - # def visit_boolean_filter(self, node, visited_children): - # pass - def visit_filter(self, node, visited_children): return visited_children[0] def visit_boolean_filter(self, node, visited_children): _, boolean, boolean_operands, _ = visited_children return {boolean: boolean_operands} - # import pdb; pdb.set_trace() - # return db.and_(c for c in visited_children if c.expr_name == "scalar_filter") - # return db.and_(c for c in visited_children) - # return db.and_(c for c in visited_children[0]) - - # if boolean == "and": - # return db.and_(clause for clause in boolean_operands) - # elif boolean == "or": - # return db.or_(clause for clause in boolean_operands) - # else: - # raise def visit_boolean(self, node, visited_children): return node.text def visit_boolean_operand(self, node, visited_children): - # return node.text _, scalar_filter_or_boolean_filter = visited_children return scalar_filter_or_boolean_filter def visit_scalar_filter_or_boolean_filter(self, node, visited_children): - # import pdb; pdb.set_trace() return visited_children[0] def visit_scalar_filter(self, node, visited_children): - # import pdb; pdb.set_trace() ( _, json_key, @@ -297,31 +280,12 @@ def visit_scalar_filter(self, node, visited_children): _, ) = visited_children - # return node.text - # print(json_key, operator, json_value_or_scalar_filter) - # for node in (json_key, operator, json_value_or_scalar_filter): - # print(node.text) - # print(json_key) - - # import pdb; pdb.set_trace() - - # return (json_key, operator, json_value_or_scalar_filter) - # return (json_key.text, operator[0].text, json_value_or_scalar_filter[0][0].text) - - # XXX need try around json.loads() ? - # import pdb; pdb.set_trace() - # return { "name": json_key.text, "op": operator[0].text, "val": json.loads(json_value_or_scalar_filter[0][0].text) } - # import pdb; pdb.set_trace() - # return { "name": json_key.text, "op": operator.text, "val": json.loads(json_value_or_scalar_filter.text) } - return { "name": json_key, "op": operator, "val": json_value_or_scalar_filter, } - # return add_filter(json_key, operator, json_value_or_scalar_filter) - def visit_json_value_or_scalar_filter(self, node, visited_children): return visited_children[0] @@ -335,62 +299,17 @@ def visit_json_value(self, node, visited_children): return json.loads(node.text) def generic_visit(self, node, visited_children): - """ The generic visit method. """ - # return visited_children or node # return section.text - # import pdb; pdb.set_trace() - # return visited_children or node if node.expr_name not in ["open", "close", "separator"] else None return visited_children or node - # return node - - filter_tree = parse_filter(filter) - filter_dict = {} - # import pdb; pdb.set_trace() - if filter_tree: - # pass - filter_tree_visitor = FilterVisitor() - # root_sql_clause = filter_tree_visitor.visit(filter_tree) - filter_dict = filter_tree_visitor.visit(filter_tree) - # import pdb; pdb.set_trace() - - # XXX comments - # def parantheses_to_json(s): - # def scalar_to_json(s): - # if not s: - # return {} - # # XXX what if only one is a parantheses? - # if s[0] != "(" and s[-1] != ")": - # try: - # return json.loads(s) - # except: - # return s - - # f = s[1:-1].split(",", 2) - # f = {"name": f[0], "op": f[1], "val": parantheses_to_json(f[2])} - # return f - - # return scalar_to_json(s) - - # filter = [parantheses_to_json(filter)] def get_clause(filter_dict): if not filter_dict: return - # boolean = getattr("boolean", node) - # expression_name = getattr(node, "expr_name") - # if expression_name - - # if boolean is not None: - # if boolean == "and": - # return db.and_(get_clause(c) for c in node.children) - # elif boolean == "or": - # return db.or_(get_clause(c) for c in node.children) - # raise - if len(filter_dict) == 1: boolean = list(filter_dict.keys())[0] boolean_operand_clauses = (get_clause(c) for c in filter_dict[boolean]) + # XXX define these boolean operators in one place if boolean == "and": return db.and_(boolean_operand_clauses) elif boolean == "or": @@ -399,6 +318,12 @@ def get_clause(filter_dict): return add_filter(filter_dict["name"], filter_dict["op"], filter_dict["val"]) + filter_tree = parse_filter(filter) + filter_dict = {} + if filter_tree: + filter_tree_visitor = FilterVisitor() + filter_dict = filter_tree_visitor.visit(filter_tree) + if data: query = Metadata.query if filter_dict: From a87bfaaba1900eeab3429826b8834a91c57e4b1e Mon Sep 17 00:00:00 2001 From: John McCann Date: Wed, 16 Dec 2020 01:54:50 -0800 Subject: [PATCH 15/39] feat(GET /objects): document examples --- src/mds/objects.py | 52 +++++++++-------- src/mds/query.py | 139 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 158 insertions(+), 33 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index f1fc82ed..43de7289 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -6,7 +6,7 @@ from fastapi import HTTPException, APIRouter, Security, Query from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials import httpx -from pydantic import BaseModel, Json +from pydantic import BaseModel from starlette.requests import Request from starlette.responses import JSONResponse from starlette.status import ( @@ -255,31 +255,33 @@ async def get_objects( ) records = {} - try: - endpoint_path = "/bulk/documents" - full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path - guids = ( - list(metadata_objects.keys()) - if hasattr(metadata_objects, "keys") - else metadata_objects - ) - # XXX /bulk/documents endpoint in indexd currently doesn't support - # filters - response = await request.app.async_client.post(full_endpoint, json=guids) - response.raise_for_status() - records = {r["did"]: r for r in response.json()} - except httpx.HTTPError as err: - logger.debug(err, exc_info=True) - if err.response: - logger.error( - "indexd `POST %s` endpoint returned a %s HTTP status code", - endpoint_path, - err.response.status_code, - ) - else: - logger.error( - "Unable to get a response from indexd `POST %s` endpoint", endpoint_path + if metadata_objects: + try: + endpoint_path = "/bulk/documents" + full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path + guids = ( + list(metadata_objects.keys()) + if hasattr(metadata_objects, "keys") + else metadata_objects ) + # XXX /bulk/documents endpoint in indexd currently doesn't support + # filters + response = await request.app.async_client.post(full_endpoint, json=guids) + response.raise_for_status() + records = {r["did"]: r for r in response.json()} + except httpx.HTTPError as err: + logger.debug(err, exc_info=True) + if err.response: + logger.error( + "indexd `POST %s` endpoint returned a %s HTTP status code", + endpoint_path, + err.response.status_code, + ) + else: + logger.error( + "Unable to get a response from indexd `POST %s` endpoint", + endpoint_path, + ) if type(metadata_objects) is dict: response = { diff --git a/src/mds/query.py b/src/mds/query.py index 00c26e54..cd561d52 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -113,11 +113,117 @@ async def search_metadata_helper( filter: str = Query("", description="Filters to apply."), ): """ - XXX comments + Helper to search for metadata objects based on filters provided in filter + param. + + The filtering functionality was primarily driven by the requirement that a + user be able to get all objects having an authz resource matching a + user-supplied pattern at any index in the "_resource_paths" array. For + example, given the following metadata objects: + + { + "0": { + "message": "hello", + "_uploader_id": "100", + "_resource_paths": [ + "/programs/a", + "/programs/b" + ], + "pet": "dog" + }, + "1": { + "message": "greetings", + "_uploader_id": "101", + "_resource_paths": [ + "/open", + "/programs/c/projects/a" + ], + "pet": "ferret", + "sport": "soccer" + }, + "2": { + "message": "morning", + "_uploader_id": "102", + "_resource_paths": [ + "/programs/d", + "/programs/e" + ], + "counts": [42, 42, 42], + "pet": "ferret", + "sport": "soccer" + }, + "3": { + "message": "evening", + "_uploader_id": "103", + "_resource_paths": [ + "/programs/f/projects/a", + "/admin" + ], + "counts": [1, 3, 5], + "pet": "ferret", + "sport": "basketball" + } + } + + how do we design a filtering interface that allows the user to get all + objects having an authz string matching the pattern + "/programs/%/projects/%" at any index in its "_resource_paths" array? (% + has been used as the wildcard so far because that's what Postgres uses as + the wildcard for LIKE) In this case, the "1" and "3" objects should be + returned. + + The filter syntax that was arrived at ending up following the syntax + specified by a Node JS implementation + (https://www.npmjs.com/package/json-api#filtering) of the JSON:API + specification (https://jsonapi.org/). + + The format for this syntax is filter=(field_name,operator,value), in which + the field_name is a json key without quotes, operator is one of :eq, :ne, + :gt, :gte, :lt, :lte, :like, :all, :any (see operators dict), and value is + a typed json value against which the operator is run. + + Examples: + + - GET /objects?filter=(message,:eq,"morning") returns "2" + - GET /objects?filter=(counts.1,:eq,3) returns "3" + + Compound expressions are supported: + + - GET /objects?filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%")) + returns "1" and "3" + - GET /objects?filter=(counts,:all,(,:eq,42)) + returns "2" + + Boolean expressions are also supported: + + - GET /objects?filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102")) + returns "1" and "2" + - GET /objects?filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello")) + returns "0", "1", and "2" + """ limit = min(limit, 2000) def add_filter(target_key, operator_name, other): + """ + XXX need a better name for other variable. with a scalar filter, other + is the value that the target value is compared against + + e.g. for the following requests, the arguments to add_filter: + * `GET /objects?filter=(_uploader_id,:eq,"57")` + - target_key is 'uploader_id' + - operator_name is ':eq' + - other is '57' + * `GET /objects?filter=(count,:gte,16)` + - target_key is 'count' + - operator_name is ':gte' + - other is 16 (note this is a python int) + * `GET /objects?filter=(_resource_paths,:any,(,:like,"/programs/foo/%"))` + - target_key is '_resource_paths' + - operator_name is ':any' + - other is {'name': '', 'op': ':like', 'val': '/programs/foo/%'} + """ + json_object = Metadata.data[list(target_key.split("."))] if type(other) is dict: @@ -134,13 +240,15 @@ def add_filter(target_key, operator_name, other): ): json_object = json_object.astext else: + # this is necessary to differentiate between strings, booleans, numbers, + # etc. in JSON other = cast(other, JSONB) return operators[operator_name]["sql_comparator"](json_object, other) # XXX get_any_sql_clause could possibly be used for a :has_key operation # w/ a little more work (e.g. does teams object {} have "bears" key: - # (teams,:any,(,:eq,"bears"))) (jsonb_object_keys + # (teams,:any,(,:eq,"bears"))) (would need to use jsonb_object_keys # https://www.postgresql.org/docs/9.5/functions-json.html#FUNCTIONS-JSON-OP-TABLE) def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): if ( @@ -218,7 +326,12 @@ def parse_filter(filter_string): if not filter_string: return - # https://github.com/erikrose/parsimonious/pull/23/files + # XXX invalid filter param can result in a 500 + # e.g. `GET objects?filter=(_uploader_id,:eq"57")` (missing second comma) + + # json_value and below taken from https://github.com/erikrose/parsimonious/pull/23/files + # XXX need to allow for escaped strings + # XXX need some cleaning up grammar = Grammar( """ filter = scalar_filter / boolean_filter @@ -230,12 +343,12 @@ def parse_filter(filter_string): scalar_filter = open json_key separator operator separator json_value_or_scalar_filter close json_value_or_scalar_filter = json_value / scalar_filter json_key = ~"[A-Z 0-9_.]*"i - operator = ":eq" / ":ne" / ":gt" / ":gte" / ":lt" / ":lte" / ":is" / ":like" / ":all" / ":any" + operator = ":eq" / ":ne" / ":gte" / ":gt" / ":lte" / ":lt" / ":is" / ":like" / ":all" / ":any" open = "(" close = ")" separator = "," - json_value = true_false_null / number / doubleString + json_value = true_false_null / number / double_string true_false_null = "true" / "false" / "null" number = (int frac exp) / (int exp) / (int frac) / int int = "-"? ((digit1to9 digits) / digit) @@ -246,7 +359,7 @@ def parse_filter(filter_string): digit1to9 = ~"[1-9]" digit = ~"[0-9]" - doubleString = ~'"([^"])*"' + double_string = ~'"([^"])*"' """ ) return grammar.parse(filter_string) @@ -301,6 +414,7 @@ def visit_json_value(self, node, visited_children): def generic_visit(self, node, visited_children): return visited_children or node + # XXX better name def get_clause(filter_dict): if not filter_dict: return @@ -328,12 +442,21 @@ def get_clause(filter_dict): query = Metadata.query if filter_dict: query = query.where(get_clause(filter_dict)) - return {metadata.guid: metadata.data for metadata in await query.gino.all()} + return { + metadata.guid: metadata.data + for metadata in await query.offset(offset).limit(limit).gino.all() + } else: query = db.select([Metadata.guid]) if filter_dict: query = query.where(get_clause(filter_dict)) - return [row[0] for row in await query.gino.return_model(False).all()] + return [ + row[0] + for row in await query.offset(offset) + .limit(limit) + .gino.return_model(False) + .all() + ] @mod.get("/metadata/{guid:path}") From a75914d878a9b3ca09c99435d8a0d20e56c5b7de Mon Sep 17 00:00:00 2001 From: johnfrancismccann Date: Wed, 16 Dec 2020 15:07:43 +0000 Subject: [PATCH 16/39] Apply automatic documentation changes --- docs/openapi.yaml | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 9cd760f6..700c855c 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -409,6 +409,63 @@ paths: tags: - Index /objects: + get: + description: XXX comments + operationId: get_objects_objects_get + parameters: + - description: Switch to returning a list of GUIDs (false), or GUIDs mapping + to their metadata (true). + in: query + name: data + required: false + schema: + default: false + description: Switch to returning a list of GUIDs (false), or GUIDs mapping + to their metadata (true). + title: Data + type: boolean + - description: 'Maximum number of records returned. (max: 2000)' + in: query + name: limit + required: false + schema: + default: 10 + description: 'Maximum number of records returned. (max: 2000)' + title: Limit + type: integer + - description: Return results at this given offset. + in: query + name: offset + required: false + schema: + default: 0 + description: Return results at this given offset. + title: Offset + type: integer + - description: Filters to apply. + in: query + name: filter + required: false + schema: + default: '' + description: Filters to apply. + title: Filter + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: Get Objects + tags: + - Object post: description: "Create object placeholder and attach metadata, return Upload url\ \ to the user.\n\nArgs:\n body (CreateObjInput): input body for create\ From 71506e64481dc8710bec9efb314b2e4f76375437 Mon Sep 17 00:00:00 2001 From: John McCann Date: Mon, 8 Mar 2021 13:26:53 -0800 Subject: [PATCH 17/39] feat(GET /objects): use data param --- src/mds/objects.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index 43de7289..c2538494 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -255,15 +255,11 @@ async def get_objects( ) records = {} - if metadata_objects: + if data and metadata_objects: try: endpoint_path = "/bulk/documents" full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path - guids = ( - list(metadata_objects.keys()) - if hasattr(metadata_objects, "keys") - else metadata_objects - ) + guids = list(metadata_objects.keys()) # XXX /bulk/documents endpoint in indexd currently doesn't support # filters response = await request.app.async_client.post(full_endpoint, json=guids) @@ -283,7 +279,7 @@ async def get_objects( endpoint_path, ) - if type(metadata_objects) is dict: + if data: response = { guid: {"record": records[guid] if guid in records else {}, "metadata": o} for guid, o in metadata_objects.items() From 79e2c7da3c63fdb9c43b0df5b7c7176a8f90ee2a Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 9 Mar 2021 15:25:24 -0800 Subject: [PATCH 18/39] feat(GET /objects): return items list in response --- src/mds/objects.py | 8 +++++--- src/mds/query.py | 27 +++++++++++++++------------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index c2538494..f5a46d6b 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -259,7 +259,7 @@ async def get_objects( try: endpoint_path = "/bulk/documents" full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path - guids = list(metadata_objects.keys()) + guids = list(guid for guid, _ in metadata_objects) # XXX /bulk/documents endpoint in indexd currently doesn't support # filters response = await request.app.async_client.post(full_endpoint, json=guids) @@ -281,8 +281,10 @@ async def get_objects( if data: response = { - guid: {"record": records[guid] if guid in records else {}, "metadata": o} - for guid, o in metadata_objects.items() + "items": [ + {"record": records[guid] if guid in records else {}, "metadata": o} + for guid, o in metadata_objects + ] } else: response = metadata_objects diff --git a/src/mds/query.py b/src/mds/query.py index cd561d52..c2dfe4c2 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -5,10 +5,11 @@ from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.sql.operators import ColumnOperators from starlette.requests import Request -from starlette.status import HTTP_404_NOT_FOUND +from starlette.status import HTTP_400_BAD_REQUEST, HTTP_404_NOT_FOUND from parsimonious.grammar import Grammar from parsimonious.nodes import NodeVisitor +from parsimonious.exceptions import IncompleteParseError, ParseError, VisitationError from .models import db, Metadata @@ -263,7 +264,6 @@ def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): f = f(target_json_object).alias() return exists( - # XXX select 1 db.select("*") .select_from(f) .where( @@ -274,8 +274,6 @@ def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): ) def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): - # XXX should DRY duplication between get_any_sql_clause and - # get_all_sql_clause functions if ( "type" in operators[scalar_operator_name] and operators[scalar_operator_name]["type"] == "text" @@ -432,20 +430,25 @@ def get_clause(filter_dict): return add_filter(filter_dict["name"], filter_dict["op"], filter_dict["val"]) - filter_tree = parse_filter(filter) - filter_dict = {} - if filter_tree: - filter_tree_visitor = FilterVisitor() - filter_dict = filter_tree_visitor.visit(filter_tree) + try: + filter_tree = parse_filter(filter) + filter_dict = {} + if filter_tree: + filter_tree_visitor = FilterVisitor() + filter_dict = filter_tree_visitor.visit(filter_tree) + except (IncompleteParseError, ParseError, VisitationError): + raise HTTPException( + HTTP_400_BAD_REQUEST, f"filter URL query param syntax is invalid" + ) if data: query = Metadata.query if filter_dict: query = query.where(get_clause(filter_dict)) - return { - metadata.guid: metadata.data + return [ + (metadata.guid, metadata.data) for metadata in await query.offset(offset).limit(limit).gino.all() - } + ] else: query = db.select([Metadata.guid]) if filter_dict: From 80a12d59a41fa4cf2623e8b202bc9a51ea2239fb Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 9 Mar 2021 15:38:24 -0800 Subject: [PATCH 19/39] feat(GET /objects): restore search_metadata --- src/mds/query.py | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index c2dfe4c2..70fb3c72 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -73,29 +73,25 @@ async def search_metadata( if key not in {"data", "limit", "offset"}: queries.setdefault(key, []).append(value) - def add_filters(): - filter_param = "" - - if queries: - filter_param = "(and" - for path, values in queries.items(): - filter_param += ",(or" - for v in values: - # to maintain backwards compatibility for this endpoint, - # use :like instead of :eq because SQL LIKE is a textual - # operator. XXX will need to escape % - filter_param += f',({path},:like,"{v}")' - filter_param += ")" - filter_param += ")" - - return filter_param - - return await search_metadata_helper( - data=data, - limit=limit, - offset=offset, - filter=add_filters(), - ) + def add_filter(query): + for path, values in queries.items(): + query = query.where( + db.or_(Metadata.data[list(path.split("."))].astext == v for v in values) + ) + return query.offset(offset).limit(limit) + + if data: + return { + metadata.guid: metadata.data + for metadata in await add_filter(Metadata.query).gino.all() + } + else: + return [ + row[0] + for row in await add_filter(db.select([Metadata.guid])) + .gino.return_model(False) + .all() + ] async def search_metadata_helper( From 879e64a8028c46d31561793b0d32dc58698de095 Mon Sep 17 00:00:00 2001 From: John McCann Date: Thu, 11 Mar 2021 15:56:12 -0800 Subject: [PATCH 20/39] feat(GET /objects): clean up parsing grammar --- src/mds/query.py | 115 ++++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 70fb3c72..19cb1308 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -105,8 +105,6 @@ async def search_metadata_helper( ), offset: int = Query(0, description="Return results at this given offset."), # XXX description - # XXX how to name this python variable something other than filter but - # still have client use "filter" as URL query param? filter: str = Query("", description="Filters to apply."), ): """ @@ -201,34 +199,34 @@ async def search_metadata_helper( """ limit = min(limit, 2000) - def add_filter(target_key, operator_name, other): + def add_filter(target_key, operator_name, right_operand): """ - XXX need a better name for other variable. with a scalar filter, other - is the value that the target value is compared against + with a scalar filter, right_operand is the value that the target value + is compared against e.g. for the following requests, the arguments to add_filter: * `GET /objects?filter=(_uploader_id,:eq,"57")` - target_key is 'uploader_id' - operator_name is ':eq' - - other is '57' + - right_operand is '57' * `GET /objects?filter=(count,:gte,16)` - target_key is 'count' - operator_name is ':gte' - - other is 16 (note this is a python int) + - right_operand is 16 (note this is a python int) * `GET /objects?filter=(_resource_paths,:any,(,:like,"/programs/foo/%"))` - target_key is '_resource_paths' - operator_name is ':any' - - other is {'name': '', 'op': ':like', 'val': '/programs/foo/%'} + - right_operand is {'name': '', 'op': ':like', 'val': '/programs/foo/%'} """ json_object = Metadata.data[list(target_key.split("."))] - if type(other) is dict: - # since other isn't a primitive type, it means that this is a compound + if type(right_operand) is dict: + # since right_operand isn't a primitive type, it means that this is a compound # operator # (e.g. GET objects?filter=(_resource_paths,:any,(,:like,"/programs/%")) ) return operators[operator_name]["sql_clause"]( - json_object, other["op"], other["val"] + json_object, right_operand["op"], right_operand["val"] ) if ( @@ -239,15 +237,18 @@ def add_filter(target_key, operator_name, other): else: # this is necessary to differentiate between strings, booleans, numbers, # etc. in JSON - other = cast(other, JSONB) + right_operand = cast(right_operand, JSONB) - return operators[operator_name]["sql_comparator"](json_object, other) + return operators[operator_name]["sql_comparator"](json_object, right_operand) - # XXX get_any_sql_clause could possibly be used for a :has_key operation - # w/ a little more work (e.g. does teams object {} have "bears" key: - # (teams,:any,(,:eq,"bears"))) (would need to use jsonb_object_keys - # https://www.postgresql.org/docs/9.5/functions-json.html#FUNCTIONS-JSON-OP-TABLE) + # XXX docstring needs more detail def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): + """ + get_any_sql_clause could possibly be used for a :has_key operation w/ a + little more work (e.g. does teams object {} have "bears" key: + (teams,:any,(,:eq,"bears"))) (would need to use jsonb_object_keys + https://www.postgresql.org/docs/9.5/functions-json.html#FUNCTIONS-JSON-OP-TABLE) + """ if ( "type" in operators[scalar_operator_name] and operators[scalar_operator_name]["type"] == "text" @@ -293,7 +294,6 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): ), ) - # XXX aggregate operators might be nice (e.g size, max, etc.) operators = { ":eq": {"sql_comparator": ColumnOperators.__eq__}, ":ne": {"sql_comparator": ColumnOperators.__ne__}, @@ -301,13 +301,6 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): ":gte": {"sql_comparator": ColumnOperators.__ge__}, ":lt": {"sql_comparator": ColumnOperators.__lt__}, ":lte": {"sql_comparator": ColumnOperators.__le__}, - # XXX :is is not working - # XXX what does SQL IS mean? - # XXX check how mongoDB filters for null (does it use it?, if an object - # doesn't have a given key, is that considered null? etc.) - ":is": {"sql_comparator": ColumnOperators.is_}, - # XXX :in is probably unnecessary (i.e. instead of (score, :in, [1, 2]) do (or(score,:eq,1),(score,:eq,2))) - # ":in": ColumnOperators.in_, ":like": { "type": "text", "sql_comparator": ColumnOperators.like, @@ -320,31 +313,33 @@ def parse_filter(filter_string): if not filter_string: return - # XXX invalid filter param can result in a 500 - # e.g. `GET objects?filter=(_uploader_id,:eq"57")` (missing second comma) - - # json_value and below taken from https://github.com/erikrose/parsimonious/pull/23/files - # XXX need to allow for escaped strings - # XXX need some cleaning up + # json_value and below was taken from + # https://github.com/erikrose/parsimonious/pull/23/files grammar = Grammar( """ - filter = scalar_filter / boolean_filter + filter = scalar_filter / compound_filter / boolean_filter boolean_filter = open boolean boolean_operands close boolean = "and" / "or" boolean_operands = boolean_operand+ - boolean_operand = separator scalar_filter_or_boolean_filter - scalar_filter_or_boolean_filter = scalar_filter / boolean_filter - scalar_filter = open json_key separator operator separator json_value_or_scalar_filter close + boolean_operand = separator specific_filter + specific_filter = scalar_filter / compound_filter / boolean_filter + scalar_filter = open json_key separator scalar_operator separator json_value close + compound_filter = open json_key separator compound_operator separator scalar_filter close json_value_or_scalar_filter = json_value / scalar_filter + json_key = ~"[A-Z 0-9_.]*"i - operator = ":eq" / ":ne" / ":gte" / ":gt" / ":lte" / ":lt" / ":is" / ":like" / ":all" / ":any" + scalar_operator = ":eq" / ":ne" / ":gte" / ":gt" / ":lte" / ":lt" / ":like" + compound_operator = ":all" / ":any" open = "(" close = ")" separator = "," - json_value = true_false_null / number / double_string + json_value = double_string / true_false_null / number + + double_string = ~'"([^"])*"' true_false_null = "true" / "false" / "null" number = (int frac exp) / (int exp) / (int frac) / int + int = "-"? ((digit1to9 digits) / digit) frac = "." digits exp = e digits @@ -352,8 +347,6 @@ def parse_filter(filter_string): e = "e+" / "e-" / "e" / "E+" / "E-" / "E" digit1to9 = ~"[1-9]" digit = ~"[0-9]" - - double_string = ~'"([^"])*"' """ ) return grammar.parse(filter_string) @@ -370,10 +363,10 @@ def visit_boolean(self, node, visited_children): return node.text def visit_boolean_operand(self, node, visited_children): - _, scalar_filter_or_boolean_filter = visited_children - return scalar_filter_or_boolean_filter + _, specific_filter = visited_children + return specific_filter - def visit_scalar_filter_or_boolean_filter(self, node, visited_children): + def visit_specific_filter(self, node, visited_children): return visited_children[0] def visit_scalar_filter(self, node, visited_children): @@ -383,14 +376,31 @@ def visit_scalar_filter(self, node, visited_children): _, operator, _, - json_value_or_scalar_filter, + json_value, + _, + ) = visited_children + + return { + "name": json_key, + "op": operator, + "val": json_value, + } + + def visit_compound_filter(self, node, visited_children): + ( + _, + json_key, + _, + operator, + _, + scalar_filter, _, ) = visited_children return { "name": json_key, "op": operator, - "val": json_value_or_scalar_filter, + "val": scalar_filter, } def visit_json_value_or_scalar_filter(self, node, visited_children): @@ -399,7 +409,10 @@ def visit_json_value_or_scalar_filter(self, node, visited_children): def visit_json_key(self, node, visited_children): return node.text - def visit_operator(self, node, visited_children): + def visit_scalar_operator(self, node, visited_children): + return node.text + + def visit_compound_operator(self, node, visited_children): return node.text def visit_json_value(self, node, visited_children): @@ -408,16 +421,16 @@ def visit_json_value(self, node, visited_children): def generic_visit(self, node, visited_children): return visited_children or node - # XXX better name - def get_clause(filter_dict): + def get_sqlalchemy_clause(filter_dict): if not filter_dict: return if len(filter_dict) == 1: boolean = list(filter_dict.keys())[0] - boolean_operand_clauses = (get_clause(c) for c in filter_dict[boolean]) - # XXX define these boolean operators in one place + boolean_operand_clauses = ( + get_sqlalchemy_clause(c) for c in filter_dict[boolean] + ) if boolean == "and": return db.and_(boolean_operand_clauses) elif boolean == "or": @@ -440,7 +453,7 @@ def get_clause(filter_dict): if data: query = Metadata.query if filter_dict: - query = query.where(get_clause(filter_dict)) + query = query.where(get_sqlalchemy_clause(filter_dict)) return [ (metadata.guid, metadata.data) for metadata in await query.offset(offset).limit(limit).gino.all() @@ -448,7 +461,7 @@ def get_clause(filter_dict): else: query = db.select([Metadata.guid]) if filter_dict: - query = query.where(get_clause(filter_dict)) + query = query.where(get_sqlalchemy_clause(filter_dict)) return [ row[0] for row in await query.offset(offset) From 1b00631f334dd59dc0e92ea4a6126958a131a48b Mon Sep 17 00:00:00 2001 From: John McCann Date: Fri, 12 Mar 2021 15:47:21 -0800 Subject: [PATCH 21/39] test(GET /objects): add filter tests --- tests/sample_objects.json | 43 +++++++++++++++ tests/test_objects.py | 110 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 tests/sample_objects.json diff --git a/tests/sample_objects.json b/tests/sample_objects.json new file mode 100644 index 00000000..0c3412a5 --- /dev/null +++ b/tests/sample_objects.json @@ -0,0 +1,43 @@ +{ + "0": { + "message": "hello", + "_uploader_id": "100", + "_resource_paths": [ + "/programs/a", + "/programs/b" + ], + "pet": "dog" + }, + "1": { + "message": "greetings", + "_uploader_id": "101", + "_resource_paths": [ + "/open", + "/programs/c/projects/a" + ], + "pet": "ferret", + "sport": "soccer" + }, + "2": { + "message": "morning", + "_uploader_id": "102", + "_resource_paths": [ + "/programs/d", + "/programs/e" + ], + "counts": [42, 42, 42], + "pet": "ferret", + "sport": "soccer" + }, + "3": { + "message": "evening", + "_uploader_id": "103", + "_resource_paths": [ + "/programs/f/projects/a", + "/admin" + ], + "counts": [1, 3, 5], + "pet": "ferret", + "sport": "basketball" + } +} diff --git a/tests/test_objects.py b/tests/test_objects.py index f7da81f9..bfe94cc2 100644 --- a/tests/test_objects.py +++ b/tests/test_objects.py @@ -1,4 +1,5 @@ import pytest +import json import httpx import respx @@ -1019,3 +1020,112 @@ def test_get_object_latest_when_indexd_returns_404_and_guid_not_in_mds( resp = client.get(latest_setup["mds_latest_endpoint_with_non_mds_guid"]) assert get_indexd_latest_request_mock.called assert resp.status_code == 404, resp.text + + +def setup_metadata_objects(client_fixture): + with open("tests/sample_objects.json") as json_file: + data = json.load(json_file) + + for guid, mds_object in data.items(): + client_fixture.post("/metadata/" + guid, json=mds_object).raise_for_status() + + +def teardown_metadata_objects(client_fixture): + with open("tests/sample_objects.json") as json_file: + data = json.load(json_file) + + for guid in data.keys(): + client_fixture.delete("/metadata/" + guid).raise_for_status() + + +def test_get_objects_filter_by_message_equals(client): + + try: + setup_metadata_objects(client) + + response = client.get( + '/objects?data=true&filter=(message,:eq,"morning")' + ).json() + assert len(response["items"]) == 1 + assert response["items"][0]["metadata"]["message"] == "morning" + finally: + teardown_metadata_objects(client) + + +def test_get_objects_filter_by_counts_index_equals(client): + + try: + setup_metadata_objects(client) + + response = client.get("/objects?data=true&filter=(counts.1,:eq,3)").json() + assert len(response["items"]) == 1 + assert response["items"][0]["metadata"]["counts"][1] == 3 + finally: + teardown_metadata_objects(client) + + +def test_get_objects_filter_by_any_and_like_with_resource_paths(client): + + try: + setup_metadata_objects(client) + + response = client.get( + '/objects?data=true&filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%"))' + ).json() + response["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + + assert len(response["items"]) == 2 + assert response["items"][0]["metadata"]["_uploader_id"] == "101" + assert response["items"][1]["metadata"]["_uploader_id"] == "103" + finally: + teardown_metadata_objects(client) + + +def test_get_objects_filter_by_counts_all_equal_42(client): + + try: + setup_metadata_objects(client) + + response = client.get( + "/objects?data=true&filter=(counts,:all,(,:eq,42))" + ).json() + + assert len(response["items"]) == 1 + assert response["items"][0]["metadata"]["_uploader_id"] == "102" + finally: + teardown_metadata_objects(client) + + +def test_get_objects_filter_by_or_two_different_uploader_ids(client): + + try: + setup_metadata_objects(client) + + response = client.get( + '/objects?data=true&filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102"))' + ).json() + response["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + + assert len(response["items"]) == 2 + assert response["items"][0]["metadata"]["_uploader_id"] == "101" + assert response["items"][1]["metadata"]["_uploader_id"] == "102" + finally: + teardown_metadata_objects(client) + + +def test_get_objects_filter_by_compound_boolean_filter(client): + + try: + setup_metadata_objects(client) + + response = client.get( + '/objects?data=true&filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello"))' + ).json() + response["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + + assert len(response["items"]) == 3 + assert response["items"][0]["metadata"]["_uploader_id"] == "100" + assert response["items"][1]["metadata"]["_uploader_id"] == "101" + assert response["items"][2]["metadata"]["_uploader_id"] == "102" + finally: + teardown_metadata_objects(client) From 9a350c4ceddbadb0d605ac4f836f3efbf858cef5 Mon Sep 17 00:00:00 2001 From: John McCann Date: Sun, 14 Mar 2021 15:05:02 -0700 Subject: [PATCH 22/39] test(GET /objects): use resp_json variable --- tests/test_objects.py | 99 ++++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 34 deletions(-) diff --git a/tests/test_objects.py b/tests/test_objects.py index bfe94cc2..302aa579 100644 --- a/tests/test_objects.py +++ b/tests/test_objects.py @@ -1043,11 +1043,12 @@ def test_get_objects_filter_by_message_equals(client): try: setup_metadata_objects(client) - response = client.get( - '/objects?data=true&filter=(message,:eq,"morning")' - ).json() - assert len(response["items"]) == 1 - assert response["items"][0]["metadata"]["message"] == "morning" + resp = client.get('/objects?data=true&filter=(message,:eq,"morning")') + resp_json = resp.json() + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 1 + assert resp_json["items"][0]["metadata"]["message"] == "morning" finally: teardown_metadata_objects(client) @@ -1057,9 +1058,12 @@ def test_get_objects_filter_by_counts_index_equals(client): try: setup_metadata_objects(client) - response = client.get("/objects?data=true&filter=(counts.1,:eq,3)").json() - assert len(response["items"]) == 1 - assert response["items"][0]["metadata"]["counts"][1] == 3 + resp = client.get("/objects?data=true&filter=(counts.1,:eq,3)") + resp_json = resp.json() + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 1 + assert resp_json["items"][0]["metadata"]["counts"][1] == 3 finally: teardown_metadata_objects(client) @@ -1069,14 +1073,16 @@ def test_get_objects_filter_by_any_and_like_with_resource_paths(client): try: setup_metadata_objects(client) - response = client.get( + resp = client.get( '/objects?data=true&filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%"))' - ).json() - response["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) - - assert len(response["items"]) == 2 - assert response["items"][0]["metadata"]["_uploader_id"] == "101" - assert response["items"][1]["metadata"]["_uploader_id"] == "103" + ) + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" + assert resp_json["items"][1]["metadata"]["_uploader_id"] == "103" finally: teardown_metadata_objects(client) @@ -1086,12 +1092,12 @@ def test_get_objects_filter_by_counts_all_equal_42(client): try: setup_metadata_objects(client) - response = client.get( - "/objects?data=true&filter=(counts,:all,(,:eq,42))" - ).json() + resp = client.get("/objects?data=true&filter=(counts,:all,(,:eq,42))") + resp_json = resp.json() - assert len(response["items"]) == 1 - assert response["items"][0]["metadata"]["_uploader_id"] == "102" + assert resp.status_code == 200 + assert len(resp_json["items"]) == 1 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "102" finally: teardown_metadata_objects(client) @@ -1101,14 +1107,16 @@ def test_get_objects_filter_by_or_two_different_uploader_ids(client): try: setup_metadata_objects(client) - response = client.get( + resp = client.get( '/objects?data=true&filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102"))' - ).json() - response["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) - - assert len(response["items"]) == 2 - assert response["items"][0]["metadata"]["_uploader_id"] == "101" - assert response["items"][1]["metadata"]["_uploader_id"] == "102" + ) + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" + assert resp_json["items"][1]["metadata"]["_uploader_id"] == "102" finally: teardown_metadata_objects(client) @@ -1118,14 +1126,37 @@ def test_get_objects_filter_by_compound_boolean_filter(client): try: setup_metadata_objects(client) - response = client.get( + resp = client.get( '/objects?data=true&filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello"))' - ).json() - response["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + ) + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 3 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "100" + assert resp_json["items"][1]["metadata"]["_uploader_id"] == "101" + assert resp_json["items"][2]["metadata"]["_uploader_id"] == "102" + finally: + teardown_metadata_objects(client) + + +def test_get_objects_with_data_param_equal_false(client): - assert len(response["items"]) == 3 - assert response["items"][0]["metadata"]["_uploader_id"] == "100" - assert response["items"][1]["metadata"]["_uploader_id"] == "101" - assert response["items"][2]["metadata"]["_uploader_id"] == "102" + try: + setup_metadata_objects(client) + + resp = client.get("/objects?data=false") + resp_json = resp.json() + resp_json.sort() + + assert resp.status_code == 200 + assert resp_json == ["0", "1", "2", "3"] finally: teardown_metadata_objects(client) + + +def test_get_objects_raises_a_400_for_invalid_filter(client): + + resp = client.get('/objects?data=true&filter=(message,"morning")') + assert resp.status_code == 400 From 7c9009643bdd26bdd4b1b46e23a54c343f76aaa9 Mon Sep 17 00:00:00 2001 From: John McCann Date: Mon, 15 Mar 2021 16:30:31 -0700 Subject: [PATCH 23/39] docs(GET /objects): add function docstrings --- src/mds/objects.py | 10 +-- src/mds/query.py | 151 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 125 insertions(+), 36 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index f5a46d6b..f4118598 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -229,21 +229,17 @@ async def create_object_for_id( @mod.get("/objects") async def get_objects( request: Request, - # XXX would be nice to be able to specify whether to return indexd records - # (e.g GET objects?data=metadata would only return mds objects) data: bool = Query( False, description="Switch to returning a list of GUIDs (false), " "or GUIDs mapping to their metadata (true).", ), + # XXX description + page: int = Query(0, description="paginate"), limit: int = Query( 10, description="Maximum number of records returned. (max: 2000)" ), - offset: int = Query(0, description="Return results at this given offset."), # XXX description - # XXX how to name this python variable something other than filter but - # still have client use "filter" as URL query param? (bc filter is already - # built in to Python) filter: str = Query("", description="Filters to apply."), ) -> JSONResponse: """ @@ -251,7 +247,7 @@ async def get_objects( """ metadata_objects = await search_metadata_helper( - data=data, limit=limit, offset=offset, filter=filter + data=data, page=page, limit=limit, filter=filter ) records = {} diff --git a/src/mds/query.py b/src/mds/query.py index 19cb1308..07a332c3 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -94,18 +94,12 @@ def add_filter(query): ] +# XXX move long docstring to objects.py async def search_metadata_helper( - data: bool = Query( - False, - description="Switch to returning a list of GUIDs (false), " - "or GUIDs mapping to their metadata (true).", - ), - limit: int = Query( - 10, description="Maximum number of records returned. (max: 2000)" - ), - offset: int = Query(0, description="Return results at this given offset."), - # XXX description - filter: str = Query("", description="Filters to apply."), + data: bool = False, + page: int = 0, + limit: int = 10, + filter: str = "", ): """ Helper to search for metadata objects based on filters provided in filter @@ -197,16 +191,27 @@ async def search_metadata_helper( returns "0", "1", and "2" """ + # XXX change to 1024 limit = min(limit, 2000) def add_filter(target_key, operator_name, right_operand): """ - with a scalar filter, right_operand is the value that the target value - is compared against + Wrapper around operators dict. Decides if a scalar or compound operator + needs to be instantiated based on right_operand. + + Args: + target_key (str): the json key which we are filtering based on + + operator_name (str): the name of the operator (leading colon included) + + right_operand (str, bool, int, float, dict): With a scalar filter (i.e. + when it's not a dict), the value that the target value is compared + against. With a compound filter (i.e. when it's a dict), the nested + filter that's run on the target key. e.g. for the following requests, the arguments to add_filter: * `GET /objects?filter=(_uploader_id,:eq,"57")` - - target_key is 'uploader_id' + - target_key is '_uploader_id' - operator_name is ':eq' - right_operand is '57' * `GET /objects?filter=(count,:gte,16)` @@ -217,6 +222,9 @@ def add_filter(target_key, operator_name, right_operand): - target_key is '_resource_paths' - operator_name is ':any' - right_operand is {'name': '', 'op': ':like', 'val': '/programs/foo/%'} + + Returns: + SQLAlchemy object """ json_object = Metadata.data[list(target_key.split("."))] @@ -241,12 +249,26 @@ def add_filter(target_key, operator_name, right_operand): return operators[operator_name]["sql_comparator"](json_object, right_operand) - # XXX docstring needs more detail - def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): + def get_any_sql_clause(target_json_key, scalar_operator_name, scalar_right_operand): """ - get_any_sql_clause could possibly be used for a :has_key operation w/ a - little more work (e.g. does teams object {} have "bears" key: - (teams,:any,(,:eq,"bears"))) (would need to use jsonb_object_keys + Generate a SQLAlchemy clause that corresponds to the :any operation. + + Args: + target_json_key (str): the json key pointing to the array which we + are filtering based on + + scalar_operator_name (str): the name of the operator (leading colon + included) (see operators dict for a list of valid operators) + + scalar_right_operand (str, bool, int, float, dict): the value that + each element in the target_json_key array is compared against + + Returns: + SQLAlchemy clause corresponding to the :any operation. + + Note: get_any_sql_clause could possibly be used for a :has_key + operation w/ a little more work (e.g. does teams object {} have "bears" + key: (teams,:any,(,:eq,"bears"))) (would need to use jsonb_object_keys https://www.postgresql.org/docs/9.5/functions-json.html#FUNCTIONS-JSON-OP-TABLE) """ if ( @@ -256,21 +278,37 @@ def get_any_sql_clause(target_json_object, scalar_operator_name, scalar_other): f = db.func.jsonb_array_elements_text else: f = db.func.jsonb_array_elements - scalar_other = cast(scalar_other, JSONB) + scalar_right_operand = cast(scalar_right_operand, JSONB) - f = f(target_json_object).alias() + f = f(target_json_key).alias() return exists( db.select("*") .select_from(f) .where( operators[scalar_operator_name]["sql_comparator"]( - column(f.name), scalar_other + column(f.name), scalar_right_operand ) ) ) - def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): + def get_all_sql_clause(target_json_key, scalar_operator_name, scalar_right_operand): + """ + Generate a SQLAlchemy clause that corresponds to the :all operation. + + Args: + target_json_key (str): the json key pointing to the array which we + are filtering based on + + scalar_operator_name (str): the name of the operator (leading colon + included) (see operators dict for a list of valid operators) + + scalar_right_operand (str, bool, int, float, dict): the value that + each element in the target_json_key array is compared against + + Returns: + SQLAlchemy clause corresponding to the :all operation. + """ if ( "type" in operators[scalar_operator_name] and operators[scalar_operator_name]["type"] == "text" @@ -278,10 +316,10 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): f = db.func.jsonb_array_elements_text else: f = db.func.jsonb_array_elements - scalar_other = cast(scalar_other, JSONB) + scalar_right_operand = cast(scalar_right_operand, JSONB) - f = f(target_json_object).alias() - count_f = db.func.jsonb_array_length(target_json_object) + f = f(target_json_key).alias() + count_f = db.func.jsonb_array_length(target_json_key) return ColumnOperators.__eq__( count_f, @@ -289,7 +327,7 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): .select_from(f) .where( operators[scalar_operator_name]["sql_comparator"]( - column(f.name), scalar_other + column(f.name), scalar_right_operand ) ), ) @@ -310,6 +348,17 @@ def get_all_sql_clause(target_json_object, scalar_operator_name, scalar_other): } def parse_filter(filter_string): + """ + Parse filter_string and return an abstract syntax tree representation. + + Args: + filter_string (str): the filter string to parse + (e.g. '(_uploader_id,:eq,"57")' ) + + Returns: + A parsimonious abstract syntax tree representation of + filter_string. + """ if not filter_string: return @@ -403,6 +452,7 @@ def visit_compound_filter(self, node, visited_children): "val": scalar_filter, } + # XXX no longer used, remove def visit_json_value_or_scalar_filter(self, node, visited_children): return visited_children[0] @@ -422,6 +472,49 @@ def generic_visit(self, node, visited_children): return visited_children or node def get_sqlalchemy_clause(filter_dict): + """ + Return a SQLAlchemy WHERE clause representing filter_dict. + + Args: + filter_dict(dict): + + when filter_dict is a scalar: + { + "name": "_resource_paths.1", + "op": ":like", + "val": "/programs/%" + } + + when filter_dict is a compound: + { + "name": "_resource_paths", + "op": ":any", + "val": { + "name": "", + "op": ":like", + "val": "/programs/%" + } + } + + when filter_dict is a boolean compound: + { + "or": [ + { + "name": "_uploader_id", + "op": ":eq", + "val": "101" + }, + { + "name": "_uploader_id", + "op": ":eq", + "val": "102" + } + ] + } + + Returns: + A SQLAlchemy WHERE clause representing filter_dict. + """ if not filter_dict: return @@ -456,7 +549,7 @@ def get_sqlalchemy_clause(filter_dict): query = query.where(get_sqlalchemy_clause(filter_dict)) return [ (metadata.guid, metadata.data) - for metadata in await query.offset(offset).limit(limit).gino.all() + for metadata in await query.offset(page * limit).limit(limit).gino.all() ] else: query = db.select([Metadata.guid]) @@ -464,7 +557,7 @@ def get_sqlalchemy_clause(filter_dict): query = query.where(get_sqlalchemy_clause(filter_dict)) return [ row[0] - for row in await query.offset(offset) + for row in await query.offset(page * limit) .limit(limit) .gino.return_model(False) .all() From 20f00ff02af476e4e271a5d97bd512d64a7153d2 Mon Sep 17 00:00:00 2001 From: johnfrancismccann Date: Mon, 15 Mar 2021 23:31:43 +0000 Subject: [PATCH 24/39] Apply automatic documentation changes --- docs/openapi.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 700c855c..e2c9fe31 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -424,6 +424,15 @@ paths: to their metadata (true). title: Data type: boolean + - description: paginate + in: query + name: page + required: false + schema: + default: 0 + description: paginate + title: Page + type: integer - description: 'Maximum number of records returned. (max: 2000)' in: query name: limit @@ -433,15 +442,6 @@ paths: description: 'Maximum number of records returned. (max: 2000)' title: Limit type: integer - - description: Return results at this given offset. - in: query - name: offset - required: false - schema: - default: 0 - description: Return results at this given offset. - title: Offset - type: integer - description: Filters to apply. in: query name: filter From 81aedb8c451ac04a509cd5f5b754bb1e648ab864 Mon Sep 17 00:00:00 2001 From: John McCann Date: Mon, 15 Mar 2021 21:49:39 -0700 Subject: [PATCH 25/39] feat(GET /objects): use advanced_search_metadata --- src/mds/objects.py | 109 ++++++++++++++++++++++++++++++++++++++++----- src/mds/query.py | 103 +++++++----------------------------------- 2 files changed, 116 insertions(+), 96 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index f4118598..36e93e68 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -22,7 +22,7 @@ from . import config, logger from .models import Metadata -from .query import get_metadata, search_metadata_helper +from .query import get_metadata, advanced_search_metadata mod = APIRouter() @@ -230,23 +230,112 @@ async def create_object_for_id( async def get_objects( request: Request, data: bool = Query( - False, + True, description="Switch to returning a list of GUIDs (false), " - "or GUIDs mapping to their metadata (true).", + "or metadata objects (true).", + ), + page: int = Query( + 0, + description="The offset for what objects are returned " + "(zero-indexed). The exact offset will be equal to " + "page*limit (e.g. with page=1, limit=15, 15 objects " + "beginning at index 15 will be returned).", ), - # XXX description - page: int = Query(0, description="paginate"), limit: int = Query( - 10, description="Maximum number of records returned. (max: 2000)" + 10, + description="Maximum number of records returned (max: 2000). " + "Also used with page to determine page size.", + ), + filter: str = Query( + "", + description="The filter(s) that will be applied to the " + "result (more detail in the docstring).", ), - # XXX description - filter: str = Query("", description="Filters to apply."), ) -> JSONResponse: """ - XXX comments + The filtering functionality was primarily driven by the requirement that a + user be able to get all objects having an authz resource matching a + user-supplied pattern at any index in the "_resource_paths" array. + + For example, given the following metadata objects: + + { + "0": { + "message": "hello", + "_uploader_id": "100", + "_resource_paths": [ + "/programs/a", + "/programs/b" + ], + "pet": "dog" + }, + "1": { + "message": "greetings", + "_uploader_id": "101", + "_resource_paths": [ + "/open", + "/programs/c/projects/a" + ], + "pet": "ferret", + "sport": "soccer" + }, + "2": { + "message": "morning", + "_uploader_id": "102", + "_resource_paths": [ + "/programs/d", + "/programs/e" + ], + "counts": [42, 42, 42], + "pet": "ferret", + "sport": "soccer" + }, + "3": { + "message": "evening", + "_uploader_id": "103", + "_resource_paths": [ + "/programs/f/projects/a", + "/admin" + ], + "counts": [1, 3, 5], + "pet": "ferret", + "sport": "basketball" + } + } + + how do we design a filtering interface that allows the user to get all + objects having an authz string matching the pattern + "/programs/%/projects/%" at any index in its "_resource_paths" array? (% + has been used as the wildcard so far because that's what Postgres uses as + the wildcard for LIKE) In this case, the "1" and "3" objects should be + returned. + + The filter syntax that was arrived at ending up following the syntax + specified by a [Node JS implementation](https://www.npmjs.com/package/json-api#filtering) of the [JSON:API + specification](https://jsonapi.org/). + + The format for this syntax is filter=(field_name,operator,value), in which + the field_name is a json key without quotes, operator is one of :eq, :ne, + :gt, :gte, :lt, :lte, :like, :all, :any (see operators dict), and value is + a typed json value against which the operator is run. + + Examples: + + GET /objects?filter=(message,:eq,"morning") returns "2" + GET /objects?filter=(counts.1,:eq,3) returns "3" + + Compound expressions are supported: + + GET /objects?filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%")) returns "1" and "3" + GET /objects?filter=(counts,:all,(,:eq,42)) returns "2" + + Boolean expressions are also supported: + + GET /objects?filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102")) returns "1" and "2" + GET /objects?filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello")) returns "0", "1", and "2" """ - metadata_objects = await search_metadata_helper( + metadata_objects = await advanced_search_metadata( data=data, page=page, limit=limit, filter=filter ) diff --git a/src/mds/query.py b/src/mds/query.py index 07a332c3..dcdd20cb 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -94,102 +94,33 @@ def add_filter(query): ] -# XXX move long docstring to objects.py -async def search_metadata_helper( - data: bool = False, +async def advanced_search_metadata( + data: bool = True, page: int = 0, limit: int = 10, filter: str = "", ): """ - Helper to search for metadata objects based on filters provided in filter - param. - - The filtering functionality was primarily driven by the requirement that a - user be able to get all objects having an authz resource matching a - user-supplied pattern at any index in the "_resource_paths" array. For - example, given the following metadata objects: - - { - "0": { - "message": "hello", - "_uploader_id": "100", - "_resource_paths": [ - "/programs/a", - "/programs/b" - ], - "pet": "dog" - }, - "1": { - "message": "greetings", - "_uploader_id": "101", - "_resource_paths": [ - "/open", - "/programs/c/projects/a" - ], - "pet": "ferret", - "sport": "soccer" - }, - "2": { - "message": "morning", - "_uploader_id": "102", - "_resource_paths": [ - "/programs/d", - "/programs/e" - ], - "counts": [42, 42, 42], - "pet": "ferret", - "sport": "soccer" - }, - "3": { - "message": "evening", - "_uploader_id": "103", - "_resource_paths": [ - "/programs/f/projects/a", - "/admin" - ], - "counts": [1, 3, 5], - "pet": "ferret", - "sport": "basketball" - } - } - - how do we design a filtering interface that allows the user to get all - objects having an authz string matching the pattern - "/programs/%/projects/%" at any index in its "_resource_paths" array? (% - has been used as the wildcard so far because that's what Postgres uses as - the wildcard for LIKE) In this case, the "1" and "3" objects should be - returned. - - The filter syntax that was arrived at ending up following the syntax - specified by a Node JS implementation - (https://www.npmjs.com/package/json-api#filtering) of the JSON:API - specification (https://jsonapi.org/). - - The format for this syntax is filter=(field_name,operator,value), in which - the field_name is a json key without quotes, operator is one of :eq, :ne, - :gt, :gte, :lt, :lte, :like, :all, :any (see operators dict), and value is - a typed json value against which the operator is run. - - Examples: - - - GET /objects?filter=(message,:eq,"morning") returns "2" - - GET /objects?filter=(counts.1,:eq,3) returns "3" + Provides for more advanced searching of metadata based on filter param. + Please see get_objects function for more documentation of how filter param + works. - Compound expressions are supported: + Args: + data (bool): Switch to returning a list of GUIDs (false), or metadata + objects (true). - - GET /objects?filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%")) - returns "1" and "3" - - GET /objects?filter=(counts,:all,(,:eq,42)) - returns "2" + page (int): The offset for what objects are returned (zero-indexed). + The exact offset will be equal to page*limit (e.g. with page=1, + limit=15, 15 objects beginning at index 15 will be returned). - Boolean expressions are also supported: + limit (int): Maximum number of records returned (max: 2000). Also used + with page to determine page size. - - GET /objects?filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102")) - returns "1" and "2" - - GET /objects?filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello")) - returns "0", "1", and "2" + filter (str): The filter(s) that will be applied to the result. + Returns: + if data=True, a list of (guid, metadata object) tuples + if data=False, a list of guids """ # XXX change to 1024 limit = min(limit, 2000) From aec9b9e163e7622cbb65ed03241c4554738903bd Mon Sep 17 00:00:00 2001 From: johnfrancismccann Date: Tue, 16 Mar 2021 04:51:02 +0000 Subject: [PATCH 26/39] Apply automatic documentation changes --- docs/openapi.yaml | 70 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index e2c9fe31..7e7c174e 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -410,45 +410,91 @@ paths: - Index /objects: get: - description: XXX comments + description: "The filtering functionality was primarily driven by the requirement\ + \ that a\nuser be able to get all objects having an authz resource matching\ + \ a\nuser-supplied pattern at any index in the \"_resource_paths\" array.\n\ + \nFor example, given the following metadata objects:\n\n {\n \"\ + 0\": {\n \"message\": \"hello\",\n \"_uploader_id\"\ + : \"100\",\n \"_resource_paths\": [\n \"/programs/a\"\ + ,\n \"/programs/b\"\n ],\n \"pet\": \"\ + dog\"\n },\n \"1\": {\n \"message\": \"greetings\"\ + ,\n \"_uploader_id\": \"101\",\n \"_resource_paths\"\ + : [\n \"/open\",\n \"/programs/c/projects/a\"\ + \n ],\n \"pet\": \"ferret\",\n \"sport\"\ + : \"soccer\"\n },\n \"2\": {\n \"message\": \"morning\"\ + ,\n \"_uploader_id\": \"102\",\n \"_resource_paths\"\ + : [\n \"/programs/d\",\n \"/programs/e\"\n \ + \ ],\n \"counts\": [42, 42, 42],\n \"pet\"\ + : \"ferret\",\n \"sport\": \"soccer\"\n },\n \"3\"\ + : {\n \"message\": \"evening\",\n \"_uploader_id\":\ + \ \"103\",\n \"_resource_paths\": [\n \"/programs/f/projects/a\"\ + ,\n \"/admin\"\n ],\n \"counts\": [1,\ + \ 3, 5],\n \"pet\": \"ferret\",\n \"sport\": \"basketball\"\ + \n }\n }\n\nhow do we design a filtering interface that allows the\ + \ user to get all\nobjects having an authz string matching the pattern\n\"\ + /programs/%/projects/%\" at any index in its \"_resource_paths\" array? (%\n\ + has been used as the wildcard so far because that's what Postgres uses as\n\ + the wildcard for LIKE) In this case, the \"1\" and \"3\" objects should be\n\ + returned.\n\nThe filter syntax that was arrived at ending up following the\ + \ syntax\nspecified by a [Node JS implementation](https://www.npmjs.com/package/json-api#filtering)\ + \ of the [JSON:API\nspecification](https://jsonapi.org/).\n\nThe format for\ + \ this syntax is filter=(field_name,operator,value), in which\nthe field_name\ + \ is a json key without quotes, operator is one of :eq, :ne,\n:gt, :gte, :lt,\ + \ :lte, :like, :all, :any (see operators dict), and value is\na typed json\ + \ value against which the operator is run.\n\nExamples:\n\n GET /objects?filter=(message,:eq,\"\ + morning\") returns \"2\"\n GET /objects?filter=(counts.1,:eq,3) returns\ + \ \"3\"\n\nCompound expressions are supported:\n\n GET /objects?filter=(_resource_paths,:any,(,:like,\"\ + /programs/%/projects/%\")) returns \"1\" and \"3\"\n GET /objects?filter=(counts,:all,(,:eq,42))\ + \ returns \"2\"\n\nBoolean expressions are also supported:\n\n GET /objects?filter=(or,(_uploader_id,:eq,\"\ + 101\"),(_uploader_id,:eq,\"102\")) returns \"1\" and \"2\"\n GET /objects?filter=(or,(and,(pet,:eq,\"\ + ferret\"),(sport,:eq,\"soccer\")),(message,:eq,\"hello\")) returns \"0\",\ + \ \"1\", and \"2\"" operationId: get_objects_objects_get parameters: - - description: Switch to returning a list of GUIDs (false), or GUIDs mapping - to their metadata (true). + - description: Switch to returning a list of GUIDs (false), or metadata objects + (true). in: query name: data required: false schema: - default: false - description: Switch to returning a list of GUIDs (false), or GUIDs mapping - to their metadata (true). + default: true + description: Switch to returning a list of GUIDs (false), or metadata objects + (true). title: Data type: boolean - - description: paginate + - description: The offset for what objects are returned (zero-indexed). The + exact offset will be equal to page*limit (e.g. with page=1, limit=15, 15 + objects beginning at index 15 will be returned). in: query name: page required: false schema: default: 0 - description: paginate + description: The offset for what objects are returned (zero-indexed). The + exact offset will be equal to page*limit (e.g. with page=1, limit=15, + 15 objects beginning at index 15 will be returned). title: Page type: integer - - description: 'Maximum number of records returned. (max: 2000)' + - description: 'Maximum number of records returned (max: 2000). Also used with + page to determine page size.' in: query name: limit required: false schema: default: 10 - description: 'Maximum number of records returned. (max: 2000)' + description: 'Maximum number of records returned (max: 2000). Also used + with page to determine page size.' title: Limit type: integer - - description: Filters to apply. + - description: The filter(s) that will be applied to the result (more detail + in the docstring). in: query name: filter required: false schema: default: '' - description: Filters to apply. + description: The filter(s) that will be applied to the result (more detail + in the docstring). title: Filter type: string responses: From 167d39a264f7dd104220c051870cdb2ca82c0002 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 16 Mar 2021 09:01:05 -0700 Subject: [PATCH 27/39] feat(GET /objects): change limit from 2000 to 1024 --- src/mds/objects.py | 2 +- src/mds/query.py | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index 36e93e68..c83712e5 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -243,7 +243,7 @@ async def get_objects( ), limit: int = Query( 10, - description="Maximum number of records returned (max: 2000). " + description="Maximum number of objects returned (max: 1024). " "Also used with page to determine page size.", ), filter: str = Query( diff --git a/src/mds/query.py b/src/mds/query.py index dcdd20cb..441aa5dc 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -113,7 +113,7 @@ async def advanced_search_metadata( The exact offset will be equal to page*limit (e.g. with page=1, limit=15, 15 objects beginning at index 15 will be returned). - limit (int): Maximum number of records returned (max: 2000). Also used + limit (int): Maximum number of objects returned (max: 1024). Also used with page to determine page size. filter (str): The filter(s) that will be applied to the result. @@ -122,8 +122,7 @@ async def advanced_search_metadata( if data=True, a list of (guid, metadata object) tuples if data=False, a list of guids """ - # XXX change to 1024 - limit = min(limit, 2000) + limit = min(limit, 1024) def add_filter(target_key, operator_name, right_operand): """ @@ -305,7 +304,6 @@ def parse_filter(filter_string): specific_filter = scalar_filter / compound_filter / boolean_filter scalar_filter = open json_key separator scalar_operator separator json_value close compound_filter = open json_key separator compound_operator separator scalar_filter close - json_value_or_scalar_filter = json_value / scalar_filter json_key = ~"[A-Z 0-9_.]*"i scalar_operator = ":eq" / ":ne" / ":gte" / ":gt" / ":lte" / ":lt" / ":like" @@ -383,10 +381,6 @@ def visit_compound_filter(self, node, visited_children): "val": scalar_filter, } - # XXX no longer used, remove - def visit_json_value_or_scalar_filter(self, node, visited_children): - return visited_children[0] - def visit_json_key(self, node, visited_children): return node.text From 60b9be54691255eeb1dbda8b9009cea29b2e88c4 Mon Sep 17 00:00:00 2001 From: johnfrancismccann Date: Tue, 16 Mar 2021 16:02:42 +0000 Subject: [PATCH 28/39] Apply automatic documentation changes --- docs/openapi.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 7e7c174e..2264985b 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -475,14 +475,14 @@ paths: 15 objects beginning at index 15 will be returned). title: Page type: integer - - description: 'Maximum number of records returned (max: 2000). Also used with + - description: 'Maximum number of objects returned (max: 1024). Also used with page to determine page size.' in: query name: limit required: false schema: default: 10 - description: 'Maximum number of records returned (max: 2000). Also used + description: 'Maximum number of objects returned (max: 1024). Also used with page to determine page size.' title: Limit type: integer From 4ff094d513500dcd264f3145112cb085d92251c7 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 16 Mar 2021 13:28:06 -0700 Subject: [PATCH 29/39] docs(GET /objects): add docstrings to tests --- src/mds/query.py | 8 ++--- tests/test_objects.py | 79 ++++++++++++++++++++++++++++++------------- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 441aa5dc..5a6e5f0b 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -190,8 +190,8 @@ def get_any_sql_clause(target_json_key, scalar_operator_name, scalar_right_opera scalar_operator_name (str): the name of the operator (leading colon included) (see operators dict for a list of valid operators) - scalar_right_operand (str, bool, int, float, dict): the value that - each element in the target_json_key array is compared against + scalar_right_operand (str, bool, int, float): the value that each + element in the target_json_key array is compared against Returns: SQLAlchemy clause corresponding to the :any operation. @@ -233,8 +233,8 @@ def get_all_sql_clause(target_json_key, scalar_operator_name, scalar_right_opera scalar_operator_name (str): the name of the operator (leading colon included) (see operators dict for a list of valid operators) - scalar_right_operand (str, bool, int, float, dict): the value that - each element in the target_json_key array is compared against + scalar_right_operand (str, bool, int, float): the value that each + element in the target_json_key array is compared against Returns: SQLAlchemy clause corresponding to the :all operation. diff --git a/tests/test_objects.py b/tests/test_objects.py index 302aa579..893c5c12 100644 --- a/tests/test_objects.py +++ b/tests/test_objects.py @@ -1022,7 +1022,10 @@ def test_get_object_latest_when_indexd_returns_404_and_guid_not_in_mds( assert resp.status_code == 404, resp.text -def setup_metadata_objects(client_fixture): +def set_up_metadata_objects(client_fixture): + """ + Create metadata objects from tests/sample_objects.json file. + """ with open("tests/sample_objects.json") as json_file: data = json.load(json_file) @@ -1030,7 +1033,11 @@ def setup_metadata_objects(client_fixture): client_fixture.post("/metadata/" + guid, json=mds_object).raise_for_status() -def teardown_metadata_objects(client_fixture): +def tear_down_metadata_objects(client_fixture): + """ + Delete metadata objects that were created from tests/sample_objects.json + file. + """ with open("tests/sample_objects.json") as json_file: data = json.load(json_file) @@ -1039,9 +1046,12 @@ def teardown_metadata_objects(client_fixture): def test_get_objects_filter_by_message_equals(client): - + """ + Test that the GET /objects endpoint filters correctly when a message-equal + filter is applied. + """ try: - setup_metadata_objects(client) + set_up_metadata_objects(client) resp = client.get('/objects?data=true&filter=(message,:eq,"morning")') resp_json = resp.json() @@ -1050,13 +1060,16 @@ def test_get_objects_filter_by_message_equals(client): assert len(resp_json["items"]) == 1 assert resp_json["items"][0]["metadata"]["message"] == "morning" finally: - teardown_metadata_objects(client) + tear_down_metadata_objects(client) def test_get_objects_filter_by_counts_index_equals(client): - + """ + Test that the GET /objects endpoint filters correctly when an array + index-equal filter is applied + """ try: - setup_metadata_objects(client) + set_up_metadata_objects(client) resp = client.get("/objects?data=true&filter=(counts.1,:eq,3)") resp_json = resp.json() @@ -1065,13 +1078,16 @@ def test_get_objects_filter_by_counts_index_equals(client): assert len(resp_json["items"]) == 1 assert resp_json["items"][0]["metadata"]["counts"][1] == 3 finally: - teardown_metadata_objects(client) + tear_down_metadata_objects(client) def test_get_objects_filter_by_any_and_like_with_resource_paths(client): - + """ + Test that the GET /objects endpoint filters correctly when an :any filter is + combined with a :like filter. + """ try: - setup_metadata_objects(client) + set_up_metadata_objects(client) resp = client.get( '/objects?data=true&filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%"))' @@ -1084,13 +1100,16 @@ def test_get_objects_filter_by_any_and_like_with_resource_paths(client): assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" assert resp_json["items"][1]["metadata"]["_uploader_id"] == "103" finally: - teardown_metadata_objects(client) + tear_down_metadata_objects(client) def test_get_objects_filter_by_counts_all_equal_42(client): - + """ + Test that the GET /objects endpoint filters correctly when an :all filter + is combined with an :eq filter. + """ try: - setup_metadata_objects(client) + set_up_metadata_objects(client) resp = client.get("/objects?data=true&filter=(counts,:all,(,:eq,42))") resp_json = resp.json() @@ -1099,13 +1118,16 @@ def test_get_objects_filter_by_counts_all_equal_42(client): assert len(resp_json["items"]) == 1 assert resp_json["items"][0]["metadata"]["_uploader_id"] == "102" finally: - teardown_metadata_objects(client) + tear_down_metadata_objects(client) def test_get_objects_filter_by_or_two_different_uploader_ids(client): - + """ + Test that the GET /objects endpoint filters correctly when two :eq filters + are combined using an :or filter. + """ try: - setup_metadata_objects(client) + set_up_metadata_objects(client) resp = client.get( '/objects?data=true&filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102"))' @@ -1118,13 +1140,16 @@ def test_get_objects_filter_by_or_two_different_uploader_ids(client): assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" assert resp_json["items"][1]["metadata"]["_uploader_id"] == "102" finally: - teardown_metadata_objects(client) + tear_down_metadata_objects(client) def test_get_objects_filter_by_compound_boolean_filter(client): - + """ + Test the filtering functionality of the GET /objects endpoint using a + compound boolean filter. + """ try: - setup_metadata_objects(client) + set_up_metadata_objects(client) resp = client.get( '/objects?data=true&filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello"))' @@ -1138,13 +1163,16 @@ def test_get_objects_filter_by_compound_boolean_filter(client): assert resp_json["items"][1]["metadata"]["_uploader_id"] == "101" assert resp_json["items"][2]["metadata"]["_uploader_id"] == "102" finally: - teardown_metadata_objects(client) + tear_down_metadata_objects(client) def test_get_objects_with_data_param_equal_false(client): - + """ + Test that a list of guids is returned from the GET /objects endpoint when + data param is false. + """ try: - setup_metadata_objects(client) + set_up_metadata_objects(client) resp = client.get("/objects?data=false") resp_json = resp.json() @@ -1153,10 +1181,13 @@ def test_get_objects_with_data_param_equal_false(client): assert resp.status_code == 200 assert resp_json == ["0", "1", "2", "3"] finally: - teardown_metadata_objects(client) + tear_down_metadata_objects(client) def test_get_objects_raises_a_400_for_invalid_filter(client): - + """ + Test that a 400 is returned from the GET /objects endpoint when an invalid + filter is provided. + """ resp = client.get('/objects?data=true&filter=(message,"morning")') assert resp.status_code == 400 From 3e574f829b8c9377c9b4e04efddac7d5db29b0d7 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 16 Mar 2021 15:39:07 -0700 Subject: [PATCH 30/39] test(GET /objects): add :lte and :gt tests --- tests/sample_objects.json | 6 +++++- tests/test_objects.py | 40 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/sample_objects.json b/tests/sample_objects.json index 0c3412a5..4e1a4730 100644 --- a/tests/sample_objects.json +++ b/tests/sample_objects.json @@ -6,7 +6,8 @@ "/programs/a", "/programs/b" ], - "pet": "dog" + "pet": "dog", + "pet_age": 1 }, "1": { "message": "greetings", @@ -16,6 +17,7 @@ "/programs/c/projects/a" ], "pet": "ferret", + "pet_age": 5, "sport": "soccer" }, "2": { @@ -27,6 +29,7 @@ ], "counts": [42, 42, 42], "pet": "ferret", + "pet_age": 10, "sport": "soccer" }, "3": { @@ -38,6 +41,7 @@ ], "counts": [1, 3, 5], "pet": "ferret", + "pet_age": 15, "sport": "basketball" } } diff --git a/tests/test_objects.py b/tests/test_objects.py index 893c5c12..9bf15bdb 100644 --- a/tests/test_objects.py +++ b/tests/test_objects.py @@ -1081,6 +1081,46 @@ def test_get_objects_filter_by_counts_index_equals(client): tear_down_metadata_objects(client) +def test_get_objects_filter_using_lte(client): + """ + Test that the GET /objects endpoint filters correctly for a less than or + equal operator. + """ + try: + set_up_metadata_objects(client) + + resp = client.get("/objects?data=true&filter=(pet_age,:lte,5)") + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["pet_age"]) + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["pet_age"] == 1 + assert resp_json["items"][1]["metadata"]["pet_age"] == 5 + finally: + tear_down_metadata_objects(client) + + +def test_get_objects_filter_using_gt(client): + """ + Test that the GET /objects endpoint filters correctly for a greater than + operator. + """ + try: + set_up_metadata_objects(client) + + resp = client.get("/objects?data=true&filter=(pet_age,:gt,5)") + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["pet_age"]) + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["pet_age"] == 10 + assert resp_json["items"][1]["metadata"]["pet_age"] == 15 + finally: + tear_down_metadata_objects(client) + + def test_get_objects_filter_by_any_and_like_with_resource_paths(client): """ Test that the GET /objects endpoint filters correctly when an :any filter is From 5d7debae8a9e6d500bd818f8ee3ef4c966a5d292 Mon Sep 17 00:00:00 2001 From: John McCann Date: Tue, 16 Mar 2021 17:16:17 -0700 Subject: [PATCH 31/39] feat(GET /objects): use search_metadata_objects --- src/mds/objects.py | 15 ++++++++++----- src/mds/query.py | 8 ++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index c83712e5..3873454e 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -22,7 +22,7 @@ from . import config, logger from .models import Metadata -from .query import get_metadata, advanced_search_metadata +from .query import get_metadata, search_metadata_objects mod = APIRouter() @@ -267,7 +267,8 @@ async def get_objects( "/programs/a", "/programs/b" ], - "pet": "dog" + "pet": "dog", + "pet_age": 1 }, "1": { "message": "greetings", @@ -277,6 +278,7 @@ async def get_objects( "/programs/c/projects/a" ], "pet": "ferret", + "pet_age": 5, "sport": "soccer" }, "2": { @@ -288,6 +290,7 @@ async def get_objects( ], "counts": [42, 42, 42], "pet": "ferret", + "pet_age": 10, "sport": "soccer" }, "3": { @@ -299,6 +302,7 @@ async def get_objects( ], "counts": [1, 3, 5], "pet": "ferret", + "pet_age": 15, "sport": "basketball" } } @@ -323,6 +327,8 @@ async def get_objects( GET /objects?filter=(message,:eq,"morning") returns "2" GET /objects?filter=(counts.1,:eq,3) returns "3" + GET /objects?filter=(pet_age,:lte,5) returns "0" and "1" + GET /objects?filter=(pet_age,:gt,5) returns "2" and "3" Compound expressions are supported: @@ -335,7 +341,7 @@ async def get_objects( GET /objects?filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello")) returns "0", "1", and "2" """ - metadata_objects = await advanced_search_metadata( + metadata_objects = await search_metadata_objects( data=data, page=page, limit=limit, filter=filter ) @@ -345,8 +351,7 @@ async def get_objects( endpoint_path = "/bulk/documents" full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path guids = list(guid for guid, _ in metadata_objects) - # XXX /bulk/documents endpoint in indexd currently doesn't support - # filters + response = await request.app.async_client.post(full_endpoint, json=guids) response.raise_for_status() records = {r["did"]: r for r in response.json()} diff --git a/src/mds/query.py b/src/mds/query.py index 5a6e5f0b..33f8385c 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -94,16 +94,16 @@ def add_filter(query): ] -async def advanced_search_metadata( +async def search_metadata_objects( data: bool = True, page: int = 0, limit: int = 10, filter: str = "", ): """ - Provides for more advanced searching of metadata based on filter param. - Please see get_objects function for more documentation of how filter param - works. + Intended as a helper for the get_objects function to query the db + based on the filter param. Please see get_objects function for more + documentation of filter param formatting. Args: data (bool): Switch to returning a list of GUIDs (false), or metadata From f688cd3c728590f4cde094cfbcf1a060f3994347 Mon Sep 17 00:00:00 2001 From: johnfrancismccann Date: Wed, 17 Mar 2021 00:18:09 +0000 Subject: [PATCH 32/39] Apply automatic documentation changes --- docs/openapi.yaml | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 2264985b..0e23058e 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -417,19 +417,21 @@ paths: 0\": {\n \"message\": \"hello\",\n \"_uploader_id\"\ : \"100\",\n \"_resource_paths\": [\n \"/programs/a\"\ ,\n \"/programs/b\"\n ],\n \"pet\": \"\ - dog\"\n },\n \"1\": {\n \"message\": \"greetings\"\ - ,\n \"_uploader_id\": \"101\",\n \"_resource_paths\"\ - : [\n \"/open\",\n \"/programs/c/projects/a\"\ - \n ],\n \"pet\": \"ferret\",\n \"sport\"\ - : \"soccer\"\n },\n \"2\": {\n \"message\": \"morning\"\ - ,\n \"_uploader_id\": \"102\",\n \"_resource_paths\"\ - : [\n \"/programs/d\",\n \"/programs/e\"\n \ - \ ],\n \"counts\": [42, 42, 42],\n \"pet\"\ - : \"ferret\",\n \"sport\": \"soccer\"\n },\n \"3\"\ - : {\n \"message\": \"evening\",\n \"_uploader_id\":\ - \ \"103\",\n \"_resource_paths\": [\n \"/programs/f/projects/a\"\ - ,\n \"/admin\"\n ],\n \"counts\": [1,\ - \ 3, 5],\n \"pet\": \"ferret\",\n \"sport\": \"basketball\"\ + dog\",\n \"pet_age\": 1\n },\n \"1\": {\n \ + \ \"message\": \"greetings\",\n \"_uploader_id\": \"101\",\n\ + \ \"_resource_paths\": [\n \"/open\",\n \ + \ \"/programs/c/projects/a\"\n ],\n \"pet\":\ + \ \"ferret\",\n \"pet_age\": 5,\n \"sport\": \"soccer\"\ + \n },\n \"2\": {\n \"message\": \"morning\",\n \ + \ \"_uploader_id\": \"102\",\n \"_resource_paths\": [\n\ + \ \"/programs/d\",\n \"/programs/e\"\n \ + \ ],\n \"counts\": [42, 42, 42],\n \"pet\": \"\ + ferret\",\n \"pet_age\": 10,\n \"sport\": \"soccer\"\ + \n },\n \"3\": {\n \"message\": \"evening\",\n \ + \ \"_uploader_id\": \"103\",\n \"_resource_paths\": [\n\ + \ \"/programs/f/projects/a\",\n \"/admin\"\n\ + \ ],\n \"counts\": [1, 3, 5],\n \"pet\":\ + \ \"ferret\",\n \"pet_age\": 15,\n \"sport\": \"basketball\"\ \n }\n }\n\nhow do we design a filtering interface that allows the\ \ user to get all\nobjects having an authz string matching the pattern\n\"\ /programs/%/projects/%\" at any index in its \"_resource_paths\" array? (%\n\ @@ -443,7 +445,9 @@ paths: \ :lte, :like, :all, :any (see operators dict), and value is\na typed json\ \ value against which the operator is run.\n\nExamples:\n\n GET /objects?filter=(message,:eq,\"\ morning\") returns \"2\"\n GET /objects?filter=(counts.1,:eq,3) returns\ - \ \"3\"\n\nCompound expressions are supported:\n\n GET /objects?filter=(_resource_paths,:any,(,:like,\"\ + \ \"3\"\n GET /objects?filter=(pet_age,:lte,5) returns \"0\" and \"1\"\n\ + \ GET /objects?filter=(pet_age,:gt,5) returns \"2\" and \"3\"\n\nCompound\ + \ expressions are supported:\n\n GET /objects?filter=(_resource_paths,:any,(,:like,\"\ /programs/%/projects/%\")) returns \"1\" and \"3\"\n GET /objects?filter=(counts,:all,(,:eq,42))\ \ returns \"2\"\n\nBoolean expressions are also supported:\n\n GET /objects?filter=(or,(_uploader_id,:eq,\"\ 101\"),(_uploader_id,:eq,\"102\")) returns \"1\" and \"2\"\n GET /objects?filter=(or,(and,(pet,:eq,\"\ From e35d9bfddc4b148cb05464c6b94a9c21866917fa Mon Sep 17 00:00:00 2001 From: John McCann Date: Thu, 1 Apr 2021 22:23:46 -0700 Subject: [PATCH 33/39] feat(GET /objects): compile filter grammar once --- src/mds/objects.py | 15 +++-- src/mds/param_parser.py | 108 ++++++++++++++++++++++++++++++++++ src/mds/query.py | 126 ++-------------------------------------- 3 files changed, 122 insertions(+), 127 deletions(-) create mode 100644 src/mds/param_parser.py diff --git a/src/mds/objects.py b/src/mds/objects.py index 6b2645a7..f7ae39d9 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -233,7 +233,7 @@ async def get_objects( request: Request, data: bool = Query( True, - description="Switch to returning a list of GUIDs (false), " + description="Switch to return a list of GUIDs (false), " "or metadata objects (true).", ), page: int = Query( @@ -255,6 +255,9 @@ async def get_objects( ), ) -> JSONResponse: """ + Returns a list of objects and their corresponding Indexd records (please + see URL query documentation for more info on which objects get returned). + The filtering functionality was primarily driven by the requirement that a user be able to get all objects having an authz resource matching a user-supplied pattern at any index in the "_resource_paths" array. @@ -351,10 +354,12 @@ async def get_objects( if data and metadata_objects: try: endpoint_path = "/bulk/documents" - full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path + full_indexd_url = ( + config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path + ) guids = list(guid for guid, _ in metadata_objects) - response = await request.app.async_client.post(full_endpoint, json=guids) + response = await request.app.async_client.post(full_indexd_url, json=guids) response.raise_for_status() records = {r["did"]: r for r in response.json()} except httpx.HTTPError as err: @@ -362,13 +367,13 @@ async def get_objects( if err.response: logger.error( "indexd `POST %s` endpoint returned a %s HTTP status code", - endpoint_path, + full_indexd_url, err.response.status_code, ) else: logger.error( "Unable to get a response from indexd `POST %s` endpoint", - endpoint_path, + full_indexd_url, ) if data: diff --git a/src/mds/param_parser.py b/src/mds/param_parser.py new file mode 100644 index 00000000..33172de1 --- /dev/null +++ b/src/mds/param_parser.py @@ -0,0 +1,108 @@ +import json + +from parsimonious.grammar import Grammar +from parsimonious.nodes import NodeVisitor + +# json_value and below was taken from +# https://github.com/erikrose/parsimonious/pull/23/files +filter_param_grammar = Grammar( + """ +filter = scalar_filter / compound_filter / boolean_filter +boolean_filter = open boolean boolean_operands close +boolean = "and" / "or" +boolean_operands = boolean_operand+ +boolean_operand = separator specific_filter +specific_filter = scalar_filter / compound_filter / boolean_filter +scalar_filter = open json_key separator scalar_operator separator json_value close +compound_filter = open json_key separator compound_operator separator scalar_filter close + +json_key = ~"[A-Z 0-9_.]*"i +scalar_operator = ":eq" / ":ne" / ":gte" / ":gt" / ":lte" / ":lt" / ":like" +compound_operator = ":all" / ":any" +open = "(" +close = ")" +separator = "," + +json_value = double_string / true_false_null / number + +double_string = ~'"([^"])*"' +true_false_null = "true" / "false" / "null" +number = (int frac exp) / (int exp) / (int frac) / int + +int = "-"? ((digit1to9 digits) / digit) +frac = "." digits +exp = e digits +digits = digit+ +e = "e+" / "e-" / "e" / "E+" / "E-" / "E" +digit1to9 = ~"[1-9]" +digit = ~"[0-9]" +""" +) + + +class FilterVisitor(NodeVisitor): + def visit_filter(self, node, visited_children): + return visited_children[0] + + def visit_boolean_filter(self, node, visited_children): + _, boolean, boolean_operands, _ = visited_children + return {boolean: boolean_operands} + + def visit_boolean(self, node, visited_children): + return node.text + + def visit_boolean_operand(self, node, visited_children): + _, specific_filter = visited_children + return specific_filter + + def visit_specific_filter(self, node, visited_children): + return visited_children[0] + + def visit_scalar_filter(self, node, visited_children): + ( + _, + json_key, + _, + operator, + _, + json_value, + _, + ) = visited_children + + return { + "name": json_key, + "op": operator, + "val": json_value, + } + + def visit_compound_filter(self, node, visited_children): + ( + _, + json_key, + _, + operator, + _, + scalar_filter, + _, + ) = visited_children + + return { + "name": json_key, + "op": operator, + "val": scalar_filter, + } + + def visit_json_key(self, node, visited_children): + return node.text + + def visit_scalar_operator(self, node, visited_children): + return node.text + + def visit_compound_operator(self, node, visited_children): + return node.text + + def visit_json_value(self, node, visited_children): + return json.loads(node.text) + + def generic_visit(self, node, visited_children): + return visited_children or node diff --git a/src/mds/query.py b/src/mds/query.py index 33f8385c..28a88400 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -7,11 +7,10 @@ from starlette.requests import Request from starlette.status import HTTP_400_BAD_REQUEST, HTTP_404_NOT_FOUND -from parsimonious.grammar import Grammar -from parsimonious.nodes import NodeVisitor from parsimonious.exceptions import IncompleteParseError, ParseError, VisitationError from .models import db, Metadata +from .param_parser import filter_param_grammar, FilterVisitor mod = APIRouter() @@ -277,125 +276,6 @@ def get_all_sql_clause(target_json_key, scalar_operator_name, scalar_right_opera ":any": {"sql_clause": get_any_sql_clause}, } - def parse_filter(filter_string): - """ - Parse filter_string and return an abstract syntax tree representation. - - Args: - filter_string (str): the filter string to parse - (e.g. '(_uploader_id,:eq,"57")' ) - - Returns: - A parsimonious abstract syntax tree representation of - filter_string. - """ - if not filter_string: - return - - # json_value and below was taken from - # https://github.com/erikrose/parsimonious/pull/23/files - grammar = Grammar( - """ - filter = scalar_filter / compound_filter / boolean_filter - boolean_filter = open boolean boolean_operands close - boolean = "and" / "or" - boolean_operands = boolean_operand+ - boolean_operand = separator specific_filter - specific_filter = scalar_filter / compound_filter / boolean_filter - scalar_filter = open json_key separator scalar_operator separator json_value close - compound_filter = open json_key separator compound_operator separator scalar_filter close - - json_key = ~"[A-Z 0-9_.]*"i - scalar_operator = ":eq" / ":ne" / ":gte" / ":gt" / ":lte" / ":lt" / ":like" - compound_operator = ":all" / ":any" - open = "(" - close = ")" - separator = "," - - json_value = double_string / true_false_null / number - - double_string = ~'"([^"])*"' - true_false_null = "true" / "false" / "null" - number = (int frac exp) / (int exp) / (int frac) / int - - int = "-"? ((digit1to9 digits) / digit) - frac = "." digits - exp = e digits - digits = digit+ - e = "e+" / "e-" / "e" / "E+" / "E-" / "E" - digit1to9 = ~"[1-9]" - digit = ~"[0-9]" - """ - ) - return grammar.parse(filter_string) - - class FilterVisitor(NodeVisitor): - def visit_filter(self, node, visited_children): - return visited_children[0] - - def visit_boolean_filter(self, node, visited_children): - _, boolean, boolean_operands, _ = visited_children - return {boolean: boolean_operands} - - def visit_boolean(self, node, visited_children): - return node.text - - def visit_boolean_operand(self, node, visited_children): - _, specific_filter = visited_children - return specific_filter - - def visit_specific_filter(self, node, visited_children): - return visited_children[0] - - def visit_scalar_filter(self, node, visited_children): - ( - _, - json_key, - _, - operator, - _, - json_value, - _, - ) = visited_children - - return { - "name": json_key, - "op": operator, - "val": json_value, - } - - def visit_compound_filter(self, node, visited_children): - ( - _, - json_key, - _, - operator, - _, - scalar_filter, - _, - ) = visited_children - - return { - "name": json_key, - "op": operator, - "val": scalar_filter, - } - - def visit_json_key(self, node, visited_children): - return node.text - - def visit_scalar_operator(self, node, visited_children): - return node.text - - def visit_compound_operator(self, node, visited_children): - return node.text - - def visit_json_value(self, node, visited_children): - return json.loads(node.text) - - def generic_visit(self, node, visited_children): - return visited_children or node - def get_sqlalchemy_clause(filter_dict): """ Return a SQLAlchemy WHERE clause representing filter_dict. @@ -458,7 +338,9 @@ def get_sqlalchemy_clause(filter_dict): return add_filter(filter_dict["name"], filter_dict["op"], filter_dict["val"]) try: - filter_tree = parse_filter(filter) + filter_tree = None + if filter: + filter_tree = filter_param_grammar.parse(filter) filter_dict = {} if filter_tree: filter_tree_visitor = FilterVisitor() From d313286830fca6d168c4440d8b32a1c462e8297f Mon Sep 17 00:00:00 2001 From: johnfrancismccann Date: Fri, 2 Apr 2021 05:26:31 +0000 Subject: [PATCH 34/39] Apply automatic documentation changes --- docs/openapi.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 4dcf11c5..86cebcc0 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -410,7 +410,9 @@ paths: - Index /objects: get: - description: "The filtering functionality was primarily driven by the requirement\ + description: "Returns a list of objects and their corresponding Indexd records\ + \ (please\nsee URL query documentation for more info on which objects get\ + \ returned).\n\nThe filtering functionality was primarily driven by the requirement\ \ that a\nuser be able to get all objects having an authz resource matching\ \ a\nuser-supplied pattern at any index in the \"_resource_paths\" array.\n\ \nFor example, given the following metadata objects:\n\n {\n \"\ @@ -455,14 +457,14 @@ paths: \ \"1\", and \"2\"" operationId: get_objects_objects_get parameters: - - description: Switch to returning a list of GUIDs (false), or metadata objects + - description: Switch to return a list of GUIDs (false), or metadata objects (true). in: query name: data required: false schema: default: true - description: Switch to returning a list of GUIDs (false), or metadata objects + description: Switch to return a list of GUIDs (false), or metadata objects (true). title: Data type: boolean From 23cb565aac4ee2ba521fcc24b295b892ecb64338 Mon Sep 17 00:00:00 2001 From: John McCann Date: Fri, 8 Oct 2021 13:41:58 -0700 Subject: [PATCH 35/39] docs(GET /objects): move dev info from swagger --- src/mds/objects.py | 56 ++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index f7ae39d9..30bcbde0 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -250,19 +250,18 @@ async def get_objects( ), filter: str = Query( "", - description="The filter(s) that will be applied to the " - "result (more detail in the docstring).", + description="The filter(s) that will be applied to the result. " + "The format is filter=(field_name,operator,value), in which " + "the field_name is a json key without quotes, operator is one of :eq, :ne, " + ":gt, :gte, :lt, :lte, :like, :all, :any, and value is " + "a typed json value against which the operator is run " + "(examples in the endpoint description).", ), ) -> JSONResponse: """ - Returns a list of objects and their corresponding Indexd records (please - see URL query documentation for more info on which objects get returned). + Returns a list of objects and their corresponding Indexd records. - The filtering functionality was primarily driven by the requirement that a - user be able to get all objects having an authz resource matching a - user-supplied pattern at any index in the "_resource_paths" array. - - For example, given the following metadata objects: + Given the following metadata objects: { "0": { @@ -312,25 +311,9 @@ async def get_objects( } } - how do we design a filtering interface that allows the user to get all - objects having an authz string matching the pattern - "/programs/%/projects/%" at any index in its "_resource_paths" array? (% - has been used as the wildcard so far because that's what Postgres uses as - the wildcard for LIKE) In this case, the "1" and "3" objects should be - returned. - - The filter syntax that was arrived at ending up following the syntax - specified by a [Node JS implementation](https://www.npmjs.com/package/json-api#filtering) of the [JSON:API - specification](https://jsonapi.org/). - - The format for this syntax is filter=(field_name,operator,value), in which - the field_name is a json key without quotes, operator is one of :eq, :ne, - :gt, :gte, :lt, :lte, :like, :all, :any (see operators dict), and value is - a typed json value against which the operator is run. + Example requests with filters: - Examples: - - GET /objects?filter=(message,:eq,"morning") returns "2" + GET /objects?filter=(message,:eq,"morning") returns "2" (i.e. the MDS object and Indexd record identified by "2") GET /objects?filter=(counts.1,:eq,3) returns "3" GET /objects?filter=(pet_age,:lte,5) returns "0" and "1" GET /objects?filter=(pet_age,:gt,5) returns "2" and "3" @@ -346,6 +329,25 @@ async def get_objects( GET /objects?filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello")) returns "0", "1", and "2" """ + # ADDITIONAL BACKGROUND + # + # The filtering functionality described above was primarily driven by the + # requirement that a user be able to get all objects having an authz + # resource matching a user-supplied pattern at any index in + # the "_resource_paths" array. + # + # So looking at the example objects from above, how do we design a + # filtering interface that allows the user to get all objects having an + # authz string matching the pattern "/programs/%/projects/%" at any index + # in the "_resource_paths" array?(% has been used as the wildcard so far + # because that's what Postgres uses as the wildcard for LIKE) In this + # case, the "1" and "3" objects should be returned. + # + # The filter syntax that was arrived at ended up following the syntax + # specified by a [Node JS implementation] + # (https://www.npmjs.com/package/json-api#filtering) of the + # [JSON:API specification](https://jsonapi.org/). + metadata_objects = await search_metadata_objects( data=data, page=page, limit=limit, filter=filter ) From 8f91573cbfa49dff2c7dd12fb9038ae1319927ff Mon Sep 17 00:00:00 2001 From: John McCann Date: Fri, 8 Oct 2021 14:08:27 -0700 Subject: [PATCH 36/39] style(GET /objects): use get on records dict --- src/mds/objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/objects.py b/src/mds/objects.py index 30bcbde0..43cf4199 100644 --- a/src/mds/objects.py +++ b/src/mds/objects.py @@ -381,7 +381,7 @@ async def get_objects( if data: response = { "items": [ - {"record": records[guid] if guid in records else {}, "metadata": o} + {"record": records.get(guid, {}), "metadata": o} for guid, o in metadata_objects ] } From 7ed158eb0f7811eebdecb4cc688fd2f7835c8976 Mon Sep 17 00:00:00 2001 From: John McCann Date: Sun, 10 Oct 2021 16:55:37 -0700 Subject: [PATCH 37/39] test(GET /objects): use fixture to set up objects --- tests/conftest.py | 17 +++ tests/test_objects.py | 236 ++++++++++++++++-------------------------- 2 files changed, 109 insertions(+), 144 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 96c803c7..ca576a59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,6 +47,23 @@ def client(): yield client +@pytest.fixture() +def set_up_and_teardown_metadata_objects(client): + """ + Set up and teardown metadata objects based on tests/sample_objects.json + file + """ + with open("tests/sample_objects.json") as json_file: + data = json.load(json_file) + for guid, mds_object in data.items(): + client.post("/metadata/" + guid, json=mds_object).raise_for_status() + yield + with open("tests/sample_objects.json") as json_file: + data = json.load(json_file) + for guid in data.keys(): + client.delete("/metadata/" + guid).raise_for_status() + + @pytest.fixture( params=[ "dg.TEST/87fced8d-b9c8-44b5-946e-c465c8f8f3d6", diff --git a/tests/test_objects.py b/tests/test_objects.py index 34ae8e8f..e8e0bd1b 100644 --- a/tests/test_objects.py +++ b/tests/test_objects.py @@ -1022,209 +1022,157 @@ def test_get_object_latest_when_indexd_returns_404_and_guid_not_in_mds( assert resp.status_code == 404, resp.text -def set_up_metadata_objects(client_fixture): - """ - Create metadata objects from tests/sample_objects.json file. - """ - with open("tests/sample_objects.json") as json_file: - data = json.load(json_file) - - for guid, mds_object in data.items(): - client_fixture.post("/metadata/" + guid, json=mds_object).raise_for_status() - - -def tear_down_metadata_objects(client_fixture): - """ - Delete metadata objects that were created from tests/sample_objects.json - file. - """ - with open("tests/sample_objects.json") as json_file: - data = json.load(json_file) - - for guid in data.keys(): - client_fixture.delete("/metadata/" + guid).raise_for_status() - - -def test_get_objects_filter_by_message_equals(client): +def test_get_objects_filter_by_message_equals( + client, set_up_and_teardown_metadata_objects +): """ Test that the GET /objects endpoint filters correctly when a message-equal filter is applied. """ - try: - set_up_metadata_objects(client) - - resp = client.get('/objects?data=true&filter=(message,:eq,"morning")') - resp_json = resp.json() + resp = client.get('/objects?data=true&filter=(message,:eq,"morning")') + resp_json = resp.json() - assert resp.status_code == 200 - assert len(resp_json["items"]) == 1 - assert resp_json["items"][0]["metadata"]["message"] == "morning" - finally: - tear_down_metadata_objects(client) + assert resp.status_code == 200 + assert len(resp_json["items"]) == 1 + assert resp_json["items"][0]["metadata"]["message"] == "morning" -def test_get_objects_filter_by_counts_index_equals(client): +def test_get_objects_filter_by_counts_index_equals( + client, set_up_and_teardown_metadata_objects +): """ Test that the GET /objects endpoint filters correctly when an array index-equal filter is applied """ - try: - set_up_metadata_objects(client) + resp = client.get("/objects?data=true&filter=(counts.1,:eq,3)") + resp_json = resp.json() - resp = client.get("/objects?data=true&filter=(counts.1,:eq,3)") - resp_json = resp.json() + assert resp.status_code == 200 + assert len(resp_json["items"]) == 1 + assert resp_json["items"][0]["metadata"]["counts"][1] == 3 - assert resp.status_code == 200 - assert len(resp_json["items"]) == 1 - assert resp_json["items"][0]["metadata"]["counts"][1] == 3 - finally: - tear_down_metadata_objects(client) - -def test_get_objects_filter_using_lte(client): +def test_get_objects_filter_using_lte(client, set_up_and_teardown_metadata_objects): """ Test that the GET /objects endpoint filters correctly for a less than or equal operator. """ - try: - set_up_metadata_objects(client) - - resp = client.get("/objects?data=true&filter=(pet_age,:lte,5)") - resp_json = resp.json() - resp_json["items"].sort(key=lambda o: o["metadata"]["pet_age"]) + resp = client.get("/objects?data=true&filter=(pet_age,:lte,5)") + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["pet_age"]) - assert resp.status_code == 200 - assert len(resp_json["items"]) == 2 - assert resp_json["items"][0]["metadata"]["pet_age"] == 1 - assert resp_json["items"][1]["metadata"]["pet_age"] == 5 - finally: - tear_down_metadata_objects(client) + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["pet_age"] == 1 + assert resp_json["items"][1]["metadata"]["pet_age"] == 5 -def test_get_objects_filter_using_gt(client): +def test_get_objects_filter_using_gt(client, set_up_and_teardown_metadata_objects): """ Test that the GET /objects endpoint filters correctly for a greater than operator. """ - try: - set_up_metadata_objects(client) - - resp = client.get("/objects?data=true&filter=(pet_age,:gt,5)") - resp_json = resp.json() - resp_json["items"].sort(key=lambda o: o["metadata"]["pet_age"]) + resp = client.get("/objects?data=true&filter=(pet_age,:gt,5)") + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["pet_age"]) - assert resp.status_code == 200 - assert len(resp_json["items"]) == 2 - assert resp_json["items"][0]["metadata"]["pet_age"] == 10 - assert resp_json["items"][1]["metadata"]["pet_age"] == 15 - finally: - tear_down_metadata_objects(client) + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["pet_age"] == 10 + assert resp_json["items"][1]["metadata"]["pet_age"] == 15 -def test_get_objects_filter_by_any_and_like_with_resource_paths(client): +def test_get_objects_filter_by_any_and_like_with_resource_paths( + client, set_up_and_teardown_metadata_objects +): """ Test that the GET /objects endpoint filters correctly when an :any filter is combined with a :like filter. """ - try: - set_up_metadata_objects(client) - - resp = client.get( - '/objects?data=true&filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%"))' - ) - resp_json = resp.json() - resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) - - assert resp.status_code == 200 - assert len(resp_json["items"]) == 2 - assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" - assert resp_json["items"][1]["metadata"]["_uploader_id"] == "103" - finally: - tear_down_metadata_objects(client) + resp = client.get( + '/objects?data=true&filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%"))' + ) + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" + assert resp_json["items"][1]["metadata"]["_uploader_id"] == "103" -def test_get_objects_filter_by_counts_all_equal_42(client): + +def test_get_objects_filter_by_counts_all_equal_42( + client, set_up_and_teardown_metadata_objects +): """ Test that the GET /objects endpoint filters correctly when an :all filter is combined with an :eq filter. """ - try: - set_up_metadata_objects(client) - - resp = client.get("/objects?data=true&filter=(counts,:all,(,:eq,42))") - resp_json = resp.json() + resp = client.get("/objects?data=true&filter=(counts,:all,(,:eq,42))") + resp_json = resp.json() - assert resp.status_code == 200 - assert len(resp_json["items"]) == 1 - assert resp_json["items"][0]["metadata"]["_uploader_id"] == "102" - finally: - tear_down_metadata_objects(client) + assert resp.status_code == 200 + assert len(resp_json["items"]) == 1 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "102" -def test_get_objects_filter_by_or_two_different_uploader_ids(client): +def test_get_objects_filter_by_or_two_different_uploader_ids( + client, set_up_and_teardown_metadata_objects +): """ Test that the GET /objects endpoint filters correctly when two :eq filters are combined using an :or filter. """ - try: - set_up_metadata_objects(client) - - resp = client.get( - '/objects?data=true&filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102"))' - ) - resp_json = resp.json() - resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) - - assert resp.status_code == 200 - assert len(resp_json["items"]) == 2 - assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" - assert resp_json["items"][1]["metadata"]["_uploader_id"] == "102" - finally: - tear_down_metadata_objects(client) + resp = client.get( + '/objects?data=true&filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102"))' + ) + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + assert resp.status_code == 200 + assert len(resp_json["items"]) == 2 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "101" + assert resp_json["items"][1]["metadata"]["_uploader_id"] == "102" -def test_get_objects_filter_by_compound_boolean_filter(client): + +def test_get_objects_filter_by_compound_boolean_filter( + client, set_up_and_teardown_metadata_objects +): """ Test the filtering functionality of the GET /objects endpoint using a compound boolean filter. """ - try: - set_up_metadata_objects(client) - - resp = client.get( - '/objects?data=true&filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello"))' - ) - resp_json = resp.json() - resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) - - assert resp.status_code == 200 - assert len(resp_json["items"]) == 3 - assert resp_json["items"][0]["metadata"]["_uploader_id"] == "100" - assert resp_json["items"][1]["metadata"]["_uploader_id"] == "101" - assert resp_json["items"][2]["metadata"]["_uploader_id"] == "102" - finally: - tear_down_metadata_objects(client) + resp = client.get( + '/objects?data=true&filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello"))' + ) + resp_json = resp.json() + resp_json["items"].sort(key=lambda o: o["metadata"]["_uploader_id"]) + + assert resp.status_code == 200 + assert len(resp_json["items"]) == 3 + assert resp_json["items"][0]["metadata"]["_uploader_id"] == "100" + assert resp_json["items"][1]["metadata"]["_uploader_id"] == "101" + assert resp_json["items"][2]["metadata"]["_uploader_id"] == "102" -def test_get_objects_with_data_param_equal_false(client): +def test_get_objects_with_data_param_equal_false( + client, set_up_and_teardown_metadata_objects +): """ Test that a list of guids is returned from the GET /objects endpoint when data param is false. """ - try: - set_up_metadata_objects(client) + resp = client.get("/objects?data=false") + resp_json = resp.json() + resp_json.sort() - resp = client.get("/objects?data=false") - resp_json = resp.json() - resp_json.sort() + assert resp.status_code == 200 + assert resp_json == ["0", "1", "2", "3"] - assert resp.status_code == 200 - assert resp_json == ["0", "1", "2", "3"] - finally: - tear_down_metadata_objects(client) - -def test_get_objects_raises_a_400_for_invalid_filter(client): +def test_get_objects_raises_a_400_for_invalid_filter( + client, set_up_and_teardown_metadata_objects +): """ Test that a 400 is returned from the GET /objects endpoint when an invalid filter is provided. @@ -1232,7 +1180,7 @@ def test_get_objects_raises_a_400_for_invalid_filter(client): resp = client.get('/objects?data=true&filter=(message,"morning")') assert resp.status_code == 400 - + @respx.mock def test_delete_object_when_fence_returns_204(client, valid_upload_file_patcher): """ From 863693072d804c8e7b09bed58bf2b93aee96c93c Mon Sep 17 00:00:00 2001 From: John McCann Date: Sun, 10 Oct 2021 18:46:31 -0700 Subject: [PATCH 38/39] feat(GET /objects): add detail in filter error res --- src/mds/query.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/mds/query.py b/src/mds/query.py index 28a88400..23c10cf2 100644 --- a/src/mds/query.py +++ b/src/mds/query.py @@ -9,6 +9,7 @@ from parsimonious.exceptions import IncompleteParseError, ParseError, VisitationError +from . import logger from .models import db, Metadata from .param_parser import filter_param_grammar, FilterVisitor @@ -345,9 +346,17 @@ def get_sqlalchemy_clause(filter_dict): if filter_tree: filter_tree_visitor = FilterVisitor() filter_dict = filter_tree_visitor.visit(filter_tree) - except (IncompleteParseError, ParseError, VisitationError): + except (IncompleteParseError, ParseError) as e: + logger.debug(e, exc_info=True) raise HTTPException( - HTTP_400_BAD_REQUEST, f"filter URL query param syntax is invalid" + HTTP_400_BAD_REQUEST, + f"Please check syntax for the provided `filter` URL query param, which could not be correctly parsed from index {e.pos} (zero-indexed) onwards.", + ) + except VisitationError as e: + logger.debug(e, exc_info=True) + raise HTTPException( + HTTP_400_BAD_REQUEST, + "Please check syntax for the provided `filter` URL query param, which could not be correctly parsed.", ) if data: From 82cb87e5cebaed08b31f634c45e2fa372391aa4a Mon Sep 17 00:00:00 2001 From: johnfrancismccann Date: Mon, 11 Oct 2021 03:28:20 +0000 Subject: [PATCH 39/39] Apply automatic documentation changes --- docs/openapi.yaml | 82 +++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 344f07a7..c010dfd0 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -413,47 +413,33 @@ paths: - Index /objects: get: - description: "Returns a list of objects and their corresponding Indexd records\ - \ (please\nsee URL query documentation for more info on which objects get\ - \ returned).\n\nThe filtering functionality was primarily driven by the requirement\ - \ that a\nuser be able to get all objects having an authz resource matching\ - \ a\nuser-supplied pattern at any index in the \"_resource_paths\" array.\n\ - \nFor example, given the following metadata objects:\n\n {\n \"\ - 0\": {\n \"message\": \"hello\",\n \"_uploader_id\"\ - : \"100\",\n \"_resource_paths\": [\n \"/programs/a\"\ - ,\n \"/programs/b\"\n ],\n \"pet\": \"\ - dog\",\n \"pet_age\": 1\n },\n \"1\": {\n \ - \ \"message\": \"greetings\",\n \"_uploader_id\": \"101\",\n\ - \ \"_resource_paths\": [\n \"/open\",\n \ - \ \"/programs/c/projects/a\"\n ],\n \"pet\":\ - \ \"ferret\",\n \"pet_age\": 5,\n \"sport\": \"soccer\"\ - \n },\n \"2\": {\n \"message\": \"morning\",\n \ - \ \"_uploader_id\": \"102\",\n \"_resource_paths\": [\n\ - \ \"/programs/d\",\n \"/programs/e\"\n \ - \ ],\n \"counts\": [42, 42, 42],\n \"pet\": \"\ - ferret\",\n \"pet_age\": 10,\n \"sport\": \"soccer\"\ - \n },\n \"3\": {\n \"message\": \"evening\",\n \ - \ \"_uploader_id\": \"103\",\n \"_resource_paths\": [\n\ - \ \"/programs/f/projects/a\",\n \"/admin\"\n\ - \ ],\n \"counts\": [1, 3, 5],\n \"pet\":\ - \ \"ferret\",\n \"pet_age\": 15,\n \"sport\": \"basketball\"\ - \n }\n }\n\nhow do we design a filtering interface that allows the\ - \ user to get all\nobjects having an authz string matching the pattern\n\"\ - /programs/%/projects/%\" at any index in its \"_resource_paths\" array? (%\n\ - has been used as the wildcard so far because that's what Postgres uses as\n\ - the wildcard for LIKE) In this case, the \"1\" and \"3\" objects should be\n\ - returned.\n\nThe filter syntax that was arrived at ending up following the\ - \ syntax\nspecified by a [Node JS implementation](https://www.npmjs.com/package/json-api#filtering)\ - \ of the [JSON:API\nspecification](https://jsonapi.org/).\n\nThe format for\ - \ this syntax is filter=(field_name,operator,value), in which\nthe field_name\ - \ is a json key without quotes, operator is one of :eq, :ne,\n:gt, :gte, :lt,\ - \ :lte, :like, :all, :any (see operators dict), and value is\na typed json\ - \ value against which the operator is run.\n\nExamples:\n\n GET /objects?filter=(message,:eq,\"\ - morning\") returns \"2\"\n GET /objects?filter=(counts.1,:eq,3) returns\ - \ \"3\"\n GET /objects?filter=(pet_age,:lte,5) returns \"0\" and \"1\"\n\ - \ GET /objects?filter=(pet_age,:gt,5) returns \"2\" and \"3\"\n\nCompound\ - \ expressions are supported:\n\n GET /objects?filter=(_resource_paths,:any,(,:like,\"\ - /programs/%/projects/%\")) returns \"1\" and \"3\"\n GET /objects?filter=(counts,:all,(,:eq,42))\ + description: "Returns a list of objects and their corresponding Indexd records.\n\ + \nGiven the following metadata objects:\n\n {\n \"0\": {\n \ + \ \"message\": \"hello\",\n \"_uploader_id\": \"100\",\n\ + \ \"_resource_paths\": [\n \"/programs/a\",\n \ + \ \"/programs/b\"\n ],\n \"pet\": \"dog\"\ + ,\n \"pet_age\": 1\n },\n \"1\": {\n \"\ + message\": \"greetings\",\n \"_uploader_id\": \"101\",\n \ + \ \"_resource_paths\": [\n \"/open\",\n \ + \ \"/programs/c/projects/a\"\n ],\n \"pet\": \"ferret\"\ + ,\n \"pet_age\": 5,\n \"sport\": \"soccer\"\n \ + \ },\n \"2\": {\n \"message\": \"morning\",\n \ + \ \"_uploader_id\": \"102\",\n \"_resource_paths\": [\n \ + \ \"/programs/d\",\n \"/programs/e\"\n \ + \ ],\n \"counts\": [42, 42, 42],\n \"pet\": \"ferret\"\ + ,\n \"pet_age\": 10,\n \"sport\": \"soccer\"\n \ + \ },\n \"3\": {\n \"message\": \"evening\",\n \ + \ \"_uploader_id\": \"103\",\n \"_resource_paths\": [\n \ + \ \"/programs/f/projects/a\",\n \"/admin\"\n \ + \ ],\n \"counts\": [1, 3, 5],\n \"pet\": \"\ + ferret\",\n \"pet_age\": 15,\n \"sport\": \"basketball\"\ + \n }\n }\n\nExample requests with filters:\n\n GET /objects?filter=(message,:eq,\"\ + morning\") returns \"2\" (i.e. the MDS object and Indexd record identified\ + \ by \"2\")\n GET /objects?filter=(counts.1,:eq,3) returns \"3\"\n GET\ + \ /objects?filter=(pet_age,:lte,5) returns \"0\" and \"1\"\n GET /objects?filter=(pet_age,:gt,5)\ + \ returns \"2\" and \"3\"\n\nCompound expressions are supported:\n\n GET\ + \ /objects?filter=(_resource_paths,:any,(,:like,\"/programs/%/projects/%\"\ + )) returns \"1\" and \"3\"\n GET /objects?filter=(counts,:all,(,:eq,42))\ \ returns \"2\"\n\nBoolean expressions are also supported:\n\n GET /objects?filter=(or,(_uploader_id,:eq,\"\ 101\"),(_uploader_id,:eq,\"102\")) returns \"1\" and \"2\"\n GET /objects?filter=(or,(and,(pet,:eq,\"\ ferret\"),(sport,:eq,\"soccer\")),(message,:eq,\"hello\")) returns \"0\",\ @@ -495,15 +481,21 @@ paths: with page to determine page size.' title: Limit type: integer - - description: The filter(s) that will be applied to the result (more detail - in the docstring). + - description: The filter(s) that will be applied to the result. The format + is filter=(field_name,operator,value), in which the field_name is a json + key without quotes, operator is one of :eq, :ne, :gt, :gte, :lt, :lte, :like, + :all, :any, and value is a typed json value against which the operator is + run (examples in the endpoint description). in: query name: filter required: false schema: default: '' - description: The filter(s) that will be applied to the result (more detail - in the docstring). + description: The filter(s) that will be applied to the result. The format + is filter=(field_name,operator,value), in which the field_name is a json + key without quotes, operator is one of :eq, :ne, :gt, :gte, :lt, :lte, + :like, :all, :any, and value is a typed json value against which the operator + is run (examples in the endpoint description). title: Filter type: string responses: