uc-cdis · johnfrancismccann · Nov 19, 2020 · Dec 3, 2020 · Dec 8, 2020 · Dec 10, 2020
diff --git a/docs/openapi.yaml b/docs/openapi.yaml
@@ -410,45 +410,97 @@ paths:
       - Index
   /objects:
     get:
-      description: XXX comments
+      description: "Returns a list of objects and their corresponding Indexd records\
+        \ (please\nsee URL query documentation for more info on which objects get\
+        \ returned).\n\nThe filtering functionality was primarily driven by the requirement\
+        \ that a\nuser be able to get all objects having an authz resource matching\
+        \ a\nuser-supplied pattern at any index in the \"_resource_paths\" array.\n\
+        \nFor example, given the following metadata objects:\n\n    {\n        \"\
+        0\": {\n            \"message\": \"hello\",\n            \"_uploader_id\"\
+        : \"100\",\n            \"_resource_paths\": [\n                \"/programs/a\"\
+        ,\n                \"/programs/b\"\n            ],\n            \"pet\": \"\
+        dog\",\n            \"pet_age\": 1\n        },\n        \"1\": {\n       \
+        \     \"message\": \"greetings\",\n            \"_uploader_id\": \"101\",\n\
+        \            \"_resource_paths\": [\n                \"/open\",\n        \
+        \        \"/programs/c/projects/a\"\n            ],\n            \"pet\":\
+        \ \"ferret\",\n            \"pet_age\": 5,\n            \"sport\": \"soccer\"\
+        \n        },\n        \"2\": {\n            \"message\": \"morning\",\n  \
+        \          \"_uploader_id\": \"102\",\n            \"_resource_paths\": [\n\
+        \                \"/programs/d\",\n                \"/programs/e\"\n     \
+        \       ],\n            \"counts\": [42, 42, 42],\n            \"pet\": \"\
+        ferret\",\n            \"pet_age\": 10,\n            \"sport\": \"soccer\"\
+        \n        },\n        \"3\": {\n            \"message\": \"evening\",\n  \
+        \          \"_uploader_id\": \"103\",\n            \"_resource_paths\": [\n\
+        \                \"/programs/f/projects/a\",\n                \"/admin\"\n\
+        \            ],\n            \"counts\": [1, 3, 5],\n            \"pet\":\
+        \ \"ferret\",\n            \"pet_age\": 15,\n            \"sport\": \"basketball\"\
+        \n        }\n    }\n\nhow do we design a filtering interface that allows the\
+        \ user to get all\nobjects having an authz string matching the pattern\n\"\
+        /programs/%/projects/%\" at any index in its \"_resource_paths\" array? (%\n\
+        has been used as the wildcard so far because that's what Postgres uses as\n\
+        the wildcard for LIKE) In this case, the \"1\" and \"3\" objects should be\n\
+        returned.\n\nThe filter syntax that was arrived at ending up following the\
+        \ syntax\nspecified by a [Node JS implementation](https://www.npmjs.com/package/json-api#filtering)\
+        \ of the [JSON:API\nspecification](https://jsonapi.org/).\n\nThe format for\
+        \ this syntax is filter=(field_name,operator,value), in which\nthe field_name\
+        \ is a json key without quotes, operator is one of :eq, :ne,\n:gt, :gte, :lt,\
+        \ :lte, :like, :all, :any (see operators dict), and value is\na typed json\
+        \ value against which the operator is run.\n\nExamples:\n\n    GET /objects?filter=(message,:eq,\"\
+        morning\") returns \"2\"\n    GET /objects?filter=(counts.1,:eq,3) returns\
+        \ \"3\"\n    GET /objects?filter=(pet_age,:lte,5) returns \"0\" and \"1\"\n\
+        \    GET /objects?filter=(pet_age,:gt,5) returns \"2\" and \"3\"\n\nCompound\
+        \ expressions are supported:\n\n    GET /objects?filter=(_resource_paths,:any,(,:like,\"\
+        /programs/%/projects/%\")) returns \"1\" and \"3\"\n    GET /objects?filter=(counts,:all,(,:eq,42))\
+        \ returns \"2\"\n\nBoolean expressions are also supported:\n\n    GET /objects?filter=(or,(_uploader_id,:eq,\"\
+        101\"),(_uploader_id,:eq,\"102\")) returns \"1\" and \"2\"\n    GET /objects?filter=(or,(and,(pet,:eq,\"\
+        ferret\"),(sport,:eq,\"soccer\")),(message,:eq,\"hello\")) returns \"0\",\
+        \ \"1\", and \"2\""
       operationId: get_objects_objects_get
       parameters:
-      - description: Switch to returning a list of GUIDs (false), or GUIDs mapping
-          to their metadata (true).
+      - description: Switch to return a list of GUIDs (false), or metadata objects
+          (true).
         in: query
         name: data
         required: false
         schema:
-          default: false
-          description: Switch to returning a list of GUIDs (false), or GUIDs mapping
-            to their metadata (true).
+          default: true
+          description: Switch to return a list of GUIDs (false), or metadata objects
+            (true).
           title: Data
           type: boolean
-      - description: 'Maximum number of records returned. (max: 2000)'
+      - description: The offset for what objects are returned (zero-indexed). The
+          exact offset will be equal to page*limit (e.g. with page=1, limit=15, 15
+          objects beginning at index 15 will be returned).
         in: query
-        name: limit
+        name: page
         required: false
         schema:
-          default: 10
-          description: 'Maximum number of records returned. (max: 2000)'
-          title: Limit
+          default: 0
+          description: The offset for what objects are returned (zero-indexed). The
+            exact offset will be equal to page*limit (e.g. with page=1, limit=15,
+            15 objects beginning at index 15 will be returned).
+          title: Page
           type: integer
-      - description: Return results at this given offset.
+      - description: 'Maximum number of objects returned (max: 1024). Also used with
+          page to determine page size.'
         in: query
-        name: offset
+        name: limit
         required: false
         schema:
-          default: 0
-          description: Return results at this given offset.
-          title: Offset
+          default: 10
+          description: 'Maximum number of objects returned (max: 1024). Also used
+            with page to determine page size.'
+          title: Limit
           type: integer
-      - description: Filters to apply.
+      - description: The filter(s) that will be applied to the result (more detail
+          in the docstring).
         in: query
         name: filter
         required: false
         schema:
           default: ''
-          description: Filters to apply.
+          description: The filter(s) that will be applied to the result (more detail
+            in the docstring).
           title: Filter
           type: string
       responses:

diff --git a/src/mds/objects.py b/src/mds/objects.py
@@ -24,7 +24,7 @@
 
 from . import config, logger
 from .models import Metadata
-from .query import get_metadata, search_metadata_helper
+from .query import get_metadata, search_metadata_objects
 
 mod = APIRouter()
 
@@ -231,64 +231,157 @@ async def create_object_for_id(
 @mod.get("/objects")
 async def get_objects(
     request: Request,
-    #  XXX would be nice to be able to specify whether to return indexd records
-    #  (e.g GET objects?data=metadata would only return mds objects)
     data: bool = Query(
-        False,
-        description="Switch to returning a list of GUIDs (false), "
-        "or GUIDs mapping to their metadata (true).",
+        True,
+        description="Switch to return a list of GUIDs (false), "
+        "or metadata objects (true).",
+    ),
+    page: int = Query(
+        0,
+        description="The offset for what objects are returned "
+        "(zero-indexed). The exact offset will be equal to "
+        "page*limit (e.g. with page=1, limit=15, 15 objects "
+        "beginning at index 15 will be returned).",
     ),
     limit: int = Query(
-        10, description="Maximum number of records returned. (max: 2000)"
+        10,
+        description="Maximum number of objects returned (max: 1024). "
+        "Also used with page to determine page size.",
+    ),
+    filter: str = Query(
+        "",
+        description="The filter(s) that will be applied to the "
+        "result (more detail in the docstring).",
-        "result (more detail in the docstring).",
+        "result (more detail in the endpoint description).",
-        "result (more detail in the docstring).",
+        "result (more detail in the endpoint description).",
     ),
-    offset: int = Query(0, description="Return results at this given offset."),
-    #  XXX description
-    #  XXX how to name this python variable something other than filter but
-    #  still have client use "filter" as URL query param? (bc filter is already
-    #  built in to Python)
-    filter: str = Query("", description="Filters to apply."),
 ) -> JSONResponse:
     """
-    XXX comments
+    Returns a list of objects and their corresponding Indexd records (please
+    see URL query documentation for more info on which objects get returned).
+
+    The filtering functionality was primarily driven by the requirement that a
+    user be able to get all objects having an authz resource matching a
+    user-supplied pattern at any index in the "_resource_paths" array.
+
+    For example, given the following metadata objects:
+
+        {
+            "0": {
+                "message": "hello",
+                "_uploader_id": "100",
+                "_resource_paths": [
+                    "/programs/a",
+                    "/programs/b"
+                ],
+                "pet": "dog",
+                "pet_age": 1
+            },
+            "1": {
+                "message": "greetings",
+                "_uploader_id": "101",
+                "_resource_paths": [
+                    "/open",
+                    "/programs/c/projects/a"
+                ],
+                "pet": "ferret",
+                "pet_age": 5,
+                "sport": "soccer"
+            },
+            "2": {
+                "message": "morning",
+                "_uploader_id": "102",
+                "_resource_paths": [
+                    "/programs/d",
+                    "/programs/e"
+                ],
+                "counts": [42, 42, 42],
+                "pet": "ferret",
+                "pet_age": 10,
+                "sport": "soccer"
+            },
+            "3": {
+                "message": "evening",
+                "_uploader_id": "103",
+                "_resource_paths": [
+                    "/programs/f/projects/a",
+                    "/admin"
+                ],
+                "counts": [1, 3, 5],
+                "pet": "ferret",
+                "pet_age": 15,
+                "sport": "basketball"
+            }
+        }
+
+    how do we design a filtering interface that allows the user to get all
+    objects having an authz string matching the pattern
+    "/programs/%/projects/%" at any index in its "_resource_paths" array? (%
+    has been used as the wildcard so far because that's what Postgres uses as
+    the wildcard for LIKE) In this case, the "1" and "3" objects should be
+    returned.
+
+    The filter syntax that was arrived at ending up following the syntax
+    specified by a [Node JS implementation](https://www.npmjs.com/package/json-api#filtering) of the [JSON:API
+    specification](https://jsonapi.org/).
+
+    The format for this syntax is filter=(field_name,operator,value), in which
+    the field_name is a json key without quotes, operator is one of :eq, :ne,
+    :gt, :gte, :lt, :lte, :like, :all, :any (see operators dict), and value is
+    a typed json value against which the operator is run.
+
+    Examples:
+
+        GET /objects?filter=(message,:eq,"morning") returns "2"
+        GET /objects?filter=(counts.1,:eq,3) returns "3"
+        GET /objects?filter=(pet_age,:lte,5) returns "0" and "1"
+        GET /objects?filter=(pet_age,:gt,5) returns "2" and "3"
+
+    Compound expressions are supported:
+
+        GET /objects?filter=(_resource_paths,:any,(,:like,"/programs/%/projects/%")) returns "1" and "3"
+        GET /objects?filter=(counts,:all,(,:eq,42)) returns "2"
+
+    Boolean expressions are also supported:
+
+        GET /objects?filter=(or,(_uploader_id,:eq,"101"),(_uploader_id,:eq,"102")) returns "1" and "2"
+        GET /objects?filter=(or,(and,(pet,:eq,"ferret"),(sport,:eq,"soccer")),(message,:eq,"hello")) returns "0", "1", and "2"
     """
 
-    metadata_objects = await search_metadata_helper(
-        data=data, limit=limit, offset=offset, filter=filter
+    metadata_objects = await search_metadata_objects(
+        data=data, page=page, limit=limit, filter=filter
     )
 
     records = {}
-    if metadata_objects:
+    if data and metadata_objects:
         try:
             endpoint_path = "/bulk/documents"
-            full_endpoint = config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path
-            guids = (
-                list(metadata_objects.keys())
-                if hasattr(metadata_objects, "keys")
-                else metadata_objects
+            full_indexd_url = (
+                config.INDEXING_SERVICE_ENDPOINT.rstrip("/") + endpoint_path
             )
-            #  XXX /bulk/documents endpoint in indexd currently doesn't support
-            #  filters
-            response = await request.app.async_client.post(full_endpoint, json=guids)
+            guids = list(guid for guid, _ in metadata_objects)
+
+            response = await request.app.async_client.post(full_indexd_url, json=guids)
             response.raise_for_status()
             records = {r["did"]: r for r in response.json()}
         except httpx.HTTPError as err:
             logger.debug(err, exc_info=True)
             if err.response:
                 logger.error(
                     "indexd `POST %s` endpoint returned a %s HTTP status code",
-                    endpoint_path,
+                    full_indexd_url,
                     err.response.status_code,
                 )
             else:
                 logger.error(
                     "Unable to get a response from indexd `POST %s` endpoint",
-                    endpoint_path,
+                    full_indexd_url,
                 )
 
-    if type(metadata_objects) is dict:
+    if data:
         response = {
-            guid: {"record": records[guid] if guid in records else {}, "metadata": o}
-            for guid, o in metadata_objects.items()
+            "items": [
+                {"record": records[guid] if guid in records else {}, "metadata": o}
-                {"record": records[guid] if guid in records else {}, "metadata": o}
+                {"record": records.get(guid, {}), "metadata": o}
-                {"record": records[guid] if guid in records else {}, "metadata": o}
+                {"record": records.get(guid, {}), "metadata": o}
+                for guid, o in metadata_objects
+            ]
         }
     else:
         response = metadata_objects