triton-inference-server
diff --git a/‎qa/L0_backend_python/decoupled/decoupled_test.py
Lines changed: 122 additions & 37 deletions b/‎qa/L0_backend_python/decoupled/decoupled_test.py
Lines changed: 122 additions & 37 deletions
diff --git a/‎qa/L0_backend_python/decoupled/models/decoupled_bls_async_cancel/1/model.py
Lines changed: 17 additions & 13 deletions b/‎qa/L0_backend_python/decoupled/models/decoupled_bls_async_cancel/1/model.py
Lines changed: 17 additions & 13 deletions
diff --git a/‎qa/L0_backend_python/decoupled/models/decoupled_bls_async_cancel/config.pbtxt
Lines changed: 10 additions & 0 deletions b/‎qa/L0_backend_python/decoupled/models/decoupled_bls_async_cancel/config.pbtxt
Lines changed: 10 additions & 0 deletions
diff --git a/‎qa/L0_backend_python/decoupled/models/decoupled_bls_cancel/1/model.py
Lines changed: 17 additions & 13 deletions b/‎qa/L0_backend_python/decoupled/models/decoupled_bls_cancel/1/model.py
Lines changed: 17 additions & 13 deletions
diff --git a/‎qa/L0_backend_python/decoupled/models/decoupled_bls_cancel/config.pbtxt
Lines changed: 10 additions & 0 deletions b/‎qa/L0_backend_python/decoupled/models/decoupled_bls_cancel/config.pbtxt
Lines changed: 10 additions & 0 deletions
@@ -325,62 +325,137 @@ def test_decoupled_execute_cancel(self):
             self.assertIn("[execute_cancel] Request cancelled at ", log_text)
 
     def test_decoupled_bls_cancel(self):
-        model_name = "decoupled_bls_cancel"
+        model_names = ["decoupled_bls_cancel", "decoupled_bls_async_cancel"]
         input_value = 1
         max_sum_value = 10
+        ignore_cancel = False
         user_data = UserData()
+        for model_name in model_names:
+            with self._shm_leak_detector.Probe() as shm_probe:
+                with grpcclient.InferenceServerClient(
+                    f"{_tritonserver_ipaddr}:8001"
+                ) as client:
+                    client.start_stream(callback=partial(callback, user_data))
+                    input_data = np.array([input_value], dtype=np.int32)
+                    max_sum_data = np.array([max_sum_value], dtype=np.int32)
+                    ignore_cancel_data = np.array([ignore_cancel], dtype=np.bool_)
+                    inputs = [
+                        grpcclient.InferInput(
+                            "INPUT",
+                            input_data.shape,
+                            np_to_triton_dtype(input_data.dtype),
+                        ),
+                        grpcclient.InferInput(
+                            "MAX_SUM",
+                            max_sum_data.shape,
+                            np_to_triton_dtype(max_sum_data.dtype),
+                        ),
+                        grpcclient.InferInput(
+                            "IGNORE_CANCEL",
+                            ignore_cancel_data.shape,
+                            np_to_triton_dtype(ignore_cancel_data.dtype),
+                        ),
+                    ]
+                    inputs[0].set_data_from_numpy(input_data)
+                    inputs[1].set_data_from_numpy(max_sum_data)
+                    inputs[2].set_data_from_numpy(ignore_cancel_data)
+                    client.async_stream_infer(model_name, inputs)
+
+                    # Check the results of the decoupled model using BLS
+                    def check_result(result):
+                        # Make sure the result is not an exception
+                        self.assertIsNot(type(result), InferenceServerException)
+                        is_cancelled = result.as_numpy("IS_CANCELLED")
+                        self.assertTrue(
+                            is_cancelled[0],
+                            "error: expected the request to be cancelled",
+                        )
 
-        with self._shm_leak_detector.Probe() as shm_probe:
-            with grpcclient.InferenceServerClient(
-                f"{_tritonserver_ipaddr}:8001"
-            ) as client:
-                client.start_stream(callback=partial(callback, user_data))
-                input_data = np.array([input_value], dtype=np.int32)
-                max_sum_data = np.array([max_sum_value], dtype=np.int32)
-                inputs = [
-                    grpcclient.InferInput(
-                        "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)
-                    ),
-                    grpcclient.InferInput(
-                        "MAX_SUM",
-                        max_sum_data.shape,
-                        np_to_triton_dtype(max_sum_data.dtype),
-                    ),
-                ]
-                inputs[0].set_data_from_numpy(input_data)
-                inputs[1].set_data_from_numpy(max_sum_data)
-                client.async_stream_infer(model_name, inputs)
+                        sum_data = result.as_numpy("SUM")
+                        self.assertIsNotNone(sum_data, "error: expected 'SUM'")
+                        self.assertTrue(
+                            np.array_equal(sum_data, max_sum_data),
+                            "error: expected output {} to match input {}".format(
+                                sum_data, max_sum_data
+                            ),
+                        )
 
-                # Check the results of the decoupled model using BLS
-                def check_result(result):
-                    # Make sure the result is not an exception
-                    self.assertIsNot(type(result), InferenceServerException)
+                    result = user_data._completed_requests.get()
+                    check_result(result)
 
-                    sum_data = result.as_numpy("SUM")
-                    self.assertIsNotNone(sum_data, "error: expected 'SUM'")
-                    self.assertTrue(
-                        np.array_equal(sum_data, max_sum_data),
-                        "error: expected output {} to match input {}".format(
-                            sum_data, max_sum_data
+    def test_decoupled_bls_ignore_cancel(self):
+        model_names = ["decoupled_bls_cancel", "decoupled_bls_async_cancel"]
+        input_value = 1
+        max_sum_value = 10
+        ignore_cancel = True
+        user_data = UserData()
+        for model_name in model_names:
+            with self._shm_leak_detector.Probe() as shm_probe:
+                with grpcclient.InferenceServerClient(
+                    f"{_tritonserver_ipaddr}:8001"
+                ) as client:
+                    client.start_stream(callback=partial(callback, user_data))
+                    input_data = np.array([input_value], dtype=np.int32)
+                    max_sum_data = np.array([max_sum_value], dtype=np.int32)
+                    ignore_cancel_data = np.array([ignore_cancel], dtype=np.bool_)
+                    inputs = [
+                        grpcclient.InferInput(
+                            "INPUT",
+                            input_data.shape,
+                            np_to_triton_dtype(input_data.dtype),
                         ),
-                    )
+                        grpcclient.InferInput(
+                            "MAX_SUM",
+                            max_sum_data.shape,
+                            np_to_triton_dtype(max_sum_data.dtype),
+                        ),
+                        grpcclient.InferInput(
+                            "IGNORE_CANCEL",
+                            ignore_cancel_data.shape,
+                            np_to_triton_dtype(ignore_cancel_data.dtype),
+                        ),
+                    ]
+                    inputs[0].set_data_from_numpy(input_data)
+                    inputs[1].set_data_from_numpy(max_sum_data)
+                    inputs[2].set_data_from_numpy(ignore_cancel_data)
+                    client.async_stream_infer(model_name, inputs)
+
+                    # Check the results of the decoupled model using BLS
+                    def check_result(result):
+                        # Make sure the result is not an exception
+                        self.assertIsNot(type(result), InferenceServerException)
+                        is_cancelled = result.as_numpy("IS_CANCELLED")
+                        self.assertFalse(
+                            is_cancelled[0],
+                            "error: expected the request not being cancelled",
+                        )
 
-                result = user_data._completed_requests.get()
-                check_result(result)
+                        sum_data = result.as_numpy("SUM")
+                        self.assertIsNotNone(sum_data, "error: expected 'SUM'")
+                        self.assertTrue(
+                            sum_data > max_sum_data,
+                            "error: expected sum_data {} to be greater than max_sum_data {}".format(
+                                sum_data, max_sum_data
+                            ),
+                        )
+
+                    result = user_data._completed_requests.get()
+                    check_result(result)
 
-    def test_decoupled_bls_async_cancel(self):
-        model_name = "decoupled_bls_async_cancel"
+    def test_decoupled_bls_cancel_after_completion(self):
+        model_name = "decoupled_bls_cancel_after_complete"
         input_value = 1
         max_sum_value = 10
+        ignore_cancel = False
         user_data = UserData()
-
         with self._shm_leak_detector.Probe() as shm_probe:
             with grpcclient.InferenceServerClient(
                 f"{_tritonserver_ipaddr}:8001"
             ) as client:
                 client.start_stream(callback=partial(callback, user_data))
                 input_data = np.array([input_value], dtype=np.int32)
                 max_sum_data = np.array([max_sum_value], dtype=np.int32)
+                ignore_cancel_data = np.array([ignore_cancel], dtype=np.bool_)
                 inputs = [
                     grpcclient.InferInput(
                         "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)
@@ -390,15 +465,25 @@ def test_decoupled_bls_async_cancel(self):
                         max_sum_data.shape,
                         np_to_triton_dtype(max_sum_data.dtype),
                     ),
+                    grpcclient.InferInput(
+                        "IGNORE_CANCEL",
+                        ignore_cancel_data.shape,
+                        np_to_triton_dtype(ignore_cancel_data.dtype),
+                    ),
                 ]
                 inputs[0].set_data_from_numpy(input_data)
                 inputs[1].set_data_from_numpy(max_sum_data)
+                inputs[2].set_data_from_numpy(ignore_cancel_data)
                 client.async_stream_infer(model_name, inputs)
 
                 # Check the results of the decoupled model using BLS
                 def check_result(result):
                     # Make sure the result is not an exception
                     self.assertIsNot(type(result), InferenceServerException)
+                    is_cancelled = result.as_numpy("IS_CANCELLED")
+                    self.assertTrue(
+                        is_cancelled[0], "error: expected the request to be cancelled"
+                    )
 
                     sum_data = result.as_numpy("SUM")
                     self.assertIsNotNone(sum_data, "error: expected 'SUM'")
 
@@ -30,26 +30,30 @@
 
 
 class TritonPythonModel:
-    """This model will send a decoupled bls request to 'response_sender_until_cancelled' model
-    The model will start adding the response from the model.
-    When the MAX_SUM is reached. The model will call the response iterarior cancel() method to
-    cancel the response stream.
-    The number of response should not reach MAX_NUMBER_OF_RESPONSE.
+    """
+    This model sends a decoupled bls inference request to 'response_sender_until_cancelled'
+    model, and sums up its responses.
+    Once the MAX_SUM is reached, the model will call the response iterator's
+    cancel() method to cancel the response stream.
+    If the IGNORE_CANCEL is set to True, the 'response_sender_until_cancelled' model will not hornor
+    the request cancellation and keep sending the output to the model.
+    The number of total responses should not reach MAX_NUMBER_OF_RESPONSE.
     """
 
     async def execute(self, requests):
         max_sum = (
             pb_utils.get_input_tensor_by_name(requests[0], "MAX_SUM").as_numpy().flat[0]
         )
         input = pb_utils.get_input_tensor_by_name(requests[0], "INPUT")
+        ignore_cancel = pb_utils.get_input_tensor_by_name(requests[0], "IGNORE_CANCEL")
         delay = pb_utils.Tensor("DELAY", np.array([50], dtype=np.int32))
         max_num_response = pb_utils.Tensor(
             "MAX_NUMBER_OF_RESPONSE", np.array([100], dtype=np.int32)
         )
 
         infer_request = pb_utils.InferenceRequest(
             model_name="response_sender_until_cancelled",
-            inputs=[input, max_num_response, delay],
+            inputs=[input, max_num_response, delay, ignore_cancel],
             requested_output_names=["OUTPUT"],
         )
 
@@ -69,18 +73,18 @@ async def execute(self, requests):
             out = pb_utils.get_output_tensor_by_name(
                 infer_response, "OUTPUT"
             ).as_numpy()[0]
-            if response_sum + out > max_sum:
-                response_stream.cancel()
-            else:
-                response_sum += out
 
-        if error is None and not is_cancelled:
-            error = pb_utils.TritonError("request is not cancelled!")
+            response_sum += out
+            if response_sum >= max_sum:
+                response_stream.cancel()
 
         responses = [
             pb_utils.InferenceResponse(
                 output_tensors=[
-                    pb_utils.Tensor("SUM", np.array([response_sum], dtype=np.int32))
+                    pb_utils.Tensor("SUM", np.array([response_sum], dtype=np.int32)),
+                    pb_utils.Tensor(
+                        "IS_CANCELLED", np.array([is_cancelled], dtype=np.bool_)
+                    ),
                 ],
                 error=error,
             )
 
@@ -37,13 +37,23 @@ input [
     name: "MAX_SUM"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "IGNORE_CANCEL"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
   }
 ]
 output [
   {
     name: "SUM"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "IS_CANCELLED"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
   }
 ]
 
 
@@ -29,26 +29,30 @@
 
 
 class TritonPythonModel:
-    """This model will send a decoupled bls request to 'response_sender_until_cancelled' model
-    The model will start adding the response from the model.
-    When the MAX_SUM is reached. The model will call the response iterarior cancel() method to
-    cancel the response stream.
-    The number of response should not reach MAX_NUMBER_OF_RESPONSE.
+    """
+    This model sends a decoupled bls inference request to 'response_sender_until_cancelled'
+    model, and sums up its responses.
+    Once the MAX_SUM is reached, the model will call the response iterator's
+    cancel() method to cancel the response stream.
+    If the IGNORE_CANCEL is set to True, the 'response_sender_until_cancelled' model will not hornor
+    the request cancellation and keep sending the output to the model.
+    The number of total responses should not reach MAX_NUMBER_OF_RESPONSE.
     """
 
     def execute(self, requests):
         max_sum = (
             pb_utils.get_input_tensor_by_name(requests[0], "MAX_SUM").as_numpy().flat[0]
         )
         input = pb_utils.get_input_tensor_by_name(requests[0], "INPUT")
+        ignore_cancel = pb_utils.get_input_tensor_by_name(requests[0], "IGNORE_CANCEL")
         delay = pb_utils.Tensor("DELAY", np.array([50], dtype=np.int32))
         max_num_response = pb_utils.Tensor(
             "MAX_NUMBER_OF_RESPONSE", np.array([100], dtype=np.int32)
         )
 
         infer_request = pb_utils.InferenceRequest(
             model_name="response_sender_until_cancelled",
-            inputs=[input, max_num_response, delay],
+            inputs=[input, max_num_response, delay, ignore_cancel],
             requested_output_names=["OUTPUT"],
         )
 
@@ -68,18 +72,18 @@ def execute(self, requests):
             out = pb_utils.get_output_tensor_by_name(
                 infer_response, "OUTPUT"
             ).as_numpy()[0]
-            if response_sum + out > max_sum:
-                response_stream.cancel()
-            else:
-                response_sum += out
 
-        if error is None and not is_cancelled:
-            error = pb_utils.TritonError("request is not cancelled!")
+            response_sum += out
+            if response_sum >= max_sum:
+                response_stream.cancel()
 
         responses = [
             pb_utils.InferenceResponse(
                 output_tensors=[
-                    pb_utils.Tensor("SUM", np.array([response_sum], dtype=np.int32))
+                    pb_utils.Tensor("SUM", np.array([response_sum], dtype=np.int32)),
+                    pb_utils.Tensor(
+                        "IS_CANCELLED", np.array([is_cancelled], dtype=np.bool_)
+                    ),
                 ],
                 error=error,
             )
 
@@ -37,13 +37,23 @@ input [
     name: "MAX_SUM"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "IGNORE_CANCEL"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
   }
 ]
 output [
   {
     name: "SUM"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "IS_CANCELLED"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
   }
 ]
Original file line number	Diff line number	Diff line change
`@@ -37,13 +37,23 @@ input [`
`37`	`37`	`name: "MAX_SUM"`
`38`	`38`	`data_type: TYPE_INT32`
`39`	`39`	`dims: [ 1 ]`
	`40`	`+ },`
	`41`	`+ {`
	`42`	`+ name: "IGNORE_CANCEL"`
	`43`	`+ data_type: TYPE_BOOL`
	`44`	`+ dims: [ 1 ]`
`40`	`45`	`}`
`41`	`46`	`]`
`42`	`47`	`output [`
`43`	`48`	`{`
`44`	`49`	`name: "SUM"`
`45`	`50`	`data_type: TYPE_INT32`
`46`	`51`	`dims: [ 1 ]`
	`52`	`+ },`
	`53`	`+ {`
	`54`	`+ name: "IS_CANCELLED"`
	`55`	`+ data_type: TYPE_BOOL`
	`56`	`+ dims: [ 1 ]`
`47`	`57`	`}`
`48`	`58`	`]`
`49`	`59`