feat: Add more metrics and error log for observability publisher (#598)

# Description  Add kafka lag metrics and error log for easier troubleshoot when issue arise regarding observability publisher # Modifications  * Add scheduled daemon to calculate lag for kafka consumer * Add more error log during failure * Add container port for observability publisher deployment so it can be scraped by the prometheus agent # Tests  # Checklist - [ ] Added PR label - [ ] Added unit test, integration, and/or e2e tests - [ ] Tested locally - [ ] Updated documentation - [ ] Update Swagger spec if the PR introduce API changes - [ ] Regenerated Golang and Python client if the PR introduces API changes # Release Notes  ```release-note ```
caraml-dev · Jul 31, 2024 · 24d79eb · 24d79eb
1 parent 8c4930d
commit 24d79eb
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 17 deletions.
diff --git a/api/pkg/observability/deployment/deployment.go b/api/pkg/observability/deployment/deployment.go
@@ -316,7 +316,7 @@ func (c *deployer) applyDeployment(ctx context.Context, data *models.WorkerData,
 	deploymentV1 := appV1.Deployments(c.targetNamespace())
 
 	applyDeploymentFunc := func(data *models.WorkerData, secretName string, isExistingDeployment bool) (*appsv1.Deployment, error) {
-		deployment, err := c.createDeploymentSpec(ctx, data, secretName)
+		deployment, err := c.createDeploymentSpec(data, secretName)
 		if err != nil {
 			return nil, err
 		}
@@ -342,7 +342,7 @@ func (c *deployer) getLabels(data *models.WorkerData) map[string]string {
 	return labels
 }
 
-func (c *deployer) createDeploymentSpec(ctx context.Context, data *models.WorkerData, secretName string) (*appsv1.Deployment, error) {
+func (c *deployer) createDeploymentSpec(data *models.WorkerData, secretName string) (*appsv1.Deployment, error) {
 	labels := c.getLabels(data)
 
 	cfgVolName := "config-volume"
@@ -394,14 +394,21 @@ func (c *deployer) createDeploymentSpec(ctx context.Context, data *models.Worker
 									ReadOnly:  true,
 								},
 							},
+							Ports: []corev1.ContainerPort{
+								{
+									Name:          "prom-metric",
+									ContainerPort: 8000,
+									Protocol:      corev1.ProtocolTCP,
+								},
+							},
 						},
 					},
 					Volumes: []corev1.Volume{
 						{
 							Name: cfgVolName,
 							VolumeSource: corev1.VolumeSource{
 								Secret: &corev1.SecretVolumeSource{
-									SecretName: c.getSecretName(data),
+									SecretName: secretName,
 								},
 							},
 						},

diff --git a/api/pkg/observability/deployment/deployment_test.go b/api/pkg/observability/deployment/deployment_test.go
@@ -104,6 +104,13 @@ func createDeploymentSpec(data *models.WorkerData, resourceRequest corev1.Resour
 									ReadOnly:  true,
 								},
 							},
+							Ports: []corev1.ContainerPort{
+								{
+									Name:          "prom-metric",
+									ContainerPort: 8000,
+									Protocol:      corev1.ProtocolTCP,
+								},
+							},
 						},
 					},
 					Volumes: []corev1.Volume{

diff --git a/python/observation-publisher/publisher/__main__.py b/python/observation-publisher/publisher/__main__.py
@@ -2,7 +2,6 @@
 from merlin.observability.inference import InferenceSchema
 from omegaconf import OmegaConf
 from prometheus_client import start_http_server
-
 from publisher.config import PublisherConfig
 from publisher.metric import MetricWriter
 from publisher.observation_sink import new_observation_sink
@@ -17,7 +16,9 @@ def start_consumer(cfg: PublisherConfig) -> None:
 
     start_http_server(cfg.environment.prometheus_port)
     MetricWriter().setup(
-        model_id=cfg.environment.model_id, model_version=cfg.environment.model_version
+        model_id=cfg.environment.model_id,
+        model_version=cfg.environment.model_version,
+        merlin_project=cfg.environment.project,
     )
     prediction_log_consumer = new_consumer(cfg.environment.observation_source)
     inference_schema = InferenceSchema.from_dict(

diff --git a/python/observation-publisher/publisher/metric.py b/python/observation-publisher/publisher/metric.py
@@ -16,12 +16,18 @@ def __init__(self):
             self.last_processed_timestamp_gauge = Gauge(
                 "last_processed_timestamp",
                 "The timestamp of the last prediction log processed by the publisher",
-                ["model_id", "model_version"],
+                ["model_id", "model_version", "merlin_project"],
             )
             self.total_prediction_logs_processed_counter = Counter(
                 "total_prediction_logs_processed",
                 "The total number of prediction logs processed by the publisher",
-                ["model_id", "model_version"],
+                ["model_id", "model_version", "merlin_project"],
+            )
+
+            self.kafka_consumer_lag_gauge = Gauge(
+                "kafka_consumer_lag",
+                "The number of unprocess message in kafka",
+                ["model_id", "model_version", "merlin_project", "partition"],
             )
             self._initialized = True
 
@@ -31,7 +37,7 @@ def __new__(cls):
             cls._instance._initialized = False
         return cls._instance
 
-    def setup(self, model_id: str, model_version: str):
+    def setup(self, model_id: str, model_version: str, merlin_project: str):
         """
         Needs to be run before sending metrics, so that the singleton instance has the correct properties value.
         :param model_id:
@@ -40,6 +46,7 @@ def setup(self, model_id: str, model_version: str):
         """
         self.model_id = model_id
         self.model_version = model_version
+        self.merlin_project = merlin_project
 
     def update_last_processed_timestamp(self, last_processed_timestamp: Timestamp):
         """
@@ -48,7 +55,9 @@ def update_last_processed_timestamp(self, last_processed_timestamp: Timestamp):
         :return:
         """
         self.last_processed_timestamp_gauge.labels(
-            model_id=self.model_id, model_version=self.model_version
+            model_id=self.model_id,
+            model_version=self.model_version,
+            merlin_project=self.merlin_project,
         ).set(last_processed_timestamp.timestamp())
 
     def increment_total_prediction_logs_processed(self, value: int):
@@ -57,5 +66,21 @@ def increment_total_prediction_logs_processed(self, value: int):
         :return:
         """
         self.total_prediction_logs_processed_counter.labels(
-            model_id=self.model_id, model_version=self.model_version
+            model_id=self.model_id,
+            model_version=self.model_version,
+            merlin_project=self.merlin_project,
         ).inc(value)
+
+    def update_kafka_lag(self, total_lag: int, partition: int):
+        """
+        Update the kafka_consumer_lag gauge with the given value
+        :param total_lag:
+        :param partition:
+        :return:
+        """
+        self.kafka_consumer_lag_gauge.labels(
+            model_id=self.model_id,
+            model_version=self.model_version,
+            partition=partition,
+            merlin_project=self.merlin_project,
+        ).set(total_lag)
diff --git a/python/observation-publisher/publisher/prediction_log_consumer.py b/python/observation-publisher/publisher/prediction_log_consumer.py
@@ -1,4 +1,5 @@
 import abc
+import time
 from dataclasses import dataclass
 from datetime import datetime
 from threading import Thread
@@ -7,7 +8,7 @@
 import numpy as np
 import pandas as pd
 from caraml.upi.v1.prediction_log_pb2 import PredictionLog
-from confluent_kafka import Consumer, KafkaException
+from confluent_kafka import Consumer, KafkaException, TopicPartition
 from dataclasses_json import DataClassJsonMixin, dataclass_json
 from merlin.observability.inference import InferenceSchema, ObservationType
 from publisher.config import ObservationSource, ObservationSourceConfig
@@ -141,9 +142,54 @@ def __init__(
 
         self._consumer = Consumer(consumer_config)
         self._batch_size = config.batch_size
+        self._topic = config.topic
         self._consumer.subscribe([config.topic])
         self._poll_timeout = config.poll_timeout_seconds
 
+        background_job_thread = Thread(target=self._emit_metrics)
+        background_job_thread.setDaemon(True)
+        background_job_thread.start()
+
+    def _emit_metrics(self):
+        while True:
+            lags, partitions = self._calculate_lag()
+            for lag, partition in zip(lags, partitions):
+                MetricWriter().update_kafka_lag(total_lag=lag, partition=partition)
+
+            time.sleep(60)
+
+    def _calculate_lag(self) -> Tuple[List[int], List[int]]:
+        cluster_metadata = self._consumer.list_topics(topic=self._topic)
+        topic_metadata = cluster_metadata.topics.get(self._topic)
+        partition_ids = list(topic_metadata.partitions.keys())
+
+        topic_partitions = [
+            TopicPartition(topic=self._topic, partition=partition_id)
+            for partition_id in partition_ids
+        ]
+
+        committed_offsets = self._consumer.committed(topic_partitions)
+        committed_offsets_per_partitions = {}
+
+        for topic_partition in committed_offsets:
+            key = f"{topic_partition.topic}_{topic_partition.partition}"
+            committed_offsets_per_partitions[key] = topic_partition.offset
+
+        diff = []
+        partitions = []
+        for topic_partition in topic_partitions:
+            _, high = self._consumer.get_watermark_offsets(topic_partition)
+            committed_offset_key = (
+                f"{topic_partition.topic}_{topic_partition.partition}"
+            )
+            commited_offset = committed_offsets_per_partitions.get(
+                committed_offset_key, 0
+            )
+            diff.append(high - commited_offset)
+            partitions.append(topic_partition.partition)
+
+        return diff, partitions
+
     def poll_new_logs(self) -> List[PredictionLog]:
         messages = self._consumer.consume(self._batch_size, timeout=self._poll_timeout)
         errors = [msg.error() for msg in messages if msg.error() is not None]

diff --git a/python/observation-publisher/publisher/prediction_log_parser.py b/python/observation-publisher/publisher/prediction_log_parser.py
@@ -92,7 +92,7 @@ def from_struct(
 
 
 def convert_to_numpy_value(
-    col_value: Optional[int | str | float | bool], value_type: Optional[ValueType]
+    col_value: Optional[int | str | float | bool], value_type: Optional[ValueType], col_name: str
 ) -> np.int64 | np.float64 | np.bool_ | np.str_:
     if value_type is None:
         if isinstance(col_value, (int, float)):
@@ -104,16 +104,16 @@ def convert_to_numpy_value(
 
     match value_type:
         case ValueType.INT64:
-            assert isinstance(col_value, (int, float))
+            assert isinstance(col_value, (int, float)), f"type of value for column {col_name} should be int or float, current value: {col_value} and type: {type(col_value)}"
             return np.int64(col_value)
         case ValueType.FLOAT64:
-            assert isinstance(col_value, (int, float, NoneType))
+            assert isinstance(col_value, (int, float, NoneType)), f"type of value for column {col_name} should be int or float or None, current value: {col_value} and type: {type(col_value)}"
             return np.float64(col_value)
         case ValueType.BOOLEAN:
-            assert isinstance(col_value, bool)
+            assert isinstance(col_value, bool), f"type of value for column {col_name} should be boolean, current value: {col_value} and type: {type(col_value)}"
             return np.bool_(col_value)
         case ValueType.STRING:
-            assert isinstance(col_value, str)
+            assert isinstance(col_value, str),f"type of value for column {col_name} should be string, current value: {col_value} and type: {type(col_value)}"
             return np.str_(col_value)
         case _:
             raise ValueError(f"Unknown value type: {value_type}")
@@ -165,6 +165,6 @@ def list_value_as_numpy_list(
         column_values.append(v)
 
     return [
-        convert_to_numpy_value(col_value, column_types.get(col_name))
+        convert_to_numpy_value(col_value=col_value, value_type=column_types.get(col_name), col_name=col_name)
         for col_value, col_name in zip(column_values, column_names) if column_types.get(col_name) is not None
     ]