From 90bdf7dbe3bc5d6af32714825c7110e78b26e69e Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Mon, 9 Mar 2020 14:39:55 +0200 Subject: [PATCH 01/40] move datadog init to AppConfig.ready --- corehq/util/datadog/apps.py | 27 +++++++++++++++++++++++++++ settings.py | 14 +------------- 2 files changed, 28 insertions(+), 13 deletions(-) create mode 100644 corehq/util/datadog/apps.py diff --git a/corehq/util/datadog/apps.py b/corehq/util/datadog/apps.py new file mode 100644 index 000000000000..032d76e978d5 --- /dev/null +++ b/corehq/util/datadog/apps.py @@ -0,0 +1,27 @@ +from django.apps import AppConfig +from django.conf import settings + + +class DatadogConfig(AppConfig): + + name = 'corehq.util.datadog' + verbose_name = 'Datadog' + + def ready(self): + if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: + return + + try: + from datadog import initialize + except ImportError: + pass + else: + initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY) + + if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS: + try: + from ddtrace import tracer + tracer.enabled = False + except ImportError: + pass + diff --git a/settings.py b/settings.py index eaa6db1f5837..f4939ce9adcd 100755 --- a/settings.py +++ b/settings.py @@ -340,6 +340,7 @@ 'corehq.motech.openmrs', 'corehq.motech.repeaters', 'corehq.util', + 'corehq.util.datadog.apps.DatadogConfig' 'dimagi.ext', 'corehq.blobs', 'corehq.apps.case_search', @@ -2073,19 +2074,6 @@ if 'dummy' not in CACHES: CACHES['dummy'] = {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'} -try: - from datadog import initialize -except ImportError: - pass -else: - initialize(DATADOG_API_KEY, DATADOG_APP_KEY) - -if UNIT_TESTING or DEBUG or 'ddtrace.contrib.django' not in INSTALLED_APPS: - try: - from ddtrace import tracer - tracer.enabled = False - except ImportError: - pass REST_FRAMEWORK = { 'DATETIME_FORMAT': '%Y-%m-%dT%H:%M:%S.%fZ', From 7ad93fa2d6aeee3b0d337ae28eb5e0f83516e3e1 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 10 Mar 2020 12:33:08 +0200 Subject: [PATCH 02/40] pluggable metrics --- corehq/util/metrics/__init__.py | 19 ++++ corehq/util/metrics/datadog.py | 49 +++++++++++ corehq/util/metrics/metrics.py | 139 ++++++++++++++++++++++++++++++ corehq/util/metrics/prometheus.py | 46 ++++++++++ settings.py | 3 +- 5 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 corehq/util/metrics/__init__.py create mode 100644 corehq/util/metrics/datadog.py create mode 100644 corehq/util/metrics/metrics.py create mode 100644 corehq/util/metrics/prometheus.py diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py new file mode 100644 index 000000000000..7672abf5b545 --- /dev/null +++ b/corehq/util/metrics/__init__.py @@ -0,0 +1,19 @@ +from corehq.util.metrics.datadog import DatadogMetrics +from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, HqMetrics +from corehq.util.metrics.prometheus import PrometheusMetrics + +_metrics = None # singleton/global + + +def get_metrics() -> HqMetrics: + global _metrics + if not _metrics: + enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) + if not enabled: + _metrics = DummyMetrics() + + if len(enabled) > 1: + _metrics = DelegatedMetrics(enabled) + + _metrics = enabled[0] + return _metrics diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py new file mode 100644 index 000000000000..a0598168e9b1 --- /dev/null +++ b/corehq/util/metrics/datadog.py @@ -0,0 +1,49 @@ +import logging + +from django.conf import settings + +from corehq.util.metrics.metrics import HqCounter, HqGauge, HqMetrics +from datadog import api +from datadog.dogstatsd.base import DogStatsd + +datadog_logger = logging.getLogger('datadog') + +COMMON_TAGS = ['environment:{}'.format(settings.SERVER_ENVIRONMENT)] + +statsd = DogStatsd(constant_tags=COMMON_TAGS) + + +def _format_tags(tag_names, tag_values): + if not tag_names or not tag_values: + return None + + return [ + f'{name}:{value}' for name, value in zip(tag_names, tag_values) + ] + + +def _datadog_record(fn, name, value, tags=None): + try: + fn(name, value, tags=tags) + except Exception: + datadog_logger.exception('Unable to record Datadog stats') + + +class Counter(HqCounter): + def _inc(self, amount=1): + tags = _format_tags(self.tag_names, self.tag_values) + _datadog_record(statsd.increment, self.name, amount, tags) + + +class Gauge(HqGauge): + def _set(self, value): + tags = _format_tags(self.tag_names, self.tag_values) + _datadog_record(statsd.gauge, self.name, value, tags) + + +class DatadogMetrics(HqMetrics): + _counter_class = Counter + _gauge_class = Gauge + + def enabled(self) -> bool: + return api._api_key and api._application_key diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py new file mode 100644 index 000000000000..9b52266bee92 --- /dev/null +++ b/corehq/util/metrics/metrics.py @@ -0,0 +1,139 @@ +import abc +import re +from abc import abstractmethod +from typing import Iterable + +from corehq.util.soft_assert import soft_assert + +METRIC_NAME_RE = re.compile(r'^[a-zA-Z_:.][a-zA-Z0-9_:.]*$') +METRIC_TAG_NAME_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$') +RESERVED_METRIC_TAG_NAME_RE = re.compile(r'^__.*$') +RESERVED_METRIC_TAG_NAMES = ['quantile', 'le'] + + +def _enforce_prefix(name, prefix): + soft_assert(fail_if_debug=True).call( + not prefix or name.startswith(prefix), + "Did you mean to call your metric 'commcare.{}'? ".format(name)) + + +def _validate_tag_names(tag_names): + tag_names = tuple(tag_names) + for l in tag_names: + if not METRIC_TAG_NAME_RE.match(l): + raise ValueError('Invalid metric tag name: ' + l) + if RESERVED_METRIC_TAG_NAME_RE.match(l): + raise ValueError('Reserved metric tag name: ' + l) + if l in RESERVED_METRIC_TAG_NAMES: + raise ValueError('Reserved metric tag name: ' + l) + return tag_names + + +class MetricBase: + def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), tag_values=None): + self.name = name + if not METRIC_NAME_RE.match(name): + raise ValueError('Invalid metric name: ' + name) + _enforce_prefix(name, 'commcare') + self.documentation = documentation + self.tag_names = _validate_tag_names(tag_names) + self.tag_values = tag_values + + def tag(self, **tag_kwargs): + if sorted(tag_kwargs) != sorted(self.tag_names): + raise ValueError('Incorrect tag names') + + tag_values = tuple(str(tag_kwargs[t]) for t in self.tag_names) + return self._get_tagged_instance(tag_values) + + def _get_tagged_instance(self, tag_values): + return self.__class__(self.name, self.documentation, self.tag_names, tag_values) + + def _validate_tags(self): + if self.tag_names and not self.tag_values: + raise Exception('Metric has missing tag values.') + + +class HqCounter(MetricBase): + def inc(self, amount: float = 1): + """Increment the counter by the given amount.""" + self._validate_tags() + self._inc(amount) + + def _inc(self, amount: float = 1): + """Override this method to record the metric""" + pass + + +class HqGauge(MetricBase): + def set(self, value: float): + """Set gauge to the given value.""" + self._validate_tags() + self._set(value) + + def _set(self, value: float): + """Override this method to record the metric""" + pass + + +class HqMetrics(metaclass=abc.ABCMeta): + _counter_class = None + _gauge_class = None + + @abstractmethod + def enabled(self) -> bool: + raise NotImplementedError + + def counter(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> HqCounter: + return self._counter_class(name, documentation, tag_names) + + def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> HqGauge: + return self._gauge_class(name, documentation, tag_names) + + +class DummyMetrics(HqMetrics): + _counter_class = HqCounter + _gauge_class = HqGauge + + def enabled(self) -> bool: + return True + + +class DelegatedMetrics(HqMetrics): + def __init__(self, delegates): + self.delegates = delegates + + def enabled(self) -> bool: + return True + + def counter(self, name: str, documentation: str, tag_names: Iterable=tuple()): + return DelegatingCounter([ + d.counter(name, documentation, tag_names) for d in self.delegates + ]) + + def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()): + return DelegatingGauge([ + d.gauge(name, documentation, tag_names) for d in self.delegates + ]) + + +class DelegatingMetric: + def __init__(self, delegates): + self._delegates = delegates + + def tag(self, **tag_kwargs): + return self.__class__([ + d.tag(**tag_kwargs) for d in self._delegates + ]) + + +class DelegatingCounter(DelegatingMetric): + def inc(self, amount: float = 1): + for d in self._delegates: + d.inc(amount) + + +class DelegatingGauge(DelegatingMetric): + def set(self, value: float): + for d in self._delegates: + d.set(value) diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py new file mode 100644 index 000000000000..6637f7097520 --- /dev/null +++ b/corehq/util/metrics/prometheus.py @@ -0,0 +1,46 @@ +from typing import Iterable + +import settings +from corehq.util.metrics.metrics import ( + HqCounter, + HqGauge, + HqMetrics, + MetricBase, +) +from prometheus_client import Counter as PCounter +from prometheus_client import Gauge as PGauge + + +class PrometheusMetricBase(MetricBase): + _metric_class = None + + def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), tag_values=None, delegate=None): + name = name.replace('.', '_') + super().__init__(name, documentation, tag_names, tag_values) + self._delegate = delegate or self._metric_class(name, documentation, tag_names, labelvalues=tag_values) + + def _get_tagged_instance(self, tag_values): + delegate = self._delegate.labels(dict(zip(self.tag_names, tag_values))) + return self.__class__(self.name, self.documentation, self.tag_names, self.tag_values, delegate) + + +class Counter(PrometheusMetricBase, HqCounter): + _metric_class = PCounter + + def _inc(self, amount=1): + self._delegate.inc(amount) + + +class Gauge(PrometheusMetricBase, HqGauge): + _metric_class = PGauge + + def _set(self, value: float): + self._delegate.set(value) + + +class PrometheusMetrics(HqMetrics): + _counter_class = Counter + _gauge_class = Gauge + + def enabled(self) -> bool: + return settings.ENABLE_PROMETHEUS_METRICS diff --git a/settings.py b/settings.py index f4939ce9adcd..05431414ab78 100755 --- a/settings.py +++ b/settings.py @@ -340,7 +340,7 @@ 'corehq.motech.openmrs', 'corehq.motech.repeaters', 'corehq.util', - 'corehq.util.datadog.apps.DatadogConfig' + 'corehq.util.datadog.apps.DatadogConfig', 'dimagi.ext', 'corehq.blobs', 'corehq.apps.case_search', @@ -838,6 +838,7 @@ DATADOG_API_KEY = None DATADOG_APP_KEY = None +ENABLE_PROMETHEUS_METRICS = False SYNCLOGS_SQL_DB_ALIAS = 'default' From 0694926fc68420d9ef09b8d5a29f315abd3e8266 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 10 Mar 2020 17:41:01 +0200 Subject: [PATCH 03/40] add histogram and tests --- corehq/util/metrics/datadog.py | 40 ++++- corehq/util/metrics/metrics.py | 58 ++++++-- corehq/util/metrics/prometheus.py | 43 ++++-- corehq/util/metrics/tests/__init__.py | 0 corehq/util/metrics/tests/test_metrics.py | 171 ++++++++++++++++++++++ corehq/util/metrics/tests/utils.py | 22 +++ 6 files changed, 308 insertions(+), 26 deletions(-) create mode 100644 corehq/util/metrics/tests/__init__.py create mode 100644 corehq/util/metrics/tests/test_metrics.py create mode 100644 corehq/util/metrics/tests/utils.py diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index a0598168e9b1..748718c3f23c 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -2,7 +2,13 @@ from django.conf import settings -from corehq.util.metrics.metrics import HqCounter, HqGauge, HqMetrics +from corehq.util.datadog.utils import bucket_value +from corehq.util.metrics.metrics import ( + HqCounter, + HqGauge, + HqHistogram, + HqMetrics, +) from datadog import api from datadog.dogstatsd.base import DogStatsd @@ -30,20 +36,48 @@ def _datadog_record(fn, name, value, tags=None): class Counter(HqCounter): - def _inc(self, amount=1): + def _record(self, amount: float): tags = _format_tags(self.tag_names, self.tag_values) _datadog_record(statsd.increment, self.name, amount, tags) class Gauge(HqGauge): - def _set(self, value): + def _record(self, value): tags = _format_tags(self.tag_names, self.tag_values) _datadog_record(statsd.gauge, self.name, value, tags) +class Histogram(HqHistogram): + """This implementation of histogram uses tagging to record the buckets. + It does not use the Datadog Histogram metric type. + + The metric itself will be incremented by 1 on each call to `observe`. The value + passed to `observe` will be used to create the bucket tag. + + For example: + + h = Histogram( + 'commcare.request.duration', 'description', + bucket_tag='duration', buckets=[1,2,3], bucket_units='ms' + ) + h.observe(1.4) + + # resulting Datadog metric + # commcare.request.duration:1|c|#duration:lt_2ms + + More details: https://help.datadoghq.com/hc/en-us/articles/211545826 + """ + def _record(self, value: float): + tags = _format_tags(self.tag_names, self.tag_values) + bucket = bucket_value(value, self._buckets, self._bucket_unit) + tags.append(f'{self._bucket_tag}:{bucket}') + _datadog_record(statsd.increment, self.name, 1, tags) + + class DatadogMetrics(HqMetrics): _counter_class = Counter _gauge_class = Gauge + _histogram_class = Histogram def enabled(self) -> bool: return api._api_key and api._application_key diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 9b52266bee92..73f7dad36e75 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -1,9 +1,10 @@ import abc import re from abc import abstractmethod -from typing import Iterable +from typing import Iterable, List from corehq.util.soft_assert import soft_assert +from prometheus_client.utils import INF METRIC_NAME_RE = re.compile(r'^[a-zA-Z_:.][a-zA-Z0-9_:.]*$') METRIC_TAG_NAME_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$') @@ -30,14 +31,19 @@ def _validate_tag_names(tag_names): class MetricBase: - def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), tag_values=None): + def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), **kwargs): self.name = name if not METRIC_NAME_RE.match(name): raise ValueError('Invalid metric name: ' + name) _enforce_prefix(name, 'commcare') self.documentation = documentation self.tag_names = _validate_tag_names(tag_names) - self.tag_values = tag_values + self.tag_values = kwargs.pop('tag_values', None) + self._kwargs = kwargs + self._init_metric() + + def _init_metric(self): + pass def tag(self, **tag_kwargs): if sorted(tag_kwargs) != sorted(self.tag_names): @@ -47,38 +53,54 @@ def tag(self, **tag_kwargs): return self._get_tagged_instance(tag_values) def _get_tagged_instance(self, tag_values): - return self.__class__(self.name, self.documentation, self.tag_names, tag_values) + return self.__class__( + self.name, self.documentation, tag_names=self.tag_names, tag_values=tag_values, **self._kwargs + ) def _validate_tags(self): if self.tag_names and not self.tag_values: raise Exception('Metric has missing tag values.') + def _record(self, value: float): + pass + class HqCounter(MetricBase): def inc(self, amount: float = 1): """Increment the counter by the given amount.""" self._validate_tags() - self._inc(amount) - - def _inc(self, amount: float = 1): - """Override this method to record the metric""" - pass + self._record(amount) class HqGauge(MetricBase): def set(self, value: float): """Set gauge to the given value.""" self._validate_tags() - self._set(value) + self._record(value) - def _set(self, value: float): - """Override this method to record the metric""" - pass + +DEFAULT_BUCKETS = (.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0, INF) + + +class HqHistogram(MetricBase): + + def _init_metric(self): + self._buckets = self._kwargs.get('buckets') or DEFAULT_BUCKETS + self._bucket_unit = self._kwargs.get('bucket_unit', '') + self._bucket_tag = self._kwargs.get('bucket_tag') + if self._bucket_tag in self.tag_names: + self.tag_names = tuple([name for name in self.tag_names if name != self._bucket_tag]) + + def observe(self, value: float): + """Update histogram with the given value.""" + self._validate_tags() + self._record(value) class HqMetrics(metaclass=abc.ABCMeta): _counter_class = None _gauge_class = None + _histogram_class = None @abstractmethod def enabled(self) -> bool: @@ -90,6 +112,16 @@ def counter(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> HqGauge: return self._gauge_class(name, documentation, tag_names) + def histogram(self, name: str, documentation: str, + bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', + tag_names: Iterable=tuple()) -> HqGauge: + """Create a histogram metric. Histogram implementations differ between provider. See provider + implementations for details. + """ + return self._histogram_class( + name, documentation, tag_names, bucket_tag=bucket_tag, buckets=buckets, bucket_unit=bucket_unit + ) + class DummyMetrics(HqMetrics): _counter_class = HqCounter diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index 6637f7097520..c98146f3886d 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -1,46 +1,69 @@ -from typing import Iterable - import settings from corehq.util.metrics.metrics import ( HqCounter, HqGauge, + HqHistogram, HqMetrics, MetricBase, ) from prometheus_client import Counter as PCounter from prometheus_client import Gauge as PGauge +from prometheus_client import Histogram as PHistogram class PrometheusMetricBase(MetricBase): _metric_class = None - def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), tag_values=None, delegate=None): - name = name.replace('.', '_') - super().__init__(name, documentation, tag_names, tag_values) - self._delegate = delegate or self._metric_class(name, documentation, tag_names, labelvalues=tag_values) + def _init_metric(self): + super()._init_metric() + self.name = self.name.replace('.', '_') + delegate = self._kwargs.get('delegate') + self._delegate = delegate or self._metric_class(self.name, self.documentation, self.tag_names) def _get_tagged_instance(self, tag_values): - delegate = self._delegate.labels(dict(zip(self.tag_names, tag_values))) - return self.__class__(self.name, self.documentation, self.tag_names, self.tag_values, delegate) + delegate = self._delegate.labels(**dict(zip(self.tag_names, tag_values))) + return self.__class__( + self.name, self.documentation, + tag_names=self.tag_names, tag_values=tag_values, delegate=delegate, **self._kwargs + ) class Counter(PrometheusMetricBase, HqCounter): + """https://prometheus.io/docs/concepts/metric_types/#counter""" _metric_class = PCounter - def _inc(self, amount=1): + def _record(self, amount: float): self._delegate.inc(amount) class Gauge(PrometheusMetricBase, HqGauge): + """https://prometheus.io/docs/concepts/metric_types/#gauge""" _metric_class = PGauge - def _set(self, value: float): + def _record(self, value: float): self._delegate.set(value) +class Histogram(PrometheusMetricBase, HqHistogram): + """This metric class implements the native Prometheus Histogram type + + https://prometheus.io/docs/concepts/metric_types/#histogram + """ + def _init_metric(self): + HqHistogram._init_metric(self) # skip _init_metric on PrometheusMetricBase + self.name = self.name.replace('.', '_') + self._delegate = self._kwargs.get('delegate') + if not self._delegate: + self._delegate = PHistogram(self.name, self.documentation, self.tag_names, buckets=self._buckets) + + def _record(self, value: float): + self._delegate.observe(value) + + class PrometheusMetrics(HqMetrics): _counter_class = Counter _gauge_class = Gauge + _histogram_class = Histogram def enabled(self) -> bool: return settings.ENABLE_PROMETHEUS_METRICS diff --git a/corehq/util/metrics/tests/__init__.py b/corehq/util/metrics/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py new file mode 100644 index 000000000000..caf4eb4c0be6 --- /dev/null +++ b/corehq/util/metrics/tests/test_metrics.py @@ -0,0 +1,171 @@ +from typing import Dict, Tuple + +from django.test import SimpleTestCase + +from corehq.util.metrics import DatadogMetrics, PrometheusMetrics +from corehq.util.metrics.metrics import HqCounter, HqGauge, HqHistogram +from corehq.util.metrics.tests.utils import patch_datadog +from prometheus_client.samples import Sample +from prometheus_client.utils import INF + + +class _TestMetrics(SimpleTestCase): + provider_class = None + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.provider = cls.provider_class() + + def test_counter(self): + counter = self.provider.counter('commcare.test.counter', 'Description', tag_names=['t1', 't2']) + counter.tag(t1='a', t2='b').inc() + counter.tag(t1='c', t2='b').inc(2) + counter.tag(t1='c', t2='b').inc() + self.assertCounterMetric(counter, { + (('t1', 'a'), ('t2', 'b')): 1, + (('t1', 'c'), ('t2', 'b')): 3, + }) + + def test_gauge(self): + gauge = self.provider.gauge('commcare.test.gauge', 'Description', tag_names=['t1', 't2']) + gauge.tag(t1='a', t2='b').set(4.2) + gauge.tag(t1='c', t2='b').set(2) + gauge.tag(t1='c', t2='b').set(5) + self.assertGaugeMetric(gauge, { + (('t1', 'a'), ('t2', 'b')): 4.2, + (('t1', 'c'), ('t2', 'b')): 5, + }) + + def test_histogram(self): + histogram = self.provider.histogram( + 'commcare.test.histogram', 'Description', 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] + ) + tagged_1 = histogram.tag(t1='a', t2='b') + tagged_1.observe(0.2) + tagged_1.observe(0.7) + tagged_1.observe(2.5) + + tagged_2 = histogram.tag(t1='c', t2='b') + tagged_2.observe(2) + tagged_2.observe(5) + self.assertHistogramMetric(histogram, { + (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, + (('t1', 'c'), ('t2', 'b')): {2: 1, INF: 1} + }) + + def assertCounterMetric(self, metric: HqCounter, expected: Dict[Tuple[Tuple[str, str], ...], float]): + """ + :param metric: metric class + :param expected: dict mapping tag tuples to metric values + """ + raise NotImplementedError + + def assertGaugeMetric(self, metric: HqGauge, expected: Dict[Tuple[Tuple[str, str], ...], float]): + """ + :param metric: metric class + :param expected: dict mapping tag tuples to metric values + """ + raise NotImplementedError + + def assertHistogramMetric(self, + metric: HqHistogram, + expected: Dict[Tuple[Tuple[str, str], ...], Dict[float, int]]): + """ + :param metric: metric class + :param expected: dict mapping tag tuples to a dict of bucket values + """ + raise NotImplementedError + + +class TestDatadogMetrics(_TestMetrics): + provider_class = DatadogMetrics + + def setUp(self) -> None: + super().setUp() + self.patch = patch_datadog() + self.metrics = self.patch.__enter__() + + def tearDown(self) -> None: + self.patch.__exit__(None, None, None) + super().tearDown() + + def assertCounterMetric(self, metric, expected): + self.assertEqual({key[0] for key in self.metrics}, {metric.name}) + actual = { + key[1]: sum(val) for key, val in self.metrics.items() + } + self.assertDictEqual(actual, expected) + + def assertGaugeMetric(self, metric, expected): + self.assertEqual({key[0] for key in self.metrics}, {metric.name}) + actual = { + key[1]: val[-1] for key, val in self.metrics.items() + } + self.assertDictEqual(actual, expected) + + def assertHistogramMetric(self, metric, expected): + self.assertEqual({key[0] for key in self.metrics}, {metric.name}) + expected_samples = {} + for tags, buckets in expected.items(): + for bucket, val in buckets.items(): + prefix = 'lt' + if bucket == INF: + bucket = metric._buckets[-1] + prefix = 'over' + bucket_tag = (metric._bucket_tag, f'{prefix}_{bucket:03d}{metric._bucket_unit}') + expected_samples[tags + (bucket_tag,)] = val + + actual = { + key[1]: sum(val) for key, val in self.metrics.items() + } + self.assertDictEqual(actual, expected_samples) + + +class TestPrometheusMetrics(_TestMetrics): + provider_class = PrometheusMetrics + + def _samples_to_dics(self, samples, filter_name=None): + """Convert a Sample tuple into a dict((name, (labels tuple)) -> value)""" + return { + tuple(sample.labels.items()): sample.value + for sample in samples + if not filter_name or sample.name == filter_name + } + + def assertGaugeMetric(self, metric, expected): + [collected] = metric._delegate.collect() + actual = self._samples_to_dics(collected.samples) + self.assertDictEqual(actual, expected) + + def assertCounterMetric(self, metric, expected): + total_name = f'{metric.name}_total' + [collected] = metric._delegate.collect() + actual = self._samples_to_dics(collected.samples, total_name) + self.assertDictEqual(actual, expected) + + def assertHistogramMetric(self, metric, expected): + # Note that Prometheus histograms are cumulative so we must sum up the successive bucket values + # https://en.wikipedia.org/wiki/Histogram#Cumulative_histogram + [collected] = metric._delegate.collect() + + sample_name = f'{metric.name}_bucket' + expected_samples = [] + for key, value in expected.items(): + cumulative_value = 0 + for bucket in metric._buckets: + val = value.get(bucket, 0) + cumulative_value += val + labels = dict(key + (('le', str(float(bucket))),)) + expected_samples.append(Sample(sample_name, labels, float(cumulative_value), None, None)) + + labels = dict(key + (('le', '+Inf'),)) + cumulative_value += value.get(INF, 0) + expected_samples.append(Sample(sample_name, labels, float(cumulative_value), None, None)) + + actual = [ + s for s in collected.samples + if s.name.endswith('bucket') + ] + self.assertListEqual(actual, expected_samples) diff --git a/corehq/util/metrics/tests/utils.py b/corehq/util/metrics/tests/utils.py new file mode 100644 index 000000000000..9c7f94b2d104 --- /dev/null +++ b/corehq/util/metrics/tests/utils.py @@ -0,0 +1,22 @@ +from collections import defaultdict +from contextlib import contextmanager + +import mock + + +@contextmanager +def patch_datadog(): + def record(fn, name, value, tags=None): + key = tuple([ + name, + tuple([tuple(t.split(':')) for t in (tags or [])]), + ]) + stats[key].append(value) + + stats = defaultdict(list) + patch = mock.patch("corehq.util.metrics.datadog._datadog_record", new=record) + patch.start() + try: + yield stats + finally: + patch.stop() From c1f8270e865523937f4390fb7ea55ba093ec185c Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 10 Mar 2020 17:41:11 +0200 Subject: [PATCH 04/40] bucket values less than or equal --- corehq/util/datadog/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/util/datadog/utils.py b/corehq/util/datadog/utils.py index 9d495679260f..c61a4d099cb1 100644 --- a/corehq/util/datadog/utils.py +++ b/corehq/util/datadog/utils.py @@ -115,7 +115,7 @@ def bucket_value(value, buckets, unit=''): lt_template = "lt_{:0%s}{}" % number_length over_template = "over_{:0%s}{}" % number_length for bucket in buckets: - if value < bucket: + if value <= bucket: return lt_template.format(bucket, unit) return over_template.format(buckets[-1], unit) From 5034156b76b1918a6b784f6cd8883380cc0bbec3 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 10 Mar 2020 18:08:00 +0200 Subject: [PATCH 05/40] add prometheus client to requirements --- requirements-python3/dev-requirements.txt | 1 + requirements-python3/prod-requirements.txt | 1 + requirements-python3/requirements.txt | 1 + requirements-python3/test-requirements.txt | 1 + requirements/dev-requirements.txt | 1 + requirements/prod-requirements.txt | 1 + requirements/requirements.in | 1 + requirements/requirements.txt | 1 + requirements/test-requirements.txt | 1 + 9 files changed, 9 insertions(+) diff --git a/requirements-python3/dev-requirements.txt b/requirements-python3/dev-requirements.txt index d0741b0bb668..ddaa038ed973 100644 --- a/requirements-python3/dev-requirements.txt +++ b/requirements-python3/dev-requirements.txt @@ -130,6 +130,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psutil==5.1.3 psycogreen==1.0.1 diff --git a/requirements-python3/prod-requirements.txt b/requirements-python3/prod-requirements.txt index 5627994d311c..128b82f8f986 100644 --- a/requirements-python3/prod-requirements.txt +++ b/requirements-python3/prod-requirements.txt @@ -109,6 +109,7 @@ pickleshare==0.7.5 # via ipython pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psycogreen==1.0.1 psycopg2==2.7.7 diff --git a/requirements-python3/requirements.txt b/requirements-python3/requirements.txt index 207e181a974d..148161bb967e 100644 --- a/requirements-python3/requirements.txt +++ b/requirements-python3/requirements.txt @@ -101,6 +101,7 @@ phonenumberslite==8.10.22 pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 diff --git a/requirements-python3/test-requirements.txt b/requirements-python3/test-requirements.txt index 12bf9a824042..6c5950ac74cf 100644 --- a/requirements-python3/test-requirements.txt +++ b/requirements-python3/test-requirements.txt @@ -113,6 +113,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 diff --git a/requirements/dev-requirements.txt b/requirements/dev-requirements.txt index d0741b0bb668..ddaa038ed973 100644 --- a/requirements/dev-requirements.txt +++ b/requirements/dev-requirements.txt @@ -130,6 +130,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psutil==5.1.3 psycogreen==1.0.1 diff --git a/requirements/prod-requirements.txt b/requirements/prod-requirements.txt index 5627994d311c..128b82f8f986 100644 --- a/requirements/prod-requirements.txt +++ b/requirements/prod-requirements.txt @@ -109,6 +109,7 @@ pickleshare==0.7.5 # via ipython pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 prompt-toolkit==1.0.18 # via ipython psycogreen==1.0.1 psycopg2==2.7.7 diff --git a/requirements/requirements.in b/requirements/requirements.in index c4754c951eea..25295c647c87 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -114,3 +114,4 @@ werkzeug==0.11.15 CommcareTranslationChecker==0.9.3.5 WeasyPrint==0.42.3 architect==0.5.6 +prometheus-client==0.7.1 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 207e181a974d..148161bb967e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -101,6 +101,7 @@ phonenumberslite==8.10.22 pillow==6.2.1 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 diff --git a/requirements/test-requirements.txt b/requirements/test-requirements.txt index 12bf9a824042..6c5950ac74cf 100644 --- a/requirements/test-requirements.txt +++ b/requirements/test-requirements.txt @@ -113,6 +113,7 @@ pillow==6.2.1 pip-tools==4.4.0 ply==3.11 # via eulxml, jsonpath-rw polib==1.1.0 +prometheus-client==0.7.1 psycogreen==1.0.1 psycopg2==2.7.7 py-kissmetrics==1.0.1 From 2be4e6a27c6cab64d0ebc696c2f6341915e2e79e Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 11 Mar 2020 14:46:18 +0200 Subject: [PATCH 06/40] make metrics lazy --- corehq/util/metrics/__init__.py | 20 +++---- corehq/util/metrics/datadog.py | 2 + corehq/util/metrics/metrics.py | 63 +++++++++++------------ corehq/util/metrics/tests/test_metrics.py | 60 ++++++++++++++++++--- 4 files changed, 93 insertions(+), 52 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 7672abf5b545..6789a3130bfc 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,19 +1,15 @@ +from django.utils.functional import SimpleLazyObject + from corehq.util.metrics.datadog import DatadogMetrics from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, HqMetrics from corehq.util.metrics.prometheus import PrometheusMetrics -_metrics = None # singleton/global - -def get_metrics() -> HqMetrics: - global _metrics - if not _metrics: - enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) - if not enabled: - _metrics = DummyMetrics() +def _get_metrics(): + enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) + if not enabled: + return [DummyMetrics()] + return enabled - if len(enabled) > 1: - _metrics = DelegatedMetrics(enabled) - _metrics = enabled[0] - return _metrics +metrics = DelegatedMetrics(SimpleLazyObject(_get_metrics)) # singleton/global diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index 748718c3f23c..e13d5c3d9a1a 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -69,6 +69,8 @@ class Histogram(HqHistogram): """ def _record(self, value: float): tags = _format_tags(self.tag_names, self.tag_values) + if not tags: + tags = [] bucket = bucket_value(value, self._buckets, self._bucket_unit) tags.append(f'{self._bucket_tag}:{bucket}') _datadog_record(statsd.increment, self.name, 1, tags) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 73f7dad36e75..c4038abaf6ed 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -3,6 +3,8 @@ from abc import abstractmethod from typing import Iterable, List +from django.utils.functional import SimpleLazyObject + from corehq.util.soft_assert import soft_assert from prometheus_client.utils import INF @@ -31,7 +33,7 @@ def _validate_tag_names(tag_names): class MetricBase: - def __init__(self, name: str, documentation: str, tag_names: Iterable=tuple(), **kwargs): + def __init__(self, name: str, documentation: str, tag_names: Iterable = tuple(), **kwargs): self.name = name if not METRIC_NAME_RE.match(name): raise ValueError('Invalid metric name: ' + name) @@ -62,7 +64,7 @@ def _validate_tags(self): raise Exception('Metric has missing tag values.') def _record(self, value: float): - pass + raise NotImplementedError class HqCounter(MetricBase): @@ -114,7 +116,7 @@ def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> H def histogram(self, name: str, documentation: str, bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', - tag_names: Iterable=tuple()) -> HqGauge: + tag_names: Iterable=tuple()) -> HqHistogram: """Create a histogram metric. Histogram implementations differ between provider. See provider implementations for details. """ @@ -131,41 +133,38 @@ def enabled(self) -> bool: return True -class DelegatedMetrics(HqMetrics): +class DelegatedMetrics: def __init__(self, delegates): self.delegates = delegates - - def enabled(self) -> bool: - return True - - def counter(self, name: str, documentation: str, tag_names: Iterable=tuple()): - return DelegatingCounter([ - d.counter(name, documentation, tag_names) for d in self.delegates - ]) - - def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()): - return DelegatingGauge([ - d.gauge(name, documentation, tag_names) for d in self.delegates - ]) + self._types = { + 'counter': 'inc', + 'gauge': 'set', + 'histogram': 'observe', + } + + def __getattr__(self, item): + if item in self._types: + def _make_type(*args, **kwargs): + return SimpleLazyObject(lambda: DelegatingMetric([ + getattr(d, item)(*args, **kwargs) for d in self.delegates + ], self._types[item])) + return _make_type + raise AttributeError class DelegatingMetric: - def __init__(self, delegates): + def __init__(self, delegates, record_fn_name): self._delegates = delegates + self._record_fn_name = record_fn_name - def tag(self, **tag_kwargs): - return self.__class__([ - d.tag(**tag_kwargs) for d in self._delegates - ]) + def tag(self, *args, **kwargs): + return self.__class__([d.tag(*args, **kwargs) for d in self._delegates]) + def __getattr__(self, item): + if item == self._record_fn_name: + def record(*args, **kwargs): + for metric in self._delegates: + getattr(metric, item)(*args, **kwargs) + return record -class DelegatingCounter(DelegatingMetric): - def inc(self, amount: float = 1): - for d in self._delegates: - d.inc(amount) - - -class DelegatingGauge(DelegatingMetric): - def set(self, value: float): - for d in self._delegates: - d.set(value) + raise AttributeError diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py index caf4eb4c0be6..c6d721921780 100644 --- a/corehq/util/metrics/tests/test_metrics.py +++ b/corehq/util/metrics/tests/test_metrics.py @@ -1,12 +1,19 @@ from typing import Dict, Tuple from django.test import SimpleTestCase +from django.utils.functional import SimpleLazyObject from corehq.util.metrics import DatadogMetrics, PrometheusMetrics -from corehq.util.metrics.metrics import HqCounter, HqGauge, HqHistogram +from corehq.util.metrics.metrics import ( + DelegatedMetrics, + HqCounter, + HqGauge, + HqHistogram, +) from corehq.util.metrics.tests.utils import patch_datadog from prometheus_client.samples import Sample from prometheus_client.utils import INF +from testil import eq class _TestMetrics(SimpleTestCase): @@ -85,28 +92,28 @@ class TestDatadogMetrics(_TestMetrics): def setUp(self) -> None: super().setUp() self.patch = patch_datadog() - self.metrics = self.patch.__enter__() + self.recorded_metrics = self.patch.__enter__() def tearDown(self) -> None: self.patch.__exit__(None, None, None) super().tearDown() def assertCounterMetric(self, metric, expected): - self.assertEqual({key[0] for key in self.metrics}, {metric.name}) + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) actual = { - key[1]: sum(val) for key, val in self.metrics.items() + key[1]: sum(val) for key, val in self.recorded_metrics.items() } self.assertDictEqual(actual, expected) def assertGaugeMetric(self, metric, expected): - self.assertEqual({key[0] for key in self.metrics}, {metric.name}) + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) actual = { - key[1]: val[-1] for key, val in self.metrics.items() + key[1]: val[-1] for key, val in self.recorded_metrics.items() } self.assertDictEqual(actual, expected) def assertHistogramMetric(self, metric, expected): - self.assertEqual({key[0] for key in self.metrics}, {metric.name}) + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) expected_samples = {} for tags, buckets in expected.items(): for bucket, val in buckets.items(): @@ -118,7 +125,7 @@ def assertHistogramMetric(self, metric, expected): expected_samples[tags + (bucket_tag,)] = val actual = { - key[1]: sum(val) for key, val in self.metrics.items() + key[1]: sum(val) for key, val in self.recorded_metrics.items() } self.assertDictEqual(actual, expected_samples) @@ -169,3 +176,40 @@ def assertHistogramMetric(self, metric, expected): if s.name.endswith('bucket') ] self.assertListEqual(actual, expected_samples) + + +def test_delegate_lazy(): + metrics = DelegatedMetrics([DatadogMetrics(), PrometheusMetrics()]) + + def _check(metric): + assert isinstance(metric, SimpleLazyObject), '' + + test_cases = [ + metrics.counter('commcare.name.1', ''), + metrics.gauge('commcare.name.2', ''), + metrics.histogram('commcare.name.3', '', 'duration'), + ] + for metric in test_cases: + yield _check, metric + + +def test_lazy_recording(): + metrics = DelegatedMetrics([DatadogMetrics(), PrometheusMetrics()]) + + def _check(metric, method_name): + with patch_datadog() as stats: + getattr(metric, method_name)(1) + + dd_metric, prom_metric = metric._delegates + [collected] = prom_metric._delegate.collect() + + eq(len(stats), 1, stats) + eq(len(collected.samples) >= 1, True, collected.samples) + + test_cases = [ + (metrics.counter('commcare.name.1', ''), 'inc'), + (metrics.gauge('commcare.name.2', ''), 'set'), + (metrics.histogram('commcare.name.3', '', 'duration'), 'observe'), + ] + for metric, method_name in test_cases: + yield _check, metric, method_name From eb8afe3fdc83b34a41d2531fc47dcd00952c9f6e Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 11 Mar 2020 16:45:31 +0200 Subject: [PATCH 07/40] example histogram --- corehq/apps/api/odata/utils.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/corehq/apps/api/odata/utils.py b/corehq/apps/api/odata/utils.py index f09608e12274..6992808fa2e4 100644 --- a/corehq/apps/api/odata/utils.py +++ b/corehq/apps/api/odata/utils.py @@ -4,6 +4,7 @@ from corehq.apps.export.models import ExportInstance from corehq.util.datadog.gauges import datadog_counter from corehq.util.datadog.utils import bucket_value +from corehq.util.metrics import metrics FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type']) @@ -54,6 +55,13 @@ def _get_dot_path(export_column): return metadata +odata_feed_access_histogram = metrics.histogram( + 'commcare.odata_feed.test_v3', 'Odata Feed Access', + bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', + tag_names=['domain', 'feed_id', 'feed_type', 'username', 'row_count', 'column_count', 'size'] +) + + def record_feed_access_in_datadog(request, config_id, duration, response): config = ExportInstance.get(config_id) username = request.couch_user.username @@ -64,14 +72,12 @@ def record_feed_access_in_datadog(request, config_id, duration, response): column_count = len(rows[0]) except IndexError: column_count = 0 - datadog_counter('commcare.odata_feed.test_v3', tags=[ - 'domain:{}'.format(request.domain), - 'feed_id:{}'.format(config_id), - 'feed_type:{}'.format(config.type), - 'username:{}'.format(username), - 'row_count:{}'.format(row_count), - 'column_count:{}'.format(column_count), - 'size:{}'.format(len(response.content)), - 'duration:{}'.format(duration), - 'duration_bucket:{}'.format(bucket_value(duration, (1, 5, 20, 60, 120, 300, 600), 's')), - ]) + odata_feed_access_histogram.tag( + domain=request.domain, + feed_id=config_id, + feed_type=config.type, + username=username, + row_count=row_count, + column_count=column_count, + size=len(response.content), + ).observe(duration) From 54f2d1bef3026d87557e78fa4b8b5c01ade70951 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 11 Mar 2020 17:09:47 +0200 Subject: [PATCH 08/40] convert sumbission metrics --- corehq/apps/receiverwrapper/views.py | 69 +++++++++++++++++----------- corehq/util/datadog/metrics.py | 2 - 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/corehq/apps/receiverwrapper/views.py b/corehq/apps/receiverwrapper/views.py index 10922e0b4e45..6f27ed881bfe 100644 --- a/corehq/apps/receiverwrapper/views.py +++ b/corehq/apps/receiverwrapper/views.py @@ -10,6 +10,7 @@ import couchforms from casexml.apps.case.xform import get_case_updates, is_device_report +from corehq.util.metrics import metrics from couchforms import openrosa_response from couchforms.const import MAGIC_PROPERTY, BadRequest from couchforms.getters import MultimediaBug @@ -50,18 +51,38 @@ convert_xform_to_json, should_use_sql_backend, ) -from corehq.util.datadog.gauges import datadog_counter, datadog_gauge -from corehq.util.datadog.metrics import ( - MULTIMEDIA_SUBMISSION_ERROR_COUNT, - XFORM_LOCKED_COUNT, -) -from corehq.util.datadog.utils import bucket_value from corehq.util.timer import TimingContext PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0)) PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT') PROFILE_LIMIT = int(PROFILE_LIMIT) if PROFILE_LIMIT is not None else 1 +corrupt_multimedia_counter = metrics.counter( + 'commcare.corrupt_multimedia_submissions', 'Count of requests with corrupt multimedia', + tag_names=['domain', 'authenticated'] +) + +xform_locked_error_counter = metrics.counter( + 'commcare.xformlocked.count', 'Count of locking errors', + tag_names=['domain', 'authenticated'] +) + +submission_counter = metrics.counter( + 'commcare.xform_submissions.count', 'Count of form submissions', + tag_names=['domain', 'backend', 'submission_type', 'status_code'] +) + +submission_duration_histogram = metrics.histogram( + 'commcare.xform_submissions.duration.seconds', 'Count of form submissions', + bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', + tag_names=['domain', 'backend', 'submission_type', 'status_code'] +) + +submission_lag_histogram = metrics.histogram( + 'commcare.xform_submissions.lag.days', 'Count of form submissions', + bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d', + tag_names=['domain', 'backend', 'submission_type', 'status_code'] +) @profile_prod('commcare_receiverwapper_process_form.prof', probability=PROFILE_PROBABILITY, limit=PROFILE_LIMIT) def _process_form(request, domain, app_id, user_id, authenticated, @@ -70,10 +91,10 @@ def _process_form(request, domain, app_id, user_id, authenticated, if rate_limit_submission(domain): return HttpTooManyRequests() - metric_tags = [ - 'backend:sql' if should_use_sql_backend(domain) else 'backend:couch', - 'domain:{}'.format(domain), - ] + metric_tags = { + 'backend': 'sql' if should_use_sql_backend(domain) else 'couch', + 'domain': domain + } try: instance, attachments = couchforms.get_instance_and_attachment(request) @@ -85,9 +106,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, except: meta = {} + corrupt_multimedia_counter.tag(domain=domain, authenticated=authenticated).inc() return _submission_error( - request, "Received a submission with POST.keys()", - MULTIMEDIA_SUBMISSION_ERROR_COUNT, metric_tags, + request, "Received a submission with POST.keys()", metric_tags, domain, app_id, user_id, authenticated, meta, ) @@ -133,8 +154,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, try: result = submission_post.run() except XFormLockError as err: + xform_locked_error_counter.tag(domain=domain, authenticated=authenticated).inc() return _submission_error( - request, "XFormLockError: %s" % err, XFORM_LOCKED_COUNT, + request, "XFormLockError: %s" % err, metric_tags, domain, app_id, user_id, authenticated, status=423, notify=False, ) @@ -145,7 +167,7 @@ def _process_form(request, domain, app_id, user_id, authenticated, return response -def _submission_error(request, message, count_metric, metric_tags, +def _submission_error(request, message, metric_tags, domain, app_id, user_id, authenticated, meta=None, status=400, notify=True): """Notify exception, datadog count, record metrics, construct response @@ -157,7 +179,6 @@ def _submission_error(request, message, count_metric, metric_tags, "domain:{}".format(domain), "authenticated:{}".format(authenticated), ] - datadog_counter(count_metric, tags=details) if notify: details.extend([ "user_id:{}".format(user_id), @@ -172,24 +193,20 @@ def _submission_error(request, message, count_metric, metric_tags, def _record_metrics(tags, submission_type, response, timer=None, xform=None): - if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: - lag = xform.received_on - xform.metadata.timeEnd - lag_days = lag.total_seconds() / 86400 - tags += [ - 'lag:%s' % bucket_value(lag_days, (1, 2, 4, 7, 14, 31, 90), 'd') - ] - tags += [ 'submission_type:{}'.format(submission_type), 'status_code:{}'.format(response.status_code) ] + if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: + lag = xform.received_on - xform.metadata.timeEnd + lag_days = lag.total_seconds() / 86400 + submission_lag_histogram.tag(**tags).observe(lag_days) + if timer: - tags += [ - 'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'), - ] + submission_duration_histogram.tag(**tags).observe(timer.duration) - datadog_counter('commcare.xform_submissions.count', tags=tags) + submission_counter.tag(**tags).inc() @location_safe diff --git a/corehq/util/datadog/metrics.py b/corehq/util/datadog/metrics.py index 545be07a5a85..d56dea5154db 100644 --- a/corehq/util/datadog/metrics.py +++ b/corehq/util/datadog/metrics.py @@ -2,6 +2,4 @@ ERROR_COUNT = 'commcare.error.count' REPEATER_ERROR_COUNT = 'commcare.repeaters.error' REPEATER_SUCCESS_COUNT = 'commcare.repeaters.success' -MULTIMEDIA_SUBMISSION_ERROR_COUNT = 'commcare.corrupt-multimedia-submission.error.count' DATE_OPENED_CASEBLOCK_ERROR_COUNT = 'commcare.date-opened-caseblock-bug.error.count' -XFORM_LOCKED_COUNT = 'commcare.xformlocked.count' From eba69c4f8937352143272b8d27eda4064bba91cd Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 11 Mar 2020 17:29:04 +0200 Subject: [PATCH 09/40] docstrings --- corehq/util/metrics/metrics.py | 2 ++ corehq/util/metrics/prometheus.py | 1 + 2 files changed, 3 insertions(+) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index c4038abaf6ed..65ea9def39b1 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -134,6 +134,8 @@ def enabled(self) -> bool: class DelegatedMetrics: + """This class makes the metric class instantiation lazy and + also multiple metrics providers to be used.""" def __init__(self, delegates): self.delegates = delegates self._types = { diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index c98146f3886d..30d4604d0fee 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -50,6 +50,7 @@ class Histogram(PrometheusMetricBase, HqHistogram): https://prometheus.io/docs/concepts/metric_types/#histogram """ def _init_metric(self): + """Overriding this so that we can pass in the buckets to the Prometheus class""" HqHistogram._init_metric(self) # skip _init_metric on PrometheusMetricBase self.name = self.name.replace('.', '_') self._delegate = self._kwargs.get('delegate') From 01fbc076c50d8a67299340e290d2c5039102ea5a Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 11 Mar 2020 17:30:30 +0200 Subject: [PATCH 10/40] stickler --- corehq/util/metrics/__init__.py | 2 +- corehq/util/metrics/metrics.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 6789a3130bfc..1e6a6053ea49 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,7 +1,7 @@ from django.utils.functional import SimpleLazyObject from corehq.util.metrics.datadog import DatadogMetrics -from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, HqMetrics +from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics from corehq.util.metrics.prometheus import PrometheusMetrics diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 65ea9def39b1..006971c30e9e 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -108,15 +108,15 @@ class HqMetrics(metaclass=abc.ABCMeta): def enabled(self) -> bool: raise NotImplementedError - def counter(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> HqCounter: + def counter(self, name: str, documentation: str, tag_names: Iterable = tuple()) -> HqCounter: return self._counter_class(name, documentation, tag_names) - def gauge(self, name: str, documentation: str, tag_names: Iterable=tuple()) -> HqGauge: + def gauge(self, name: str, documentation: str, tag_names: Iterable = tuple()) -> HqGauge: return self._gauge_class(name, documentation, tag_names) def histogram(self, name: str, documentation: str, bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', - tag_names: Iterable=tuple()) -> HqHistogram: + tag_names: Iterable = tuple()) -> HqHistogram: """Create a histogram metric. Histogram implementations differ between provider. See provider implementations for details. """ From 450f211179d41bc243c02470b269c0bdfdd3203f Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 12 Mar 2020 11:17:25 +0200 Subject: [PATCH 11/40] keep tag_values as dict instead of splitting and re-combining --- corehq/util/metrics/datadog.py | 12 ++++++------ corehq/util/metrics/metrics.py | 15 ++++++++------- corehq/util/metrics/prometheus.py | 4 ++-- corehq/util/metrics/tests/test_metrics.py | 2 +- corehq/util/metrics/tests/utils.py | 8 ++------ 5 files changed, 19 insertions(+), 22 deletions(-) diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index e13d5c3d9a1a..574cf8de58bf 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -19,12 +19,12 @@ statsd = DogStatsd(constant_tags=COMMON_TAGS) -def _format_tags(tag_names, tag_values): - if not tag_names or not tag_values: +def _format_tags(tag_values: dict): + if not tag_values: return None return [ - f'{name}:{value}' for name, value in zip(tag_names, tag_values) + f'{name}:{value}' for name, value in tag_values.items() ] @@ -37,13 +37,13 @@ def _datadog_record(fn, name, value, tags=None): class Counter(HqCounter): def _record(self, amount: float): - tags = _format_tags(self.tag_names, self.tag_values) + tags = _format_tags(self.tag_values) _datadog_record(statsd.increment, self.name, amount, tags) class Gauge(HqGauge): def _record(self, value): - tags = _format_tags(self.tag_names, self.tag_values) + tags = _format_tags(self.tag_values) _datadog_record(statsd.gauge, self.name, value, tags) @@ -68,7 +68,7 @@ class Histogram(HqHistogram): More details: https://help.datadoghq.com/hc/en-us/articles/211545826 """ def _record(self, value: float): - tags = _format_tags(self.tag_names, self.tag_values) + tags = _format_tags(self.tag_values) if not tags: tags = [] bucket = bucket_value(value, self._buckets, self._bucket_unit) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 006971c30e9e..d1bf13680b7f 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -21,7 +21,7 @@ def _enforce_prefix(name, prefix): def _validate_tag_names(tag_names): - tag_names = tuple(tag_names) + tag_names = set(tag_names) for l in tag_names: if not METRIC_TAG_NAME_RE.match(l): raise ValueError('Invalid metric tag name: ' + l) @@ -41,6 +41,10 @@ def __init__(self, name: str, documentation: str, tag_names: Iterable = tuple(), self.documentation = documentation self.tag_names = _validate_tag_names(tag_names) self.tag_values = kwargs.pop('tag_values', None) + if self.tag_values: + assert isinstance(self.tag_values, dict) + if self.tag_values.keys() != self.tag_names: + raise ValueError('Incorrect tag names') self._kwargs = kwargs self._init_metric() @@ -48,13 +52,10 @@ def _init_metric(self): pass def tag(self, **tag_kwargs): - if sorted(tag_kwargs) != sorted(self.tag_names): - raise ValueError('Incorrect tag names') + tag_kwargs = {t: str(v) for t, v in tag_kwargs.items()} + return self._get_tagged_instance(tag_kwargs) - tag_values = tuple(str(tag_kwargs[t]) for t in self.tag_names) - return self._get_tagged_instance(tag_values) - - def _get_tagged_instance(self, tag_values): + def _get_tagged_instance(self, tag_values: dict): return self.__class__( self.name, self.documentation, tag_names=self.tag_names, tag_values=tag_values, **self._kwargs ) diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index 30d4604d0fee..604b02ae7aaf 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -20,8 +20,8 @@ def _init_metric(self): delegate = self._kwargs.get('delegate') self._delegate = delegate or self._metric_class(self.name, self.documentation, self.tag_names) - def _get_tagged_instance(self, tag_values): - delegate = self._delegate.labels(**dict(zip(self.tag_names, tag_values))) + def _get_tagged_instance(self, tag_values: dict): + delegate = self._delegate.labels(**tag_values) return self.__class__( self.name, self.documentation, tag_names=self.tag_names, tag_values=tag_values, delegate=delegate, **self._kwargs diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py index c6d721921780..f77d7e929e9f 100644 --- a/corehq/util/metrics/tests/test_metrics.py +++ b/corehq/util/metrics/tests/test_metrics.py @@ -122,7 +122,7 @@ def assertHistogramMetric(self, metric, expected): bucket = metric._buckets[-1] prefix = 'over' bucket_tag = (metric._bucket_tag, f'{prefix}_{bucket:03d}{metric._bucket_unit}') - expected_samples[tags + (bucket_tag,)] = val + expected_samples[tuple(sorted(tags + (bucket_tag,)))] = val actual = { key[1]: sum(val) for key, val in self.recorded_metrics.items() diff --git a/corehq/util/metrics/tests/utils.py b/corehq/util/metrics/tests/utils.py index 9c7f94b2d104..bbd9119e3113 100644 --- a/corehq/util/metrics/tests/utils.py +++ b/corehq/util/metrics/tests/utils.py @@ -9,14 +9,10 @@ def patch_datadog(): def record(fn, name, value, tags=None): key = tuple([ name, - tuple([tuple(t.split(':')) for t in (tags or [])]), + tuple(sorted([tuple(t.split(':')) for t in (tags or [])])), ]) stats[key].append(value) stats = defaultdict(list) - patch = mock.patch("corehq.util.metrics.datadog._datadog_record", new=record) - patch.start() - try: + with mock.patch("corehq.util.metrics.datadog._datadog_record", new=record): yield stats - finally: - patch.stop() From 2c3edefb732f894c64c891cd960bc42252f5a30f Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 12 Mar 2020 13:55:57 +0200 Subject: [PATCH 12/40] update links --- corehq/util/metrics/datadog.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index 574cf8de58bf..e463cccfcd53 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -65,7 +65,9 @@ class Histogram(HqHistogram): # resulting Datadog metric # commcare.request.duration:1|c|#duration:lt_2ms - More details: https://help.datadoghq.com/hc/en-us/articles/211545826 + For more details see: + * https://github.com/dimagi/commcare-hq/pull/17080 + * https://github.com/dimagi/commcare-hq/pull/17030#issuecomment-315794700 """ def _record(self, value: float): tags = _format_tags(self.tag_values) From 8664161faf69621076752cfb29fb1e07441441c2 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 12 Mar 2020 13:56:59 +0200 Subject: [PATCH 13/40] remove unnecessary list Co-Authored-By: Daniel Miller --- corehq/util/metrics/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index d1bf13680b7f..6520d6f5b6cd 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -92,7 +92,7 @@ def _init_metric(self): self._bucket_unit = self._kwargs.get('bucket_unit', '') self._bucket_tag = self._kwargs.get('bucket_tag') if self._bucket_tag in self.tag_names: - self.tag_names = tuple([name for name in self.tag_names if name != self._bucket_tag]) + self.tag_names = tuple(name for name in self.tag_names if name != self._bucket_tag) def observe(self, value: float): """Update histogram with the given value.""" From 0bcedea6a39896985f37058701f8344367523904 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 12 Mar 2020 13:58:34 +0200 Subject: [PATCH 14/40] replace typle() with () --- corehq/util/metrics/metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index d1bf13680b7f..cef24f31cb87 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -33,7 +33,7 @@ def _validate_tag_names(tag_names): class MetricBase: - def __init__(self, name: str, documentation: str, tag_names: Iterable = tuple(), **kwargs): + def __init__(self, name: str, documentation: str, tag_names: Iterable = (), **kwargs): self.name = name if not METRIC_NAME_RE.match(name): raise ValueError('Invalid metric name: ' + name) @@ -109,15 +109,15 @@ class HqMetrics(metaclass=abc.ABCMeta): def enabled(self) -> bool: raise NotImplementedError - def counter(self, name: str, documentation: str, tag_names: Iterable = tuple()) -> HqCounter: + def counter(self, name: str, documentation: str, tag_names: Iterable = ()) -> HqCounter: return self._counter_class(name, documentation, tag_names) - def gauge(self, name: str, documentation: str, tag_names: Iterable = tuple()) -> HqGauge: + def gauge(self, name: str, documentation: str, tag_names: Iterable = ()) -> HqGauge: return self._gauge_class(name, documentation, tag_names) def histogram(self, name: str, documentation: str, bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', - tag_names: Iterable = tuple()) -> HqHistogram: + tag_names: Iterable = ()) -> HqHistogram: """Create a histogram metric. Histogram implementations differ between provider. See provider implementations for details. """ From 904e358edeebc5078507700554157cdc933276d4 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 12 Mar 2020 15:34:36 +0200 Subject: [PATCH 15/40] fix tags --- corehq/apps/receiverwrapper/views.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/corehq/apps/receiverwrapper/views.py b/corehq/apps/receiverwrapper/views.py index 6f27ed881bfe..22e53df3c76d 100644 --- a/corehq/apps/receiverwrapper/views.py +++ b/corehq/apps/receiverwrapper/views.py @@ -193,10 +193,10 @@ def _submission_error(request, message, metric_tags, def _record_metrics(tags, submission_type, response, timer=None, xform=None): - tags += [ - 'submission_type:{}'.format(submission_type), - 'status_code:{}'.format(response.status_code) - ] + tags.update({ + 'submission_type': submission_type, + 'status_code': response.status_code + }) if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: lag = xform.received_on - xform.metadata.timeEnd From d68e104a2df754e02b3b3154382f34489e03c5de Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 12 Mar 2020 15:34:42 +0200 Subject: [PATCH 16/40] pass other args --- corehq/util/metrics/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 4fdf53446923..3d37368e4a40 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -161,7 +161,7 @@ def __init__(self, delegates, record_fn_name): self._record_fn_name = record_fn_name def tag(self, *args, **kwargs): - return self.__class__([d.tag(*args, **kwargs) for d in self._delegates]) + return self.__class__([d.tag(*args, **kwargs) for d in self._delegates], self._record_fn_name) def __getattr__(self, item): if item == self._record_fn_name: From 5c000528e4c114005cb25faa7e3fbc288722a8c7 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 12 Mar 2020 16:53:42 +0200 Subject: [PATCH 17/40] revert change to datadog bucketing boundry --- corehq/util/datadog/utils.py | 2 +- corehq/util/metrics/tests/test_metrics.py | 63 +++++++++++++---------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/corehq/util/datadog/utils.py b/corehq/util/datadog/utils.py index c61a4d099cb1..9d495679260f 100644 --- a/corehq/util/datadog/utils.py +++ b/corehq/util/datadog/utils.py @@ -115,7 +115,7 @@ def bucket_value(value, buckets, unit=''): lt_template = "lt_{:0%s}{}" % number_length over_template = "over_{:0%s}{}" % number_length for bucket in buckets: - if value <= bucket: + if value < bucket: return lt_template.format(bucket, unit) return over_template.format(buckets[-1], unit) diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py index f77d7e929e9f..ea6b878f6e8b 100644 --- a/corehq/util/metrics/tests/test_metrics.py +++ b/corehq/util/metrics/tests/test_metrics.py @@ -44,24 +44,6 @@ def test_gauge(self): (('t1', 'c'), ('t2', 'b')): 5, }) - def test_histogram(self): - histogram = self.provider.histogram( - 'commcare.test.histogram', 'Description', 'duration', - buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] - ) - tagged_1 = histogram.tag(t1='a', t2='b') - tagged_1.observe(0.2) - tagged_1.observe(0.7) - tagged_1.observe(2.5) - - tagged_2 = histogram.tag(t1='c', t2='b') - tagged_2.observe(2) - tagged_2.observe(5) - self.assertHistogramMetric(histogram, { - (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, - (('t1', 'c'), ('t2', 'b')): {2: 1, INF: 1} - }) - def assertCounterMetric(self, metric: HqCounter, expected: Dict[Tuple[Tuple[str, str], ...], float]): """ :param metric: metric class @@ -76,15 +58,6 @@ def assertGaugeMetric(self, metric: HqGauge, expected: Dict[Tuple[Tuple[str, str """ raise NotImplementedError - def assertHistogramMetric(self, - metric: HqHistogram, - expected: Dict[Tuple[Tuple[str, str], ...], Dict[float, int]]): - """ - :param metric: metric class - :param expected: dict mapping tag tuples to a dict of bucket values - """ - raise NotImplementedError - class TestDatadogMetrics(_TestMetrics): provider_class = DatadogMetrics @@ -98,6 +71,24 @@ def tearDown(self) -> None: self.patch.__exit__(None, None, None) super().tearDown() + def test_histogram(self): + histogram = self.provider.histogram( + 'commcare.test.histogram', 'Description', 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] + ) + tagged_1 = histogram.tag(t1='a', t2='b') + tagged_1.observe(0.2) + tagged_1.observe(0.7) + tagged_1.observe(2.5) + + tagged_2 = histogram.tag(t1='c', t2='b') + tagged_2.observe(2) + tagged_2.observe(5) + self.assertHistogramMetric(histogram, { + (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, + (('t1', 'c'), ('t2', 'b')): {3: 1, INF: 1} + }) + def assertCounterMetric(self, metric, expected): self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) actual = { @@ -133,6 +124,24 @@ def assertHistogramMetric(self, metric, expected): class TestPrometheusMetrics(_TestMetrics): provider_class = PrometheusMetrics + def test_histogram(self): + histogram = self.provider.histogram( + 'commcare.test.histogram', 'Description', 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] + ) + tagged_1 = histogram.tag(t1='a', t2='b') + tagged_1.observe(0.2) + tagged_1.observe(0.7) + tagged_1.observe(2.5) + + tagged_2 = histogram.tag(t1='c', t2='b') + tagged_2.observe(2) + tagged_2.observe(5) + self.assertHistogramMetric(histogram, { + (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, + (('t1', 'c'), ('t2', 'b')): {2: 1, INF: 1} + }) + def _samples_to_dics(self, samples, filter_name=None): """Convert a Sample tuple into a dict((name, (labels tuple)) -> value)""" return { From e00b7e176bb3c296f003d49d7e39354e5f3b1390 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Mon, 16 Mar 2020 13:40:39 +0200 Subject: [PATCH 18/40] remove unnecessary list --- corehq/util/metrics/tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/util/metrics/tests/utils.py b/corehq/util/metrics/tests/utils.py index bbd9119e3113..3c012c0ff9d5 100644 --- a/corehq/util/metrics/tests/utils.py +++ b/corehq/util/metrics/tests/utils.py @@ -9,7 +9,7 @@ def patch_datadog(): def record(fn, name, value, tags=None): key = tuple([ name, - tuple(sorted([tuple(t.split(':')) for t in (tags or [])])), + tuple(sorted(tuple(t.split(':')) for t in (tags or []))), ]) stats[key].append(value) From 20e8a6172ebbef0560bdfd607041b25313a2bbcf Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Mon, 16 Mar 2020 13:44:35 +0200 Subject: [PATCH 19/40] apply tags at the same time as recording the metric This change means we don't have to instantiate a new class when applying the tags which simplifies the class structure a lot. --- corehq/apps/api/odata/utils.py | 9 ++- corehq/apps/receiverwrapper/views.py | 10 ++-- corehq/util/metrics/datadog.py | 12 ++-- corehq/util/metrics/metrics.py | 68 ++++++++++------------- corehq/util/metrics/prometheus.py | 51 +++++++---------- corehq/util/metrics/tests/test_metrics.py | 42 +++++++------- 6 files changed, 83 insertions(+), 109 deletions(-) diff --git a/corehq/apps/api/odata/utils.py b/corehq/apps/api/odata/utils.py index 6992808fa2e4..7628a03eb8c7 100644 --- a/corehq/apps/api/odata/utils.py +++ b/corehq/apps/api/odata/utils.py @@ -2,8 +2,6 @@ from collections import namedtuple from corehq.apps.export.models import ExportInstance -from corehq.util.datadog.gauges import datadog_counter -from corehq.util.datadog.utils import bucket_value from corehq.util.metrics import metrics FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type']) @@ -72,12 +70,13 @@ def record_feed_access_in_datadog(request, config_id, duration, response): column_count = len(rows[0]) except IndexError: column_count = 0 - odata_feed_access_histogram.tag( + odata_feed_access_histogram.observe( + duration, domain=request.domain, feed_id=config_id, feed_type=config.type, username=username, row_count=row_count, column_count=column_count, - size=len(response.content), - ).observe(duration) + size=len(response.content) + ) diff --git a/corehq/apps/receiverwrapper/views.py b/corehq/apps/receiverwrapper/views.py index 22e53df3c76d..16d28bf5a38b 100644 --- a/corehq/apps/receiverwrapper/views.py +++ b/corehq/apps/receiverwrapper/views.py @@ -106,7 +106,7 @@ def _process_form(request, domain, app_id, user_id, authenticated, except: meta = {} - corrupt_multimedia_counter.tag(domain=domain, authenticated=authenticated).inc() + corrupt_multimedia_counter.inc(domain=domain, authenticated=authenticated) return _submission_error( request, "Received a submission with POST.keys()", metric_tags, domain, app_id, user_id, authenticated, meta, @@ -154,7 +154,7 @@ def _process_form(request, domain, app_id, user_id, authenticated, try: result = submission_post.run() except XFormLockError as err: - xform_locked_error_counter.tag(domain=domain, authenticated=authenticated).inc() + xform_locked_error_counter.inc(domain=domain, authenticated=authenticated) return _submission_error( request, "XFormLockError: %s" % err, metric_tags, domain, app_id, user_id, authenticated, status=423, @@ -201,12 +201,12 @@ def _record_metrics(tags, submission_type, response, timer=None, xform=None): if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: lag = xform.received_on - xform.metadata.timeEnd lag_days = lag.total_seconds() / 86400 - submission_lag_histogram.tag(**tags).observe(lag_days) + submission_lag_histogram.observe(lag_days, **tags) if timer: - submission_duration_histogram.tag(**tags).observe(timer.duration) + submission_duration_histogram.observe(timer.duration, **tags) - submission_counter.tag(**tags).inc() + submission_counter.inc(**tags) @location_safe diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index e463cccfcd53..0163e5696924 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -36,14 +36,14 @@ def _datadog_record(fn, name, value, tags=None): class Counter(HqCounter): - def _record(self, amount: float): - tags = _format_tags(self.tag_values) + def _record(self, amount: float, tag_values: dict): + tags = _format_tags(tag_values) _datadog_record(statsd.increment, self.name, amount, tags) class Gauge(HqGauge): - def _record(self, value): - tags = _format_tags(self.tag_values) + def _record(self, value: float, tag_values: dict): + tags = _format_tags(tag_values) _datadog_record(statsd.gauge, self.name, value, tags) @@ -69,8 +69,8 @@ class Histogram(HqHistogram): * https://github.com/dimagi/commcare-hq/pull/17080 * https://github.com/dimagi/commcare-hq/pull/17030#issuecomment-315794700 """ - def _record(self, value: float): - tags = _format_tags(self.tag_values) + def _record(self, value: float, tag_values: dict): + tags = _format_tags(tag_values) if not tags: tags = [] bucket = bucket_value(value, self._buckets, self._bucket_unit) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 3d37368e4a40..f4a699145ace 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -33,53 +33,43 @@ def _validate_tag_names(tag_names): class MetricBase: - def __init__(self, name: str, documentation: str, tag_names: Iterable = (), **kwargs): + def __init__(self, name: str, documentation: str, tag_names: Iterable = ()): self.name = name if not METRIC_NAME_RE.match(name): raise ValueError('Invalid metric name: ' + name) _enforce_prefix(name, 'commcare') self.documentation = documentation self.tag_names = _validate_tag_names(tag_names) - self.tag_values = kwargs.pop('tag_values', None) - if self.tag_values: - assert isinstance(self.tag_values, dict) - if self.tag_values.keys() != self.tag_names: - raise ValueError('Incorrect tag names') - self._kwargs = kwargs self._init_metric() def _init_metric(self): pass - def tag(self, **tag_kwargs): - tag_kwargs = {t: str(v) for t, v in tag_kwargs.items()} - return self._get_tagged_instance(tag_kwargs) - - def _get_tagged_instance(self, tag_values: dict): - return self.__class__( - self.name, self.documentation, tag_names=self.tag_names, tag_values=tag_values, **self._kwargs - ) - - def _validate_tags(self): - if self.tag_names and not self.tag_values: + def _validate_tags(self, tag_values: dict): + if self.tag_names and not tag_values: raise Exception('Metric has missing tag values.') - def _record(self, value: float): + if tag_values: + assert isinstance(tag_values, dict) + if tag_values.keys() != self.tag_names: + raise ValueError('Incorrect tag names') + + def _record(self, value: float, tags: dict): raise NotImplementedError class HqCounter(MetricBase): - def inc(self, amount: float = 1): + def inc(self, amount: float = 1, **tags): """Increment the counter by the given amount.""" - self._validate_tags() - self._record(amount) + self._validate_tags(tags) + self._record(amount, tags) class HqGauge(MetricBase): - def set(self, value: float): + def set(self, value: float, **tags): """Set gauge to the given value.""" - self._validate_tags() - self._record(value) + self._validate_tags(tags) + self._record(value, tags) DEFAULT_BUCKETS = (.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0, INF) @@ -87,17 +77,20 @@ def set(self, value: float): class HqHistogram(MetricBase): - def _init_metric(self): - self._buckets = self._kwargs.get('buckets') or DEFAULT_BUCKETS - self._bucket_unit = self._kwargs.get('bucket_unit', '') - self._bucket_tag = self._kwargs.get('bucket_tag') - if self._bucket_tag in self.tag_names: - self.tag_names = tuple(name for name in self.tag_names if name != self._bucket_tag) - - def observe(self, value: float): + def __init__(self, name: str, documentation: str, + bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', + tag_names: Iterable = ()): + self._bucket_tag = bucket_tag + self._buckets = buckets + self._bucket_unit = bucket_unit + if self._bucket_tag in tag_names: + tag_names = tuple(name for name in tag_names if name != bucket_tag) + super().__init__(name, documentation, tag_names) + + def observe(self, value: float, **tags): """Update histogram with the given value.""" - self._validate_tags() - self._record(value) + self._validate_tags(tags) + self._record(value, tags) class HqMetrics(metaclass=abc.ABCMeta): @@ -122,7 +115,7 @@ def histogram(self, name: str, documentation: str, implementations for details. """ return self._histogram_class( - name, documentation, tag_names, bucket_tag=bucket_tag, buckets=buckets, bucket_unit=bucket_unit + name, documentation, bucket_tag, buckets=buckets, bucket_unit=bucket_unit, tag_names=tag_names ) @@ -160,9 +153,6 @@ def __init__(self, delegates, record_fn_name): self._delegates = delegates self._record_fn_name = record_fn_name - def tag(self, *args, **kwargs): - return self.__class__([d.tag(*args, **kwargs) for d in self._delegates], self._record_fn_name) - def __getattr__(self, item): if item == self._record_fn_name: def record(*args, **kwargs): diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index 604b02ae7aaf..cce9a0c671ed 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -11,54 +11,43 @@ from prometheus_client import Histogram as PHistogram -class PrometheusMetricBase(MetricBase): - _metric_class = None +class Counter(HqCounter): + """https://prometheus.io/docs/concepts/metric_types/#counter""" def _init_metric(self): - super()._init_metric() self.name = self.name.replace('.', '_') - delegate = self._kwargs.get('delegate') - self._delegate = delegate or self._metric_class(self.name, self.documentation, self.tag_names) - - def _get_tagged_instance(self, tag_values: dict): - delegate = self._delegate.labels(**tag_values) - return self.__class__( - self.name, self.documentation, - tag_names=self.tag_names, tag_values=tag_values, delegate=delegate, **self._kwargs - ) - - -class Counter(PrometheusMetricBase, HqCounter): - """https://prometheus.io/docs/concepts/metric_types/#counter""" - _metric_class = PCounter + self._delegate = PCounter(self.name, self.documentation, self.tag_names) - def _record(self, amount: float): - self._delegate.inc(amount) + def _record(self, amount: float, tags): + _get_labeled(self._delegate, tags).inc(amount) -class Gauge(PrometheusMetricBase, HqGauge): +class Gauge(HqGauge): """https://prometheus.io/docs/concepts/metric_types/#gauge""" - _metric_class = PGauge - def _record(self, value: float): - self._delegate.set(value) + def _init_metric(self): + self.name = self.name.replace('.', '_') + self._delegate = PGauge(self.name, self.documentation, self.tag_names) + + def _record(self, value: float, tags): + _get_labeled(self._delegate, tags).set(value) -class Histogram(PrometheusMetricBase, HqHistogram): +class Histogram(HqHistogram): """This metric class implements the native Prometheus Histogram type https://prometheus.io/docs/concepts/metric_types/#histogram """ def _init_metric(self): - """Overriding this so that we can pass in the buckets to the Prometheus class""" - HqHistogram._init_metric(self) # skip _init_metric on PrometheusMetricBase self.name = self.name.replace('.', '_') - self._delegate = self._kwargs.get('delegate') - if not self._delegate: - self._delegate = PHistogram(self.name, self.documentation, self.tag_names, buckets=self._buckets) + self._delegate = PHistogram(self.name, self.documentation, self.tag_names, buckets=self._buckets) + + def _record(self, value: float, tags: dict): + _get_labeled(self._delegate, tags).observe(value) + - def _record(self, value: float): - self._delegate.observe(value) +def _get_labeled(metric, labels): + return metric.labels(**labels) if labels else metric class PrometheusMetrics(HqMetrics): diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py index ea6b878f6e8b..bb9c8232a00b 100644 --- a/corehq/util/metrics/tests/test_metrics.py +++ b/corehq/util/metrics/tests/test_metrics.py @@ -26,9 +26,9 @@ def setUpClass(cls): def test_counter(self): counter = self.provider.counter('commcare.test.counter', 'Description', tag_names=['t1', 't2']) - counter.tag(t1='a', t2='b').inc() - counter.tag(t1='c', t2='b').inc(2) - counter.tag(t1='c', t2='b').inc() + counter.inc(t1='a', t2='b') + counter.inc(2, t1='c', t2='b') + counter.inc(t1='c', t2='b') self.assertCounterMetric(counter, { (('t1', 'a'), ('t2', 'b')): 1, (('t1', 'c'), ('t2', 'b')): 3, @@ -36,9 +36,9 @@ def test_counter(self): def test_gauge(self): gauge = self.provider.gauge('commcare.test.gauge', 'Description', tag_names=['t1', 't2']) - gauge.tag(t1='a', t2='b').set(4.2) - gauge.tag(t1='c', t2='b').set(2) - gauge.tag(t1='c', t2='b').set(5) + gauge.set(4.2, t1='a', t2='b') + gauge.set(2, t1='c', t2='b') + gauge.set(5, t1='c', t2='b') self.assertGaugeMetric(gauge, { (('t1', 'a'), ('t2', 'b')): 4.2, (('t1', 'c'), ('t2', 'b')): 5, @@ -76,14 +76,12 @@ def test_histogram(self): 'commcare.test.histogram', 'Description', 'duration', buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] ) - tagged_1 = histogram.tag(t1='a', t2='b') - tagged_1.observe(0.2) - tagged_1.observe(0.7) - tagged_1.observe(2.5) - - tagged_2 = histogram.tag(t1='c', t2='b') - tagged_2.observe(2) - tagged_2.observe(5) + histogram.observe(0.2, t1='a', t2='b') + histogram.observe(0.7, t1='a', t2='b') + histogram.observe(2.5, t1='a', t2='b') + + histogram.observe(2, t1='c', t2='b') + histogram.observe(5, t1='c', t2='b') self.assertHistogramMetric(histogram, { (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, (('t1', 'c'), ('t2', 'b')): {3: 1, INF: 1} @@ -129,14 +127,12 @@ def test_histogram(self): 'commcare.test.histogram', 'Description', 'duration', buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] ) - tagged_1 = histogram.tag(t1='a', t2='b') - tagged_1.observe(0.2) - tagged_1.observe(0.7) - tagged_1.observe(2.5) - - tagged_2 = histogram.tag(t1='c', t2='b') - tagged_2.observe(2) - tagged_2.observe(5) + histogram.observe(0.2, t1='a', t2='b') + histogram.observe(0.7, t1='a', t2='b') + histogram.observe(2.5, t1='a', t2='b') + + histogram.observe(2, t1='c', t2='b') + histogram.observe(5, t1='c', t2='b') self.assertHistogramMetric(histogram, { (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, (('t1', 'c'), ('t2', 'b')): {2: 1, INF: 1} @@ -145,7 +141,7 @@ def test_histogram(self): def _samples_to_dics(self, samples, filter_name=None): """Convert a Sample tuple into a dict((name, (labels tuple)) -> value)""" return { - tuple(sample.labels.items()): sample.value + tuple(sorted(sample.labels.items())): sample.value for sample in samples if not filter_name or sample.name == filter_name } From 0ab0006ac2660650e5ca743eb669b5dc841a11e6 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Mon, 16 Mar 2020 15:04:49 +0200 Subject: [PATCH 20/40] dummy metric --- corehq/util/metrics/metrics.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index f4a699145ace..5abb6811f308 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -119,9 +119,20 @@ def histogram(self, name: str, documentation: str, ) +class DummyMetric: + def __init__(self, *args, **kwargs): + pass + + def __getattr__(self, item): + if item in ('inc', 'set', 'observe'): + return lambda *args, **kwargs: None + raise AttributeError + + class DummyMetrics(HqMetrics): - _counter_class = HqCounter - _gauge_class = HqGauge + _counter_class = DummyMetric + _gauge_class = DummyMetric + _histogram_class = DummyMetric def enabled(self) -> bool: return True From 02b40eeee7fec129c219d3819744fb8097e0d213 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Mon, 16 Mar 2020 18:32:43 +0200 Subject: [PATCH 21/40] functional interface --- corehq/apps/api/odata/utils.py | 30 ++--- corehq/apps/receiverwrapper/views.py | 50 +++---- corehq/util/metrics/__init__.py | 38 ++++-- corehq/util/metrics/datadog.py | 106 +++++++-------- corehq/util/metrics/metrics.py | 151 +++++---------------- corehq/util/metrics/prometheus.py | 78 ++++------- corehq/util/metrics/tests/test_metrics.py | 155 ++++++++-------------- 7 files changed, 230 insertions(+), 378 deletions(-) diff --git a/corehq/apps/api/odata/utils.py b/corehq/apps/api/odata/utils.py index 7628a03eb8c7..cee2b6447997 100644 --- a/corehq/apps/api/odata/utils.py +++ b/corehq/apps/api/odata/utils.py @@ -2,7 +2,7 @@ from collections import namedtuple from corehq.apps.export.models import ExportInstance -from corehq.util.metrics import metrics +from corehq.util.metrics import metrics_histogram FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type']) @@ -53,13 +53,6 @@ def _get_dot_path(export_column): return metadata -odata_feed_access_histogram = metrics.histogram( - 'commcare.odata_feed.test_v3', 'Odata Feed Access', - bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', - tag_names=['domain', 'feed_id', 'feed_type', 'username', 'row_count', 'column_count', 'size'] -) - - def record_feed_access_in_datadog(request, config_id, duration, response): config = ExportInstance.get(config_id) username = request.couch_user.username @@ -70,13 +63,16 @@ def record_feed_access_in_datadog(request, config_id, duration, response): column_count = len(rows[0]) except IndexError: column_count = 0 - odata_feed_access_histogram.observe( - duration, - domain=request.domain, - feed_id=config_id, - feed_type=config.type, - username=username, - row_count=row_count, - column_count=column_count, - size=len(response.content) + metrics_histogram( + 'commcare.odata_feed.test_v3', duration, + bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', + tags={ + 'domain': request.domain, + 'feed_id': config_id, + 'feed_type': config.type, + 'username': username, + 'row_count': row_count, + 'column_count': column_count, + 'size': len(response.content) + } ) diff --git a/corehq/apps/receiverwrapper/views.py b/corehq/apps/receiverwrapper/views.py index 16d28bf5a38b..00d00de4b8cb 100644 --- a/corehq/apps/receiverwrapper/views.py +++ b/corehq/apps/receiverwrapper/views.py @@ -10,7 +10,7 @@ import couchforms from casexml.apps.case.xform import get_case_updates, is_device_report -from corehq.util.metrics import metrics +from corehq.util.metrics import metrics, metrics_counter, metrics_histogram from couchforms import openrosa_response from couchforms.const import MAGIC_PROPERTY, BadRequest from couchforms.getters import MultimediaBug @@ -57,32 +57,6 @@ PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT') PROFILE_LIMIT = int(PROFILE_LIMIT) if PROFILE_LIMIT is not None else 1 -corrupt_multimedia_counter = metrics.counter( - 'commcare.corrupt_multimedia_submissions', 'Count of requests with corrupt multimedia', - tag_names=['domain', 'authenticated'] -) - -xform_locked_error_counter = metrics.counter( - 'commcare.xformlocked.count', 'Count of locking errors', - tag_names=['domain', 'authenticated'] -) - -submission_counter = metrics.counter( - 'commcare.xform_submissions.count', 'Count of form submissions', - tag_names=['domain', 'backend', 'submission_type', 'status_code'] -) - -submission_duration_histogram = metrics.histogram( - 'commcare.xform_submissions.duration.seconds', 'Count of form submissions', - bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', - tag_names=['domain', 'backend', 'submission_type', 'status_code'] -) - -submission_lag_histogram = metrics.histogram( - 'commcare.xform_submissions.lag.days', 'Count of form submissions', - bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d', - tag_names=['domain', 'backend', 'submission_type', 'status_code'] -) @profile_prod('commcare_receiverwapper_process_form.prof', probability=PROFILE_PROBABILITY, limit=PROFILE_LIMIT) def _process_form(request, domain, app_id, user_id, authenticated, @@ -106,7 +80,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, except: meta = {} - corrupt_multimedia_counter.inc(domain=domain, authenticated=authenticated) + metrics_counter('commcare.corrupt_multimedia_submissions', tags={ + 'domain': domain, 'authenticated': authenticated + }) return _submission_error( request, "Received a submission with POST.keys()", metric_tags, domain, app_id, user_id, authenticated, meta, @@ -154,7 +130,9 @@ def _process_form(request, domain, app_id, user_id, authenticated, try: result = submission_post.run() except XFormLockError as err: - xform_locked_error_counter.inc(domain=domain, authenticated=authenticated) + metrics_counter('commcare.xformlocked.count', tags={ + 'domain': domain, 'authenticated': authenticated + }) return _submission_error( request, "XFormLockError: %s" % err, metric_tags, domain, app_id, user_id, authenticated, status=423, @@ -201,12 +179,20 @@ def _record_metrics(tags, submission_type, response, timer=None, xform=None): if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on: lag = xform.received_on - xform.metadata.timeEnd lag_days = lag.total_seconds() / 86400 - submission_lag_histogram.observe(lag_days, **tags) + metrics_histogram( + 'commcare.xform_submissions.lag.days', lag_days, + bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d', + tags=tags + ) if timer: - submission_duration_histogram.observe(timer.duration, **tags) + metrics_histogram( + 'commcare.xform_submissions.duration.seconds', timer.duration, + bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s', + tags=tags + ) - submission_counter.inc(**tags) + metrics_counter('commcare.xform_submissions.count', tags=tags) @location_safe diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 1e6a6053ea49..937a8f483efb 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,15 +1,37 @@ -from django.utils.functional import SimpleLazyObject +from typing import List, Iterable from corehq.util.metrics.datadog import DatadogMetrics -from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics +from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, DEFAULT_BUCKETS from corehq.util.metrics.prometheus import PrometheusMetrics +_metrics = None -def _get_metrics(): - enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) - if not enabled: - return [DummyMetrics()] - return enabled +def _get_metrics_provider(): + global _metrics + if not _metrics: + enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) + if not enabled: + _metrics = DummyMetrics() + elif len(enabled) > 1: + _metrics = DelegatedMetrics(enabled) + else: + _metrics = enabled[0] + return _metrics -metrics = DelegatedMetrics(SimpleLazyObject(_get_metrics)) # singleton/global + +def metrics_counter(name: str, value: float = 1, tags: dict = None, documentation: str = ''): + provider = _get_metrics_provider() + provider.counter(name, value, tags, documentation) + + +def metrics_gauge(name: str, value: float, tags: dict = None, documentation: str = ''): + provider = _get_metrics_provider() + provider.gauge(name, value, tags, documentation) + + +def metrics_histogram(name: str, value: float, + bucket_tag: str, buckets: Iterable[int] = DEFAULT_BUCKETS, bucket_unit: str = '', + tags: dict = None, documentation: str = ''): + provider = _get_metrics_provider() + provider.histogram(name, value, bucket_tag, buckets, bucket_unit, tags, documentation) diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index 0163e5696924..a2808f72195f 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -1,14 +1,10 @@ import logging +from typing import List from django.conf import settings from corehq.util.datadog.utils import bucket_value -from corehq.util.metrics.metrics import ( - HqCounter, - HqGauge, - HqHistogram, - HqMetrics, -) +from corehq.util.metrics.metrics import HqMetrics from datadog import api from datadog.dogstatsd.base import DogStatsd @@ -19,6 +15,52 @@ statsd = DogStatsd(constant_tags=COMMON_TAGS) +class DatadogMetrics(HqMetrics): + def validate(self): + if not (api._api_key and api._application_key): + raise Exception("Datadog not configured. Set DATADOG_API_KEY and DATADOG_APP_KEY in settings.") + + def _counter(self, name: str, value: float, tags: dict = None, documentation: str = ''): + dd_tags = _format_tags(tags) + _datadog_record(statsd.increment, name, value, dd_tags) + + def _gauge(self, name: str, value: float, tags: dict = None, documentation: str = ''): + dd_tags = _format_tags(tags) + _datadog_record(statsd.gauge, name, value, dd_tags) + + def _histogram(self, name: str, value: float, + bucket_tag: str, buckets: List[int], bucket_unit: str = '', + tags: dict = None, documentation: str = ''): + """ + This implementation of histogram uses tagging to record the buckets. + It does not use the Datadog Histogram metric type. + + The metric itself will be incremented by 1 on each call to `observe`. The value + passed to `observe` will be used to create the bucket tag. + + For example: + + h = Histogram( + 'commcare.request.duration', 'description', + bucket_tag='duration', buckets=[1,2,3], bucket_units='ms' + ) + h.observe(1.4) + + # resulting Datadog metric + # commcare.request.duration:1|c|#duration:lt_2ms + + For more details see: + * https://github.com/dimagi/commcare-hq/pull/17080 + * https://github.com/dimagi/commcare-hq/pull/17030#issuecomment-315794700 + """ + tags = _format_tags(tags) + if not tags: + tags = [] + bucket = bucket_value(value, buckets, bucket_unit) + tags.append(f'{bucket_tag}:{bucket}') + _datadog_record(statsd.increment, name, 1, tags) + + def _format_tags(tag_values: dict): if not tag_values: return None @@ -33,55 +75,3 @@ def _datadog_record(fn, name, value, tags=None): fn(name, value, tags=tags) except Exception: datadog_logger.exception('Unable to record Datadog stats') - - -class Counter(HqCounter): - def _record(self, amount: float, tag_values: dict): - tags = _format_tags(tag_values) - _datadog_record(statsd.increment, self.name, amount, tags) - - -class Gauge(HqGauge): - def _record(self, value: float, tag_values: dict): - tags = _format_tags(tag_values) - _datadog_record(statsd.gauge, self.name, value, tags) - - -class Histogram(HqHistogram): - """This implementation of histogram uses tagging to record the buckets. - It does not use the Datadog Histogram metric type. - - The metric itself will be incremented by 1 on each call to `observe`. The value - passed to `observe` will be used to create the bucket tag. - - For example: - - h = Histogram( - 'commcare.request.duration', 'description', - bucket_tag='duration', buckets=[1,2,3], bucket_units='ms' - ) - h.observe(1.4) - - # resulting Datadog metric - # commcare.request.duration:1|c|#duration:lt_2ms - - For more details see: - * https://github.com/dimagi/commcare-hq/pull/17080 - * https://github.com/dimagi/commcare-hq/pull/17030#issuecomment-315794700 - """ - def _record(self, value: float, tag_values: dict): - tags = _format_tags(tag_values) - if not tags: - tags = [] - bucket = bucket_value(value, self._buckets, self._bucket_unit) - tags.append(f'{self._bucket_tag}:{bucket}') - _datadog_record(statsd.increment, self.name, 1, tags) - - -class DatadogMetrics(HqMetrics): - _counter_class = Counter - _gauge_class = Gauge - _histogram_class = Histogram - - def enabled(self) -> bool: - return api._api_key and api._application_key diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 5abb6811f308..e407ce935e45 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -32,143 +32,64 @@ def _validate_tag_names(tag_names): return tag_names -class MetricBase: - def __init__(self, name: str, documentation: str, tag_names: Iterable = ()): - self.name = name - if not METRIC_NAME_RE.match(name): - raise ValueError('Invalid metric name: ' + name) - _enforce_prefix(name, 'commcare') - self.documentation = documentation - self.tag_names = _validate_tag_names(tag_names) - self._init_metric() - - def _init_metric(self): - pass - - def _validate_tags(self, tag_values: dict): - if self.tag_names and not tag_values: - raise Exception('Metric has missing tag values.') - - if tag_values: - assert isinstance(tag_values, dict) - if tag_values.keys() != self.tag_names: - raise ValueError('Incorrect tag names') - - def _record(self, value: float, tags: dict): - raise NotImplementedError - - -class HqCounter(MetricBase): - def inc(self, amount: float = 1, **tags): - """Increment the counter by the given amount.""" - self._validate_tags(tags) - self._record(amount, tags) - - -class HqGauge(MetricBase): - def set(self, value: float, **tags): - """Set gauge to the given value.""" - self._validate_tags(tags) - self._record(value, tags) - - DEFAULT_BUCKETS = (.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0, INF) -class HqHistogram(MetricBase): - - def __init__(self, name: str, documentation: str, - bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', - tag_names: Iterable = ()): - self._bucket_tag = bucket_tag - self._buckets = buckets - self._bucket_unit = bucket_unit - if self._bucket_tag in tag_names: - tag_names = tuple(name for name in tag_names if name != bucket_tag) - super().__init__(name, documentation, tag_names) - - def observe(self, value: float, **tags): - """Update histogram with the given value.""" - self._validate_tags(tags) - self._record(value, tags) - - class HqMetrics(metaclass=abc.ABCMeta): - _counter_class = None - _gauge_class = None - _histogram_class = None - - @abstractmethod - def enabled(self) -> bool: - raise NotImplementedError + def validate(self): + pass - def counter(self, name: str, documentation: str, tag_names: Iterable = ()) -> HqCounter: - return self._counter_class(name, documentation, tag_names) + def counter(self, name: str, value: float = 1, tags: dict = None, documentation: str = ''): + _enforce_prefix(name, 'commcare') + _validate_tag_names(tags) + self._counter(name, value, tags, documentation) - def gauge(self, name: str, documentation: str, tag_names: Iterable = ()) -> HqGauge: - return self._gauge_class(name, documentation, tag_names) + def gauge(self, name: str, value: float, tags: dict = None, documentation: str = ''): + _enforce_prefix(name, 'commcare') + _validate_tag_names(tags) + self._gauge(name, value, tags, documentation) - def histogram(self, name: str, documentation: str, + def histogram(self, name: str, value: float, bucket_tag: str, buckets: List[int] = DEFAULT_BUCKETS, bucket_unit: str = '', - tag_names: Iterable = ()) -> HqHistogram: + tags: dict = None, documentation: str = ''): """Create a histogram metric. Histogram implementations differ between provider. See provider implementations for details. """ - return self._histogram_class( - name, documentation, bucket_tag, buckets=buckets, bucket_unit=bucket_unit, tag_names=tag_names - ) + _enforce_prefix(name, 'commcare') + _validate_tag_names(tags) + self._histogram(name, value, bucket_tag, buckets, bucket_unit, tags, documentation) + @abstractmethod + def _counter(self, name, value, tags, documentation): + raise NotImplementedError -class DummyMetric: - def __init__(self, *args, **kwargs): + @abstractmethod + def _gauge(self, name, value, tags, documentation): pass - def __getattr__(self, item): - if item in ('inc', 'set', 'observe'): - return lambda *args, **kwargs: None - raise AttributeError - + @abstractmethod + def _histogram(self, name, value, bucket_tag, buckets, bucket_unit, tags, documentation): + pass -class DummyMetrics(HqMetrics): - _counter_class = DummyMetric - _gauge_class = DummyMetric - _histogram_class = DummyMetric - def enabled(self) -> bool: - return True +class DummyMetrics: + def __getattr__(self, item): + if item in ('counter', 'gauge', 'histogram'): + def _check(name, documentation, tags, *args, **kwargs): + _enforce_prefix(name, 'commcare') + _validate_tag_names(tags) + return _check + raise AttributeError class DelegatedMetrics: - """This class makes the metric class instantiation lazy and - also multiple metrics providers to be used.""" def __init__(self, delegates): self.delegates = delegates - self._types = { - 'counter': 'inc', - 'gauge': 'set', - 'histogram': 'observe', - } - - def __getattr__(self, item): - if item in self._types: - def _make_type(*args, **kwargs): - return SimpleLazyObject(lambda: DelegatingMetric([ - getattr(d, item)(*args, **kwargs) for d in self.delegates - ], self._types[item])) - return _make_type - raise AttributeError - - -class DelegatingMetric: - def __init__(self, delegates, record_fn_name): - self._delegates = delegates - self._record_fn_name = record_fn_name def __getattr__(self, item): - if item == self._record_fn_name: - def record(*args, **kwargs): - for metric in self._delegates: - getattr(metric, item)(*args, **kwargs) - return record - + if item in ('counter', 'gauge', 'histogram'): + def _record_metric(*args, **kwargs): + for delegate in self.delegates: + getattr(delegate, item)(*args, **kwargs) + return _record_metric raise AttributeError diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index cce9a0c671ed..ecb757dcc0e3 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -1,59 +1,33 @@ -import settings -from corehq.util.metrics.metrics import ( - HqCounter, - HqGauge, - HqHistogram, - HqMetrics, - MetricBase, -) +from typing import List + from prometheus_client import Counter as PCounter from prometheus_client import Gauge as PGauge from prometheus_client import Histogram as PHistogram - -class Counter(HqCounter): - """https://prometheus.io/docs/concepts/metric_types/#counter""" - - def _init_metric(self): - self.name = self.name.replace('.', '_') - self._delegate = PCounter(self.name, self.documentation, self.tag_names) - - def _record(self, amount: float, tags): - _get_labeled(self._delegate, tags).inc(amount) - - -class Gauge(HqGauge): - """https://prometheus.io/docs/concepts/metric_types/#gauge""" - - def _init_metric(self): - self.name = self.name.replace('.', '_') - self._delegate = PGauge(self.name, self.documentation, self.tag_names) - - def _record(self, value: float, tags): - _get_labeled(self._delegate, tags).set(value) - - -class Histogram(HqHistogram): - """This metric class implements the native Prometheus Histogram type - - https://prometheus.io/docs/concepts/metric_types/#histogram - """ - def _init_metric(self): - self.name = self.name.replace('.', '_') - self._delegate = PHistogram(self.name, self.documentation, self.tag_names, buckets=self._buckets) - - def _record(self, value: float, tags: dict): - _get_labeled(self._delegate, tags).observe(value) - - -def _get_labeled(metric, labels): - return metric.labels(**labels) if labels else metric +from corehq.util.metrics.metrics import HqMetrics class PrometheusMetrics(HqMetrics): - _counter_class = Counter - _gauge_class = Gauge - _histogram_class = Histogram - - def enabled(self) -> bool: - return settings.ENABLE_PROMETHEUS_METRICS + def __init__(self): + self._metrics = {} + + def _counter(self, name: str, value: float = 1, tags: dict = None, documentation: str = ''): + self._get_metric(PCounter, name, tags, documentation).inc(value) + + def _gauge(self, name: str, value: float, tags: dict = None, documentation: str = ''): + self._get_metric(PGauge, name, tags, documentation).set(value) + + def _histogram(self, name: str, value: float, bucket_tag: str, buckets: List[int], bucket_unit: str = '', + tags: dict = None, documentation: str = ''): + self._get_metric(PHistogram, name, tags, documentation, buckets=buckets).observe(value) + + def _get_metric(self, metric_type, name, tags, documentation, **kwargs): + name = name.replace('.', '_') + metric = self._metrics.get(name) + if not metric: + tags = tags or {} + metric = metric_type(name, documentation, labelnames=tags.keys(), **kwargs) + self._metrics[name] = metric + else: + assert metric.__class__ == metric_type + return metric.labels(**tags) if tags else metric diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py index bb9c8232a00b..920fb52bcacb 100644 --- a/corehq/util/metrics/tests/test_metrics.py +++ b/corehq/util/metrics/tests/test_metrics.py @@ -6,9 +6,6 @@ from corehq.util.metrics import DatadogMetrics, PrometheusMetrics from corehq.util.metrics.metrics import ( DelegatedMetrics, - HqCounter, - HqGauge, - HqHistogram, ) from corehq.util.metrics.tests.utils import patch_datadog from prometheus_client.samples import Sample @@ -25,33 +22,31 @@ def setUpClass(cls): cls.provider = cls.provider_class() def test_counter(self): - counter = self.provider.counter('commcare.test.counter', 'Description', tag_names=['t1', 't2']) - counter.inc(t1='a', t2='b') - counter.inc(2, t1='c', t2='b') - counter.inc(t1='c', t2='b') - self.assertCounterMetric(counter, { + self.provider.counter('commcare.test.counter', tags={'t1': 'a', 't2': 'b'}) + self.provider.counter('commcare.test.counter', 2, tags={'t1': 'c', 't2': 'b'}) + self.provider.counter('commcare.test.counter', tags={'t1': 'c', 't2': 'b'}) + self.assertCounterMetric('commcare.test.counter', { (('t1', 'a'), ('t2', 'b')): 1, (('t1', 'c'), ('t2', 'b')): 3, }) def test_gauge(self): - gauge = self.provider.gauge('commcare.test.gauge', 'Description', tag_names=['t1', 't2']) - gauge.set(4.2, t1='a', t2='b') - gauge.set(2, t1='c', t2='b') - gauge.set(5, t1='c', t2='b') - self.assertGaugeMetric(gauge, { + self.provider.gauge('commcare.test.gauge', 4.2, tags={'t1': 'a', 't2': 'b'}) + self.provider.gauge('commcare.test.gauge', 2, tags={'t1': 'c', 't2': 'b'}) + self.provider.gauge('commcare.test.gauge', 5, tags={'t1': 'c', 't2': 'b'}) + self.assertGaugeMetric('commcare.test.gauge', { (('t1', 'a'), ('t2', 'b')): 4.2, (('t1', 'c'), ('t2', 'b')): 5, }) - def assertCounterMetric(self, metric: HqCounter, expected: Dict[Tuple[Tuple[str, str], ...], float]): + def assertCounterMetric(self, metric: str, expected: Dict[Tuple[Tuple[str, str], ...], float]): """ :param metric: metric class :param expected: dict mapping tag tuples to metric values """ raise NotImplementedError - def assertGaugeMetric(self, metric: HqGauge, expected: Dict[Tuple[Tuple[str, str], ...], float]): + def assertGaugeMetric(self, metric: str, expected: Dict[Tuple[Tuple[str, str], ...], float]): """ :param metric: metric class :param expected: dict mapping tag tuples to metric values @@ -72,46 +67,46 @@ def tearDown(self) -> None: super().tearDown() def test_histogram(self): - histogram = self.provider.histogram( - 'commcare.test.histogram', 'Description', 'duration', - buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] - ) - histogram.observe(0.2, t1='a', t2='b') - histogram.observe(0.7, t1='a', t2='b') - histogram.observe(2.5, t1='a', t2='b') - - histogram.observe(2, t1='c', t2='b') - histogram.observe(5, t1='c', t2='b') - self.assertHistogramMetric(histogram, { + for value in (0.2, 0.7, 2.5): + self.provider.histogram( + 'commcare.test.histogram', value, 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tags={'t1': 'a', 't2': 'b'} + ) + for value in (2, 5): + self.provider.histogram( + 'commcare.test.histogram', value, 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tags={'t1': 'c', 't2': 'b'} + ) + self.assertHistogramMetric('commcare.test.histogram', { (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, (('t1', 'c'), ('t2', 'b')): {3: 1, INF: 1} - }) + }, 'duration', [1, 2, 3], 'ms') - def assertCounterMetric(self, metric, expected): - self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) + def assertCounterMetric(self, metric_name, expected): + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric_name}) actual = { key[1]: sum(val) for key, val in self.recorded_metrics.items() } self.assertDictEqual(actual, expected) - def assertGaugeMetric(self, metric, expected): - self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) + def assertGaugeMetric(self, metric_name, expected): + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric_name}) actual = { key[1]: val[-1] for key, val in self.recorded_metrics.items() } self.assertDictEqual(actual, expected) - def assertHistogramMetric(self, metric, expected): - self.assertEqual({key[0] for key in self.recorded_metrics}, {metric.name}) + def assertHistogramMetric(self, metric_name, expected, bucket_tag, buckets, bucket_unit): + self.assertEqual({key[0] for key in self.recorded_metrics}, {metric_name}) expected_samples = {} - for tags, buckets in expected.items(): - for bucket, val in buckets.items(): + for tags, expected_buckets in expected.items(): + for bucket, val in expected_buckets.items(): prefix = 'lt' if bucket == INF: - bucket = metric._buckets[-1] + bucket = buckets[-1] prefix = 'over' - bucket_tag = (metric._bucket_tag, f'{prefix}_{bucket:03d}{metric._bucket_unit}') - expected_samples[tuple(sorted(tags + (bucket_tag,)))] = val + dd_bucket_tag = (bucket_tag, f'{prefix}_{bucket:03d}{bucket_unit}') + expected_samples[tuple(sorted(tags + (dd_bucket_tag,)))] = val actual = { key[1]: sum(val) for key, val in self.recorded_metrics.items() @@ -123,20 +118,20 @@ class TestPrometheusMetrics(_TestMetrics): provider_class = PrometheusMetrics def test_histogram(self): - histogram = self.provider.histogram( - 'commcare.test.histogram', 'Description', 'duration', - buckets=[1, 2, 3], bucket_unit='ms', tag_names=['t1', 't2'] - ) - histogram.observe(0.2, t1='a', t2='b') - histogram.observe(0.7, t1='a', t2='b') - histogram.observe(2.5, t1='a', t2='b') - - histogram.observe(2, t1='c', t2='b') - histogram.observe(5, t1='c', t2='b') - self.assertHistogramMetric(histogram, { + for value in (0.2, 0.7, 2.5): + self.provider.histogram( + 'commcare_test_histogram', value, 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tags={'t1': 'a', 't2': 'b'} + ) + for value in (2, 5): + self.provider.histogram( + 'commcare_test_histogram', value, 'duration', + buckets=[1, 2, 3], bucket_unit='ms', tags={'t1': 'c', 't2': 'b'} + ) + self.assertHistogramMetric('commcare_test_histogram', { (('t1', 'a'), ('t2', 'b')): {1: 2, 3: 1}, (('t1', 'c'), ('t2', 'b')): {2: 1, INF: 1} - }) + }, [1, 2, 3]) def _samples_to_dics(self, samples, filter_name=None): """Convert a Sample tuple into a dict((name, (labels tuple)) -> value)""" @@ -146,27 +141,32 @@ def _samples_to_dics(self, samples, filter_name=None): if not filter_name or sample.name == filter_name } - def assertGaugeMetric(self, metric, expected): - [collected] = metric._delegate.collect() + def assertGaugeMetric(self, metric_name, expected): + metric_name = metric_name.replace('.', '_') + metric = self.provider._metrics[metric_name] + [collected] = metric.collect() actual = self._samples_to_dics(collected.samples) self.assertDictEqual(actual, expected) - def assertCounterMetric(self, metric, expected): - total_name = f'{metric.name}_total' - [collected] = metric._delegate.collect() + def assertCounterMetric(self, metric_name, expected): + metric_name = metric_name.replace('.', '_') + metric = self.provider._metrics[metric_name] + total_name = f'{metric_name}_total' + [collected] = metric.collect() actual = self._samples_to_dics(collected.samples, total_name) self.assertDictEqual(actual, expected) - def assertHistogramMetric(self, metric, expected): + def assertHistogramMetric(self, metric_name, expected, buckets): # Note that Prometheus histograms are cumulative so we must sum up the successive bucket values # https://en.wikipedia.org/wiki/Histogram#Cumulative_histogram - [collected] = metric._delegate.collect() + metric = self.provider._metrics[metric_name] + [collected] = metric.collect() - sample_name = f'{metric.name}_bucket' + sample_name = f'{metric_name}_bucket' expected_samples = [] for key, value in expected.items(): cumulative_value = 0 - for bucket in metric._buckets: + for bucket in buckets: val = value.get(bucket, 0) cumulative_value += val labels = dict(key + (('le', str(float(bucket))),)) @@ -181,40 +181,3 @@ def assertHistogramMetric(self, metric, expected): if s.name.endswith('bucket') ] self.assertListEqual(actual, expected_samples) - - -def test_delegate_lazy(): - metrics = DelegatedMetrics([DatadogMetrics(), PrometheusMetrics()]) - - def _check(metric): - assert isinstance(metric, SimpleLazyObject), '' - - test_cases = [ - metrics.counter('commcare.name.1', ''), - metrics.gauge('commcare.name.2', ''), - metrics.histogram('commcare.name.3', '', 'duration'), - ] - for metric in test_cases: - yield _check, metric - - -def test_lazy_recording(): - metrics = DelegatedMetrics([DatadogMetrics(), PrometheusMetrics()]) - - def _check(metric, method_name): - with patch_datadog() as stats: - getattr(metric, method_name)(1) - - dd_metric, prom_metric = metric._delegates - [collected] = prom_metric._delegate.collect() - - eq(len(stats), 1, stats) - eq(len(collected.samples) >= 1, True, collected.samples) - - test_cases = [ - (metrics.counter('commcare.name.1', ''), 'inc'), - (metrics.gauge('commcare.name.2', ''), 'set'), - (metrics.histogram('commcare.name.3', '', 'duration'), 'observe'), - ] - for metric, method_name in test_cases: - yield _check, metric, method_name From 31cfba91fa4092d307bed740fed1e7c9c35506ed Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 17 Mar 2020 10:48:06 +0200 Subject: [PATCH 22/40] re-do configuration via settings --- corehq/util/metrics/__init__.py | 17 ++++++++++++----- settings.py | 6 +++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 937a8f483efb..6ac72792915c 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,8 +1,10 @@ from typing import List, Iterable +import settings from corehq.util.metrics.datadog import DatadogMetrics from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, DEFAULT_BUCKETS from corehq.util.metrics.prometheus import PrometheusMetrics +from dimagi.utils.modules import to_function _metrics = None @@ -10,13 +12,18 @@ def _get_metrics_provider(): global _metrics if not _metrics: - enabled = list(filter(lambda m: m.enabled(), [DatadogMetrics(), PrometheusMetrics()])) - if not enabled: + providers = [] + for provider_path in settings.METRICS_PROVIDERS: + provider = to_function(provider_path) + provider.validate() + providers.append(provider) + + if not providers: _metrics = DummyMetrics() - elif len(enabled) > 1: - _metrics = DelegatedMetrics(enabled) + elif len(providers) > 1: + _metrics = DelegatedMetrics(providers) else: - _metrics = enabled[0] + _metrics = providers[0] return _metrics diff --git a/settings.py b/settings.py index 05431414ab78..20ec3f4005ea 100755 --- a/settings.py +++ b/settings.py @@ -836,9 +836,13 @@ SUBSCRIPTION_USERNAME = None SUBSCRIPTION_PASSWORD = None +# List of metrics providers to use. Available providers: +# * 'corehq.util.metrics.datadog.DatadogMetrics' +# * 'corehq.util.metrics.prometheus.PrometheusMetrics' +METRICS_PROVIDERS = [] + DATADOG_API_KEY = None DATADOG_APP_KEY = None -ENABLE_PROMETHEUS_METRICS = False SYNCLOGS_SQL_DB_ALIAS = 'default' From 892e57bea62eac820a21fc5f1082dec4013ec8d9 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 17 Mar 2020 10:51:24 +0200 Subject: [PATCH 23/40] move initialization into provider --- corehq/util/datadog/apps.py | 27 --------------------------- corehq/util/metrics/__init__.py | 2 +- corehq/util/metrics/datadog.py | 18 ++++++++++++++++-- corehq/util/metrics/metrics.py | 2 +- settings.py | 1 - 5 files changed, 18 insertions(+), 32 deletions(-) delete mode 100644 corehq/util/datadog/apps.py diff --git a/corehq/util/datadog/apps.py b/corehq/util/datadog/apps.py deleted file mode 100644 index 032d76e978d5..000000000000 --- a/corehq/util/datadog/apps.py +++ /dev/null @@ -1,27 +0,0 @@ -from django.apps import AppConfig -from django.conf import settings - - -class DatadogConfig(AppConfig): - - name = 'corehq.util.datadog' - verbose_name = 'Datadog' - - def ready(self): - if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: - return - - try: - from datadog import initialize - except ImportError: - pass - else: - initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY) - - if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS: - try: - from ddtrace import tracer - tracer.enabled = False - except ImportError: - pass - diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 6ac72792915c..4d34b1e48822 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -15,7 +15,7 @@ def _get_metrics_provider(): providers = [] for provider_path in settings.METRICS_PROVIDERS: provider = to_function(provider_path) - provider.validate() + provider.initialize() providers.append(provider) if not providers: diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index a2808f72195f..09efc8dc8524 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -16,10 +16,24 @@ class DatadogMetrics(HqMetrics): - def validate(self): - if not (api._api_key and api._application_key): + def initialize(self): + if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: raise Exception("Datadog not configured. Set DATADOG_API_KEY and DATADOG_APP_KEY in settings.") + try: + from datadog import initialize + except ImportError: + pass + else: + initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY) + + if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS: + try: + from ddtrace import tracer + tracer.enabled = False + except ImportError: + pass + def _counter(self, name: str, value: float, tags: dict = None, documentation: str = ''): dd_tags = _format_tags(tags) _datadog_record(statsd.increment, name, value, dd_tags) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index e407ce935e45..8b28203692cf 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -36,7 +36,7 @@ def _validate_tag_names(tag_names): class HqMetrics(metaclass=abc.ABCMeta): - def validate(self): + def initialize(self): pass def counter(self, name: str, value: float = 1, tags: dict = None, documentation: str = ''): diff --git a/settings.py b/settings.py index 20ec3f4005ea..8541637ad31c 100755 --- a/settings.py +++ b/settings.py @@ -340,7 +340,6 @@ 'corehq.motech.openmrs', 'corehq.motech.repeaters', 'corehq.util', - 'corehq.util.datadog.apps.DatadogConfig', 'dimagi.ext', 'corehq.blobs', 'corehq.apps.case_search', From dbdbd3fed490e54967e944125082f97ed5465ea9 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 17 Mar 2020 11:43:55 +0200 Subject: [PATCH 24/40] replace datadog_gauge --- corehq/apps/case_importer/tasks.py | 9 ++------ corehq/apps/hqwebapp/tasks.py | 5 +++-- corehq/apps/sms/tasks.py | 5 +++-- corehq/motech/repeaters/tasks.py | 4 ++-- corehq/util/datadog/gauges.py | 34 ------------------------------ corehq/util/metrics/metrics.py | 25 ++++++++++++++++++++++ 6 files changed, 35 insertions(+), 47 deletions(-) diff --git a/corehq/apps/case_importer/tasks.py b/corehq/apps/case_importer/tasks.py index 14d63356b5c6..e0e4202a0ada 100644 --- a/corehq/apps/case_importer/tasks.py +++ b/corehq/apps/case_importer/tasks.py @@ -1,21 +1,16 @@ -from celery import states -from celery.exceptions import Ignore from celery.schedules import crontab from celery.task import task -from soil.progress import update_task_state - from corehq.apps.hqadmin.tasks import ( AbnormalUsageAlert, send_abnormal_usage_alert, ) -from corehq.util.datadog.gauges import datadog_gauge_task - from .do_import import do_import from .exceptions import ImporterError from .tracking.analytics import get_case_upload_files_total_bytes from .tracking.case_upload_tracker import CaseUpload from .util import get_importer_error_message, exit_celery_with_error_message +from ...util.metrics.metrics import metrics_gauge_task @task(serializer='pickle', queue='case_import_queue') @@ -64,7 +59,7 @@ def _alert_on_result(result, domain): send_abnormal_usage_alert.delay(alert) -total_bytes = datadog_gauge_task( +total_bytes = metrics_gauge_task( 'commcare.case_importer.files.total_bytes', get_case_upload_files_total_bytes, run_every=crontab(minute=0) diff --git a/corehq/apps/hqwebapp/tasks.py b/corehq/apps/hqwebapp/tasks.py index f08cb646ae0d..a030c54fe211 100644 --- a/corehq/apps/hqwebapp/tasks.py +++ b/corehq/apps/hqwebapp/tasks.py @@ -5,9 +5,10 @@ from celery.task import task, periodic_task from corehq.util.bounced_email_manager import BouncedEmailManager +from corehq.util.metrics.metrics import metrics_gauge_task from dimagi.utils.logging import notify_exception -from corehq.util.datadog.gauges import datadog_gauge_task, datadog_track_errors +from corehq.util.datadog.gauges import datadog_track_errors from corehq.util.log import send_HTML_email @@ -129,5 +130,5 @@ def get_maintenance_alert_active(): return 1 if MaintenanceAlert.get_latest_alert() else 0 -datadog_gauge_task('commcare.maintenance_alerts.active', get_maintenance_alert_active, +metrics_gauge_task('commcare.maintenance_alerts.active', get_maintenance_alert_active, run_every=crontab(minute=1)) diff --git a/corehq/apps/sms/tasks.py b/corehq/apps/sms/tasks.py index 968551e66eeb..aa5c58446751 100644 --- a/corehq/apps/sms/tasks.py +++ b/corehq/apps/sms/tasks.py @@ -7,6 +7,7 @@ from celery.schedules import crontab +from corehq.util.metrics.metrics import metrics_gauge_task from dimagi.utils.couch import ( CriticalSection, get_redis_client, @@ -50,7 +51,7 @@ from corehq.apps.users.models import CommCareUser, CouchUser from corehq.messaging.util import use_phone_entries from corehq.util.celery_utils import no_result_task -from corehq.util.datadog.gauges import datadog_counter, datadog_gauge_task +from corehq.util.datadog.gauges import datadog_counter from corehq.util.timezones.conversions import ServerTime MAX_TRIAL_SMS = 50 @@ -588,4 +589,4 @@ def queued_sms(): return QueuedSMS.objects.count() -datadog_gauge_task('commcare.sms.queued', queued_sms, run_every=crontab()) +metrics_gauge_task('commcare.sms.queued', queued_sms, run_every=crontab()) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 5382fbdca840..25cdd705fb60 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -6,6 +6,7 @@ from celery.task import periodic_task, task from celery.utils.log import get_task_logger +from corehq.util.metrics.metrics import metrics_gauge_task from dimagi.utils.couch import get_redis_lock from dimagi.utils.couch.undo import DELETED_SUFFIX @@ -25,7 +26,6 @@ from corehq.util.datadog.gauges import ( datadog_bucket_timer, datadog_counter, - datadog_gauge_task, ) from corehq.util.datadog.utils import make_buckets_from_timedeltas from corehq.util.soft_assert import soft_assert @@ -137,7 +137,7 @@ def process_repeat_record(repeat_record): logging.exception('Failed to process repeat record: {}'.format(repeat_record._id)) -repeaters_overdue = datadog_gauge_task( +repeaters_overdue = metrics_gauge_task( 'commcare.repeaters.overdue', get_overdue_repeat_record_count, run_every=crontab() # every minute diff --git a/corehq/util/datadog/gauges.py b/corehq/util/datadog/gauges.py index bb4788491192..744fa9ebc057 100644 --- a/corehq/util/datadog/gauges.py +++ b/corehq/util/datadog/gauges.py @@ -9,23 +9,6 @@ from corehq.util.timer import TimingContext -def datadog_gauge_task(name, fn, run_every, enforce_prefix='commcare'): - """ - helper for easily registering datadog gauges to run periodically - - To update a datadog gauge on a schedule based on the result of a function - just add to your app's tasks.py: - - my_calculation = datadog_gauge_task('my.datadog.metric', my_calculation_function, - run_every=crontab(minute=0)) - - """ - _enforce_prefix(name, enforce_prefix) - - datadog_gauge = _DatadogGauge(name, fn, run_every) - return datadog_gauge.periodic_task() - - def datadog_histogram(name, value, enforce_prefix='commcare', tags=None): """ Usage: Used to track the statistical distribution of a set of values over a statsd flush period. @@ -100,23 +83,6 @@ def new_stop(name=None): return timer -class _DatadogGauge(object): - - def __init__(self, name, fn, run_every): - self.name = name - self.fn = fn - self.run_every = run_every - - def periodic_task(self): - @periodic_task(serializer='pickle', queue='background_queue', run_every=self.run_every, - acks_late=True, ignore_result=True) - @wraps(self.fn) - def inner(*args, **kwargs): - statsd.gauge(self.name, self.fn(*args, **kwargs)) - - return inner - - def _enforce_prefix(name, prefix): soft_assert(fail_if_debug=True).call( not prefix or name.split('.')[0] == prefix, diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 8b28203692cf..00ed7a633b98 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -1,8 +1,10 @@ import abc import re from abc import abstractmethod +from functools import wraps from typing import Iterable, List +from celery.task import periodic_task from django.utils.functional import SimpleLazyObject from corehq.util.soft_assert import soft_assert @@ -93,3 +95,26 @@ def _record_metric(*args, **kwargs): getattr(delegate, item)(*args, **kwargs) return _record_metric raise AttributeError + + +def metrics_gauge_task(name, fn, run_every): + """ + helper for easily registering gauges to run periodically + + To update a gauge on a schedule based on the result of a function + just add to your app's tasks.py: + + my_calculation = metrics_gauge_task('commcare.my.metric', my_calculation_function, + run_every=crontab(minute=0)) + + """ + _enforce_prefix(name, 'commcare') + + @periodic_task(serializer='pickle', queue='background_queue', run_every=run_every, + acks_late=True, ignore_result=True) + @wraps(fn) + def inner(*args, **kwargs): + from corehq.util.metrics import metrics_gauge + metrics_gauge(name, fn(*args, **kwargs)) + + return inner From 5897eb9574ad8baf4615eb8ff5a2062b2f7cf14f Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 17 Mar 2020 14:43:11 +0200 Subject: [PATCH 25/40] instantiate provider --- corehq/util/metrics/__init__.py | 2 +- corehq/util/metrics/urls.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 corehq/util/metrics/urls.py diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 4d34b1e48822..9ee5c6dcda2d 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -14,7 +14,7 @@ def _get_metrics_provider(): if not _metrics: providers = [] for provider_path in settings.METRICS_PROVIDERS: - provider = to_function(provider_path) + provider = to_function(provider_path)() provider.initialize() providers.append(provider) diff --git a/corehq/util/metrics/urls.py b/corehq/util/metrics/urls.py new file mode 100644 index 000000000000..e69de29bb2d1 From 28a8a5bc448bbc958e9defa9b27d82ee66143702 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 17 Mar 2020 14:43:40 +0200 Subject: [PATCH 26/40] hook up metrics view --- corehq/util/metrics/urls.py | 7 +++++++ corehq/util/metrics/views.py | 19 +++++++++++++++++++ deployment/gunicorn/gunicorn_conf.py | 13 +++++++++++++ urls.py | 3 ++- 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 corehq/util/metrics/views.py diff --git a/corehq/util/metrics/urls.py b/corehq/util/metrics/urls.py index e69de29bb2d1..b6df3985ac4f 100644 --- a/corehq/util/metrics/urls.py +++ b/corehq/util/metrics/urls.py @@ -0,0 +1,7 @@ +from django.conf.urls import url + +from corehq.util.metrics.views import prometheus_metrics + +urlpatterns = [ + url(r"^metrics$", prometheus_metrics, name="prometheus-django-metrics") +] diff --git a/corehq/util/metrics/views.py b/corehq/util/metrics/views.py new file mode 100644 index 000000000000..d55d55934b7b --- /dev/null +++ b/corehq/util/metrics/views.py @@ -0,0 +1,19 @@ +import os + +import prometheus_client +from django.http import HttpResponse +from prometheus_client import multiprocess + + +def prometheus_metrics(request): + """Exports /metrics as a Django view. + """ + if "prometheus_multiproc_dir" in os.environ: + registry = prometheus_client.CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + else: + registry = prometheus_client.REGISTRY + metrics_page = prometheus_client.generate_latest(registry) + return HttpResponse( + metrics_page, content_type=prometheus_client.CONTENT_TYPE_LATEST + ) diff --git a/deployment/gunicorn/gunicorn_conf.py b/deployment/gunicorn/gunicorn_conf.py index 668c9e68a008..9462855841ad 100644 --- a/deployment/gunicorn/gunicorn_conf.py +++ b/deployment/gunicorn/gunicorn_conf.py @@ -1,3 +1,6 @@ +import glob +import os + preload_app = True worker_class = 'gevent' keepalive = 60 @@ -18,3 +21,13 @@ def post_fork(server, worker): # see: https://github.com/benoitc/gunicorn/issues/527#issuecomment-19601046 from django.urls import resolve resolve('/') + +def on_starting(server): + path = os.environ.get('prometheus_multiproc_dir') + for f in glob.glob(os.path.join(path, '*.db')): + os.remove(f) + + +def child_exit(server, worker): + from prometheus_client import multiprocess + multiprocess.mark_process_dead(worker.pid) diff --git a/urls.py b/urls.py index 6d2b35fc2004..14bb326d340e 100644 --- a/urls.py +++ b/urls.py @@ -141,7 +141,8 @@ url(r'^unsubscribe_report/(?P[\w-]+)/' r'(?P[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})/(?P[\w-]+)/', ReportNotificationUnsubscribeView.as_view(), name=ReportNotificationUnsubscribeView.urlname), - url(r'^phone/list_apps', list_apps, name="list_accessible_apps") + url(r'^phone/list_apps', list_apps, name="list_accessible_apps"), + url(r'', include('corehq.util.metrics.urls')), ] + LOCAL_APP_URLS if settings.ENABLE_PRELOGIN_SITE: From abd03555f983678b7ad47aa9f1dbba3c78588732 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 17 Mar 2020 15:36:13 +0200 Subject: [PATCH 27/40] todo --- corehq/util/metrics/metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 00ed7a633b98..093c9ffbd63e 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -115,6 +115,7 @@ def metrics_gauge_task(name, fn, run_every): @wraps(fn) def inner(*args, **kwargs): from corehq.util.metrics import metrics_gauge + # TODO: make this use prometheus push gateway metrics_gauge(name, fn(*args, **kwargs)) return inner From 75fa9793259f75075bcc2f83fe58253bd4479737 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Tue, 17 Mar 2020 16:22:16 +0200 Subject: [PATCH 28/40] lint --- corehq/apps/receiverwrapper/views.py | 8 +++----- corehq/util/metrics/__init__.py | 11 +++++------ corehq/util/metrics/datadog.py | 1 - corehq/util/metrics/metrics.py | 4 +--- corehq/util/metrics/tests/test_metrics.py | 5 ----- deployment/gunicorn/gunicorn_conf.py | 2 ++ 6 files changed, 11 insertions(+), 20 deletions(-) diff --git a/corehq/apps/receiverwrapper/views.py b/corehq/apps/receiverwrapper/views.py index 00d00de4b8cb..6fe383b8465b 100644 --- a/corehq/apps/receiverwrapper/views.py +++ b/corehq/apps/receiverwrapper/views.py @@ -1,16 +1,11 @@ -import logging import os from django.http import HttpResponseBadRequest, HttpResponseForbidden from django.views.decorators.csrf import csrf_exempt from django.views.decorators.http import require_POST -from couchdbkit import ResourceNotFound -from tastypie.http import HttpTooManyRequests - import couchforms from casexml.apps.case.xform import get_case_updates, is_device_report -from corehq.util.metrics import metrics, metrics_counter, metrics_histogram from couchforms import openrosa_response from couchforms.const import MAGIC_PROPERTY, BadRequest from couchforms.getters import MultimediaBug @@ -51,7 +46,10 @@ convert_xform_to_json, should_use_sql_backend, ) +from corehq.util.metrics import metrics_counter, metrics_histogram from corehq.util.timer import TimingContext +from couchdbkit import ResourceNotFound +from tastypie.http import HttpTooManyRequests PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0)) PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT') diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 9ee5c6dcda2d..9cdb66dd182f 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,9 +1,7 @@ -from typing import List, Iterable +from typing import Iterable import settings -from corehq.util.metrics.datadog import DatadogMetrics from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, DEFAULT_BUCKETS -from corehq.util.metrics.prometheus import PrometheusMetrics from dimagi.utils.modules import to_function _metrics = None @@ -37,8 +35,9 @@ def metrics_gauge(name: str, value: float, tags: dict = None, documentation: str provider.gauge(name, value, tags, documentation) -def metrics_histogram(name: str, value: float, - bucket_tag: str, buckets: Iterable[int] = DEFAULT_BUCKETS, bucket_unit: str = '', - tags: dict = None, documentation: str = ''): +def metrics_histogram( + name: str, value: float, + bucket_tag: str, buckets: Iterable[int] = DEFAULT_BUCKETS, bucket_unit: str = '', + tags: dict = None, documentation: str = ''): provider = _get_metrics_provider() provider.histogram(name, value, bucket_tag, buckets, bucket_unit, tags, documentation) diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index 09efc8dc8524..f5bca56f24f1 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -5,7 +5,6 @@ from corehq.util.datadog.utils import bucket_value from corehq.util.metrics.metrics import HqMetrics -from datadog import api from datadog.dogstatsd.base import DogStatsd datadog_logger = logging.getLogger('datadog') diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 093c9ffbd63e..1ddb92167c30 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -2,11 +2,9 @@ import re from abc import abstractmethod from functools import wraps -from typing import Iterable, List +from typing import List from celery.task import periodic_task -from django.utils.functional import SimpleLazyObject - from corehq.util.soft_assert import soft_assert from prometheus_client.utils import INF diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py index 920fb52bcacb..535521bdc9b3 100644 --- a/corehq/util/metrics/tests/test_metrics.py +++ b/corehq/util/metrics/tests/test_metrics.py @@ -1,16 +1,11 @@ from typing import Dict, Tuple from django.test import SimpleTestCase -from django.utils.functional import SimpleLazyObject from corehq.util.metrics import DatadogMetrics, PrometheusMetrics -from corehq.util.metrics.metrics import ( - DelegatedMetrics, -) from corehq.util.metrics.tests.utils import patch_datadog from prometheus_client.samples import Sample from prometheus_client.utils import INF -from testil import eq class _TestMetrics(SimpleTestCase): diff --git a/deployment/gunicorn/gunicorn_conf.py b/deployment/gunicorn/gunicorn_conf.py index 9462855841ad..631b4d6a2fe5 100644 --- a/deployment/gunicorn/gunicorn_conf.py +++ b/deployment/gunicorn/gunicorn_conf.py @@ -22,7 +22,9 @@ def post_fork(server, worker): from django.urls import resolve resolve('/') + def on_starting(server): + """Wipe hte metrics from previous processes""" path = os.environ.get('prometheus_multiproc_dir') for f in glob.glob(os.path.join(path, '*.db')): os.remove(f) From 017ca5a57c5e6150df16d05abd0350577dc1a2a5 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 10:10:53 +0200 Subject: [PATCH 29/40] PR feedback --- corehq/util/metrics/__init__.py | 6 ++++++ corehq/util/metrics/datadog.py | 6 +++++- corehq/util/metrics/metrics.py | 6 +++--- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 9cdb66dd182f..2bc9b73c2159 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -4,6 +4,12 @@ from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, DEFAULT_BUCKETS from dimagi.utils.modules import to_function +__all__ = [ + 'metrics_counter', + 'metrics_gauge', + 'metrics_histogram', +] + _metrics = None diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index f5bca56f24f1..ee30af19f650 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -17,7 +17,11 @@ class DatadogMetrics(HqMetrics): def initialize(self): if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: - raise Exception("Datadog not configured. Set DATADOG_API_KEY and DATADOG_APP_KEY in settings.") + raise Exception( + "Datadog not configured." + "Set DATADOG_API_KEY and DATADOG_APP_KEY in settings or update METRICS_PROVIDERS" + "to remove the Datadog provider." + ) try: from datadog import initialize diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 1ddb92167c30..87bc96e3cf3e 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -65,11 +65,11 @@ def _counter(self, name, value, tags, documentation): @abstractmethod def _gauge(self, name, value, tags, documentation): - pass + raise NotImplementedError @abstractmethod def _histogram(self, name, value, bucket_tag, buckets, bucket_unit, tags, documentation): - pass + raise NotImplementedError class DummyMetrics: @@ -92,7 +92,7 @@ def _record_metric(*args, **kwargs): for delegate in self.delegates: getattr(delegate, item)(*args, **kwargs) return _record_metric - raise AttributeError + raise AttributeError(item) def metrics_gauge_task(name, fn, run_every): From 770f4a037b7aed45a43f2013ae45215334a34e9a Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 10:12:03 +0200 Subject: [PATCH 30/40] log output from DebugMetrics --- corehq/util/metrics/__init__.py | 4 ++-- corehq/util/metrics/metrics.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 2bc9b73c2159..9d45ead614d5 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,7 +1,7 @@ from typing import Iterable import settings -from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, DEFAULT_BUCKETS +from corehq.util.metrics.metrics import DebugMetrics, DelegatedMetrics, DEFAULT_BUCKETS from dimagi.utils.modules import to_function __all__ = [ @@ -23,7 +23,7 @@ def _get_metrics_provider(): providers.append(provider) if not providers: - _metrics = DummyMetrics() + _metrics = DebugMetrics() elif len(providers) > 1: _metrics = DelegatedMetrics(providers) else: diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 87bc96e3cf3e..70597ce99bef 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -1,4 +1,5 @@ import abc +import logging import re from abc import abstractmethod from functools import wraps @@ -14,6 +15,9 @@ RESERVED_METRIC_TAG_NAMES = ['quantile', 'le'] +logger = logging.getLogger('commcare.metrics') + + def _enforce_prefix(name, prefix): soft_assert(fail_if_debug=True).call( not prefix or name.startswith(prefix), @@ -72,14 +76,16 @@ def _histogram(self, name, value, bucket_tag, buckets, bucket_unit, tags, docume raise NotImplementedError -class DummyMetrics: +class DebugMetrics: def __getattr__(self, item): if item in ('counter', 'gauge', 'histogram'): - def _check(name, documentation, tags, *args, **kwargs): + def _check(name, value, *args, **kwargs): + tags = kwargs.get('tags', {}) _enforce_prefix(name, 'commcare') _validate_tag_names(tags) + logger.debug("[%s] %s %s %s", item, name, tags, value) return _check - raise AttributeError + raise AttributeError(item) class DelegatedMetrics: From 8bb9e03a5e476d7072ac34327fe624dcd5acdc5e Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 10:32:28 +0200 Subject: [PATCH 31/40] move metrics view to hq/admin --- deployment/gunicorn/gunicorn_conf.py | 2 +- urls.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/gunicorn/gunicorn_conf.py b/deployment/gunicorn/gunicorn_conf.py index 631b4d6a2fe5..aa8399de7ad6 100644 --- a/deployment/gunicorn/gunicorn_conf.py +++ b/deployment/gunicorn/gunicorn_conf.py @@ -24,7 +24,7 @@ def post_fork(server, worker): def on_starting(server): - """Wipe hte metrics from previous processes""" + """Wipe the metrics from previous processes""" path = os.environ.get('prometheus_multiproc_dir') for f in glob.glob(os.path.join(path, '*.db')): os.remove(f) diff --git a/urls.py b/urls.py index 14bb326d340e..d0c98c0a1564 100644 --- a/urls.py +++ b/urls.py @@ -104,6 +104,7 @@ url(r'^hq/sms/', include(sms_admin_interface_urls)), url(r'^hq/multimedia/', include('corehq.apps.hqmedia.urls')), url(r'^hq/admin/', include('corehq.apps.hqadmin.urls')), + url(r'^hq/admin/', include('corehq.util.metrics.urls')), url(r'^hq/flags/', include('corehq.apps.toggle_ui.urls')), url(r'^hq/notifications/', include('corehq.apps.notifications.urls')), url(r'^unicel/', include('corehq.messaging.smsbackends.unicel.urls')), @@ -142,7 +143,6 @@ r'(?P[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})/(?P[\w-]+)/', ReportNotificationUnsubscribeView.as_view(), name=ReportNotificationUnsubscribeView.urlname), url(r'^phone/list_apps', list_apps, name="list_accessible_apps"), - url(r'', include('corehq.util.metrics.urls')), ] + LOCAL_APP_URLS if settings.ENABLE_PRELOGIN_SITE: From 9a49d2681e15ee96f9baf18a8763199588b7e3aa Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 10:36:00 +0200 Subject: [PATCH 32/40] move metrics_gauge_task to package init --- corehq/apps/case_importer/tasks.py | 2 +- corehq/apps/hqwebapp/tasks.py | 2 +- corehq/apps/sms/tasks.py | 2 +- corehq/motech/repeaters/tasks.py | 2 +- corehq/util/metrics/__init__.py | 29 ++++++++++++++++++++++++++++- corehq/util/metrics/metrics.py | 26 -------------------------- 6 files changed, 32 insertions(+), 31 deletions(-) diff --git a/corehq/apps/case_importer/tasks.py b/corehq/apps/case_importer/tasks.py index e0e4202a0ada..087ba77a8e30 100644 --- a/corehq/apps/case_importer/tasks.py +++ b/corehq/apps/case_importer/tasks.py @@ -10,7 +10,7 @@ from .tracking.analytics import get_case_upload_files_total_bytes from .tracking.case_upload_tracker import CaseUpload from .util import get_importer_error_message, exit_celery_with_error_message -from ...util.metrics.metrics import metrics_gauge_task +from ...util.metrics import metrics_gauge_task @task(serializer='pickle', queue='case_import_queue') diff --git a/corehq/apps/hqwebapp/tasks.py b/corehq/apps/hqwebapp/tasks.py index a030c54fe211..35fdef85381f 100644 --- a/corehq/apps/hqwebapp/tasks.py +++ b/corehq/apps/hqwebapp/tasks.py @@ -5,7 +5,7 @@ from celery.task import task, periodic_task from corehq.util.bounced_email_manager import BouncedEmailManager -from corehq.util.metrics.metrics import metrics_gauge_task +from corehq.util.metrics import metrics_gauge_task from dimagi.utils.logging import notify_exception from corehq.util.datadog.gauges import datadog_track_errors diff --git a/corehq/apps/sms/tasks.py b/corehq/apps/sms/tasks.py index aa5c58446751..254579db4273 100644 --- a/corehq/apps/sms/tasks.py +++ b/corehq/apps/sms/tasks.py @@ -7,7 +7,7 @@ from celery.schedules import crontab -from corehq.util.metrics.metrics import metrics_gauge_task +from corehq.util.metrics import metrics_gauge_task from dimagi.utils.couch import ( CriticalSection, get_redis_client, diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 25cdd705fb60..1a6c1fc12431 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -6,7 +6,7 @@ from celery.task import periodic_task, task from celery.utils.log import get_task_logger -from corehq.util.metrics.metrics import metrics_gauge_task +from corehq.util.metrics import metrics_gauge_task from dimagi.utils.couch import get_redis_lock from dimagi.utils.couch.undo import DELETED_SUFFIX diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 9d45ead614d5..17eacee76f88 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,7 +1,10 @@ +from functools import wraps from typing import Iterable +from celery.task import periodic_task + import settings -from corehq.util.metrics.metrics import DebugMetrics, DelegatedMetrics, DEFAULT_BUCKETS +from corehq.util.metrics.metrics import DebugMetrics, DelegatedMetrics, DEFAULT_BUCKETS, _enforce_prefix from dimagi.utils.modules import to_function __all__ = [ @@ -47,3 +50,27 @@ def metrics_histogram( tags: dict = None, documentation: str = ''): provider = _get_metrics_provider() provider.histogram(name, value, bucket_tag, buckets, bucket_unit, tags, documentation) + + +def metrics_gauge_task(name, fn, run_every): + """ + helper for easily registering gauges to run periodically + + To update a gauge on a schedule based on the result of a function + just add to your app's tasks.py: + + my_calculation = metrics_gauge_task('commcare.my.metric', my_calculation_function, + run_every=crontab(minute=0)) + + """ + _enforce_prefix(name, 'commcare') + + @periodic_task(serializer='pickle', queue='background_queue', run_every=run_every, + acks_late=True, ignore_result=True) + @wraps(fn) + def inner(*args, **kwargs): + from corehq.util.metrics import metrics_gauge + # TODO: make this use prometheus push gateway + metrics_gauge(name, fn(*args, **kwargs)) + + return inner diff --git a/corehq/util/metrics/metrics.py b/corehq/util/metrics/metrics.py index 70597ce99bef..d181927f8fb4 100644 --- a/corehq/util/metrics/metrics.py +++ b/corehq/util/metrics/metrics.py @@ -2,10 +2,8 @@ import logging import re from abc import abstractmethod -from functools import wraps from typing import List -from celery.task import periodic_task from corehq.util.soft_assert import soft_assert from prometheus_client.utils import INF @@ -99,27 +97,3 @@ def _record_metric(*args, **kwargs): getattr(delegate, item)(*args, **kwargs) return _record_metric raise AttributeError(item) - - -def metrics_gauge_task(name, fn, run_every): - """ - helper for easily registering gauges to run periodically - - To update a gauge on a schedule based on the result of a function - just add to your app's tasks.py: - - my_calculation = metrics_gauge_task('commcare.my.metric', my_calculation_function, - run_every=crontab(minute=0)) - - """ - _enforce_prefix(name, 'commcare') - - @periodic_task(serializer='pickle', queue='background_queue', run_every=run_every, - acks_late=True, ignore_result=True) - @wraps(fn) - def inner(*args, **kwargs): - from corehq.util.metrics import metrics_gauge - # TODO: make this use prometheus push gateway - metrics_gauge(name, fn(*args, **kwargs)) - - return inner From 74ce07fd5eb064c64c5b2cf0e1732d759139870d Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 10:39:19 +0200 Subject: [PATCH 33/40] fix import --- corehq/util/metrics/tests/test_metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/corehq/util/metrics/tests/test_metrics.py b/corehq/util/metrics/tests/test_metrics.py index 535521bdc9b3..50b5ad514b51 100644 --- a/corehq/util/metrics/tests/test_metrics.py +++ b/corehq/util/metrics/tests/test_metrics.py @@ -2,7 +2,8 @@ from django.test import SimpleTestCase -from corehq.util.metrics import DatadogMetrics, PrometheusMetrics +from corehq.util.metrics.datadog import DatadogMetrics +from corehq.util.metrics.prometheus import PrometheusMetrics from corehq.util.metrics.tests.utils import patch_datadog from prometheus_client.samples import Sample from prometheus_client.utils import INF From f1874e43522c7c8f871eb62cf128e17146425aa2 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 11:47:07 +0200 Subject: [PATCH 34/40] add script for running metrics endpoint --- deployment/prometheum_server.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 deployment/prometheum_server.py diff --git a/deployment/prometheum_server.py b/deployment/prometheum_server.py new file mode 100644 index 000000000000..1de11e7f2b04 --- /dev/null +++ b/deployment/prometheum_server.py @@ -0,0 +1,26 @@ +""" +Simple WSGI server that exposes Prometheus metrics. + +Environment variable `prometheus_multiproc_dir` must be set and match +the value used by Django. +""" +import os +import threading +from wsgiref.simple_server import make_server + +from prometheus_client import CollectorRegistry, make_wsgi_app, multiprocess +from prometheus_client.exposition import _SilentHandler + +multiproc_dir = os.environ.get("prometheus_multiproc_dir") +if not multiproc_dir: + raise Exception("Environment variable 'prometheus_multiproc_dir' is not set") + +print(f"Exposing metrics from '{multiproc_dir}'") + +registry = CollectorRegistry() +multiprocess.MultiProcessCollector(registry) + +app = make_wsgi_app(registry) +httpd = make_server('', 9011, app, handler_class=_SilentHandler) +t = threading.Thread(target=httpd.serve_forever) +t.start() From d4220f06d756de09565fc54775a3cc1921d28777 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 12:13:34 +0200 Subject: [PATCH 35/40] update docs --- corehq/util/metrics/__init__.py | 1 + corehq/util/metrics/datadog.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 17eacee76f88..410d53c2dad3 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -11,6 +11,7 @@ 'metrics_counter', 'metrics_gauge', 'metrics_histogram', + 'metrics_gauge_task', ] _metrics = None diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index ee30af19f650..6b766d0fa350 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -52,16 +52,16 @@ def _histogram(self, name: str, value: float, This implementation of histogram uses tagging to record the buckets. It does not use the Datadog Histogram metric type. - The metric itself will be incremented by 1 on each call to `observe`. The value - passed to `observe` will be used to create the bucket tag. + The metric itself will be incremented by 1 on each call. The value + passed to `metrics_histogram` will be used to create the bucket tag. For example: - h = Histogram( - 'commcare.request.duration', 'description', - bucket_tag='duration', buckets=[1,2,3], bucket_units='ms' + h = metrics_histogram( + 'commcare.request.duration', 1.4, + bucket_tag='duration', buckets=[1,2,3], bucket_units='ms', + tags=tags ) - h.observe(1.4) # resulting Datadog metric # commcare.request.duration:1|c|#duration:lt_2ms From 0457c000c5944cf12d3ca6ab55491c3ac57167c8 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 13:58:03 +0200 Subject: [PATCH 36/40] docs --- corehq/util/metrics/__init__.py | 108 +++++++++++++++++++++++++++++- corehq/util/metrics/datadog.py | 19 +++++- corehq/util/metrics/prometheus.py | 34 +++++++++- docs/index.rst | 1 + docs/metrics.rst | 1 + 5 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 docs/metrics.rst diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 410d53c2dad3..44855c9ba4c0 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -1,3 +1,102 @@ +""" +Metrics collection +****************** + +.. contents:: + :local: + +This package exposes functions and utilities to record metrics in CommCare. These metrics +are exported / exposed to the configured metrics providers. Supported providers are: + + * Datadog + * Prometheus + +Providers are enabled using the `METRICS_PROVIDER` setting. Multiple providers can be +enabled concurrently: + +:: + + METRICS_PROVIDERS = [ + 'corehq.util.metrics.prometheus.PrometheusMetrics', + 'corehq.util.metrics.datadog.DatadogMetrics', + ] + +If no metrics providers are configured CommCare will log all metrics to the `commcare.metrics` logger +at the DEBUG level. + +Metric tagging +============== +Metrics may be tagged by passing a dictionary of tag names and values. Tags should be used +add dimensions to a metric e.g. request type, response status. + +Tags should not originate from unbounded sources or sources with high dimensionality such as +timestamps, user IDs, request IDs etc. Ideally a tag should not have more than 10 possible values. + +Read more about tagging: + +* https://prometheus.io/docs/practices/naming/#labels +* https://docs.datadoghq.com/tagging/ + +Metric Types +============ + +Counter metric +'''''''''''''' + +A counter is a cumulative metric that represents a single monotonically increasing counter +whose value can only increase or be reset to zero on restart. For example, you can use a +counter to represent the number of requests served, tasks completed, or errors. + +Do not use a counter to expose a value that can decrease. For example, do not use a counter +for the number of currently running processes; instead use a gauge. + +:: + + metrics_counter('commcare.case_import.count', 1, tags={'domain': domain}) + + +Gauge metric +'''''''''''' + +A gauge is a metric that represents a single numerical value that can arbitrarily go up and down. + +Gauges are typically used for measured values like temperatures or current memory usage, +but also "counts" that can go up and down, like the number of concurrent requests. + +:: + + metrics_gauge('commcare.case_import.queue_length', queue_length) + +For regular reporting of a gauge metric there is the `metrics_gauge_task` function: + +.. autofunction:: corehq.util.metrics.metrics_gauge_task + +Histogram metric +'''''''''''''''' + +A histogram samples observations (usually things like request durations or response sizes) +and counts them in configurable buckets. + +:: + + metrics_histogram( + 'commcare.case_import.duration', timer_duration, + bucket_tag='size', buckets=[10, 50, 200, 1000], bucket_unit='s', + tags={'domain': domain} + ) + +Histograms are recorded differently in the different providers. + +.. automethod:: corehq.util.metrics.datadog.DatadogMetrics._histogram + +.. automethod:: corehq.util.metrics.prometheus.PrometheusMetrics._histogram + + +Other Notes +=========== + +* All metrics must use the prefix 'commcare.' +""" from functools import wraps from typing import Iterable @@ -55,13 +154,16 @@ def metrics_histogram( def metrics_gauge_task(name, fn, run_every): """ - helper for easily registering gauges to run periodically + Helper for easily registering gauges to run periodically To update a gauge on a schedule based on the result of a function just add to your app's tasks.py: - my_calculation = metrics_gauge_task('commcare.my.metric', my_calculation_function, - run_every=crontab(minute=0)) + :: + + my_calculation = metrics_gauge_task( + 'commcare.my.metric', my_calculation_function, run_every=crontab(minute=0) + ) """ _enforce_prefix(name, 'commcare') diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index 6b766d0fa350..29baa01272ae 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -15,6 +15,13 @@ class DatadogMetrics(HqMetrics): + """Datadog Metrics Provider + + Settings: + * DATADOG_API_KEY + * DATADOG_APP_KEY + """ + def initialize(self): if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: raise Exception( @@ -38,10 +45,13 @@ def initialize(self): pass def _counter(self, name: str, value: float, tags: dict = None, documentation: str = ''): + """Although this is submitted as a COUNT the Datadog app represents these as a RATE. + See https://docs.datadoghq.com/developers/metrics/types/?tab=rate#definition""" dd_tags = _format_tags(tags) _datadog_record(statsd.increment, name, value, dd_tags) def _gauge(self, name: str, value: float, tags: dict = None, documentation: str = ''): + """See https://docs.datadoghq.com/developers/metrics/types/?tab=gauge#definition""" dd_tags = _format_tags(tags) _datadog_record(statsd.gauge, name, value, dd_tags) @@ -57,16 +67,19 @@ def _histogram(self, name: str, value: float, For example: + :: + h = metrics_histogram( 'commcare.request.duration', 1.4, bucket_tag='duration', buckets=[1,2,3], bucket_units='ms', tags=tags ) - # resulting Datadog metric - # commcare.request.duration:1|c|#duration:lt_2ms + # resulting metrics + # commcare.request.duration:1|c|#duration:lt_2ms + + For more explanation about why this implementation was chosen see: - For more details see: * https://github.com/dimagi/commcare-hq/pull/17080 * https://github.com/dimagi/commcare-hq/pull/17030#issuecomment-315794700 """ diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index ecb757dcc0e3..f3766870345a 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -8,17 +8,49 @@ class PrometheusMetrics(HqMetrics): - def __init__(self): + """Prometheus Metrics Provider""" + + def initialize(self): self._metrics = {} def _counter(self, name: str, value: float = 1, tags: dict = None, documentation: str = ''): + """See https://prometheus.io/docs/concepts/metric_types/#counter""" self._get_metric(PCounter, name, tags, documentation).inc(value) def _gauge(self, name: str, value: float, tags: dict = None, documentation: str = ''): + """See https://prometheus.io/docs/concepts/metric_types/#histogram""" self._get_metric(PGauge, name, tags, documentation).set(value) def _histogram(self, name: str, value: float, bucket_tag: str, buckets: List[int], bucket_unit: str = '', tags: dict = None, documentation: str = ''): + """ + A cumulative histogram with a base metric name of exposes multiple time series + during a scrape: + + * cumulative counters for the observation buckets, exposed as + `_bucket{le=""}` + * the total sum of all observed values, exposed as `_sum` + * the count of events that have been observed, exposed as `_count` + (identical to `_bucket{le="+Inf"}` above) + + For example + :: + + h = metrics_histogram( + 'commcare.request_duration', 1.4, + bucket_tag='duration', buckets=[1,2,3], bucket_units='ms', + tags=tags + ) + + # resulting metrics + # commcare_request_duration_bucket{...tags..., le="1.0"} 0.0 + # commcare_request_duration_bucket{...tags..., le="2.0"} 1.0 + # commcare_request_duration_bucket{...tags..., le="3.0"} 1.0 + # commcare_request_duration_bucket{...tags..., le="+Inf"} 1.0 + # commcare_request_duration_sum{...tags...} 1.4 + # commcare_request_duration_count{...tags...} 1.0 + + See https://prometheus.io/docs/concepts/metric_types/#histogram""" self._get_metric(PHistogram, name, tags, documentation, buckets=buckets).observe(value) def _get_metric(self, metric_type, name, tags, documentation, **kwargs): diff --git a/docs/index.rst b/docs/index.rst index 697abee9a1b6..47845d2f3d93 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -48,6 +48,7 @@ Welcome to CommCareHQ's documentation! openmrs js-guide/README databases + metrics Tips for documenting -------------------- diff --git a/docs/metrics.rst b/docs/metrics.rst new file mode 100644 index 000000000000..fdfaf57dc3b1 --- /dev/null +++ b/docs/metrics.rst @@ -0,0 +1 @@ +.. automodule:: corehq.util.metrics From 3d7991264236ede1ee3f2bf850ae7486449ebce6 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Wed, 18 Mar 2020 16:12:21 +0200 Subject: [PATCH 37/40] do setup in __init__ --- corehq/util/metrics/__init__.py | 1 - corehq/util/metrics/datadog.py | 19 +++++++++++-------- corehq/util/metrics/prometheus.py | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 44855c9ba4c0..80315e3462c2 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -122,7 +122,6 @@ def _get_metrics_provider(): providers = [] for provider_path in settings.METRICS_PROVIDERS: provider = to_function(provider_path)() - provider.initialize() providers.append(provider) if not providers: diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index 29baa01272ae..143aedebe41c 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -22,7 +22,17 @@ class DatadogMetrics(HqMetrics): * DATADOG_APP_KEY """ - def initialize(self): + def __init__(self): + if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS: + try: + from ddtrace import tracer + tracer.enabled = False + except ImportError: + pass + + if settings.UNIT_TESTING: + return + if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: raise Exception( "Datadog not configured." @@ -37,13 +47,6 @@ def initialize(self): else: initialize(settings.DATADOG_API_KEY, settings.DATADOG_APP_KEY) - if settings.UNIT_TESTING or settings.DEBUG or 'ddtrace.contrib.django' not in settings.INSTALLED_APPS: - try: - from ddtrace import tracer - tracer.enabled = False - except ImportError: - pass - def _counter(self, name: str, value: float, tags: dict = None, documentation: str = ''): """Although this is submitted as a COUNT the Datadog app represents these as a RATE. See https://docs.datadoghq.com/developers/metrics/types/?tab=rate#definition""" diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index f3766870345a..3ddca3537153 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -10,7 +10,7 @@ class PrometheusMetrics(HqMetrics): """Prometheus Metrics Provider""" - def initialize(self): + def __init__(self): self._metrics = {} def _counter(self, name: str, value: float = 1, tags: dict = None, documentation: str = ''): From d4c3dc1b8f4f6ff443df985a1ea66c84accd9722 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 19 Mar 2020 10:52:08 +0200 Subject: [PATCH 38/40] simplify prometheus server --- deployment/{prometheum_server.py => prometheus_server.py} | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) rename deployment/{prometheum_server.py => prometheus_server.py} (90%) diff --git a/deployment/prometheum_server.py b/deployment/prometheus_server.py similarity index 90% rename from deployment/prometheum_server.py rename to deployment/prometheus_server.py index 1de11e7f2b04..7e0a7da74e8a 100644 --- a/deployment/prometheum_server.py +++ b/deployment/prometheus_server.py @@ -5,7 +5,6 @@ the value used by Django. """ import os -import threading from wsgiref.simple_server import make_server from prometheus_client import CollectorRegistry, make_wsgi_app, multiprocess @@ -22,5 +21,4 @@ app = make_wsgi_app(registry) httpd = make_server('', 9011, app, handler_class=_SilentHandler) -t = threading.Thread(target=httpd.serve_forever) -t.start() +httpd.serve_forever() From 1017c8512e5ad6f2f1f6728ab9f86397d2dfcdee Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 19 Mar 2020 11:28:47 +0200 Subject: [PATCH 39/40] doc updates --- corehq/util/metrics/__init__.py | 2 +- corehq/util/metrics/prometheus.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/corehq/util/metrics/__init__.py b/corehq/util/metrics/__init__.py index 80315e3462c2..5421865d9b53 100644 --- a/corehq/util/metrics/__init__.py +++ b/corehq/util/metrics/__init__.py @@ -27,7 +27,7 @@ Metric tagging ============== Metrics may be tagged by passing a dictionary of tag names and values. Tags should be used -add dimensions to a metric e.g. request type, response status. +to add dimensions to a metric e.g. request type, response status. Tags should not originate from unbounded sources or sources with high dimensionality such as timestamps, user IDs, request IDs etc. Ideally a tag should not have more than 10 possible values. diff --git a/corehq/util/metrics/prometheus.py b/corehq/util/metrics/prometheus.py index 3ddca3537153..00d66b24711a 100644 --- a/corehq/util/metrics/prometheus.py +++ b/corehq/util/metrics/prometheus.py @@ -28,10 +28,10 @@ def _histogram(self, name: str, value: float, bucket_tag: str, buckets: List[int during a scrape: * cumulative counters for the observation buckets, exposed as - `_bucket{le=""}` - * the total sum of all observed values, exposed as `_sum` - * the count of events that have been observed, exposed as `_count` - (identical to `_bucket{le="+Inf"}` above) + `_bucket{le=""}` + * the total sum of all observed values, exposed as `_sum` + * the count of events that have been observed, exposed as `_count` + (identical to `_bucket{le="+Inf"}` above) For example :: From d18075dd8a9a72005d12a35bf4f3386fc43a8629 Mon Sep 17 00:00:00 2001 From: Simon Kelly Date: Thu, 19 Mar 2020 12:27:08 +0200 Subject: [PATCH 40/40] Apply suggestions from code review Co-Authored-By: Norman Hooper --- corehq/util/metrics/datadog.py | 4 ++-- corehq/util/metrics/tests/utils.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/corehq/util/metrics/datadog.py b/corehq/util/metrics/datadog.py index 143aedebe41c..461c94b4048d 100644 --- a/corehq/util/metrics/datadog.py +++ b/corehq/util/metrics/datadog.py @@ -35,8 +35,8 @@ def __init__(self): if not settings.DATADOG_API_KEY or not settings.DATADOG_APP_KEY: raise Exception( - "Datadog not configured." - "Set DATADOG_API_KEY and DATADOG_APP_KEY in settings or update METRICS_PROVIDERS" + "Datadog not configured. " + "Set DATADOG_API_KEY and DATADOG_APP_KEY in settings or update METRICS_PROVIDERS " "to remove the Datadog provider." ) diff --git a/corehq/util/metrics/tests/utils.py b/corehq/util/metrics/tests/utils.py index 3c012c0ff9d5..6c4c3edcc9ee 100644 --- a/corehq/util/metrics/tests/utils.py +++ b/corehq/util/metrics/tests/utils.py @@ -7,10 +7,9 @@ @contextmanager def patch_datadog(): def record(fn, name, value, tags=None): - key = tuple([ - name, - tuple(sorted(tuple(t.split(':')) for t in (tags or []))), - ]) + def get_tag_pairs(tags: list): + return tuple(sorted(tuple(t.split(':', 1)) for t in tags)) + key = (name, get_tag_pairs(tags or [])) stats[key].append(value) stats = defaultdict(list)