Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

monitoring functional #26876

Merged
merged 43 commits into from
Mar 19, 2020
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
90bdf7d
move datadog init to AppConfig.ready
snopoke Mar 9, 2020
7ad93fa
pluggable metrics
snopoke Mar 10, 2020
0694926
add histogram and tests
snopoke Mar 10, 2020
c1f8270
bucket values less than or equal
snopoke Mar 10, 2020
5034156
add prometheus client to requirements
snopoke Mar 10, 2020
172ef37
Merge branch 'master' into sk/monitoring
snopoke Mar 11, 2020
2be4e6a
make metrics lazy
snopoke Mar 11, 2020
eb8afe3
example histogram
snopoke Mar 11, 2020
54e91c0
Merge branch 'master' into sk/monitoring
snopoke Mar 11, 2020
54f2d1b
convert sumbission metrics
snopoke Mar 11, 2020
eba69c4
docstrings
snopoke Mar 11, 2020
01fbc07
stickler
snopoke Mar 11, 2020
450f211
keep tag_values as dict instead of splitting and re-combining
snopoke Mar 12, 2020
2c3edef
update links
snopoke Mar 12, 2020
8664161
remove unnecessary list
snopoke Mar 12, 2020
0bcedea
replace typle() with ()
snopoke Mar 12, 2020
9ba3d4a
Merge branch 'sk/monitoring' of github.com:dimagi/commcare-hq into sk…
snopoke Mar 12, 2020
904e358
fix tags
snopoke Mar 12, 2020
d68e104
pass other args
snopoke Mar 12, 2020
5c00052
revert change to datadog bucketing boundry
snopoke Mar 12, 2020
e00b7e1
remove unnecessary list
snopoke Mar 16, 2020
20e8a61
apply tags at the same time as recording the metric
snopoke Mar 16, 2020
0ab0006
dummy metric
snopoke Mar 16, 2020
02b40ee
functional interface
snopoke Mar 16, 2020
31cfba9
re-do configuration via settings
snopoke Mar 17, 2020
892e57b
move initialization into provider
snopoke Mar 17, 2020
dbdbd3f
replace datadog_gauge
snopoke Mar 17, 2020
5897eb9
instantiate provider
snopoke Mar 17, 2020
28a8a5b
hook up metrics view
snopoke Mar 17, 2020
abd0355
todo
snopoke Mar 17, 2020
75fa979
lint
snopoke Mar 17, 2020
017ca5a
PR feedback
snopoke Mar 18, 2020
770f4a0
log output from DebugMetrics
snopoke Mar 18, 2020
8bb9e03
move metrics view to hq/admin
snopoke Mar 18, 2020
9a49d26
move metrics_gauge_task to package init
snopoke Mar 18, 2020
74ce07f
fix import
snopoke Mar 18, 2020
f1874e4
add script for running metrics endpoint
snopoke Mar 18, 2020
d4220f0
update docs
snopoke Mar 18, 2020
0457c00
docs
snopoke Mar 18, 2020
3d79912
do setup in __init__
snopoke Mar 18, 2020
d4c3dc1
simplify prometheus server
snopoke Mar 19, 2020
1017c85
doc updates
snopoke Mar 19, 2020
d18075d
Apply suggestions from code review
snopoke Mar 19, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions corehq/apps/api/odata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from collections import namedtuple

from corehq.apps.export.models import ExportInstance
from corehq.util.datadog.gauges import datadog_counter
from corehq.util.datadog.utils import bucket_value
from corehq.util.metrics import metrics_histogram

FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type'])

Expand Down Expand Up @@ -64,14 +63,16 @@ def record_feed_access_in_datadog(request, config_id, duration, response):
column_count = len(rows[0])
except IndexError:
column_count = 0
datadog_counter('commcare.odata_feed.test_v3', tags=[
'domain:{}'.format(request.domain),
'feed_id:{}'.format(config_id),
'feed_type:{}'.format(config.type),
'username:{}'.format(username),
'row_count:{}'.format(row_count),
'column_count:{}'.format(column_count),
'size:{}'.format(len(response.content)),
'duration:{}'.format(duration),
'duration_bucket:{}'.format(bucket_value(duration, (1, 5, 20, 60, 120, 300, 600), 's')),
])
metrics_histogram(
'commcare.odata_feed.test_v3', duration,
bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
tags={
'domain': request.domain,
'feed_id': config_id,
'feed_type': config.type,
'username': username,
'row_count': row_count,
'column_count': column_count,
'size': len(response.content)
}
)
9 changes: 2 additions & 7 deletions corehq/apps/case_importer/tasks.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
from celery import states
from celery.exceptions import Ignore
from celery.schedules import crontab
from celery.task import task

from soil.progress import update_task_state

from corehq.apps.hqadmin.tasks import (
AbnormalUsageAlert,
send_abnormal_usage_alert,
)
from corehq.util.datadog.gauges import datadog_gauge_task

from .do_import import do_import
from .exceptions import ImporterError
from .tracking.analytics import get_case_upload_files_total_bytes
from .tracking.case_upload_tracker import CaseUpload
from .util import get_importer_error_message, exit_celery_with_error_message
from ...util.metrics.metrics import metrics_gauge_task


@task(serializer='pickle', queue='case_import_queue')
Expand Down Expand Up @@ -64,7 +59,7 @@ def _alert_on_result(result, domain):
send_abnormal_usage_alert.delay(alert)


total_bytes = datadog_gauge_task(
total_bytes = metrics_gauge_task(
'commcare.case_importer.files.total_bytes',
get_case_upload_files_total_bytes,
run_every=crontab(minute=0)
Expand Down
5 changes: 3 additions & 2 deletions corehq/apps/hqwebapp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from celery.task import task, periodic_task

from corehq.util.bounced_email_manager import BouncedEmailManager
from corehq.util.metrics.metrics import metrics_gauge_task
from dimagi.utils.logging import notify_exception

from corehq.util.datadog.gauges import datadog_gauge_task, datadog_track_errors
from corehq.util.datadog.gauges import datadog_track_errors
from corehq.util.log import send_HTML_email


Expand Down Expand Up @@ -129,5 +130,5 @@ def get_maintenance_alert_active():
return 1 if MaintenanceAlert.get_latest_alert() else 0


datadog_gauge_task('commcare.maintenance_alerts.active', get_maintenance_alert_active,
metrics_gauge_task('commcare.maintenance_alerts.active', get_maintenance_alert_active,
run_every=crontab(minute=1))
63 changes: 32 additions & 31 deletions corehq/apps/receiverwrapper/views.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
import logging
import os

from django.http import HttpResponseBadRequest, HttpResponseForbidden
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_POST

from couchdbkit import ResourceNotFound
from tastypie.http import HttpTooManyRequests

import couchforms
from casexml.apps.case.xform import get_case_updates, is_device_report
from couchforms import openrosa_response
Expand Down Expand Up @@ -50,13 +46,10 @@
convert_xform_to_json,
should_use_sql_backend,
)
from corehq.util.datadog.gauges import datadog_counter, datadog_gauge
from corehq.util.datadog.metrics import (
MULTIMEDIA_SUBMISSION_ERROR_COUNT,
XFORM_LOCKED_COUNT,
)
from corehq.util.datadog.utils import bucket_value
from corehq.util.metrics import metrics_counter, metrics_histogram
from corehq.util.timer import TimingContext
from couchdbkit import ResourceNotFound
from tastypie.http import HttpTooManyRequests

PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0))
PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT')
Expand All @@ -70,10 +63,10 @@ def _process_form(request, domain, app_id, user_id, authenticated,
if rate_limit_submission(domain):
return HttpTooManyRequests()

metric_tags = [
'backend:sql' if should_use_sql_backend(domain) else 'backend:couch',
'domain:{}'.format(domain),
]
metric_tags = {
'backend': 'sql' if should_use_sql_backend(domain) else 'couch',
'domain': domain
}

try:
instance, attachments = couchforms.get_instance_and_attachment(request)
Expand All @@ -85,9 +78,11 @@ def _process_form(request, domain, app_id, user_id, authenticated,
except:
meta = {}

metrics_counter('commcare.corrupt_multimedia_submissions', tags={
'domain': domain, 'authenticated': authenticated
})
return _submission_error(
request, "Received a submission with POST.keys()",
MULTIMEDIA_SUBMISSION_ERROR_COUNT, metric_tags,
request, "Received a submission with POST.keys()", metric_tags,
domain, app_id, user_id, authenticated, meta,
)

Expand Down Expand Up @@ -133,8 +128,11 @@ def _process_form(request, domain, app_id, user_id, authenticated,
try:
result = submission_post.run()
except XFormLockError as err:
metrics_counter('commcare.xformlocked.count', tags={
'domain': domain, 'authenticated': authenticated
})
return _submission_error(
request, "XFormLockError: %s" % err, XFORM_LOCKED_COUNT,
request, "XFormLockError: %s" % err,
metric_tags, domain, app_id, user_id, authenticated, status=423,
notify=False,
)
Expand All @@ -145,7 +143,7 @@ def _process_form(request, domain, app_id, user_id, authenticated,
return response


def _submission_error(request, message, count_metric, metric_tags,
def _submission_error(request, message, metric_tags,
domain, app_id, user_id, authenticated, meta=None, status=400,
notify=True):
"""Notify exception, datadog count, record metrics, construct response
Expand All @@ -157,7 +155,6 @@ def _submission_error(request, message, count_metric, metric_tags,
"domain:{}".format(domain),
"authenticated:{}".format(authenticated),
]
datadog_counter(count_metric, tags=details)
if notify:
details.extend([
"user_id:{}".format(user_id),
Expand All @@ -172,24 +169,28 @@ def _submission_error(request, message, count_metric, metric_tags,


def _record_metrics(tags, submission_type, response, timer=None, xform=None):
tags.update({
'submission_type': submission_type,
'status_code': response.status_code
})

if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on:
lag = xform.received_on - xform.metadata.timeEnd
lag_days = lag.total_seconds() / 86400
tags += [
'lag:%s' % bucket_value(lag_days, (1, 2, 4, 7, 14, 31, 90), 'd')
]

tags += [
'submission_type:{}'.format(submission_type),
'status_code:{}'.format(response.status_code)
]
metrics_histogram(
'commcare.xform_submissions.lag.days', lag_days,
bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d',
tags=tags
)

if timer:
tags += [
'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'),
]
metrics_histogram(
'commcare.xform_submissions.duration.seconds', timer.duration,
bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
tags=tags
)

datadog_counter('commcare.xform_submissions.count', tags=tags)
metrics_counter('commcare.xform_submissions.count', tags=tags)


@location_safe
Expand Down
5 changes: 3 additions & 2 deletions corehq/apps/sms/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from celery.schedules import crontab

from corehq.util.metrics.metrics import metrics_gauge_task
from dimagi.utils.couch import (
CriticalSection,
get_redis_client,
Expand Down Expand Up @@ -50,7 +51,7 @@
from corehq.apps.users.models import CommCareUser, CouchUser
from corehq.messaging.util import use_phone_entries
from corehq.util.celery_utils import no_result_task
from corehq.util.datadog.gauges import datadog_counter, datadog_gauge_task
from corehq.util.datadog.gauges import datadog_counter
from corehq.util.timezones.conversions import ServerTime

MAX_TRIAL_SMS = 50
Expand Down Expand Up @@ -588,4 +589,4 @@ def queued_sms():
return QueuedSMS.objects.count()


datadog_gauge_task('commcare.sms.queued', queued_sms, run_every=crontab())
metrics_gauge_task('commcare.sms.queued', queued_sms, run_every=crontab())
4 changes: 2 additions & 2 deletions corehq/motech/repeaters/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from celery.task import periodic_task, task
from celery.utils.log import get_task_logger

from corehq.util.metrics.metrics import metrics_gauge_task
from dimagi.utils.couch import get_redis_lock
from dimagi.utils.couch.undo import DELETED_SUFFIX

Expand All @@ -25,7 +26,6 @@
from corehq.util.datadog.gauges import (
datadog_bucket_timer,
datadog_counter,
datadog_gauge_task,
)
from corehq.util.datadog.utils import make_buckets_from_timedeltas
from corehq.util.soft_assert import soft_assert
Expand Down Expand Up @@ -137,7 +137,7 @@ def process_repeat_record(repeat_record):
logging.exception('Failed to process repeat record: {}'.format(repeat_record._id))


repeaters_overdue = datadog_gauge_task(
repeaters_overdue = metrics_gauge_task(
'commcare.repeaters.overdue',
get_overdue_repeat_record_count,
run_every=crontab() # every minute
Expand Down
34 changes: 0 additions & 34 deletions corehq/util/datadog/gauges.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,6 @@
from corehq.util.timer import TimingContext


def datadog_gauge_task(name, fn, run_every, enforce_prefix='commcare'):
"""
helper for easily registering datadog gauges to run periodically

To update a datadog gauge on a schedule based on the result of a function
just add to your app's tasks.py:

my_calculation = datadog_gauge_task('my.datadog.metric', my_calculation_function,
run_every=crontab(minute=0))

"""
_enforce_prefix(name, enforce_prefix)

datadog_gauge = _DatadogGauge(name, fn, run_every)
return datadog_gauge.periodic_task()


def datadog_histogram(name, value, enforce_prefix='commcare', tags=None):
"""
Usage: Used to track the statistical distribution of a set of values over a statsd flush period.
Expand Down Expand Up @@ -100,23 +83,6 @@ def new_stop(name=None):
return timer


class _DatadogGauge(object):

def __init__(self, name, fn, run_every):
self.name = name
self.fn = fn
self.run_every = run_every

def periodic_task(self):
@periodic_task(serializer='pickle', queue='background_queue', run_every=self.run_every,
acks_late=True, ignore_result=True)
@wraps(self.fn)
def inner(*args, **kwargs):
statsd.gauge(self.name, self.fn(*args, **kwargs))

return inner


def _enforce_prefix(name, prefix):
soft_assert(fail_if_debug=True).call(
not prefix or name.split('.')[0] == prefix,
Expand Down
2 changes: 0 additions & 2 deletions corehq/util/datadog/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@
ERROR_COUNT = 'commcare.error.count'
REPEATER_ERROR_COUNT = 'commcare.repeaters.error'
REPEATER_SUCCESS_COUNT = 'commcare.repeaters.success'
MULTIMEDIA_SUBMISSION_ERROR_COUNT = 'commcare.corrupt-multimedia-submission.error.count'
DATE_OPENED_CASEBLOCK_ERROR_COUNT = 'commcare.date-opened-caseblock-bug.error.count'
XFORM_LOCKED_COUNT = 'commcare.xformlocked.count'
49 changes: 49 additions & 0 deletions corehq/util/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Iterable
snopoke marked this conversation as resolved.
Show resolved Hide resolved

import settings
from corehq.util.metrics.metrics import DummyMetrics, DelegatedMetrics, DEFAULT_BUCKETS
from dimagi.utils.modules import to_function

__all__ = [
'metrics_counter',
'metrics_gauge',
'metrics_histogram',
]

_metrics = None


def _get_metrics_provider():
global _metrics
if not _metrics:
providers = []
for provider_path in settings.METRICS_PROVIDERS:
provider = to_function(provider_path)()
provider.initialize()
providers.append(provider)

if not providers:
_metrics = DummyMetrics()
elif len(providers) > 1:
_metrics = DelegatedMetrics(providers)
else:
_metrics = providers[0]
snopoke marked this conversation as resolved.
Show resolved Hide resolved
return _metrics


def metrics_counter(name: str, value: float = 1, tags: dict = None, documentation: str = ''):
provider = _get_metrics_provider()
provider.counter(name, value, tags, documentation)


def metrics_gauge(name: str, value: float, tags: dict = None, documentation: str = ''):
provider = _get_metrics_provider()
provider.gauge(name, value, tags, documentation)


def metrics_histogram(
name: str, value: float,
bucket_tag: str, buckets: Iterable[int] = DEFAULT_BUCKETS, bucket_unit: str = '',
tags: dict = None, documentation: str = ''):
provider = _get_metrics_provider()
provider.histogram(name, value, bucket_tag, buckets, bucket_unit, tags, documentation)
Loading