Skip to content

Commit

Permalink
Merge pull request #26876 from dimagi/sk/monitoring-functional
Browse files Browse the repository at this point in the history
monitoring functional
  • Loading branch information
snopoke authored Mar 19, 2020
2 parents 4980bc0 + d18075d commit b5b3fba
Show file tree
Hide file tree
Showing 32 changed files with 787 additions and 107 deletions.
27 changes: 14 additions & 13 deletions corehq/apps/api/odata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from collections import namedtuple

from corehq.apps.export.models import ExportInstance
from corehq.util.datadog.gauges import datadog_counter
from corehq.util.datadog.utils import bucket_value
from corehq.util.metrics import metrics_histogram

FieldMetadata = namedtuple('FieldMetadata', ['name', 'odata_type'])

Expand Down Expand Up @@ -64,14 +63,16 @@ def record_feed_access_in_datadog(request, config_id, duration, response):
column_count = len(rows[0])
except IndexError:
column_count = 0
datadog_counter('commcare.odata_feed.test_v3', tags=[
'domain:{}'.format(request.domain),
'feed_id:{}'.format(config_id),
'feed_type:{}'.format(config.type),
'username:{}'.format(username),
'row_count:{}'.format(row_count),
'column_count:{}'.format(column_count),
'size:{}'.format(len(response.content)),
'duration:{}'.format(duration),
'duration_bucket:{}'.format(bucket_value(duration, (1, 5, 20, 60, 120, 300, 600), 's')),
])
metrics_histogram(
'commcare.odata_feed.test_v3', duration,
bucket_tag='duration_bucket', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
tags={
'domain': request.domain,
'feed_id': config_id,
'feed_type': config.type,
'username': username,
'row_count': row_count,
'column_count': column_count,
'size': len(response.content)
}
)
9 changes: 2 additions & 7 deletions corehq/apps/case_importer/tasks.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
from celery import states
from celery.exceptions import Ignore
from celery.schedules import crontab
from celery.task import task

from soil.progress import update_task_state

from corehq.apps.hqadmin.tasks import (
AbnormalUsageAlert,
send_abnormal_usage_alert,
)
from corehq.util.datadog.gauges import datadog_gauge_task

from .do_import import do_import
from .exceptions import ImporterError
from .tracking.analytics import get_case_upload_files_total_bytes
from .tracking.case_upload_tracker import CaseUpload
from .util import get_importer_error_message, exit_celery_with_error_message
from ...util.metrics import metrics_gauge_task


@task(serializer='pickle', queue='case_import_queue')
Expand Down Expand Up @@ -64,7 +59,7 @@ def _alert_on_result(result, domain):
send_abnormal_usage_alert.delay(alert)


total_bytes = datadog_gauge_task(
total_bytes = metrics_gauge_task(
'commcare.case_importer.files.total_bytes',
get_case_upload_files_total_bytes,
run_every=crontab(minute=0)
Expand Down
5 changes: 3 additions & 2 deletions corehq/apps/hqwebapp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from celery.task import task, periodic_task

from corehq.util.bounced_email_manager import BouncedEmailManager
from corehq.util.metrics import metrics_gauge_task
from dimagi.utils.logging import notify_exception

from corehq.util.datadog.gauges import datadog_gauge_task, datadog_track_errors
from corehq.util.datadog.gauges import datadog_track_errors
from corehq.util.log import send_HTML_email


Expand Down Expand Up @@ -129,5 +130,5 @@ def get_maintenance_alert_active():
return 1 if MaintenanceAlert.get_latest_alert() else 0


datadog_gauge_task('commcare.maintenance_alerts.active', get_maintenance_alert_active,
metrics_gauge_task('commcare.maintenance_alerts.active', get_maintenance_alert_active,
run_every=crontab(minute=1))
63 changes: 32 additions & 31 deletions corehq/apps/receiverwrapper/views.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
import logging
import os

from django.http import HttpResponseBadRequest, HttpResponseForbidden
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_POST

from couchdbkit import ResourceNotFound
from tastypie.http import HttpTooManyRequests

import couchforms
from casexml.apps.case.xform import get_case_updates, is_device_report
from couchforms import openrosa_response
Expand Down Expand Up @@ -50,13 +46,10 @@
convert_xform_to_json,
should_use_sql_backend,
)
from corehq.util.datadog.gauges import datadog_counter, datadog_gauge
from corehq.util.datadog.metrics import (
MULTIMEDIA_SUBMISSION_ERROR_COUNT,
XFORM_LOCKED_COUNT,
)
from corehq.util.datadog.utils import bucket_value
from corehq.util.metrics import metrics_counter, metrics_histogram
from corehq.util.timer import TimingContext
from couchdbkit import ResourceNotFound
from tastypie.http import HttpTooManyRequests

PROFILE_PROBABILITY = float(os.getenv('COMMCARE_PROFILE_SUBMISSION_PROBABILITY', 0))
PROFILE_LIMIT = os.getenv('COMMCARE_PROFILE_SUBMISSION_LIMIT')
Expand All @@ -70,10 +63,10 @@ def _process_form(request, domain, app_id, user_id, authenticated,
if rate_limit_submission(domain):
return HttpTooManyRequests()

metric_tags = [
'backend:sql' if should_use_sql_backend(domain) else 'backend:couch',
'domain:{}'.format(domain),
]
metric_tags = {
'backend': 'sql' if should_use_sql_backend(domain) else 'couch',
'domain': domain
}

try:
instance, attachments = couchforms.get_instance_and_attachment(request)
Expand All @@ -85,9 +78,11 @@ def _process_form(request, domain, app_id, user_id, authenticated,
except:
meta = {}

metrics_counter('commcare.corrupt_multimedia_submissions', tags={
'domain': domain, 'authenticated': authenticated
})
return _submission_error(
request, "Received a submission with POST.keys()",
MULTIMEDIA_SUBMISSION_ERROR_COUNT, metric_tags,
request, "Received a submission with POST.keys()", metric_tags,
domain, app_id, user_id, authenticated, meta,
)

Expand Down Expand Up @@ -133,8 +128,11 @@ def _process_form(request, domain, app_id, user_id, authenticated,
try:
result = submission_post.run()
except XFormLockError as err:
metrics_counter('commcare.xformlocked.count', tags={
'domain': domain, 'authenticated': authenticated
})
return _submission_error(
request, "XFormLockError: %s" % err, XFORM_LOCKED_COUNT,
request, "XFormLockError: %s" % err,
metric_tags, domain, app_id, user_id, authenticated, status=423,
notify=False,
)
Expand All @@ -145,7 +143,7 @@ def _process_form(request, domain, app_id, user_id, authenticated,
return response


def _submission_error(request, message, count_metric, metric_tags,
def _submission_error(request, message, metric_tags,
domain, app_id, user_id, authenticated, meta=None, status=400,
notify=True):
"""Notify exception, datadog count, record metrics, construct response
Expand All @@ -157,7 +155,6 @@ def _submission_error(request, message, count_metric, metric_tags,
"domain:{}".format(domain),
"authenticated:{}".format(authenticated),
]
datadog_counter(count_metric, tags=details)
if notify:
details.extend([
"user_id:{}".format(user_id),
Expand All @@ -172,24 +169,28 @@ def _submission_error(request, message, count_metric, metric_tags,


def _record_metrics(tags, submission_type, response, timer=None, xform=None):
tags.update({
'submission_type': submission_type,
'status_code': response.status_code
})

if xform and xform.metadata and xform.metadata.timeEnd and xform.received_on:
lag = xform.received_on - xform.metadata.timeEnd
lag_days = lag.total_seconds() / 86400
tags += [
'lag:%s' % bucket_value(lag_days, (1, 2, 4, 7, 14, 31, 90), 'd')
]

tags += [
'submission_type:{}'.format(submission_type),
'status_code:{}'.format(response.status_code)
]
metrics_histogram(
'commcare.xform_submissions.lag.days', lag_days,
bucket_tag='lag', buckets=(1, 2, 4, 7, 14, 31, 90), bucket_unit='d',
tags=tags
)

if timer:
tags += [
'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'),
]
metrics_histogram(
'commcare.xform_submissions.duration.seconds', timer.duration,
bucket_tag='duration', buckets=(1, 5, 20, 60, 120, 300, 600), bucket_unit='s',
tags=tags
)

datadog_counter('commcare.xform_submissions.count', tags=tags)
metrics_counter('commcare.xform_submissions.count', tags=tags)


@location_safe
Expand Down
5 changes: 3 additions & 2 deletions corehq/apps/sms/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from celery.schedules import crontab

from corehq.util.metrics import metrics_gauge_task
from dimagi.utils.couch import (
CriticalSection,
get_redis_client,
Expand Down Expand Up @@ -50,7 +51,7 @@
from corehq.apps.users.models import CommCareUser, CouchUser
from corehq.messaging.util import use_phone_entries
from corehq.util.celery_utils import no_result_task
from corehq.util.datadog.gauges import datadog_counter, datadog_gauge_task
from corehq.util.datadog.gauges import datadog_counter
from corehq.util.timezones.conversions import ServerTime

MAX_TRIAL_SMS = 50
Expand Down Expand Up @@ -588,4 +589,4 @@ def queued_sms():
return QueuedSMS.objects.count()


datadog_gauge_task('commcare.sms.queued', queued_sms, run_every=crontab())
metrics_gauge_task('commcare.sms.queued', queued_sms, run_every=crontab())
4 changes: 2 additions & 2 deletions corehq/motech/repeaters/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from celery.task import periodic_task, task
from celery.utils.log import get_task_logger

from corehq.util.metrics import metrics_gauge_task
from dimagi.utils.couch import get_redis_lock
from dimagi.utils.couch.undo import DELETED_SUFFIX

Expand All @@ -25,7 +26,6 @@
from corehq.util.datadog.gauges import (
datadog_bucket_timer,
datadog_counter,
datadog_gauge_task,
)
from corehq.util.datadog.utils import make_buckets_from_timedeltas
from corehq.util.soft_assert import soft_assert
Expand Down Expand Up @@ -137,7 +137,7 @@ def process_repeat_record(repeat_record):
logging.exception('Failed to process repeat record: {}'.format(repeat_record._id))


repeaters_overdue = datadog_gauge_task(
repeaters_overdue = metrics_gauge_task(
'commcare.repeaters.overdue',
get_overdue_repeat_record_count,
run_every=crontab() # every minute
Expand Down
34 changes: 0 additions & 34 deletions corehq/util/datadog/gauges.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,6 @@
from corehq.util.timer import TimingContext


def datadog_gauge_task(name, fn, run_every, enforce_prefix='commcare'):
"""
helper for easily registering datadog gauges to run periodically
To update a datadog gauge on a schedule based on the result of a function
just add to your app's tasks.py:
my_calculation = datadog_gauge_task('my.datadog.metric', my_calculation_function,
run_every=crontab(minute=0))
"""
_enforce_prefix(name, enforce_prefix)

datadog_gauge = _DatadogGauge(name, fn, run_every)
return datadog_gauge.periodic_task()


def datadog_histogram(name, value, enforce_prefix='commcare', tags=None):
"""
Usage: Used to track the statistical distribution of a set of values over a statsd flush period.
Expand Down Expand Up @@ -100,23 +83,6 @@ def new_stop(name=None):
return timer


class _DatadogGauge(object):

def __init__(self, name, fn, run_every):
self.name = name
self.fn = fn
self.run_every = run_every

def periodic_task(self):
@periodic_task(serializer='pickle', queue='background_queue', run_every=self.run_every,
acks_late=True, ignore_result=True)
@wraps(self.fn)
def inner(*args, **kwargs):
statsd.gauge(self.name, self.fn(*args, **kwargs))

return inner


def _enforce_prefix(name, prefix):
soft_assert(fail_if_debug=True).call(
not prefix or name.split('.')[0] == prefix,
Expand Down
2 changes: 0 additions & 2 deletions corehq/util/datadog/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@
ERROR_COUNT = 'commcare.error.count'
REPEATER_ERROR_COUNT = 'commcare.repeaters.error'
REPEATER_SUCCESS_COUNT = 'commcare.repeaters.success'
MULTIMEDIA_SUBMISSION_ERROR_COUNT = 'commcare.corrupt-multimedia-submission.error.count'
DATE_OPENED_CASEBLOCK_ERROR_COUNT = 'commcare.date-opened-caseblock-bug.error.count'
XFORM_LOCKED_COUNT = 'commcare.xformlocked.count'
Loading

0 comments on commit b5b3fba

Please sign in to comment.