Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCR metrics using Datadog for Rebuild and Rebuild-in-place functionality #35496

Merged
merged 18 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion corehq/apps/userreports/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from corehq.apps.change_feed import topics


TEMP_REPORT_PREFIX = '__tmp' # reports made by the report bulider use this
TEMP_REPORT_PREFIX = '__tmp' # reports made by the report builder use this

REPORT_BUILDER_EVENTS_KEY = 'REPORT_BUILDER_EVENTS_KEY'

Expand Down
8 changes: 8 additions & 0 deletions corehq/apps/userreports/sql/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ def get_table(self):
self.config, get_metadata(self.engine_id), override_table_name=self.override_table_name
)

@memoized
def get_existing_table_from_db(self):
"""Loads existing table directly from database if one exists"""
try:
return sqlalchemy.Table(self.get_table().name, sqlalchemy.MetaData(), autoload_with=self.engine)
except sqlalchemy.exc.NoSuchTableError:
pass

@property
def table_exists(self):
return self.engine.has_table(self.get_table().name)
Expand Down
77 changes: 73 additions & 4 deletions corehq/apps/userreports/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
)
from corehq.apps.userreports.models import (
AsyncIndicator,
DataSourceActionLog,
id_is_static,
)
from corehq.apps.userreports.rebuild import DataSourceResumeHelper
Expand Down Expand Up @@ -118,8 +119,15 @@ def rebuild_indicators(
config.save()

skip_log = bool(limit > 0) # don't store log for temporary report builder UCRs
adapter.rebuild_table(initiated_by=initiated_by, source=source, skip_log=skip_log, diffs=diffs)
_iteratively_build_table(config, limit=limit)
rows_count_before_rebuild = _get_rows_count_from_existing_table(adapter)
try:
adapter.rebuild_table(initiated_by=initiated_by, source=source, skip_log=skip_log, diffs=diffs)
_iteratively_build_table(config, limit=limit)
except Exception:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Big nit: Is there a specific exception we're trying to catch here, or just generally making sure nothing goes wrong? If it's the former then catching the specific exception here instead of a generic Exception would make it clearer what could go wrong.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question.

Here the intention for the metric is to catch if a rebuild fails for any reason. The idea is establish a relationship between a rebuild failure and the number of rows a datasource have. It has been observed a rebuild fails for some datasource with huge no. of records.

On other note, I like the point of having clarity for what went wrong. Since we are not sure of the reason, I am wondering if the reason is being captured today. If not, we should probably log them into sentry. Will look into this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're not entirely sure what might go wrong, and if there are a few things that could go wrong, then totally fine to leave as is. Could be a good followup if we discover specific errors that we are expecting.

_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource', adapter,
rows_count_before_rebuild, error=True)
raise
_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource', adapter, rows_count_before_rebuild)


@serial_task(
Expand All @@ -138,8 +146,69 @@ def rebuild_indicators_in_place(indicator_config_id, initiated_by=None, source=N
config.meta.build.rebuilt_asynchronously = False
config.save()

adapter.build_table(initiated_by=initiated_by, source=source)
_iteratively_build_table(config, in_place=True)
rows_count_before_rebuild = _get_rows_count_from_existing_table(adapter)
try:
adapter.build_table(initiated_by=initiated_by, source=source)
_iteratively_build_table(config, in_place=True)
except Exception:
_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource_in_place', adapter,
rows_count_before_rebuild, error=True)
raise
_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource_in_place', adapter,
rows_count_before_rebuild)


def _get_rows_count_from_existing_table(adapter):
table = adapter.get_existing_table_from_db()
if table is not None:
return adapter.session_helper.Session.query(table).count()


def _report_ucr_rebuild_metrics(config, source, action, adapter, rows_count_before_rebuild, error=False):
if source not in ('edit_data_source_rebuild', 'edit_data_source_build_in_place'):
return
try:
_report_metric_number_of_days_since_first_build(config, action)
if error:
_report_metric_rebuild_error(config, action)
else:
_report_metric_increase_in_rows_count(config, action, adapter, rows_count_before_rebuild)
except Exception:
pass


def _report_metric_number_of_days_since_first_build(config, action):
try:
earliest_entry = DataSourceActionLog.objects.filter(
domain=config.domain,
indicator_config_id=config.get_id,
action__in=[DataSourceActionLog.BUILD, DataSourceActionLog.REBUILD]
).earliest('date_created')
except DataSourceActionLog.DoesNotExist:
pass
else:
no_of_days = (datetime.utcnow() - earliest_entry.date_created).days
metrics_gauge(f'commcare.ucr.{action}.days_since_first_build', no_of_days, tags={'domain': config.domain})


def _report_metric_rebuild_error(config, action):
from corehq.apps.userreports.views import _number_of_records_to_be_iterated_for_rebuild
expected_rows_to_process = _number_of_records_to_be_iterated_for_rebuild(config)
metrics_gauge(
f'commcare.ucr.{action}.failed.expected_rows_to_process',
expected_rows_to_process,
tags={'domain': config.domain}
)


def _report_metric_increase_in_rows_count(config, action, adapter, rows_count_before_rebuild):
if rows_count_before_rebuild is None:
return
# Row count can only be obtained for synchronous rebuilds.
if not config.asynchronous:
rows_count_after_rebuild = adapter.get_query_object().count()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: minor but this would "expected" count after rebuild, though that count eventually could be higher or lower than this number here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is that it would be the actual count as we are fetching the rows count from the database. Also testing confirms this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we calling this after the rebuild has finished?
It does seem that this is being called at times directly after queuing the rebuild task.

Is this the count from the table or the count from the query the table will perform?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we calling this after the rebuild has finished?
It does seem that this is being called at times directly after queuing the rebuild task.

Do you mean the get_query_object().count() method ? If yes, it is used at multiple places and may be called before the rebuild.

For this code, this particular method is called by _report_ucr_rebuild_metrics which is used here at the end after rebuild code is finished.

Is this the count from the table or the count from the query the table will perform?

It is the count from the table.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean the get_query_object().count() method ? If yes, it is used at multiple places and may be called before the rebuild.

So if its run before the rebuild, the table would have 0 rows or old rows?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if its run before the rebuild, the table would have 0 rows or old rows?

Good question. You are right.

So there are two scenarios here:

One good example is to to Preview Data where this is called.

  1. Table has never been rebuilt : In that case, it should an appropriate message however today it throws programming error (500 error) because no table exists. . I believe we have a ticket planned to address this 500 error.

  2. Table has been rebuilt before : Here the old table rows count.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, so won't the metric be reported wrong in that case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, so won't the metric be reported wrong in that case?

Apologies. I did not state clearly above.

'For this code, this particular method is called by _report_ucr_rebuild_metrics which is used here at the end after rebuild code is finished.'

The metric code (as started above in the earlier comment) is only called inside the rebuild task at the end when the rebuild operation finishes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay

if rows_count_after_rebuild > rows_count_before_rebuild:
metrics_counter(f'commcare.ucr.{action}.increase_in_rows', tags={'domain': config.domain})


@task(serializer='pickle', queue=UCR_CELERY_QUEUE, ignore_result=True, acks_late=True)
Expand Down
12 changes: 11 additions & 1 deletion corehq/apps/userreports/ui/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from corehq.apps.userreports.ui import help_text
from corehq.apps.userreports.ui.fields import JsonField, ReportDataSourceField
from corehq.apps.userreports.util import get_table_name
from corehq.util.metrics import metrics_counter


class DocumentFormBase(forms.Form):
Expand Down Expand Up @@ -271,7 +272,16 @@ def clean(self):
def save(self, commit=False):
self.instance.meta.build.finished = False
self.instance.meta.build.initiated = None
return super(ConfigurableDataSourceEditForm, self).save(commit)
instance = super(ConfigurableDataSourceEditForm, self).save(commit)
self._report_edit_datasource_metrics()
return instance

def _report_edit_datasource_metrics(self):
if 'configured_filter' in self.changed_data:
metrics_counter('commcare.ucr.datasource.change_in_filters', tags={'domain': self.domain})
if 'configured_indicators' in self.changed_data:
if len(self.instance.configured_indicators) > len(self.initial.get('configured_indicators', [])):
metrics_counter('commcare.ucr.datasource.increase_in_columns', tags={'domain': self.domain})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we definitely not care about rebuilds where the user decreases the number of columns? (Maybe an AE question.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I will check with AE on this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey Norman, I re-read the intention of this metric (frrom the document made by AE) and you are right in that we do not care about the decrease. It looks the rebuild should majorly be driven when there is need for an extra column.

Just quoting,
A rebuild datasource action should only be triggered if there is a new column



class ConfigurableDataSourceFromAppForm(forms.Form):
Expand Down
45 changes: 32 additions & 13 deletions corehq/apps/userreports/util.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
import collections
import hashlib
import logging
import re
from dataclasses import dataclass
from typing import Any, Optional

from couchdbkit import ResourceNotFound
from django_prbac.utils import has_privilege
from dataclasses import dataclass

from dimagi.utils.couch.undo import is_deleted, remove_deleted_doc_type_suffix

from corehq import privileges, toggles
from corehq.apps.app_manager.dbaccessors import get_apps_in_domain
from corehq.apps.hqwebapp.templatetags.hq_shared_tags import toggle_enabled
from corehq.apps.linked_domain.util import is_linked_report
from corehq.apps.userreports.adapter import IndicatorAdapterLoadTracker
from corehq.apps.userreports.const import REPORT_BUILDER_EVENTS_KEY, TEMP_REPORT_PREFIX
from corehq.apps.userreports.exceptions import BadSpecError, ReportConfigurationNotFoundError, \
DataSourceConfigurationNotFoundError
from corehq.apps.userreports.const import (
REPORT_BUILDER_EVENTS_KEY,
TEMP_REPORT_PREFIX,
)
from corehq.apps.userreports.exceptions import (
BadSpecError,
DataSourceConfigurationNotFoundError,
ReportConfigurationNotFoundError,
)
from corehq.toggles import ENABLE_UCR_MIRRORS
from corehq.util import reverse
from corehq.util.couch import DocumentNotFound
from corehq.util.metrics.load_counters import ucr_load_counter
from dimagi.utils.couch.undo import is_deleted, remove_deleted_doc_type_suffix
import logging

UCR_TABLE_PREFIX = 'ucr_'
LEGACY_UCR_TABLE_PREFIX = 'config_report_'
Expand Down Expand Up @@ -161,6 +168,11 @@ def allowed_report_builder_reports(request):
return 0


def get_configurable_and_static_reports_for_data_source(domain, data_source_id):
reports = get_configurable_and_static_reports(domain)
return [report for report in reports if report.config_id == data_source_id]


def get_configurable_and_static_reports(domain):
from corehq.apps.userreports.models import StaticReportConfiguration
return get_existing_reports(domain) + StaticReportConfiguration.by_domain(domain)
Expand Down Expand Up @@ -192,8 +204,12 @@ def number_of_ucr_reports(domain):


def get_indicator_adapter(config, raise_errors=False, load_source="unknown"):
from corehq.apps.userreports.sql.adapter import IndicatorSqlAdapter, ErrorRaisingIndicatorSqlAdapter, \
MultiDBSqlAdapter, ErrorRaisingMultiDBAdapter
from corehq.apps.userreports.sql.adapter import (
ErrorRaisingIndicatorSqlAdapter,
ErrorRaisingMultiDBAdapter,
IndicatorSqlAdapter,
MultiDBSqlAdapter,
)
requires_mirroring = config.mirrored_engine_ids
if requires_mirroring and ENABLE_UCR_MIRRORS.enabled(config.domain):
adapter_cls = ErrorRaisingMultiDBAdapter if raise_errors else MultiDBSqlAdapter
Expand Down Expand Up @@ -265,8 +281,11 @@ def get_async_indicator_modify_lock_key(doc_id):


def get_static_report_mapping(from_domain, to_domain):
from corehq.apps.userreports.models import StaticReportConfiguration, STATIC_PREFIX, \
CUSTOM_REPORT_PREFIX
from corehq.apps.userreports.models import (
CUSTOM_REPORT_PREFIX,
STATIC_PREFIX,
StaticReportConfiguration,
)

report_map = {}

Expand Down Expand Up @@ -316,9 +335,9 @@ def get_report_config_or_not_found(domain, config_id):

def get_ucr_datasource_config_by_id(indicator_config_id, allow_deleted=False):
from corehq.apps.userreports.models import (
id_is_static,
StaticDataSourceConfiguration,
DataSourceConfiguration,
StaticDataSourceConfiguration,
id_is_static,
)
if id_is_static(indicator_config_id):
return StaticDataSourceConfiguration.by_id(indicator_config_id)
Expand All @@ -344,8 +363,8 @@ def _wrap_data_source_by_doc_type(doc, allow_deleted=False):

def wrap_report_config_by_type(config, allow_deleted=False):
from corehq.apps.userreports.models import (
ReportConfiguration,
RegistryReportConfiguration,
ReportConfiguration,
)
if is_deleted(config) and not allow_deleted:
raise ReportConfigurationNotFoundError()
Expand Down
35 changes: 34 additions & 1 deletion corehq/apps/userreports/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@
from corehq.apps.userreports.util import (
add_event,
allowed_report_builder_reports,
get_configurable_and_static_reports_for_data_source,
get_indicator_adapter,
get_referring_apps,
has_report_builder_access,
Expand All @@ -179,6 +180,7 @@
from corehq.tabs.tabclasses import ProjectReportsTab
from corehq.util import reverse
from corehq.util.couch import get_document_or_404
from corehq.util.metrics import metrics_counter, metrics_gauge
from corehq.util.quickcache import quickcache
from corehq.util.soft_assert import soft_assert

Expand Down Expand Up @@ -1344,12 +1346,42 @@ def rebuild_data_source(request, domain, config_id):
)
)

rebuild_indicators.delay(config_id, request.user.username, domain=domain)
rebuild_indicators.delay(config_id, request.user.username, domain=domain, source='edit_data_source_rebuild')
_report_ucr_rebuild_metrics(domain, config, 'rebuild_datasource')
return HttpResponseRedirect(reverse(
EditDataSourceView.urlname, args=[domain, config._id]
))


def _report_ucr_rebuild_metrics(domain, config, action):
metrics_counter(
f'commcare.ucr.{action}.count',
tags={
'domain': domain,
'datasource_id': config.get_id,
}
)
metrics_gauge(
f'commcare.ucr.{action}.columns.count',
len(config.get_columns()),
tags={'domain': domain}
)
Comment on lines +1364 to +1368
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think unless you tag the data source in some way, the value for the domain will change with every data source you send a metric for. I've used the data source's display name in this suggestion, but there is nothing that enforces uniqueness, so there is still the risk that the value will flip for different data sources with the samme display name. But on the other hand, maybe it's more usable than a config ID. 🤷

Suggested change
metrics_gauge(
f'commcare.ucr.{action}.columns.count',
len(config.get_columns()),
tags={'domain': domain}
)
metrics_gauge(
f'commcare.ucr.{action}.columns.count',
len(config.get_columns()),
tags={
'domain': domain,
'data_source': config.display_name,
}
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

think unless you tag the data source in some way, the value for the domain will change with every data source you send a metric for. I

That is right and a valid concern.

The original requirement was for including datasource_id. It was my recommendation to not include it considering below factors:

  • The aim is to keep the cardinality(variance) to minimum in general to avoid increase in Datadog costs. Note that this is not a restriction though.
  • Based on my the requirements, the idea is see the general trend of how many a columns a datasource has while being rebuilt instead of a report with datasource and their columns count. Since the ticket is specificaly focused to track the usage of Rebuild and Rebuild in Place functionality, a general trend (e.g average) seems to be a reasonable insight.
  • For the DataDog visualisation, the intention is to show a average of above metric with an option to segment/filter by domain.

Let me know if this is something that sounds reasonable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. That does sound reasonable. Just taking the average is an excellent approach.

_report_metric_report_counts_by_datasource(domain, config.get_id, action)


def _report_metric_report_counts_by_datasource(domain, data_source_id, action):
try:
reports = get_configurable_and_static_reports_for_data_source(domain, data_source_id)
except Exception:
pass
else:
metrics_gauge(
f'commcare.ucr.{action}.reports_per_datasource.count',
len(reports),
tags={'domain': domain}
)
Comment on lines +1378 to +1382
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, I think you'll need to include the data source in the tag, otherwise the value for the domain will just keep alternating. Datadog will allow you to sum for all data sources in the domaain, or average across data sources. (Going with config ID this time just to be different :) )

Suggested change
metrics_gauge(
f'commcare.ucr.{action}.reports_per_datasource.count',
len(reports),
tags={'domain': domain}
)
metrics_gauge(
f'commcare.ucr.{action}.reports_per_datasource.count',
len(reports),
tags={
'domain': domain,
'data_source': config.get_id,
}
)



def _number_of_records_to_be_iterated_for_rebuild(datasource_configuration):
if datasource_configuration.referenced_doc_type == 'CommCareCase':
es_query = CaseSearchES().domain(datasource_configuration.domain)
Expand Down Expand Up @@ -1442,6 +1474,7 @@ def build_data_source_in_place(request, domain, config_id):
source='edit_data_source_build_in_place',
domain=config.domain,
)
_report_ucr_rebuild_metrics(domain, config, 'rebuild_datasource_in_place')
return HttpResponseRedirect(reverse(
EditDataSourceView.urlname, args=[domain, config._id]
))
Expand Down
Loading