-
-
Notifications
You must be signed in to change notification settings - Fork 222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCR metrics using Datadog for Rebuild and Rebuild-in-place functionality #35496
Changes from all commits
e04c02b
3f435ae
582775c
484777b
6b6331a
928e915
66b3dc0
a2ea739
4cc390c
52b27d8
42cd8b8
e37b241
a5c51e5
011eb70
f4bc32e
4864944
1db397a
50accc3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,7 @@ | |
) | ||
from corehq.apps.userreports.models import ( | ||
AsyncIndicator, | ||
DataSourceActionLog, | ||
id_is_static, | ||
) | ||
from corehq.apps.userreports.rebuild import DataSourceResumeHelper | ||
|
@@ -118,8 +119,15 @@ def rebuild_indicators( | |
config.save() | ||
|
||
skip_log = bool(limit > 0) # don't store log for temporary report builder UCRs | ||
adapter.rebuild_table(initiated_by=initiated_by, source=source, skip_log=skip_log, diffs=diffs) | ||
_iteratively_build_table(config, limit=limit) | ||
rows_count_before_rebuild = _get_rows_count_from_existing_table(adapter) | ||
try: | ||
adapter.rebuild_table(initiated_by=initiated_by, source=source, skip_log=skip_log, diffs=diffs) | ||
_iteratively_build_table(config, limit=limit) | ||
except Exception: | ||
_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource', adapter, | ||
rows_count_before_rebuild, error=True) | ||
raise | ||
_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource', adapter, rows_count_before_rebuild) | ||
|
||
|
||
@serial_task( | ||
|
@@ -138,8 +146,69 @@ def rebuild_indicators_in_place(indicator_config_id, initiated_by=None, source=N | |
config.meta.build.rebuilt_asynchronously = False | ||
config.save() | ||
|
||
adapter.build_table(initiated_by=initiated_by, source=source) | ||
_iteratively_build_table(config, in_place=True) | ||
rows_count_before_rebuild = _get_rows_count_from_existing_table(adapter) | ||
try: | ||
adapter.build_table(initiated_by=initiated_by, source=source) | ||
_iteratively_build_table(config, in_place=True) | ||
except Exception: | ||
_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource_in_place', adapter, | ||
rows_count_before_rebuild, error=True) | ||
raise | ||
_report_ucr_rebuild_metrics(config, source, 'rebuild_datasource_in_place', adapter, | ||
rows_count_before_rebuild) | ||
|
||
|
||
def _get_rows_count_from_existing_table(adapter): | ||
table = adapter.get_existing_table_from_db() | ||
if table is not None: | ||
return adapter.session_helper.Session.query(table).count() | ||
|
||
|
||
def _report_ucr_rebuild_metrics(config, source, action, adapter, rows_count_before_rebuild, error=False): | ||
if source not in ('edit_data_source_rebuild', 'edit_data_source_build_in_place'): | ||
return | ||
try: | ||
_report_metric_number_of_days_since_first_build(config, action) | ||
if error: | ||
_report_metric_rebuild_error(config, action) | ||
else: | ||
_report_metric_increase_in_rows_count(config, action, adapter, rows_count_before_rebuild) | ||
except Exception: | ||
pass | ||
|
||
|
||
def _report_metric_number_of_days_since_first_build(config, action): | ||
try: | ||
earliest_entry = DataSourceActionLog.objects.filter( | ||
domain=config.domain, | ||
indicator_config_id=config.get_id, | ||
action__in=[DataSourceActionLog.BUILD, DataSourceActionLog.REBUILD] | ||
).earliest('date_created') | ||
except DataSourceActionLog.DoesNotExist: | ||
pass | ||
else: | ||
no_of_days = (datetime.utcnow() - earliest_entry.date_created).days | ||
metrics_gauge(f'commcare.ucr.{action}.days_since_first_build', no_of_days, tags={'domain': config.domain}) | ||
|
||
|
||
def _report_metric_rebuild_error(config, action): | ||
from corehq.apps.userreports.views import _number_of_records_to_be_iterated_for_rebuild | ||
zandre-eng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
expected_rows_to_process = _number_of_records_to_be_iterated_for_rebuild(config) | ||
metrics_gauge( | ||
f'commcare.ucr.{action}.failed.expected_rows_to_process', | ||
expected_rows_to_process, | ||
tags={'domain': config.domain} | ||
) | ||
|
||
|
||
def _report_metric_increase_in_rows_count(config, action, adapter, rows_count_before_rebuild): | ||
if rows_count_before_rebuild is None: | ||
return | ||
# Row count can only be obtained for synchronous rebuilds. | ||
if not config.asynchronous: | ||
rows_count_after_rebuild = adapter.get_query_object().count() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: minor but this would "expected" count after rebuild, though that count eventually could be higher or lower than this number here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My understanding is that it would be the actual count as we are fetching the rows count from the database. Also testing confirms this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we calling this after the rebuild has finished? Is this the count from the table or the count from the query the table will perform? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Do you mean the For this code, this particular method is called by
It is the count from the table. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So if its run before the rebuild, the table would have 0 rows or old rows? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Good question. You are right. So there are two scenarios here: One good example is to to Preview Data where this is called.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, so won't the metric be reported wrong in that case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Apologies. I did not state clearly above. 'For this code, this particular method is called by _report_ucr_rebuild_metrics which is used here at the end after rebuild code is finished.' The metric code (as started above in the earlier comment) is only called inside the rebuild task at the end when the rebuild operation finishes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay |
||
if rows_count_after_rebuild > rows_count_before_rebuild: | ||
metrics_counter(f'commcare.ucr.{action}.increase_in_rows', tags={'domain': config.domain}) | ||
|
||
|
||
@task(serializer='pickle', queue=UCR_CELERY_QUEUE, ignore_result=True, acks_late=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
from corehq.apps.userreports.ui import help_text | ||
from corehq.apps.userreports.ui.fields import JsonField, ReportDataSourceField | ||
from corehq.apps.userreports.util import get_table_name | ||
from corehq.util.metrics import metrics_counter | ||
|
||
|
||
class DocumentFormBase(forms.Form): | ||
|
@@ -271,7 +272,16 @@ def clean(self): | |
def save(self, commit=False): | ||
self.instance.meta.build.finished = False | ||
self.instance.meta.build.initiated = None | ||
return super(ConfigurableDataSourceEditForm, self).save(commit) | ||
instance = super(ConfigurableDataSourceEditForm, self).save(commit) | ||
self._report_edit_datasource_metrics() | ||
return instance | ||
|
||
def _report_edit_datasource_metrics(self): | ||
if 'configured_filter' in self.changed_data: | ||
metrics_counter('commcare.ucr.datasource.change_in_filters', tags={'domain': self.domain}) | ||
if 'configured_indicators' in self.changed_data: | ||
if len(self.instance.configured_indicators) > len(self.initial.get('configured_indicators', [])): | ||
metrics_counter('commcare.ucr.datasource.increase_in_columns', tags={'domain': self.domain}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we definitely not care about rebuilds where the user decreases the number of columns? (Maybe an AE question.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question. I will check with AE on this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey Norman, I re-read the intention of this metric (frrom the document made by AE) and you are right in that we do not care about the decrease. It looks the rebuild should majorly be driven when there is need for an extra column. Just quoting, |
||
|
||
|
||
class ConfigurableDataSourceFromAppForm(forms.Form): | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -161,6 +161,7 @@ | |||||||||||||||||||||||||||
from corehq.apps.userreports.util import ( | ||||||||||||||||||||||||||||
add_event, | ||||||||||||||||||||||||||||
allowed_report_builder_reports, | ||||||||||||||||||||||||||||
get_configurable_and_static_reports_for_data_source, | ||||||||||||||||||||||||||||
get_indicator_adapter, | ||||||||||||||||||||||||||||
get_referring_apps, | ||||||||||||||||||||||||||||
has_report_builder_access, | ||||||||||||||||||||||||||||
|
@@ -179,6 +180,7 @@ | |||||||||||||||||||||||||||
from corehq.tabs.tabclasses import ProjectReportsTab | ||||||||||||||||||||||||||||
from corehq.util import reverse | ||||||||||||||||||||||||||||
from corehq.util.couch import get_document_or_404 | ||||||||||||||||||||||||||||
from corehq.util.metrics import metrics_counter, metrics_gauge | ||||||||||||||||||||||||||||
from corehq.util.quickcache import quickcache | ||||||||||||||||||||||||||||
from corehq.util.soft_assert import soft_assert | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
@@ -1344,12 +1346,42 @@ def rebuild_data_source(request, domain, config_id): | |||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
rebuild_indicators.delay(config_id, request.user.username, domain=domain) | ||||||||||||||||||||||||||||
rebuild_indicators.delay(config_id, request.user.username, domain=domain, source='edit_data_source_rebuild') | ||||||||||||||||||||||||||||
_report_ucr_rebuild_metrics(domain, config, 'rebuild_datasource') | ||||||||||||||||||||||||||||
return HttpResponseRedirect(reverse( | ||||||||||||||||||||||||||||
EditDataSourceView.urlname, args=[domain, config._id] | ||||||||||||||||||||||||||||
)) | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
def _report_ucr_rebuild_metrics(domain, config, action): | ||||||||||||||||||||||||||||
metrics_counter( | ||||||||||||||||||||||||||||
f'commcare.ucr.{action}.count', | ||||||||||||||||||||||||||||
tags={ | ||||||||||||||||||||||||||||
'domain': domain, | ||||||||||||||||||||||||||||
'datasource_id': config.get_id, | ||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||
metrics_gauge( | ||||||||||||||||||||||||||||
f'commcare.ucr.{action}.columns.count', | ||||||||||||||||||||||||||||
len(config.get_columns()), | ||||||||||||||||||||||||||||
tags={'domain': domain} | ||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||
Comment on lines
+1364
to
+1368
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think unless you tag the data source in some way, the value for the domain will change with every data source you send a metric for. I've used the data source's display name in this suggestion, but there is nothing that enforces uniqueness, so there is still the risk that the value will flip for different data sources with the samme display name. But on the other hand, maybe it's more usable than a config ID. 🤷
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That is right and a valid concern. The original requirement was for including
Let me know if this is something that sounds reasonable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. That does sound reasonable. Just taking the average is an excellent approach. |
||||||||||||||||||||||||||||
_report_metric_report_counts_by_datasource(domain, config.get_id, action) | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
def _report_metric_report_counts_by_datasource(domain, data_source_id, action): | ||||||||||||||||||||||||||||
try: | ||||||||||||||||||||||||||||
reports = get_configurable_and_static_reports_for_data_source(domain, data_source_id) | ||||||||||||||||||||||||||||
except Exception: | ||||||||||||||||||||||||||||
pass | ||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||
metrics_gauge( | ||||||||||||||||||||||||||||
f'commcare.ucr.{action}.reports_per_datasource.count', | ||||||||||||||||||||||||||||
len(reports), | ||||||||||||||||||||||||||||
tags={'domain': domain} | ||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||
Comment on lines
+1378
to
+1382
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, I think you'll need to include the data source in the tag, otherwise the value for the domain will just keep alternating. Datadog will allow you to sum for all data sources in the domaain, or average across data sources. (Going with config ID this time just to be different :) )
Suggested change
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
def _number_of_records_to_be_iterated_for_rebuild(datasource_configuration): | ||||||||||||||||||||||||||||
if datasource_configuration.referenced_doc_type == 'CommCareCase': | ||||||||||||||||||||||||||||
es_query = CaseSearchES().domain(datasource_configuration.domain) | ||||||||||||||||||||||||||||
|
@@ -1442,6 +1474,7 @@ def build_data_source_in_place(request, domain, config_id): | |||||||||||||||||||||||||||
source='edit_data_source_build_in_place', | ||||||||||||||||||||||||||||
domain=config.domain, | ||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||
_report_ucr_rebuild_metrics(domain, config, 'rebuild_datasource_in_place') | ||||||||||||||||||||||||||||
return HttpResponseRedirect(reverse( | ||||||||||||||||||||||||||||
EditDataSourceView.urlname, args=[domain, config._id] | ||||||||||||||||||||||||||||
)) | ||||||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Big nit: Is there a specific exception we're trying to catch here, or just generally making sure nothing goes wrong? If it's the former then catching the specific exception here instead of a generic
Exception
would make it clearer what could go wrong.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good question.
Here the intention for the metric is to catch if a rebuild fails for any reason. The idea is establish a relationship between a rebuild failure and the number of rows a datasource have. It has been observed a rebuild fails for some datasource with huge no. of records.
On other note, I like the point of having clarity for what went wrong. Since we are not sure of the reason, I am wondering if the reason is being captured today. If not, we should probably log them into sentry. Will look into this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we're not entirely sure what might go wrong, and if there are a few things that could go wrong, then totally fine to leave as is. Could be a good followup if we discover specific errors that we are expecting.