Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 0 additions & 21 deletions docs/infrastructure/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,6 @@ For less urgent issues or general support, you can file a bug with [cloudOps](ht

## Monitoring & metrics

- New Relic
- Overview:
[prototype](https://rpm.newrelic.com/accounts/677903/applications/7385291) |
[stage](https://rpm.newrelic.com/accounts/677903/applications/14179733) |
[prod](https://rpm.newrelic.com/accounts/677903/applications/14179757)
- Error analytics:
[prototype](https://rpm.newrelic.com/accounts/677903/applications/7385291/filterable_errors) |
[stage](https://rpm.newrelic.com/accounts/677903/applications/14179733/filterable_errors) |
[prod](https://rpm.newrelic.com/accounts/677903/applications/14179757/filterable_errors)
- Web transactions:
[prototype](https://rpm.newrelic.com/accounts/677903/applications/7385291/transactions?type=app) |
[stage](https://rpm.newrelic.com/accounts/677903/applications/14179733/transactions?type=app) |
[prod](https://rpm.newrelic.com/accounts/677903/applications/14179757/transactions?type=app)
- Non-web transactions (background tasks):
[prototype](https://rpm.newrelic.com/accounts/677903/applications/7385291/transactions?type=other&show_browser=false) |
[stage](https://rpm.newrelic.com/accounts/677903/applications/14179733/transactions?type=other&show_browser=false) |
[prod](https://rpm.newrelic.com/accounts/677903/applications/14179757/transactions?type=other&show_browser=false)
- Postgres/Redis client request stats:
[prototype](https://rpm.newrelic.com/accounts/677903/applications/7385291/datastores) |
[stage](https://rpm.newrelic.com/accounts/677903/applications/14179733/datastores) |
[prod](https://rpm.newrelic.com/accounts/677903/applications/14179757/datastores)
- Google Cloud Console
- [prod](https://console.cloud.google.com/kubernetes/list?project=moz-fx-treeherder-prod-c739)
- [all other deployments](https://console.cloud.google.com/kubernetes/list?project=moz-fx-treeherde-nonprod-34ec)
Expand Down
1 change: 0 additions & 1 deletion treeherder/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
LOGGING_LEVEL = env("LOGGING_LEVEL", default="INFO")

NEW_RELIC_INSIGHTS_API_KEY = env("NEW_RELIC_INSIGHTS_API_KEY", default=None)
NEW_RELIC_INSIGHTS_API_URL = "https://insights-api.newrelic.com/v1/accounts/677903/query"

# Make this unique, and don't share it with anybody.
SECRET_KEY = env(
Expand Down
3 changes: 0 additions & 3 deletions treeherder/etl/classification_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import re

import environ
import newrelic.agent

from treeherder.model.models import (
BugJobMap,
Expand Down Expand Up @@ -106,15 +105,13 @@ def get_push(self, task_route):
raise

try:
newrelic.agent.add_custom_attribute("project", project)

repository = Repository.objects.get(name=project)
except Repository.DoesNotExist:
logger.info("Job with unsupported project: %s", project)
raise

try:
newrelic.agent.add_custom_attribute("revision", revision)

revision_field = "revision__startswith" if len(revision) < 40 else "revision"
filter_kwargs = {"repository": repository, revision_field: revision}
Expand Down
7 changes: 0 additions & 7 deletions treeherder/etl/job_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import uuid

import jsonschema
import newrelic.agent
import slugid
from django.conf import settings

Expand Down Expand Up @@ -67,7 +66,6 @@ def process_job(self, pulse_job, root_url):
try:
with settings.STATSD_CLIENT.timer("process_job_transform"):
project = pulse_job["origin"]["project"]
newrelic.agent.add_custom_attribute("project", project)

repository = Repository.objects.get(name=project)
if repository.active_status != "active":
Expand Down Expand Up @@ -103,11 +101,6 @@ def validate_revision(self, repository, pulse_job):
revision_field = "revision__startswith" if len(revision) < 40 else "revision"
filter_kwargs = {"repository": repository, revision_field: revision}

if revision_field == "revision__startswith":
newrelic.agent.record_custom_event(
"short_revision_job_loader",
{"error": "Revision <40 chars", "revision": revision, "job": pulse_job},
)

if not Push.objects.filter(**filter_kwargs).exists():
(real_task_id, _) = task_and_retry_ids(pulse_job["taskId"])
Expand Down
2 changes: 0 additions & 2 deletions treeherder/etl/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from datetime import datetime
from hashlib import sha1

import newrelic.agent
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db.utils import IntegrityError
Expand Down Expand Up @@ -500,7 +499,6 @@ def store_job_data(repository, original_data):
# make more fields visible in new relic for the job
# where we encountered the error
datum.update(datum.get("job", {}))
newrelic.agent.notice_error(attributes=datum)

# skip any jobs that hit errors in these stages.
continue
Expand Down
6 changes: 0 additions & 6 deletions treeherder/etl/push_loader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging

import environ
import newrelic.agent
from django.core.exceptions import ObjectDoesNotExist

from treeherder.etl.common import to_timestamp
Expand All @@ -20,15 +19,12 @@ class PushLoader:
def process(self, message_body, exchange, root_url):
transformer = self.get_transformer_class(exchange)(message_body)
try:
newrelic.agent.add_custom_attribute("url", transformer.repo_url)
newrelic.agent.add_custom_attribute("branch", transformer.branch)
repos = Repository.objects
if transformer.branch:
repos = repos.filter(branch__regex=f"(^|,){transformer.branch}($|,)")
else:
repos = repos.filter(branch=None)
repo = repos.get(url=transformer.repo_url, active_status="active")
newrelic.agent.add_custom_attribute("repository", repo.name)
except ObjectDoesNotExist:
repo_info = transformer.get_info()
repo_info.update(
Expand All @@ -37,7 +33,6 @@ def process(self, message_body, exchange, root_url):
"branch": transformer.branch,
}
)
newrelic.agent.record_custom_event("skip_unknown_repository", repo_info)
logger.warning(
"Skipping unsupported repo: %s %s", transformer.repo_url, transformer.branch
)
Expand Down Expand Up @@ -257,7 +252,6 @@ def transform(self, repository):
return self.fetch_push(url, repository)

def fetch_push(self, url, repository, sha=None):
newrelic.agent.add_custom_attribute("sha", sha)

logger.debug("fetching for %s %s", repository, url)
# there will only ever be one, with this url
Expand Down
2 changes: 0 additions & 2 deletions treeherder/etl/pushlog.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import traceback

import newrelic.agent
import requests
from django.core.cache import cache

Expand Down Expand Up @@ -123,7 +122,6 @@ def run(self, source_url, repository_name, changeset=None, last_push_id=None):
try:
store_push(repository, self.transform_push(push))
except Exception:
newrelic.agent.notice_error()
errors.append(
{
"project": repository,
Expand Down
9 changes: 2 additions & 7 deletions treeherder/etl/tasks/pulse_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import asyncio

import newrelic.agent
from django.conf import settings

from treeherder.etl.classification_loader import ClassificationLoader
Expand All @@ -25,8 +24,7 @@ def store_pulse_tasks(
Fetches tasks from Taskcluster
"""
loop = asyncio.get_event_loop()
newrelic.agent.add_custom_attribute("exchange", exchange)
newrelic.agent.add_custom_attribute("routing_key", routing_key)

# handle_message expects messages in this format
with settings.STATSD_CLIENT.timer("pulse_handle_message"):
runs = loop.run_until_complete(
Expand All @@ -50,8 +48,7 @@ def store_pulse_pushes(
"""
Fetches the pushes pending from pulse exchanges and loads them.
"""
newrelic.agent.add_custom_attribute("exchange", exchange)
newrelic.agent.add_custom_attribute("routing_key", routing_key)


PushLoader().process(body, exchange, root_url)

Expand All @@ -66,7 +63,5 @@ def store_pulse_tasks_classification(
By default, it should listen to the Community cluster as classifications
are only running there for the moment
"""
newrelic.agent.add_custom_attribute("exchange", exchange)
newrelic.agent.add_custom_attribute("routing_key", routing_key)

ClassificationLoader().process(pulse_job, root_url)
3 changes: 1 addition & 2 deletions treeherder/etl/tasks/pushlog_tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import newrelic.agent
from celery import shared_task

from treeherder.etl.pushlog import HgPushlogProcess
Expand All @@ -19,6 +18,6 @@ def fetch_hg_push_log(repo_name, repo_url):
"""
Run a HgPushlog etl process
"""
newrelic.agent.add_custom_attribute("repo_name", repo_name)

process = HgPushlogProcess()
process.run(repo_url + "/json-pushes/?full=1&version=2", repo_name)
6 changes: 0 additions & 6 deletions treeherder/log_parser/artifactbuildercollection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging

import newrelic.agent

from treeherder.utils.http import make_request

from .artifactbuilders import LogViewerArtifactBuilder, PerformanceDataArtifactBuilder
Expand Down Expand Up @@ -86,10 +84,6 @@ def parse(self):
download_size_in_bytes = int(response.headers.get("Content-Length", -1))

# Temporary annotation of log size to help set thresholds in bug 1295997.
newrelic.agent.add_custom_attribute("unstructured_log_size", download_size_in_bytes)
newrelic.agent.add_custom_attribute(
"unstructured_log_encoding", response.headers.get("Content-Encoding", "None")
)

if download_size_in_bytes > MAX_DOWNLOAD_SIZE_IN_BYTES:
raise LogSizeError(f"Download size of {download_size_in_bytes} bytes exceeds limit")
Expand Down
15 changes: 2 additions & 13 deletions treeherder/log_parser/failureline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from collections import defaultdict
from itertools import islice

import newrelic.agent
from django.conf import settings
from django.db import transaction
from django.db.utils import DataError, IntegrityError, OperationalError
Expand Down Expand Up @@ -119,17 +118,8 @@ def create_group_result(job_log, line):
# Log to New Relic if it's not in a form we like. We can enter
# Bugs to upstream to remedy them.
if "\\" in group_path or len(group_path) > 255:
newrelic.agent.record_custom_event(
"malformed_test_group",
{
"message": "Group paths must be relative, with no backslashes and <255 chars",
"group": line["group"],
"group_path": group_path,
"length": len(group_path),
"repository": job_log.job.repository,
"job_guid": job_log.job.guid,
},
)
pass

else:
group, _ = Group.objects.get_or_create(name=group_path[:255])
duration = line.get("duration", 0)
Expand Down Expand Up @@ -157,7 +147,6 @@ def create(job_log, log_list):
for line in log_list:
action = line["action"]
if action not in FailureLine.ACTION_LIST:
newrelic.agent.record_custom_event("unsupported_failure_line_action", line)
# Unfortunately, these errors flood the logs, but we want to report any
# others that we didn't expect. We know about the following action we choose
# to ignore.
Expand Down
8 changes: 1 addition & 7 deletions treeherder/log_parser/tasks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging

import newrelic.agent
import simplejson as json
from celery.exceptions import SoftTimeLimitExceeded
from requests.exceptions import HTTPError
Expand All @@ -20,7 +19,6 @@

@retryable_task(name="log-parser", max_retries=10)
def parse_logs(job_id, job_log_ids, priority):
newrelic.agent.add_custom_attribute("job_id", str(job_id))

job = Job.objects.get(id=job_id)
job_logs = JobLog.objects.filter(id__in=job_log_ids, job=job)
Expand All @@ -41,7 +39,6 @@ def parse_logs(job_id, job_log_ids, priority):
first_exception = None
completed_names = set()
for job_log in job_logs:
newrelic.agent.add_custom_attribute(f"job_log_{job_log.name}_url", job_log.url)
logger.info("parser_task for %s", job_log.id)

# Only parse logs which haven't yet been processed or else failed on the last attempt.
Expand All @@ -60,16 +57,13 @@ def parse_logs(job_id, job_log_ids, priority):
parser(job_log)
except Exception as e:
if isinstance(e, SoftTimeLimitExceeded):
# stop parsing further logs but raise so NewRelic and
# stop parsing further logs but raise so
# Papertrail will still show output
raise

if first_exception is None:
first_exception = e

# track the exception on NewRelic but don't stop parsing future
# log lines.
newrelic.agent.notice_error()
else:
completed_names.add(job_log.name)

Expand Down
10 changes: 0 additions & 10 deletions treeherder/middleware.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re

import newrelic.agent
from django.utils.deprecation import MiddlewareMixin
from whitenoise.middleware import WhiteNoiseMiddleware

Expand Down Expand Up @@ -69,12 +68,3 @@ def immutable_file_test(self, path, url):
# bootstrap.min.abda843684d0.js
return super().immutable_file_test(path, url)


class NewRelicMiddleware(MiddlewareMixin):
"""Adds custom annotations to New Relic web transactions."""

def process_request(self, request):
# The New Relic Python agent only submits the User Agent to APM (for exceptions and
# slow transactions), so for use in Insights we have to add it as a customer parameter.
if "HTTP_USER_AGENT" in request.META:
newrelic.agent.add_custom_attribute("user_agent", request.META["HTTP_USER_AGENT"])
3 changes: 0 additions & 3 deletions treeherder/model/error_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import logging
import re

import newrelic.agent
from django.core.cache import caches

from treeherder.model.models import Bugscache, TextLogError
Expand Down Expand Up @@ -174,15 +173,13 @@ def get_error_summary(job, queryset=None):
try:
cache.set(cache_key, error_summary, BUG_SUGGESTION_CACHE_TIMEOUT)
except Exception as e:
newrelic.agent.record_custom_event("error caching error_summary for job", job.id)
logger.error("error caching error_summary for job %s: %s", job.id, e, exc_info=True)

try:
lcache.update_cache(date, line_cache[date])
# TODO: consider reducing this, each date is ~5%, so it will be faster
lcache.update_db_cache(date, line_cache[date])
except Exception as e:
newrelic.agent.record_custom_event("error caching error_lines for job", job.id)
logger.error("error caching error_lines for job %s: %s", job.id, e, exc_info=True)

return error_summary
Expand Down
13 changes: 1 addition & 12 deletions treeherder/model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import warnings
from hashlib import sha1

import newrelic.agent
from django.conf import settings
from django.contrib.auth.models import User
from django.contrib.postgres.indexes import GinIndex
Expand All @@ -21,7 +20,6 @@

from treeherder.webapp.api.utils import REPO_GROUPS, to_timestamp

warnings.filterwarnings("ignore", category=DeprecationWarning, module="newrelic")

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -294,7 +292,6 @@ def search(cls, search_term):
open_recent = [x for x in all_data if x["resolution"] == ""]
all_others = [x for x in all_data if x["resolution"] != ""]
except ProgrammingError as e:
newrelic.agent.notice_error()
logger.error(
f"Failed to execute FULLTEXT search on Bugscache, error={e}, SQL={recent_qs.query.__str__()}"
)
Expand Down Expand Up @@ -1291,19 +1288,11 @@ def verify_classification(self, classification):
self.metadata.best_is_verified = True
self.metadata.save(update_fields=["best_classification", "best_is_verified"])

# Send event to NewRelic when a verifing an autoclassified failure.

match = self.matches.filter(classified_failure=classification).first()
if not match:
return

newrelic.agent.record_custom_event(
"user_verified_classification",
{
"matcher": match.matcher_name,
"job_id": self.id,
},
)

def get_failure_line(self):
"""Get a related FailureLine instance if one exists."""
try:
Expand Down
Loading