Skip to content

Commit deb268d

Browse files
committed
[enriched/githubql] Fix enrich_reference_analysis study
This code fixes the `enrich_reference_analysis` study that only processed the first 10 references instead of all of them. By default, the ElasticSearch/OpenSearch `aggregations` query only returns the first 10 documents. Using `composite aggregations` we can paginate the result to get all the references. Signed-off-by: Quan Zhou <[email protected]>
1 parent e44cac0 commit deb268d

File tree

2 files changed

+90
-58
lines changed

2 files changed

+90
-58
lines changed

grimoire_elk/enriched/githubql.py

Lines changed: 79 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,84 @@ def _get_merged_prs(es_input):
593593

594594
return merged_prs_list
595595

596+
def _get_cross_references(es_input, index):
597+
# Get all CrossReferencedEvent items and their referenced issues and pull requests
598+
es_query = {
599+
"size": 0,
600+
"track_total_hits": True,
601+
"query": {
602+
"bool": {
603+
"must": {
604+
"term": {
605+
"event_type": "CrossReferencedEvent"
606+
}
607+
}
608+
}
609+
},
610+
"aggs": {
611+
"composite_issue_url": {
612+
"composite": {
613+
"sources": [{
614+
"issue_url": {
615+
"terms": {
616+
"field": "issue_url"
617+
}
618+
}
619+
}],
620+
"size": 1000
621+
},
622+
"aggs": {
623+
"references_urls": {
624+
"terms": {
625+
"field": "reference_source_url",
626+
"size": 10000
627+
}
628+
}
629+
}
630+
}
631+
}
632+
}
633+
634+
buckets = []
635+
while True:
636+
cross_references = es_input.search(index=index, body=es_query)
637+
buckets += cross_references['aggregations']['composite_issue_url']['buckets']
638+
after_key = cross_references['aggregations']['composite_issue_url'].get('after_key', None)
639+
if not after_key:
640+
break
641+
es_query['aggs']['composite_issue_url']['composite']['after'] = after_key
642+
643+
reference_dict = {}
644+
for item in buckets:
645+
issue_url = item['key']['issue_url']
646+
references = [ref['key'] for ref in item['references_urls']['buckets']]
647+
648+
# Update reference dictionary
649+
if issue_url not in reference_dict.keys():
650+
reference_dict[issue_url] = references
651+
else:
652+
prev_references = reference_dict[issue_url]
653+
prev_references.append(references)
654+
reference_dict[issue_url] = list(set(prev_references))
655+
656+
# Adding list entries from reversed references
657+
for issue_url in reference_dict.keys():
658+
reference_list = reference_dict[issue_url]
659+
if not reference_list:
660+
continue
661+
for ref in reference_list:
662+
try:
663+
ref_entry_list = reference_dict[ref]
664+
except KeyError:
665+
continue
666+
if ref_entry_list:
667+
ref_entry_list.append(issue_url)
668+
else:
669+
ref_entry_list = [issue_url]
670+
reference_dict[ref] = list(set(ref_entry_list))
671+
672+
return reference_dict
673+
596674
data_source = enrich_backend.__class__.__name__.split("Enrich")[0].lower()
597675
log_prefix = "[{}] Cross reference analysis".format(data_source)
598676
logger.info("{} starting study {}".format(log_prefix, anonymize_url(self.elastic.index_url)))
@@ -605,64 +683,7 @@ def _get_merged_prs(es_input):
605683
logger.info("{} Retrieving the merged PRs from MergeEvents".format(log_prefix))
606684
merged_prs = _get_merged_prs(es_in)
607685

608-
# Get all CrossReferencedEvent items and their referenced issues and pull requests
609-
es_query = {
610-
"size": 0,
611-
"query": {
612-
"bool": {
613-
"must": {
614-
"term": {
615-
"event_type": "CrossReferencedEvent"
616-
}
617-
}
618-
}
619-
},
620-
"aggs": {
621-
"issue_url": {
622-
"terms": {
623-
"field": "issue_url",
624-
"size": 30000
625-
},
626-
"aggs": {
627-
"uniq_gender": {
628-
"terms": {"field": "reference_source_url"}
629-
}
630-
}
631-
}
632-
}
633-
}
634-
635-
cross_references = es_in.search(index=in_index, body=es_query)
636-
buckets = cross_references['aggregations']['issue_url']['buckets']
637-
638-
reference_dict = {}
639-
for item in buckets:
640-
issue_url = item['key']
641-
references = [ref['key'] for ref in item['uniq_gender']['buckets']]
642-
643-
# Update reference dictionary
644-
if issue_url not in reference_dict.keys():
645-
reference_dict[issue_url] = references
646-
else:
647-
prev_references = reference_dict[issue_url]
648-
prev_references.append(references)
649-
reference_dict[issue_url] = list(set(prev_references))
650-
651-
# Adding list entries from reversed references
652-
for issue_url in reference_dict.keys():
653-
reference_list = reference_dict[issue_url]
654-
if not reference_list:
655-
continue
656-
for ref in reference_list:
657-
try:
658-
ref_entry_list = reference_dict[ref]
659-
except KeyError:
660-
continue
661-
if ref_entry_list:
662-
ref_entry_list.append(issue_url)
663-
else:
664-
ref_entry_list = [issue_url]
665-
reference_dict[ref] = list(set(ref_entry_list))
686+
reference_dict = _get_cross_references(es_in, in_index)
666687

667688
# Updated affected issues and pull requests
668689
painless_code = """
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
title: All references processed for the reference analysis study
3+
category: fixed
4+
author: Quan Zhou <[email protected]>
5+
issue: null
6+
notes: >
7+
The `enrich_reference_analysis` study analyzes the cross-references
8+
between "issues" and "pull request". When we use an aggregations query,
9+
it returns only the first 10 items (ElasticSearch/OpenSearch by default).
10+
By using 'composite aggregations', we can paginate the result and thus,
11+
obtain all the references.

0 commit comments

Comments
 (0)