-
Notifications
You must be signed in to change notification settings - Fork 140
Description
GitHub recently announced improvements to their Search API, including nested queries or boolean operators. Prow uses Search API at multiple places so we should explore whether these improve offer opportunities to improve Prow.
- https://github.blog/changelog/2025-04-09-evolving-github-issues-and-projects/#%f0%9f%95%b5%ef%b8%8f%e2%99%80%ef%b8%8f-finding-what-you-need-with-advanced-search
- https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/filtering-and-searching-issues-and-pull-requests#building-advanced-filters-for-issues
- https://github.blog/developer-skills/application-development/github-issues-search-now-supports-nested-queries-and-boolean-operators-heres-how-we-rebuilt-it/
Places that use Search API in some form (maybe there are more...):
Lines 3455 to 3499 in 59a573d
// FindIssuesWithOrg uses the GitHub search API to find issues which match a particular query. // // Input query the same way you would into the website. // Order returned results with sort (usually "updated"). // Control whether oldest/newest is first with asc. // This method is supposed to be used in contexts where "github-app-id" is set. // // See https://help.github.com/articles/searching-issues-and-pull-requests/ for details. func (c *client) FindIssuesWithOrg(org, query, sort string, asc bool) ([]Issue, error) { loggerName := "FindIssuesWithOrg" if org == "" { loggerName = "FindIssues" } durationLogger := c.log(loggerName, query) defer durationLogger() values := url.Values{ "per_page": []string{"100"}, "q": []string{query}, } var issues []Issue if sort != "" { values["sort"] = []string{sort} if asc { values["order"] = []string{"asc"} } } err := c.readPaginatedResultsWithValues( "/search/issues", values, acceptNone, org, func() interface{} { // newObj return &IssuesSearchResult{} }, func(obj interface{}) { issues = append(issues, obj.(*IssuesSearchResult).Issues...) }, ) if err != nil { return nil, err } return issues, err } Lines 101 to 151 in 59a573d
// Query gets all open PRs based on tide configuration. func (gi *GitHubProvider) Query() (map[string]CodeReviewCommon, error) { lock := sync.Mutex{} wg := sync.WaitGroup{} prs := make(map[string]CodeReviewCommon) var errs []error for i, query := range gi.cfg().Tide.Queries { // Use org-sharded queries only when GitHub apps auth is in use var queries map[string]string if gi.usesGitHubAppsAuth { queries = query.OrgQueries() } else { queries = map[string]string{"": query.Query()} } for org, q := range queries { org, q, i := org, q, i wg.Add(1) go func() { defer wg.Done() results, err := gi.search(gi.ghc.QueryWithGitHubAppsSupport, gi.logger, q, time.Time{}, time.Now(), org) resultString := "success" if err != nil { resultString = "error" } tideMetrics.queryResults.WithLabelValues(strconv.Itoa(i), org, resultString).Inc() lock.Lock() defer lock.Unlock() if err != nil && len(results) == 0 { gi.logger.WithField("query", q).WithError(err).Warn("Failed to execute query.") errs = append(errs, fmt.Errorf("query %d, err: %w", i, err)) return } if err != nil { gi.logger.WithError(err).WithField("query", q).Warning("found partial results") } for _, pr := range results { crc := CodeReviewCommonFromPullRequest(&pr) prs[prKey(crc)] = *crc } }() } } wg.Wait() return prs, utilerrors.NewAggregate(errs) } Lines 165 to 212 in 59a573d
func (gi *GitHubProvider) search(query querier, log *logrus.Entry, q string, start, end time.Time, org string) ([]PullRequest, error) { start = floor(start) end = floor(end) log = log.WithFields(logrus.Fields{ "query": q, "start": start.String(), "end": end.String(), }) requestStart := time.Now() var cursor *githubql.String vars := map[string]interface{}{ "query": githubql.String(datedQuery(q, start, end)), "searchCursor": cursor, } var totalCost, remaining int var ret []PullRequest var sq searchQuery ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() for { log.Debug("Sending query") if err := query(ctx, &sq, vars, org); err != nil { if cursor != nil { err = fmt.Errorf("cursor: %q, err: %w", *cursor, err) } return ret, err } totalCost += int(sq.RateLimit.Cost) remaining = int(sq.RateLimit.Remaining) for _, n := range sq.Search.Nodes { ret = append(ret, n.PullRequest) } if !sq.Search.PageInfo.HasNextPage { break } cursor = &sq.Search.PageInfo.EndCursor vars["searchCursor"] = cursor log = log.WithField("searchCursor", *cursor) } log.WithFields(logrus.Fields{ "duration": time.Since(requestStart).String(), "pr_found_count": len(ret), "cost": totalCost, "remaining": remaining, }).Debug("Finished query") return ret, nil } prow/pkg/tide/blockers/blockers.go
Lines 75 to 127 in 59a573d
// FindAll finds issues with label in the specified orgs/repos that should block tide. func FindAll(ghc githubClient, log *logrus.Entry, label string, orgRepoTokensByOrg map[string]string, splitQueryByOrg bool) (Blockers, error) { queries := map[string]sets.Set[string]{} for org, query := range orgRepoTokensByOrg { if splitQueryByOrg { queries[org] = sets.New[string](blockerQuery(label, query)...) } else { if queries[""] == nil { queries[""] = sets.Set[string]{} } queries[""].Insert(blockerQuery(label, query)...) } } var issues []Issue var errs []error var lock sync.Mutex var wg sync.WaitGroup ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() for org, query := range queries { org, query := org, strings.Join(sets.List(query), " ") wg.Add(1) go func() { defer wg.Done() result, err := search( ctx, ghc, org, log, query, ) lock.Lock() defer lock.Unlock() if err != nil { errs = append(errs, err) return } issues = append(issues, result...) }() } wg.Wait() if err := utilerrors.NewAggregate(errs); err != nil { return Blockers{}, fmt.Errorf("error searching for blocker issues: %w", err) } return fromIssues(issues, log), nil } prow/cmd/external-plugins/needs-rebase/plugin/plugin.go
Lines 342 to 394 in 59a573d
// constructQueries constructs the v4 queries for the periodic scan. // It returns a map[org][]query. func constructQueries(log *logrus.Entry, now time.Time, orgs, repos []string, usesGitHubAppsAuth bool) map[string][]string { result := map[string][]string{} // GitHub hard caps queries at 1k results, so always do one query per org and one for // all repos. Ref: https://github.community/t/graphql-github-api-how-to-get-more-than-1000-pull-requests/13838/11 for _, org := range orgs { // https://img.17qq.com/images/crqhcuueqhx.jpeg if org == "kubernetes" { result[org] = append(result[org], searchQueryPrefix+` org:"kubernetes" -repo:"kubernetes/kubernetes"`) // Sharding by creation time > 2 months ago gives us around 50% of PRs per query (585 for the newer ones, 538 for the older ones when testing) twoMonthsAgoISO8601 := now.Add(-2 * 30 * 24 * time.Hour).Format("2006-01-02") result[org] = append(result[org], searchQueryPrefix+` repo:"kubernetes/kubernetes" created:>=`+twoMonthsAgoISO8601) result[org] = append(result[org], searchQueryPrefix+` repo:"kubernetes/kubernetes" created:<`+twoMonthsAgoISO8601) } else { result[org] = append(result[org], searchQueryPrefix+` org:"`+org+`"`) } } reposQueries := map[string]*bytes.Buffer{} for _, repo := range repos { slashSplit := strings.Split(repo, "/") if n := len(slashSplit); n != 2 { log.WithField("repo", repo).Warn("Found repo that was not in org/repo format, ignoring...") continue } org := slashSplit[0] if _, hasOrgQuery := result[org]; hasOrgQuery { log.WithField("repo", repo).Warn("Plugin was enabled for repo even though it is already enabled for the org, ignoring...") continue } var b *bytes.Buffer if usesGitHubAppsAuth { if reposQueries[org] == nil { reposQueries[org] = bytes.NewBufferString(searchQueryPrefix) } b = reposQueries[org] } else { if reposQueries[""] == nil { reposQueries[""] = bytes.NewBufferString(searchQueryPrefix) } b = reposQueries[""] } fmt.Fprintf(b, " repo:\"%s\"", repo) } for org, repoQuery := range reposQueries { result[org] = append(result[org], repoQuery.String()) } return result }
Tide merge criteria configuration language is essentially a GH search query through YAML, and sometimes the queries are awkward to use when a simple boolean could suffice. We cannot change the existing configuration language but we could create a new one that uses new Search API constructs. Another option is to find optimization opportunities internally where instead of doing multiple queries we could internally merge them and hit the API fewer times.
/kind feature
/area tide