Skip to content

Explore GitHub's Improved Search API #482

@petr-muller

Description

@petr-muller

GitHub recently announced improvements to their Search API, including nested queries or boolean operators. Prow uses Search API at multiple places so we should explore whether these improve offer opportunities to improve Prow.

Places that use Search API in some form (maybe there are more...):

  • prow/pkg/github/client.go

    Lines 3455 to 3499 in 59a573d

    // FindIssuesWithOrg uses the GitHub search API to find issues which match a particular query.
    //
    // Input query the same way you would into the website.
    // Order returned results with sort (usually "updated").
    // Control whether oldest/newest is first with asc.
    // This method is supposed to be used in contexts where "github-app-id" is set.
    //
    // See https://help.github.com/articles/searching-issues-and-pull-requests/ for details.
    func (c *client) FindIssuesWithOrg(org, query, sort string, asc bool) ([]Issue, error) {
    loggerName := "FindIssuesWithOrg"
    if org == "" {
    loggerName = "FindIssues"
    }
    durationLogger := c.log(loggerName, query)
    defer durationLogger()
    values := url.Values{
    "per_page": []string{"100"},
    "q": []string{query},
    }
    var issues []Issue
    if sort != "" {
    values["sort"] = []string{sort}
    if asc {
    values["order"] = []string{"asc"}
    }
    }
    err := c.readPaginatedResultsWithValues(
    "/search/issues",
    values,
    acceptNone,
    org,
    func() interface{} { // newObj
    return &IssuesSearchResult{}
    },
    func(obj interface{}) {
    issues = append(issues, obj.(*IssuesSearchResult).Issues...)
    },
    )
    if err != nil {
    return nil, err
    }
    return issues, err
    }
  • prow/pkg/tide/github.go

    Lines 101 to 151 in 59a573d

    // Query gets all open PRs based on tide configuration.
    func (gi *GitHubProvider) Query() (map[string]CodeReviewCommon, error) {
    lock := sync.Mutex{}
    wg := sync.WaitGroup{}
    prs := make(map[string]CodeReviewCommon)
    var errs []error
    for i, query := range gi.cfg().Tide.Queries {
    // Use org-sharded queries only when GitHub apps auth is in use
    var queries map[string]string
    if gi.usesGitHubAppsAuth {
    queries = query.OrgQueries()
    } else {
    queries = map[string]string{"": query.Query()}
    }
    for org, q := range queries {
    org, q, i := org, q, i
    wg.Add(1)
    go func() {
    defer wg.Done()
    results, err := gi.search(gi.ghc.QueryWithGitHubAppsSupport, gi.logger, q, time.Time{}, time.Now(), org)
    resultString := "success"
    if err != nil {
    resultString = "error"
    }
    tideMetrics.queryResults.WithLabelValues(strconv.Itoa(i), org, resultString).Inc()
    lock.Lock()
    defer lock.Unlock()
    if err != nil && len(results) == 0 {
    gi.logger.WithField("query", q).WithError(err).Warn("Failed to execute query.")
    errs = append(errs, fmt.Errorf("query %d, err: %w", i, err))
    return
    }
    if err != nil {
    gi.logger.WithError(err).WithField("query", q).Warning("found partial results")
    }
    for _, pr := range results {
    crc := CodeReviewCommonFromPullRequest(&pr)
    prs[prKey(crc)] = *crc
    }
    }()
    }
    }
    wg.Wait()
    return prs, utilerrors.NewAggregate(errs)
    }
  • prow/pkg/tide/github.go

    Lines 165 to 212 in 59a573d

    func (gi *GitHubProvider) search(query querier, log *logrus.Entry, q string, start, end time.Time, org string) ([]PullRequest, error) {
    start = floor(start)
    end = floor(end)
    log = log.WithFields(logrus.Fields{
    "query": q,
    "start": start.String(),
    "end": end.String(),
    })
    requestStart := time.Now()
    var cursor *githubql.String
    vars := map[string]interface{}{
    "query": githubql.String(datedQuery(q, start, end)),
    "searchCursor": cursor,
    }
    var totalCost, remaining int
    var ret []PullRequest
    var sq searchQuery
    ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
    defer cancel()
    for {
    log.Debug("Sending query")
    if err := query(ctx, &sq, vars, org); err != nil {
    if cursor != nil {
    err = fmt.Errorf("cursor: %q, err: %w", *cursor, err)
    }
    return ret, err
    }
    totalCost += int(sq.RateLimit.Cost)
    remaining = int(sq.RateLimit.Remaining)
    for _, n := range sq.Search.Nodes {
    ret = append(ret, n.PullRequest)
    }
    if !sq.Search.PageInfo.HasNextPage {
    break
    }
    cursor = &sq.Search.PageInfo.EndCursor
    vars["searchCursor"] = cursor
    log = log.WithField("searchCursor", *cursor)
    }
    log.WithFields(logrus.Fields{
    "duration": time.Since(requestStart).String(),
    "pr_found_count": len(ret),
    "cost": totalCost,
    "remaining": remaining,
    }).Debug("Finished query")
    return ret, nil
    }
  • // FindAll finds issues with label in the specified orgs/repos that should block tide.
    func FindAll(ghc githubClient, log *logrus.Entry, label string, orgRepoTokensByOrg map[string]string, splitQueryByOrg bool) (Blockers, error) {
    queries := map[string]sets.Set[string]{}
    for org, query := range orgRepoTokensByOrg {
    if splitQueryByOrg {
    queries[org] = sets.New[string](blockerQuery(label, query)...)
    } else {
    if queries[""] == nil {
    queries[""] = sets.Set[string]{}
    }
    queries[""].Insert(blockerQuery(label, query)...)
    }
    }
    var issues []Issue
    var errs []error
    var lock sync.Mutex
    var wg sync.WaitGroup
    ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
    defer cancel()
    for org, query := range queries {
    org, query := org, strings.Join(sets.List(query), " ")
    wg.Add(1)
    go func() {
    defer wg.Done()
    result, err := search(
    ctx,
    ghc,
    org,
    log,
    query,
    )
    lock.Lock()
    defer lock.Unlock()
    if err != nil {
    errs = append(errs, err)
    return
    }
    issues = append(issues, result...)
    }()
    }
    wg.Wait()
    if err := utilerrors.NewAggregate(errs); err != nil {
    return Blockers{}, fmt.Errorf("error searching for blocker issues: %w", err)
    }
    return fromIssues(issues, log), nil
    }
  • // constructQueries constructs the v4 queries for the periodic scan.
    // It returns a map[org][]query.
    func constructQueries(log *logrus.Entry, now time.Time, orgs, repos []string, usesGitHubAppsAuth bool) map[string][]string {
    result := map[string][]string{}
    // GitHub hard caps queries at 1k results, so always do one query per org and one for
    // all repos. Ref: https://github.community/t/graphql-github-api-how-to-get-more-than-1000-pull-requests/13838/11
    for _, org := range orgs {
    // https://img.17qq.com/images/crqhcuueqhx.jpeg
    if org == "kubernetes" {
    result[org] = append(result[org], searchQueryPrefix+` org:"kubernetes" -repo:"kubernetes/kubernetes"`)
    // Sharding by creation time > 2 months ago gives us around 50% of PRs per query (585 for the newer ones, 538 for the older ones when testing)
    twoMonthsAgoISO8601 := now.Add(-2 * 30 * 24 * time.Hour).Format("2006-01-02")
    result[org] = append(result[org], searchQueryPrefix+` repo:"kubernetes/kubernetes" created:>=`+twoMonthsAgoISO8601)
    result[org] = append(result[org], searchQueryPrefix+` repo:"kubernetes/kubernetes" created:<`+twoMonthsAgoISO8601)
    } else {
    result[org] = append(result[org], searchQueryPrefix+` org:"`+org+`"`)
    }
    }
    reposQueries := map[string]*bytes.Buffer{}
    for _, repo := range repos {
    slashSplit := strings.Split(repo, "/")
    if n := len(slashSplit); n != 2 {
    log.WithField("repo", repo).Warn("Found repo that was not in org/repo format, ignoring...")
    continue
    }
    org := slashSplit[0]
    if _, hasOrgQuery := result[org]; hasOrgQuery {
    log.WithField("repo", repo).Warn("Plugin was enabled for repo even though it is already enabled for the org, ignoring...")
    continue
    }
    var b *bytes.Buffer
    if usesGitHubAppsAuth {
    if reposQueries[org] == nil {
    reposQueries[org] = bytes.NewBufferString(searchQueryPrefix)
    }
    b = reposQueries[org]
    } else {
    if reposQueries[""] == nil {
    reposQueries[""] = bytes.NewBufferString(searchQueryPrefix)
    }
    b = reposQueries[""]
    }
    fmt.Fprintf(b, " repo:\"%s\"", repo)
    }
    for org, repoQuery := range reposQueries {
    result[org] = append(result[org], repoQuery.String())
    }
    return result
    }

Tide merge criteria configuration language is essentially a GH search query through YAML, and sometimes the queries are awkward to use when a simple boolean could suffice. We cannot change the existing configuration language but we could create a new one that uses new Search API constructs. Another option is to find optimization opportunities internally where instead of doing multiple queries we could internally merge them and hit the API fewer times.

/kind feature
/area tide

Metadata

Metadata

Assignees

No one assigned

    Labels

    area/tideIssues or PRs related to prow's tide componentkind/featureCategorizes issue or PR as related to a new feature.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions