Skip to content

Commit 794e9a4

Browse files
KAFKA-14995: Automate asf.yaml collaborators refresh (#17124)
Add a Python script that analyzes our Git history to find top contributors. This can be used by committers to update the list of contributors in .asf.yaml without a lot of tedious effort. Co-authored-by: stevenbooke <[email protected]> Co-authored-by: Joao Pedro Fonseca <[email protected]> Reviewers: David Arthur <[email protected]>
1 parent 6fd973b commit 794e9a4

File tree

3 files changed

+233
-0
lines changed

3 files changed

+233
-0
lines changed

committer-tools/README.md

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Refresh Collaborators Script
2+
3+
The Refresh Collaborators script automates the process of fetching contributor
4+
data from GitHub repositories, filtering top contributors who are not part of
5+
the existing committers, and updating a local configuration file (.asf.yaml) to
6+
include these new contributors.
7+
8+
## Table of Contents
9+
10+
- [Requirements](#requirements)
11+
- [Installation](#installation)
12+
- [Usage](#usage)
13+
14+
## Requirements
15+
16+
- Python 3.x and pip
17+
- A valid GitHub token with repository read access
18+
19+
## Installation
20+
21+
### 1. Check Python installation
22+
23+
Check if Python and pip are installed in your system.
24+
25+
```bash
26+
python3 --version
27+
pip3 --version
28+
```
29+
30+
### 2. Set up a virtual environment (optional)
31+
32+
```bash
33+
python3 -m venv venv
34+
35+
# For Linux/macOS
36+
source venv/bin/activate
37+
38+
# On Windows:
39+
# .\venv\Scripts\activate
40+
```
41+
42+
3. Install the required dependencies
43+
44+
```bash
45+
pip3 install -r requirements.txt
46+
```
47+
48+
## Usage
49+
50+
### 1. Set up the environment variable for GitHub Token
51+
52+
You need to set up a valid GitHub token to access the repository. After you
53+
generate it (or authenticate via GitHub CLI), this can be done by setting the
54+
GITHUB_TOKEN environment variable.
55+
56+
```bash
57+
# For Linux/macOS
58+
export GITHUB_TOKEN="your_github_token"
59+
# Or if you use GitHub CLI
60+
export GITHUB_TOKEN="$(gh auth token)"
61+
62+
# On Windows:
63+
# .\venv\Scripts\activate
64+
```
65+
66+
### 2. Run the script
67+
68+
```bash
69+
python3 refresh_collaborators.py
70+
```
+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
"""
19+
This script automates the process of fetching contributor data from GitHub
20+
repositories, filtering top contributors who are not part of the existing
21+
committers, and updating a local configuration file (.asf.yaml) to include these
22+
new contributors.
23+
"""
24+
25+
import io
26+
import logging
27+
import os
28+
from datetime import datetime, timedelta
29+
from typing import Dict, List, Tuple
30+
31+
from bs4 import BeautifulSoup
32+
from github import Github
33+
from github.Commit import Commit
34+
from github.ContentFile import ContentFile
35+
from github.PaginatedList import PaginatedList
36+
from github.Repository import Repository
37+
from ruamel.yaml import YAML
38+
39+
logging.basicConfig(
40+
format="%(asctime)s %(levelname)s %(message)s",
41+
level=logging.INFO,
42+
)
43+
44+
GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN")
45+
REPO_KAFKA_SITE: str = "apache/kafka-site"
46+
REPO_KAFKA: str = "apache/kafka"
47+
ASF_YAML_PATH: str = "../.asf.yaml"
48+
TOP_N_CONTRIBUTORS: int = 10
49+
50+
51+
def get_github_client() -> Github:
52+
"""
53+
Initialize GitHub client with token.
54+
"""
55+
if not GITHUB_TOKEN:
56+
logging.error("GITHUB_TOKEN is not set in the environment")
57+
raise ValueError("GITHUB_TOKEN is not set in the environment")
58+
59+
logging.info("Successfully initialized GitHub client")
60+
return Github(GITHUB_TOKEN)
61+
62+
63+
def get_committers_list(repo: Repository) -> List[str]:
64+
"""
65+
Fetch the committers from the given repository.
66+
"""
67+
logging.info(f"Fetching committers from the repository {REPO_KAFKA_SITE}")
68+
committers_file: ContentFile = repo.get_contents("committers.html")
69+
content: bytes = committers_file.decoded_content
70+
soup: BeautifulSoup = BeautifulSoup(content, "html.parser")
71+
72+
committers = [login.text for login in soup.find_all("div", class_="github_login")]
73+
logging.info(f"Found {len(committers)} committers")
74+
return committers
75+
76+
77+
def get_top_contributors(repo: Repository, committers: List[str]) -> List[str]:
78+
"""
79+
Get top contributors for the given repository excluding committers.
80+
"""
81+
logging.info(f"Fetching contributors from the repository {REPO_KAFKA}")
82+
one_year_ago: datetime = datetime.now() - timedelta(days=365)
83+
contributors: Dict[str, int] = {}
84+
85+
last_year_commits: PaginatedList[Commit] = repo.get_commits(since=one_year_ago)
86+
for contributor in repo.get_contributors():
87+
if contributor.login not in committers:
88+
contributions: int = 0
89+
for commit in last_year_commits:
90+
if commit.author == contributor:
91+
contributions += 1
92+
contributors[contributor.login] = contributions
93+
94+
sorted_contributors: List[Tuple[str, int]] = sorted(
95+
contributors.items(), key=lambda x: x[1], reverse=True
96+
)
97+
98+
top_contributors = [login for login, _ in sorted_contributors][:TOP_N_CONTRIBUTORS]
99+
logging.info(
100+
f"Found {len(top_contributors)} top contributors who are not committers"
101+
)
102+
return top_contributors
103+
104+
105+
def update_local_yaml_content(yaml_file_path: str, collaborators: List[str]) -> None:
106+
"""
107+
Update the local .asf.yaml file with refreshed GitHub whitelist and
108+
collaborators.
109+
"""
110+
logging.info(
111+
f"Updating {yaml_file_path} with {len(collaborators)} new collaborators"
112+
)
113+
114+
with open(yaml_file_path, "r", encoding="utf-8") as file:
115+
yaml: YAML = YAML()
116+
yaml_content: dict = yaml.load(file)
117+
118+
yaml_content["jenkins"]["github_whitelist"] = collaborators
119+
yaml_content["github"]["collaborators"] = collaborators.copy()
120+
121+
with open(yaml_file_path, "w", encoding="utf-8") as file:
122+
yaml.dump(yaml_content, file)
123+
124+
logging.info(f"Local file {yaml_file_path} updated successfully")
125+
126+
127+
def main() -> None:
128+
github_client: Github = get_github_client()
129+
130+
kafka_site_repo: Repository = github_client.get_repo(REPO_KAFKA_SITE)
131+
committers: List[str] = get_committers_list(kafka_site_repo)
132+
133+
kafka_repo: Repository = github_client.get_repo(REPO_KAFKA)
134+
top_contributors: List[str] = get_top_contributors(kafka_repo, committers)
135+
136+
update_local_yaml_content(ASF_YAML_PATH, top_contributors)
137+
138+
139+
if __name__ == "__main__":
140+
try:
141+
main()
142+
except Exception as e:
143+
logging.error(f"Error: {e}")

committer-tools/requirements.txt

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
beautifulsoup4==4.12.3
19+
PyGithub==2.4.0
20+
ruamel.yaml==0.18.6

0 commit comments

Comments
 (0)