Skip to content

Commit 4370628

Browse files
committed
Add source code files to repo
1 parent 64764c6 commit 4370628

File tree

7 files changed

+473
-0
lines changed

7 files changed

+473
-0
lines changed

.github/workflows/cml.yaml

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
name: CML Report
2+
on: pull_request
3+
jobs:
4+
run:
5+
runs-on: [ubuntu-latest]
6+
steps:
7+
- uses: iterative/setup-cml@v2
8+
- uses: iterative/setup-dvc@v1
9+
- uses: actions/checkout@v3
10+
with:
11+
fetch-depth: 2
12+
# Needed for https://github.com/iterative/example-repos-dev/issues/225
13+
- name: Installs JSON5
14+
run: npm install -g json5
15+
- name: Generate metrics report
16+
env:
17+
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18+
run: |
19+
cml ci
20+
if [ $GITHUB_REF = refs/heads/main ]; then
21+
PREVIOUS_REF=HEAD~1
22+
else
23+
PREVIOUS_REF=main
24+
git fetch origin main:main
25+
fi
26+
27+
dvc pull eval
28+
dvc plots diff $PREVIOUS_REF workspace \
29+
--show-vega --targets ROC | json5 > vega.json
30+
vl2svg vega.json roc.svg
31+
32+
dvc plots diff $PREVIOUS_REF workspace \
33+
--show-vega --targets Precision-Recall | json5 > vega.json
34+
vl2svg vega.json prc.svg
35+
36+
dvc plots diff $PREVIOUS_REF workspace \
37+
--show-vega --targets Confusion-Matrix | json5 > vega.json
38+
vl2svg vega.json confusion.svg
39+
40+
cp eval/plots/images/importance.png importance_workspace.png
41+
42+
git checkout $PREVIOUS_REF -- dvc.lock
43+
cp eval/plots/images/importance.png importance_previous.png
44+
45+
dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
46+
47+
cat <<EOF > report.md
48+
# CML Report
49+
## Plots
50+
![ROC](./roc.svg)
51+
![Precision-Recall](./prc.svg)
52+
![Confusion Matrix](./confusion.svg)
53+
#### Feature Importance: ${PREVIOUS_REF}
54+
![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
55+
#### Feature Importance: workspace
56+
![Feature Importance: workspace](./importance_workspace.png)
57+
58+
## Metrics and Params
59+
### ${PREVIOUS_REF} → workspace
60+
${dvc_report}
61+
EOF
62+
63+
cml comment create --publish --pr=false report.md

params.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
prepare:
2+
split: 0.20
3+
seed: 20170428
4+
5+
featurize:
6+
max_features: 100
7+
ngrams: 1
8+
9+
train:
10+
seed: 20170428
11+
n_est: 50
12+
min_split: 0.01
13+

src/evaluate.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import json
2+
import math
3+
import os
4+
import pickle
5+
import sys
6+
7+
import pandas as pd
8+
from sklearn import metrics
9+
from sklearn import tree
10+
from dvclive import Live
11+
from matplotlib import pyplot as plt
12+
13+
14+
def evaluate(model, matrix, split, live, save_path):
15+
"""
16+
Dump all evaluation metrics and plots for given datasets.
17+
18+
Args:
19+
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
20+
matrix (scipy.sparse.csr_matrix): Input matrix.
21+
split (str): Dataset name.
22+
live (dvclive.Live): Dvclive instance.
23+
save_path (str): Path to save the metrics.
24+
"""
25+
labels = matrix[:, 1].toarray().astype(int)
26+
x = matrix[:, 2:]
27+
28+
predictions_by_class = model.predict_proba(x)
29+
predictions = predictions_by_class[:, 1]
30+
31+
# Use dvclive to log a few simple metrics...
32+
avg_prec = metrics.average_precision_score(labels, predictions)
33+
roc_auc = metrics.roc_auc_score(labels, predictions)
34+
if not live.summary:
35+
live.summary = {"avg_prec": {}, "roc_auc": {}}
36+
live.summary["avg_prec"][split] = avg_prec
37+
live.summary["roc_auc"][split] = roc_auc
38+
39+
# ... and plots...
40+
# ... like an roc plot...
41+
live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
42+
# ... and precision recall plot...
43+
# ... which passes `drop_intermediate=True` to the sklearn method...
44+
live.log_sklearn_plot(
45+
"precision_recall",
46+
labels,
47+
predictions,
48+
name=f"prc/{split}",
49+
drop_intermediate=True,
50+
)
51+
# ... and confusion matrix plot
52+
live.log_sklearn_plot(
53+
"confusion_matrix",
54+
labels.squeeze(),
55+
predictions_by_class.argmax(-1),
56+
name=f"cm/{split}",
57+
)
58+
59+
60+
def save_importance_plot(live, model, feature_names):
61+
"""
62+
Save feature importance plot.
63+
64+
Args:
65+
live (dvclive.Live): DVCLive instance.
66+
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
67+
feature_names (list): List of feature names.
68+
"""
69+
fig, axes = plt.subplots(dpi=100)
70+
fig.subplots_adjust(bottom=0.2, top=0.95)
71+
axes.set_ylabel("Mean decrease in impurity")
72+
73+
importances = model.feature_importances_
74+
forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
75+
forest_importances.plot.bar(ax=axes)
76+
77+
live.log_image("importance.png", fig)
78+
79+
80+
def main():
81+
EVAL_PATH = "eval"
82+
83+
if len(sys.argv) != 3:
84+
sys.stderr.write("Arguments error. Usage:\n")
85+
sys.stderr.write("\tpython evaluate.py model features\n")
86+
sys.exit(1)
87+
88+
model_file = sys.argv[1]
89+
train_file = os.path.join(sys.argv[2], "train.pkl")
90+
test_file = os.path.join(sys.argv[2], "test.pkl")
91+
92+
# Load model and data.
93+
with open(model_file, "rb") as fd:
94+
model = pickle.load(fd)
95+
96+
with open(train_file, "rb") as fd:
97+
train, feature_names = pickle.load(fd)
98+
99+
with open(test_file, "rb") as fd:
100+
test, _ = pickle.load(fd)
101+
102+
# Evaluate train and test datasets.
103+
with Live(EVAL_PATH) as live:
104+
evaluate(model, train, "train", live, save_path=EVAL_PATH)
105+
evaluate(model, test, "test", live, save_path=EVAL_PATH)
106+
107+
# Dump feature importance plot.
108+
save_importance_plot(live, model, feature_names)
109+
110+
111+
if __name__ == "__main__":
112+
main()

src/featurization.py

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import os
2+
import pickle
3+
import sys
4+
5+
import numpy as np
6+
import pandas as pd
7+
import scipy.sparse as sparse
8+
import yaml
9+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
10+
11+
12+
def get_df(data):
13+
"""Read the input data file and return a data frame."""
14+
df = pd.read_csv(
15+
data,
16+
encoding="utf-8",
17+
header=None,
18+
delimiter="\t",
19+
names=["id", "label", "text"],
20+
)
21+
sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
22+
return df
23+
24+
25+
def save_matrix(df, matrix, names, output):
26+
"""
27+
Save the matrix to a pickle file.
28+
29+
Args:
30+
df (pandas.DataFrame): Input data frame.
31+
matrix (scipy.sparse.csr_matrix): Input matrix.
32+
names (list): List of feature names.
33+
output (str): Output file name.
34+
"""
35+
id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
36+
label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
37+
38+
result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
39+
40+
msg = "The output matrix {} size is {} and data type is {}\n"
41+
sys.stderr.write(msg.format(output, result.shape, result.dtype))
42+
43+
with open(output, "wb") as fd:
44+
pickle.dump((result, names), fd)
45+
pass
46+
47+
48+
def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
49+
"""
50+
Generate train feature matrix.
51+
52+
Args:
53+
train_input (str): Train input file name.
54+
train_output (str): Train output file name.
55+
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
56+
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
57+
"""
58+
df_train = get_df(train_input)
59+
train_words = np.array(df_train.text.str.lower().values)
60+
61+
bag_of_words.fit(train_words)
62+
63+
train_words_binary_matrix = bag_of_words.transform(train_words)
64+
feature_names = bag_of_words.get_feature_names_out()
65+
66+
tfidf.fit(train_words_binary_matrix)
67+
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
68+
69+
save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
70+
71+
72+
def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
73+
"""
74+
Generate test feature matrix.
75+
76+
Args:
77+
test_input (str): Test input file name.
78+
test_output (str): Test output file name.
79+
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
80+
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
81+
"""
82+
df_test = get_df(test_input)
83+
test_words = np.array(df_test.text.str.lower().values)
84+
85+
test_words_binary_matrix = bag_of_words.transform(test_words)
86+
test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
87+
feature_names = bag_of_words.get_feature_names_out()
88+
89+
save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
90+
91+
92+
def main():
93+
params = yaml.safe_load(open("params.yaml"))["featurize"]
94+
95+
np.set_printoptions(suppress=True)
96+
97+
if len(sys.argv) != 3 and len(sys.argv) != 5:
98+
sys.stderr.write("Arguments error. Usage:\n")
99+
sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
100+
sys.exit(1)
101+
102+
in_path = sys.argv[1]
103+
out_path = sys.argv[2]
104+
105+
train_input = os.path.join(in_path, "train.tsv")
106+
test_input = os.path.join(in_path, "test.tsv")
107+
train_output = os.path.join(out_path, "train.pkl")
108+
test_output = os.path.join(out_path, "test.pkl")
109+
110+
max_features = params["max_features"]
111+
ngrams = params["ngrams"]
112+
113+
os.makedirs(out_path, exist_ok=True)
114+
115+
bag_of_words = CountVectorizer(
116+
stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
117+
)
118+
tfidf = TfidfTransformer(smooth_idf=False)
119+
120+
generate_and_save_train_features(
121+
train_input=train_input,
122+
train_output=train_output,
123+
bag_of_words=bag_of_words,
124+
tfidf=tfidf,
125+
)
126+
127+
generate_and_save_test_features(
128+
test_input=test_input,
129+
test_output=test_output,
130+
bag_of_words=bag_of_words,
131+
tfidf=tfidf,
132+
)
133+
134+
135+
if __name__ == "__main__":
136+
main()

0 commit comments

Comments
 (0)