Skip to content

Commit

Permalink
feat(notebook): Update notebook to align with IBM Gallery (#17)
Browse files Browse the repository at this point in the history
* fix(chi2): update chi2 to keep single character

* chore(all): reformat everything

* feat(notebook): align notebook with studio version with minior doc updates

* test(end2end): update end2end test

* chore(version): update version to 1.1.0

* chore(license): update license information
  • Loading branch information
haodeqi authored and navneetrao committed Nov 15, 2019
1 parent ea17fda commit 6a4c729
Show file tree
Hide file tree
Showing 16 changed files with 1,326 additions and 785 deletions.
2 changes: 1 addition & 1 deletion _version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Semantic versioning
# MAJOR.MINOR.PATCH

__version__ = '1.0.2'
__version__ = '1.1.0'
2 changes: 1 addition & 1 deletion assistant_dialog_skill_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.0.2'
__version__ = "1.1.0"

Large diffs are not rendered by default.

210 changes: 128 additions & 82 deletions assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ def _label_percentage(data_frame):
:return: label_percentage_dict: dictionary maps label : % of labels
"""
total_examples = len(data_frame)
label_frequency_dict = dict(Counter(data_frame['intent']).most_common())
percentage_list = np.array(list(label_frequency_dict.values()))/total_examples
label_percentage_dict = dict(zip(list(label_frequency_dict.keys()), percentage_list))
label_frequency_dict = dict(Counter(data_frame["intent"]).most_common())
percentage_list = np.array(list(label_frequency_dict.values())) / total_examples
label_percentage_dict = dict(
zip(list(label_frequency_dict.keys()), percentage_list)
)
return label_percentage_dict


Expand All @@ -26,15 +28,17 @@ def _train_test_coloring(val):
:return:
"""
if val > 25:
color = 'red'
color = "red"
elif val > 10:
color = 'DarkBlue'
color = "DarkBlue"
else:
color = 'green'
return 'color: %s' % color
color = "green"
return "color: %s" % color


def _train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict):
def _train_test_label_difference(
workspace_label_percentage_dict, test_label_percentage_dict
):
"""
analyze the difference between training set and test set
:param workspace_label_percentage_dict:
Expand Down Expand Up @@ -66,9 +70,11 @@ def _train_test_label_difference(workspace_label_percentage_dict, test_label_per
current_difference = np.abs(test_percentage - workspace_percentage)

if key in test_label_percentage_dict:
difference_dict[key] = [workspace_percentage*100,
test_percentage*100,
current_difference*100]
difference_dict[key] = [
workspace_percentage * 100,
test_percentage * 100,
current_difference * 100,
]

js_distance = distance.jensenshannon(distribution1, distribution2, 2.0)

Expand All @@ -86,8 +92,8 @@ def _train_test_vocab_difference(train_set_pd, test_set_pd):
"""
train_vocab = set()
test_vocab = set()
train_set_tokens = train_set_pd['utterance'].apply(word_tokenize)
test_set_tokens = test_set_pd['utterance'].apply(word_tokenize)
train_set_tokens = train_set_pd["utterance"].apply(word_tokenize)
test_set_tokens = test_set_pd["utterance"].apply(word_tokenize)

for tokens in train_set_tokens.tolist():
train_vocab.update(tokens)
Expand All @@ -107,24 +113,26 @@ def _train_test_utterance_length_difference(train_set_pd, test_set_pd):
train_test_legnth_comparison: pandas dataframe [Intent, Absolute Difference]
"""
train_pd_temp = train_set_pd.copy()
train_pd_temp['tokens'] = train_set_pd['utterance'].apply(word_tokenize)
train_pd_temp['Train'] = train_pd_temp['tokens'].apply(len)
train_avg_len_by_label = train_pd_temp[['intent', 'Train']].groupby('intent').mean()
train_pd_temp["tokens"] = train_set_pd["utterance"].apply(word_tokenize)
train_pd_temp["Train"] = train_pd_temp["tokens"].apply(len)
train_avg_len_by_label = train_pd_temp[["intent", "Train"]].groupby("intent").mean()

test_pd_temp = test_set_pd.copy()
test_pd_temp['tokens'] = test_set_pd['utterance'].apply(word_tokenize)
test_pd_temp['Test'] = test_pd_temp['tokens'].apply(len)
test_avg_len_by_label = test_pd_temp[['intent', 'Test']].groupby('intent').mean()

train_test_length_comparison = pd.merge(train_avg_len_by_label,
test_avg_len_by_label, on='intent')
train_test_length_comparison['Absolute Difference'] = \
np.abs(train_test_length_comparison['Train'] - train_test_length_comparison['Test'])
test_pd_temp["tokens"] = test_set_pd["utterance"].apply(word_tokenize)
test_pd_temp["Test"] = test_pd_temp["tokens"].apply(len)
test_avg_len_by_label = test_pd_temp[["intent", "Test"]].groupby("intent").mean()

train_test_length_comparison = pd.merge(
train_avg_len_by_label, test_avg_len_by_label, on="intent"
)
train_test_length_comparison["Absolute Difference"] = np.abs(
train_test_length_comparison["Train"] - train_test_length_comparison["Test"]
)
train_test_length_comparison = train_test_length_comparison.sort_values(
by=["Absolute Difference"], ascending=False)
by=["Absolute Difference"], ascending=False
)
train_test_length_comparison = train_test_length_comparison.reset_index()
train_test_length_comparison.rename(columns={'intent':'Intent'
}, inplace=True)
train_test_length_comparison.rename(columns={"intent": "Intent"}, inplace=True)
return train_test_length_comparison


Expand All @@ -137,8 +145,8 @@ def _get_metrics(results):
recall_dict: maps the {intent: recall}
f1_dict: maps the {intent:f1}
"""
groundtruth = results['correct_intent'].values.tolist()
top_intent = results['top_intent'].values.tolist()
groundtruth = results["correct_intent"].values.tolist()
top_intent = results["top_intent"].values.tolist()
gt_cnt_dict = dict()
pred_cnt_dict = dict()
true_positive_dict = dict()
Expand All @@ -152,13 +160,22 @@ def _get_metrics(results):
f1_dict = dict()
for lb in true_positive_dict:

recall_dict[lb] = true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0

precision_dict[lb] = true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict \
else 0

f1_dict[lb] = 0.0 if recall_dict[lb] == 0 and precision_dict[lb] == 0 \
else 2.0 * recall_dict[lb] * precision_dict[lb] / (recall_dict[lb] + precision_dict[lb])
recall_dict[lb] = (
true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0
)

precision_dict[lb] = (
true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict else 0
)

f1_dict[lb] = (
0.0
if recall_dict[lb] == 0 and precision_dict[lb] == 0
else 2.0
* recall_dict[lb]
* precision_dict[lb]
/ (recall_dict[lb] + precision_dict[lb])
)
return precision_dict, recall_dict, f1_dict


Expand All @@ -172,12 +189,14 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
workspace_label_percentage_dict = _label_percentage(train_set_pd)
test_label_percentage_dict = _label_percentage(test_set_pd)

missing_label, difference_dict, js = \
_train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict)
missing_label, difference_dict, js = _train_test_label_difference(
workspace_label_percentage_dict, test_label_percentage_dict
)
train_vocab, test_vocab = _train_test_vocab_difference(train_set_pd, test_set_pd)

train_test_length_comparison_pd = \
_train_test_utterance_length_difference(train_set_pd, test_set_pd)
train_test_length_comparison_pd = _train_test_utterance_length_difference(
train_set_pd, test_set_pd
)

display(Markdown("## Test Data Evaluation"))

Expand All @@ -186,35 +205,43 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
label = list(difference_dict.keys())
diff = np.round(list(difference_dict.values()), 2)
precision_dict, recall_dict, f1_dict = _get_metrics(results)
precision = np.round([precision_dict[l]*100.0 if l in precision_dict else 0.0
for l in label], 2)
precision = np.round(
[precision_dict[l] * 100.0 if l in precision_dict else 0.0 for l in label],
2,
)

recall = np.round([recall_dict[l]*100.0 if l in recall_dict else 0.0 for l in label], 2)
recall = np.round(
[recall_dict[l] * 100.0 if l in recall_dict else 0.0 for l in label], 2
)

f1 = np.round([f1_dict[l]*100.0 if l in f1_dict else 0.0 for l in label], 2)
f1 = np.round([f1_dict[l] * 100.0 if l in f1_dict else 0.0 for l in label], 2)

train_count_dict = dict(Counter(train_set_pd['intent']))
test_count_dict = dict(Counter(test_set_pd['intent']))
train_count_dict = dict(Counter(train_set_pd["intent"]))
test_count_dict = dict(Counter(test_set_pd["intent"]))
tr_cnt = [train_count_dict[l] if l in train_count_dict else 0.0 for l in label]
te_cnt = [test_count_dict[l] if l in test_count_dict else 0.0 for l in label]

difference_pd = pd.DataFrame({"Intent": label,
"% of Train": diff[:, 0],
"% of Test": diff[:, 1],
"Absolute Difference %": diff[:, 2],
"Train Examples": tr_cnt,
"Test Examples": te_cnt,
"Test Precision %": precision,
"Test Recall %": recall,
"Test F1 %": f1})

if not difference_pd[difference_pd["Absolute Difference %"] > .001].empty:
table_for_display = difference_pd[difference_pd["Absolute Difference %"]
> .001].sort_values(by=["Absolute Difference %"],
ascending=False)
table_for_display = \
table_for_display.style.applymap(_train_test_coloring,
subset=pd.IndexSlice[:, ["Absolute Difference %"]])
difference_pd = pd.DataFrame(
{
"Intent": label,
"% of Train": diff[:, 0],
"% of Test": diff[:, 1],
"Absolute Difference %": diff[:, 2],
"Train Examples": tr_cnt,
"Test Examples": te_cnt,
"Test Precision %": precision,
"Test Recall %": recall,
"Test F1 %": f1,
}
)

if not difference_pd[difference_pd["Absolute Difference %"] > 0.001].empty:
table_for_display = difference_pd[
difference_pd["Absolute Difference %"] > 0.001
].sort_values(by=["Absolute Difference %"], ascending=False)
table_for_display = table_for_display.style.applymap(
_train_test_coloring, subset=pd.IndexSlice[:, ["Absolute Difference %"]]
)
display(table_for_display)
display(Markdown("\n"))
display(Markdown("Distribution Mismatch Color Code"))
Expand All @@ -223,42 +250,61 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
display(Markdown("<font color = 'green'> Green - Good </font>"))

if js >= 0:
js = np.round(js, 2)*100
display(Markdown("### Data Distribution Divergence Test vs Train \
<font color='blue'>{}%</font>" .format(js)))
js = np.round(js, 2) * 100
display(
Markdown(
"### Data Distribution Divergence Test vs Train \
<font color='blue'>{}%</font>".format(
js
)
)
)
display(Markdown("**Note** Metric used is Jensen Shannon Distance"))

if missing_label:
display(Markdown("### Missing Intents in Test Data"))
missing_label_pd = pd.DataFrame(missing_label,
columns=["Missing Intents in Test Set "])
missing_label_pd.index = np.arange(1, len(missing_label_pd)+1)
missing_label_pd = pd.DataFrame(
missing_label, columns=["Missing Intents in Test Set "]
)
missing_label_pd.index = np.arange(1, len(missing_label_pd) + 1)
display(missing_label_pd)

display(Markdown("### Test Data Example Length"))
condition1 = (train_test_length_comparison_pd["Absolute Difference"] /
train_test_length_comparison_pd["Train"] > .3)
condition2 = (train_test_length_comparison_pd["Absolute Difference"] > 3)
condition1 = (
train_test_length_comparison_pd["Absolute Difference"]
/ train_test_length_comparison_pd["Train"]
> 0.3
)
condition2 = train_test_length_comparison_pd["Absolute Difference"] > 3

length_comparison_pd = train_test_length_comparison_pd[condition1 & condition2]

if not length_comparison_pd.empty:
display(Markdown(
"Divergence found in average length of user examples in test vs training data"))
length_comparison_pd.index = np.arange(1, len(length_comparison_pd)+1)
display(
Markdown(
"Divergence found in average length of user examples in test vs training data"
)
)
length_comparison_pd.index = np.arange(1, len(length_comparison_pd) + 1)
display(length_comparison_pd.round(2))
else:
display(Markdown("Average length of user examples is comparable"))

if train_vocab and test_vocab:
display(Markdown("### Vocabulary Size Test vs Train"))
oov_vocab_percentage = (len(test_vocab) - len(train_vocab.intersection(test_vocab))) \
/ len(test_vocab)*100

vocab_df = pd.DataFrame(data={
'Train Vocabulary Size': [len(train_vocab)],
'Test Vocabulary Size': [len(test_vocab)],
'% Test Set Vocabulary not found in Train': [oov_vocab_percentage]})
oov_vocab_percentage = (
(len(test_vocab) - len(train_vocab.intersection(test_vocab)))
/ len(test_vocab)
* 100
)

vocab_df = pd.DataFrame(
data={
"Train Vocabulary Size": [len(train_vocab)],
"Test Vocabulary Size": [len(test_vocab)],
"% Test Set Vocabulary not found in Train": [oov_vocab_percentage],
}
)
vocab_df.index = np.arange(1, len(vocab_df) + 1)
display(vocab_df.round(2))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from IPython.display import display, Markdown, HTML


def ambiguous_examples_analysis(workspace_pd, threshold=.7):
def ambiguous_examples_analysis(workspace_pd, threshold=0.7):
"""
Analyze the test workspace and find out similar utterances that belongs to different intent
:param workspace_pd: pandas dataframe in format of [utterance,label]
Expand All @@ -15,31 +15,49 @@ def ambiguous_examples_analysis(workspace_pd, threshold=.7):
"""
# first create the feature matrix
vectorizer = CountVectorizer(ngram_range=(1, 2))
workspace_bow = vectorizer.fit_transform(workspace_pd['utterance']).todense()
workspace_bow = vectorizer.fit_transform(workspace_pd["utterance"]).todense()
cos_sim_score_matrix = _calculate_cosine_similarity(workspace_bow)

# remove the lower triangle of the matrix and apply threshold
similar_utterance_index = np.argwhere((cos_sim_score_matrix - np.tril(cos_sim_score_matrix))
> threshold)
similar_utterance_pd = pd.DataFrame(columns=['Intent1', 'Utterance1', 'Intent2', 'Utterance2',
'similarity score'])
similar_utterance_index = np.argwhere(
(cos_sim_score_matrix - np.tril(cos_sim_score_matrix)) > threshold
)
similar_utterance_pd = pd.DataFrame(
columns=["Intent1", "Utterance1", "Intent2", "Utterance2", "similarity score"]
)

for index in similar_utterance_index:
if workspace_pd['intent'].iloc[index[0]] != workspace_pd['intent'].iloc[index[1]]:
intent1 = workspace_pd['intent'].iloc[index[0]]
utterance1 = workspace_pd['utterance'].iloc[index[0]]
intent2 = workspace_pd['intent'].iloc[index[1]]
utterance2 = workspace_pd['utterance'].iloc[index[1]]
if (
workspace_pd["intent"].iloc[index[0]]
!= workspace_pd["intent"].iloc[index[1]]
):
intent1 = workspace_pd["intent"].iloc[index[0]]
utterance1 = workspace_pd["utterance"].iloc[index[0]]
intent2 = workspace_pd["intent"].iloc[index[1]]
utterance2 = workspace_pd["utterance"].iloc[index[1]]
score = cos_sim_score_matrix[index[0], index[1]]
temp_pd = pd.DataFrame(
{'Intent1': [intent1], 'Utterance1': [utterance1], 'Intent2': [intent2],
'Utterance2': [utterance2], 'similarity score': [score]})
similar_utterance_pd = similar_utterance_pd.append(temp_pd, ignore_index=True)
{
"Intent1": [intent1],
"Utterance1": [utterance1],
"Intent2": [intent2],
"Utterance2": [utterance2],
"similarity score": [score],
}
)
similar_utterance_pd = similar_utterance_pd.append(
temp_pd, ignore_index=True
)

if not similar_utterance_pd.empty:
with pd.option_context('max_colwidth', 250):
display(HTML(similar_utterance_pd.sort_values(by=['similarity score'],
ascending=False).to_html(index=False)))
with pd.option_context("max_colwidth", 250):
display(
HTML(
similar_utterance_pd.sort_values(
by=["similarity score"], ascending=False
).to_html(index=False)
)
)
else:
display(Markdown("### There are no similar utterances within different Intent"))

Expand Down
Loading

0 comments on commit 6a4c729

Please sign in to comment.