diff --git a/_version.py b/_version.py index 8b77a9a..9297fba 100644 --- a/_version.py +++ b/_version.py @@ -1,4 +1,4 @@ # Semantic versioning # MAJOR.MINOR.PATCH -__version__ = '1.0.2' +__version__ = '1.1.0' diff --git a/assistant_dialog_skill_analysis/__init__.py b/assistant_dialog_skill_analysis/__init__.py index a6221b3..6849410 100644 --- a/assistant_dialog_skill_analysis/__init__.py +++ b/assistant_dialog_skill_analysis/__init__.py @@ -1 +1 @@ -__version__ = '1.0.2' +__version__ = "1.1.0" diff --git a/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py b/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py index 17df373..5b31442 100644 --- a/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py +++ b/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py @@ -17,51 +17,92 @@ def abnormal_conf(full_results, correct_thresh, incorrect_thresh): :return: """ test_pd = pd.DataFrame(full_results) - test_pd = test_pd.loc[~(test_pd['correct_intent'] == 'SYSTEM_OUT_OF_DOMAIN')] - correct = test_pd.loc[test_pd['correct_intent'] == test_pd['top_intent']] + test_pd = test_pd.loc[~(test_pd["correct_intent"] == "SYSTEM_OUT_OF_DOMAIN")] + correct = test_pd.loc[test_pd["correct_intent"] == test_pd["top_intent"]] - correct_low_conf = correct.loc[correct['top_confidence'] < correct_thresh] + correct_low_conf = correct.loc[correct["top_confidence"] < correct_thresh] correct_low_conf = correct_low_conf[ - ['correct_intent', 'utterance', 'top_confidence', 'top_intent']] + ["correct_intent", "utterance", "top_confidence", "top_intent"] + ] - incorrect = test_pd.loc[~(test_pd['correct_intent'] == test_pd['top_intent'])] - incorrect_high_conf = incorrect.loc[incorrect['top_confidence'] > incorrect_thresh] + incorrect = test_pd.loc[~(test_pd["correct_intent"] == test_pd["top_intent"])] + incorrect_high_conf = incorrect.loc[incorrect["top_confidence"] > incorrect_thresh] top1 = list() top2 = list() top3 = list() for i in range(len(incorrect_high_conf)): - possible_range = len(incorrect_high_conf.iloc[i, :]['top_predicts']) + possible_range = len(incorrect_high_conf.iloc[i, :]["top_predicts"]) for j in range(3): if j == 0: if possible_range >= 1: - top1.append(incorrect_high_conf.iloc[i, :]['top_predicts'][j]['intent'] + ' ' + - '(' + str(np.round(incorrect_high_conf.iloc[i, :]['top_predicts'][j] - ['confidence'], 3)) + ')') + top1.append( + incorrect_high_conf.iloc[i, :]["top_predicts"][j]["intent"] + + " " + + "(" + + str( + np.round( + incorrect_high_conf.iloc[i, :]["top_predicts"][j][ + "confidence" + ], + 3, + ) + ) + + ")" + ) else: - top1.append('NA') + top1.append("NA") if j == 1: if possible_range >= 2: - top2.append(incorrect_high_conf.iloc[i, :]['top_predicts'][j]['intent'] + ' ' + - '(' + str(np.round(incorrect_high_conf.iloc[i, :]['top_predicts'][j] - ['confidence'], 3)) + ')') + top2.append( + incorrect_high_conf.iloc[i, :]["top_predicts"][j]["intent"] + + " " + + "(" + + str( + np.round( + incorrect_high_conf.iloc[i, :]["top_predicts"][j][ + "confidence" + ], + 3, + ) + ) + + ")" + ) else: - top2.append('NA') + top2.append("NA") if j == 2: if possible_range >= 3: - top3.append(incorrect_high_conf.iloc[i, :]['top_predicts'][j]['intent'] + ' ' + - '(' + str(np.round(incorrect_high_conf.iloc[i, :]['top_predicts'][j] - ['confidence'], 3)) + ')') + top3.append( + incorrect_high_conf.iloc[i, :]["top_predicts"][j]["intent"] + + " " + + "(" + + str( + np.round( + incorrect_high_conf.iloc[i, :]["top_predicts"][j][ + "confidence" + ], + 3, + ) + ) + + ")" + ) else: - top3.append('NA') + top3.append("NA") - incorrect_high_conf['top1_prediction'] = top1 - incorrect_high_conf['top2_prediction'] = top2 - incorrect_high_conf['top3_prediction'] = top3 + incorrect_high_conf["top1_prediction"] = top1 + incorrect_high_conf["top2_prediction"] = top2 + incorrect_high_conf["top3_prediction"] = top3 incorrect_high_conf = incorrect_high_conf[ - ['correct_intent', 'utterance', 'top1_prediction', 'top2_prediction', 'top3_prediction']] + [ + "correct_intent", + "utterance", + "top1_prediction", + "top2_prediction", + "top3_prediction", + ] + ] return correct_low_conf, incorrect_high_conf @@ -79,13 +120,13 @@ def analysis(results, intent_list=None): analysis_df = analysis_pipeline(results) return analysis_df - if len(intent_list) == 1 and intent_list[0] == 'ALL_INTENTS': - intent_list = list(results['correct_intent'].unique()) + if len(intent_list) == 1 and intent_list[0] == "ALL_INTENTS": + intent_list = list(results["correct_intent"].unique()) if OFFTOPIC_LABEL in intent_list: intent_list.remove(OFFTOPIC_LABEL) analysis_df_list = list() for intent_name in intent_list: - display(Markdown('### Threshold Analysis for Intent: {}'.format(intent_name))) + display(Markdown("### Threshold Analysis for Intent: {}".format(intent_name))) analysis_df = analysis_pipeline(results, intent_name) if all(analysis_df): analysis_df.index = np.arange(1, len(analysis_df) + 1) @@ -94,37 +135,58 @@ def analysis(results, intent_list=None): return analysis_df_list + def _display_analysis_metrics(display_far): """display the explanation for analysis metrics""" display(Markdown("### Threshold Metrics")) - display(Markdown( - "We calculate metrics for responses where the top intent has a confidence above the \ - threshold specified on the x-axis. ")) - - display(Markdown( - "We consider examples which are within the scope of the chatbot's problem formulation as \ + display( + Markdown( + "We calculate metrics for responses where the top intent has a confidence above the \ + threshold specified on the x-axis. " + ) + ) + + display( + Markdown( + "We consider examples which are within the scope of the chatbot's problem formulation as \ on topic or in domain and those examples which are outside the scope of the problem to be \ - out of domain or irrelevant")) + out of domain or irrelevant" + ) + ) display(Markdown("#### 1) Thresholded On Topic Accuracy (TOA)")) - display(Markdown( - "x-axis: Confidence threshold used || " + - "y-axis: Intent Detection Accuracy for On Topic utterances")) + display( + Markdown( + "x-axis: Confidence threshold used || " + + "y-axis: Intent Detection Accuracy for On Topic utterances" + ) + ) display(Markdown("#### 2) Bot Coverage %")) - display(Markdown( - "x-axis: Confidence threshold used || " + - "y-axis: Fraction of All utterances above the threshold")) + display( + Markdown( + "x-axis: Confidence threshold used || " + + "y-axis: Fraction of All utterances above the threshold" + ) + ) if display_far: - display(Markdown("#### 3) False Acceptance Rate for Out of Domain Examples (FAR)")) - display(Markdown( - "x-axis: Confidence threshold used || " + - "y-axis: Fraction of Out of Domain utterances falsely considered on topic")) - - display(Markdown( - "#### Note: Default acceptance threshold for Watson Assistant is set at 0.2.\ - Utterances with top intent confidence < 0.2 will be considered irrelevant")) + display( + Markdown("#### 3) False Acceptance Rate for Out of Domain Examples (FAR)") + ) + display( + Markdown( + "x-axis: Confidence threshold used || " + + "y-axis: Fraction of Out of Domain utterances falsely considered on topic" + ) + ) + + display( + Markdown( + "#### Note: Default acceptance threshold for Watson Assistant is set at 0.2.\ + Utterances with top intent confidence < 0.2 will be considered irrelevant" + ) + ) def generate_unique_thresholds(sorted_results_tuples): @@ -135,8 +197,12 @@ def generate_unique_thresholds(sorted_results_tuples): """ sort_uniq_confs = list(sorted(set([info[2] for info in sorted_results_tuples]))) thresholds = [0] - thresholds.extend([(sort_uniq_confs[idx] + sort_uniq_confs[idx + 1]) / 2 - for idx in range(len(sort_uniq_confs) - 1)]) + thresholds.extend( + [ + (sort_uniq_confs[idx] + sort_uniq_confs[idx + 1]) / 2 + for idx in range(len(sort_uniq_confs) - 1) + ] + ) return thresholds, sort_uniq_confs @@ -202,7 +268,7 @@ def _get_bot_coverage_list(sorted_infos, thresholds): cur_bot_coverage -= 1 current_step += 1 bot_coverage_count_list.append(cur_bot_coverage) - bot_coverage_list.append(cur_bot_coverage/tol) + bot_coverage_list.append(cur_bot_coverage / tol) return bot_coverage_list, bot_coverage_count_list @@ -226,7 +292,7 @@ def _get_far_list(sorted_infos, thresholds): current_step += 1 else: break - far_list.append(cur_fa_count/tol) + far_list.append(cur_fa_count / tol) far_count.append(cur_fa_count) return far_list, far_count @@ -240,27 +306,38 @@ def _convert_data_format(results, intent_name=None): :return: result_list: list of tuples of (ground_truth, prediction, confidence) sorted by conf """ if intent_name: - results = results[(results['correct_intent'] == intent_name) | - (results['top_intent'] == intent_name)].copy() - - results['correct_intent'] = np.where((results['correct_intent'] != - results['top_intent']) & - (results['top_intent'] == intent_name), - OFFTOPIC_LABEL, - results['correct_intent']) - - results_list = [(gt, pred, conf) for gt, pred, conf in - zip(results['correct_intent'], - results['top_intent'], - results['top_confidence'])] + results = results[ + (results["correct_intent"] == intent_name) + | (results["top_intent"] == intent_name) + ].copy() + + results["correct_intent"] = np.where( + (results["correct_intent"] != results["top_intent"]) + & (results["top_intent"] == intent_name), + OFFTOPIC_LABEL, + results["correct_intent"], + ) + + results_list = [ + (gt, pred, conf) + for gt, pred, conf in zip( + results["correct_intent"], + results["top_intent"], + results["top_confidence"], + ) + ] results_list = sorted(results_list, key=lambda x: x[2]) else: - results_list = [(truth, prediction, confidence) for truth, prediction, confidence - in zip(results['correct_intent'], - results['top_intent'], - results['top_confidence'])] + results_list = [ + (truth, prediction, confidence) + for truth, prediction, confidence in zip( + results["correct_intent"], + results["top_intent"], + results["top_confidence"], + ) + ] results_list = sorted(results_list, key=lambda x: x[2]) return results_list @@ -273,11 +350,13 @@ def extract_by_topic(sorted_results): :return: ontopic_infos, list """ - offtopic_infos = [prediction for prediction in sorted_results - if prediction[0] == OFFTOPIC_LABEL] + offtopic_infos = [ + prediction for prediction in sorted_results if prediction[0] == OFFTOPIC_LABEL + ] - ontopic_infos = [prediction for prediction in sorted_results - if prediction[0] != OFFTOPIC_LABEL] + ontopic_infos = [ + prediction for prediction in sorted_results if prediction[0] != OFFTOPIC_LABEL + ] return ontopic_infos, offtopic_infos @@ -295,23 +374,32 @@ def analysis_pipeline(results, intent_name=None): # if ontopic counts or sorted results are less than 3, the graph will show almost no variation # if all confidence of the predicted result are the same, there will be no variation - if len(ontopic_infos) < 3 or len(sorted_results) < 3 \ - or all(ele[2] == sorted_results[0][2] for ele in sorted_results): - display(Markdown('**Inadequate Data Points**: No analysis will be conducted')) + if ( + len(ontopic_infos) < 3 + or len(sorted_results) < 3 + or all(ele[2] == sorted_results[0][2] for ele in sorted_results) + ): + display(Markdown("**Inadequate Data Points**: No analysis will be conducted")) analysis_df = pd.DataFrame() return analysis_df - analysis_df, toa_list, bot_coverage_list, far_list, thresholds = \ - extract_table_analysis(sorted_results, - ontopic_infos, - offtopic_infos) + ( + analysis_df, + toa_list, + bot_coverage_list, + far_list, + thresholds, + ) = extract_table_analysis(sorted_results, ontopic_infos, offtopic_infos) if not intent_name and not analysis_df.empty: - line_graph_data = pd.DataFrame(data={'Thresholded On Topic Accuracy': toa_list, - 'Bot Coverage %': bot_coverage_list, - 'False Acceptance Rate (FAR) for Out of Domain Examples': - far_list}, - index=thresholds) + line_graph_data = pd.DataFrame( + data={ + "Thresholded On Topic Accuracy": toa_list, + "Bot Coverage %": bot_coverage_list, + "False Acceptance Rate (FAR) for Out of Domain Examples": far_list, + }, + index=thresholds, + ) create_threshold_graph(line_graph_data) @@ -332,25 +420,32 @@ def extract_table_analysis(sorted_results, ontopic_infos, offtopic_infos): thresholds, sort_uniq_confs = generate_unique_thresholds(sorted_results) toa_list, toa_count = _get_ontopic_accuracy_list(sorted_results, thresholds) - bot_coverage_list, bot_coverage_count = _get_bot_coverage_list(sorted_results, thresholds) + bot_coverage_list, bot_coverage_count = _get_bot_coverage_list( + sorted_results, thresholds + ) if len(offtopic_infos) >= OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY: far_list, _ = _get_far_list(sorted_results, thresholds) else: - display(Markdown( - 'Out of Domain examples fewer than **%d** thus \ - no False Acceptance Rate (FAR) calculated' - % OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY)) - far_list = [-1]*len(thresholds) - - analysis_df = create_display_table(toa_list, - bot_coverage_list, - bot_coverage_count, - sorted_results, - thresholds, - offtopic_infos, - far_list) + display( + Markdown( + "Out of Domain examples fewer than **%d** thus \ + no False Acceptance Rate (FAR) calculated" + % OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY + ) + ) + far_list = [-1] * len(thresholds) + + analysis_df = create_display_table( + toa_list, + bot_coverage_list, + bot_coverage_count, + sorted_results, + thresholds, + offtopic_infos, + far_list, + ) return analysis_df, toa_list, bot_coverage_list, far_list, thresholds @@ -361,21 +456,24 @@ def create_threshold_graph(data): :param data: :return: None """ - sns.set(rc={'figure.figsize': (20.7, 10.27)}) + sns.set(rc={"figure.figsize": (20.7, 10.27)}) plt.ylim(0, 1.1) - plt.axvline(.2, 0, 1) + plt.axvline(0.2, 0, 1) plot = sns.lineplot(data=data, palette="tab10", linewidth=3.5) - plt.setp(plot.legend().get_texts(), fontsize='22') - plot.set_xlabel('Threshold T', fontsize=18) - plot.set_ylabel('Metrics mentioned above', fontsize=18) - -def create_display_table(toa_list, - bot_coverage_list, - bot_coverage_count, - sorted_results, - thresholds, - offtopic_infos, - far_list): + plt.setp(plot.legend().get_texts(), fontsize="22") + plot.set_xlabel("Threshold T", fontsize=18) + plot.set_ylabel("Metrics mentioned above", fontsize=18) + + +def create_display_table( + toa_list, + bot_coverage_list, + bot_coverage_count, + sorted_results, + thresholds, + offtopic_infos, + far_list, +): """ create table for display purpose :param toa_list: @@ -388,20 +486,28 @@ def create_display_table(toa_list, :return: analysis_df, pandas dataframe containing metrics at intervals of 10% """ # produce the threhold quantiles for extraction of relevant information - display_thresholds = [t/100 for t in range(0, 100, 10)] + display_thresholds = [t / 100 for t in range(0, 100, 10)] display_indexes = [_find_threshold(t, thresholds) for t in display_thresholds] analysis_data = dict() - analysis_data['Threshold (T)'] = display_thresholds - analysis_data['Ontopic Accuracy (TOA)'] = [toa_list[idx]*100 for idx in display_indexes] - analysis_data['Bot Coverage %'] = [bot_coverage_list[idx]*100 for idx in display_indexes] - analysis_data['Bot Coverage Counts'] = [str(np.round(bot_coverage_count[idx], decimals=0)) - + ' / ' + str(len(sorted_results)) - for idx in display_indexes] + analysis_data["Threshold (T)"] = display_thresholds + analysis_data["Ontopic Accuracy (TOA)"] = [ + toa_list[idx] * 100 for idx in display_indexes + ] + analysis_data["Bot Coverage %"] = [ + bot_coverage_list[idx] * 100 for idx in display_indexes + ] + analysis_data["Bot Coverage Counts"] = [ + str(np.round(bot_coverage_count[idx], decimals=0)) + + " / " + + str(len(sorted_results)) + for idx in display_indexes + ] if len(offtopic_infos) >= OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY: - analysis_data['False Acceptance Rate (FAR)'] = [far_list[idx]*100 for - idx in display_indexes] + analysis_data["False Acceptance Rate (FAR)"] = [ + far_list[idx] * 100 for idx in display_indexes + ] analysis_df = pd.DataFrame(data=analysis_data) return analysis_df diff --git a/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py b/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py index cb49d2a..a7a7abe 100644 --- a/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py +++ b/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py @@ -13,9 +13,11 @@ def _label_percentage(data_frame): :return: label_percentage_dict: dictionary maps label : % of labels """ total_examples = len(data_frame) - label_frequency_dict = dict(Counter(data_frame['intent']).most_common()) - percentage_list = np.array(list(label_frequency_dict.values()))/total_examples - label_percentage_dict = dict(zip(list(label_frequency_dict.keys()), percentage_list)) + label_frequency_dict = dict(Counter(data_frame["intent"]).most_common()) + percentage_list = np.array(list(label_frequency_dict.values())) / total_examples + label_percentage_dict = dict( + zip(list(label_frequency_dict.keys()), percentage_list) + ) return label_percentage_dict @@ -26,15 +28,17 @@ def _train_test_coloring(val): :return: """ if val > 25: - color = 'red' + color = "red" elif val > 10: - color = 'DarkBlue' + color = "DarkBlue" else: - color = 'green' - return 'color: %s' % color + color = "green" + return "color: %s" % color -def _train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict): +def _train_test_label_difference( + workspace_label_percentage_dict, test_label_percentage_dict +): """ analyze the difference between training set and test set :param workspace_label_percentage_dict: @@ -66,9 +70,11 @@ def _train_test_label_difference(workspace_label_percentage_dict, test_label_per current_difference = np.abs(test_percentage - workspace_percentage) if key in test_label_percentage_dict: - difference_dict[key] = [workspace_percentage*100, - test_percentage*100, - current_difference*100] + difference_dict[key] = [ + workspace_percentage * 100, + test_percentage * 100, + current_difference * 100, + ] js_distance = distance.jensenshannon(distribution1, distribution2, 2.0) @@ -86,8 +92,8 @@ def _train_test_vocab_difference(train_set_pd, test_set_pd): """ train_vocab = set() test_vocab = set() - train_set_tokens = train_set_pd['utterance'].apply(word_tokenize) - test_set_tokens = test_set_pd['utterance'].apply(word_tokenize) + train_set_tokens = train_set_pd["utterance"].apply(word_tokenize) + test_set_tokens = test_set_pd["utterance"].apply(word_tokenize) for tokens in train_set_tokens.tolist(): train_vocab.update(tokens) @@ -107,24 +113,26 @@ def _train_test_utterance_length_difference(train_set_pd, test_set_pd): train_test_legnth_comparison: pandas dataframe [Intent, Absolute Difference] """ train_pd_temp = train_set_pd.copy() - train_pd_temp['tokens'] = train_set_pd['utterance'].apply(word_tokenize) - train_pd_temp['Train'] = train_pd_temp['tokens'].apply(len) - train_avg_len_by_label = train_pd_temp[['intent', 'Train']].groupby('intent').mean() + train_pd_temp["tokens"] = train_set_pd["utterance"].apply(word_tokenize) + train_pd_temp["Train"] = train_pd_temp["tokens"].apply(len) + train_avg_len_by_label = train_pd_temp[["intent", "Train"]].groupby("intent").mean() test_pd_temp = test_set_pd.copy() - test_pd_temp['tokens'] = test_set_pd['utterance'].apply(word_tokenize) - test_pd_temp['Test'] = test_pd_temp['tokens'].apply(len) - test_avg_len_by_label = test_pd_temp[['intent', 'Test']].groupby('intent').mean() - - train_test_length_comparison = pd.merge(train_avg_len_by_label, - test_avg_len_by_label, on='intent') - train_test_length_comparison['Absolute Difference'] = \ - np.abs(train_test_length_comparison['Train'] - train_test_length_comparison['Test']) + test_pd_temp["tokens"] = test_set_pd["utterance"].apply(word_tokenize) + test_pd_temp["Test"] = test_pd_temp["tokens"].apply(len) + test_avg_len_by_label = test_pd_temp[["intent", "Test"]].groupby("intent").mean() + + train_test_length_comparison = pd.merge( + train_avg_len_by_label, test_avg_len_by_label, on="intent" + ) + train_test_length_comparison["Absolute Difference"] = np.abs( + train_test_length_comparison["Train"] - train_test_length_comparison["Test"] + ) train_test_length_comparison = train_test_length_comparison.sort_values( - by=["Absolute Difference"], ascending=False) + by=["Absolute Difference"], ascending=False + ) train_test_length_comparison = train_test_length_comparison.reset_index() - train_test_length_comparison.rename(columns={'intent':'Intent' - }, inplace=True) + train_test_length_comparison.rename(columns={"intent": "Intent"}, inplace=True) return train_test_length_comparison @@ -137,8 +145,8 @@ def _get_metrics(results): recall_dict: maps the {intent: recall} f1_dict: maps the {intent:f1} """ - groundtruth = results['correct_intent'].values.tolist() - top_intent = results['top_intent'].values.tolist() + groundtruth = results["correct_intent"].values.tolist() + top_intent = results["top_intent"].values.tolist() gt_cnt_dict = dict() pred_cnt_dict = dict() true_positive_dict = dict() @@ -152,13 +160,22 @@ def _get_metrics(results): f1_dict = dict() for lb in true_positive_dict: - recall_dict[lb] = true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0 - - precision_dict[lb] = true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict \ - else 0 - - f1_dict[lb] = 0.0 if recall_dict[lb] == 0 and precision_dict[lb] == 0 \ - else 2.0 * recall_dict[lb] * precision_dict[lb] / (recall_dict[lb] + precision_dict[lb]) + recall_dict[lb] = ( + true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0 + ) + + precision_dict[lb] = ( + true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict else 0 + ) + + f1_dict[lb] = ( + 0.0 + if recall_dict[lb] == 0 and precision_dict[lb] == 0 + else 2.0 + * recall_dict[lb] + * precision_dict[lb] + / (recall_dict[lb] + precision_dict[lb]) + ) return precision_dict, recall_dict, f1_dict @@ -172,12 +189,14 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results): workspace_label_percentage_dict = _label_percentage(train_set_pd) test_label_percentage_dict = _label_percentage(test_set_pd) - missing_label, difference_dict, js = \ - _train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict) + missing_label, difference_dict, js = _train_test_label_difference( + workspace_label_percentage_dict, test_label_percentage_dict + ) train_vocab, test_vocab = _train_test_vocab_difference(train_set_pd, test_set_pd) - train_test_length_comparison_pd = \ - _train_test_utterance_length_difference(train_set_pd, test_set_pd) + train_test_length_comparison_pd = _train_test_utterance_length_difference( + train_set_pd, test_set_pd + ) display(Markdown("## Test Data Evaluation")) @@ -186,35 +205,43 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results): label = list(difference_dict.keys()) diff = np.round(list(difference_dict.values()), 2) precision_dict, recall_dict, f1_dict = _get_metrics(results) - precision = np.round([precision_dict[l]*100.0 if l in precision_dict else 0.0 - for l in label], 2) + precision = np.round( + [precision_dict[l] * 100.0 if l in precision_dict else 0.0 for l in label], + 2, + ) - recall = np.round([recall_dict[l]*100.0 if l in recall_dict else 0.0 for l in label], 2) + recall = np.round( + [recall_dict[l] * 100.0 if l in recall_dict else 0.0 for l in label], 2 + ) - f1 = np.round([f1_dict[l]*100.0 if l in f1_dict else 0.0 for l in label], 2) + f1 = np.round([f1_dict[l] * 100.0 if l in f1_dict else 0.0 for l in label], 2) - train_count_dict = dict(Counter(train_set_pd['intent'])) - test_count_dict = dict(Counter(test_set_pd['intent'])) + train_count_dict = dict(Counter(train_set_pd["intent"])) + test_count_dict = dict(Counter(test_set_pd["intent"])) tr_cnt = [train_count_dict[l] if l in train_count_dict else 0.0 for l in label] te_cnt = [test_count_dict[l] if l in test_count_dict else 0.0 for l in label] - difference_pd = pd.DataFrame({"Intent": label, - "% of Train": diff[:, 0], - "% of Test": diff[:, 1], - "Absolute Difference %": diff[:, 2], - "Train Examples": tr_cnt, - "Test Examples": te_cnt, - "Test Precision %": precision, - "Test Recall %": recall, - "Test F1 %": f1}) - - if not difference_pd[difference_pd["Absolute Difference %"] > .001].empty: - table_for_display = difference_pd[difference_pd["Absolute Difference %"] - > .001].sort_values(by=["Absolute Difference %"], - ascending=False) - table_for_display = \ - table_for_display.style.applymap(_train_test_coloring, - subset=pd.IndexSlice[:, ["Absolute Difference %"]]) + difference_pd = pd.DataFrame( + { + "Intent": label, + "% of Train": diff[:, 0], + "% of Test": diff[:, 1], + "Absolute Difference %": diff[:, 2], + "Train Examples": tr_cnt, + "Test Examples": te_cnt, + "Test Precision %": precision, + "Test Recall %": recall, + "Test F1 %": f1, + } + ) + + if not difference_pd[difference_pd["Absolute Difference %"] > 0.001].empty: + table_for_display = difference_pd[ + difference_pd["Absolute Difference %"] > 0.001 + ].sort_values(by=["Absolute Difference %"], ascending=False) + table_for_display = table_for_display.style.applymap( + _train_test_coloring, subset=pd.IndexSlice[:, ["Absolute Difference %"]] + ) display(table_for_display) display(Markdown("\n")) display(Markdown("Distribution Mismatch Color Code")) @@ -223,42 +250,61 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results): display(Markdown(" Green - Good ")) if js >= 0: - js = np.round(js, 2)*100 - display(Markdown("### Data Distribution Divergence Test vs Train \ - {}%" .format(js))) + js = np.round(js, 2) * 100 + display( + Markdown( + "### Data Distribution Divergence Test vs Train \ + {}%".format( + js + ) + ) + ) display(Markdown("**Note** Metric used is Jensen Shannon Distance")) if missing_label: display(Markdown("### Missing Intents in Test Data")) - missing_label_pd = pd.DataFrame(missing_label, - columns=["Missing Intents in Test Set "]) - missing_label_pd.index = np.arange(1, len(missing_label_pd)+1) + missing_label_pd = pd.DataFrame( + missing_label, columns=["Missing Intents in Test Set "] + ) + missing_label_pd.index = np.arange(1, len(missing_label_pd) + 1) display(missing_label_pd) display(Markdown("### Test Data Example Length")) - condition1 = (train_test_length_comparison_pd["Absolute Difference"] / - train_test_length_comparison_pd["Train"] > .3) - condition2 = (train_test_length_comparison_pd["Absolute Difference"] > 3) + condition1 = ( + train_test_length_comparison_pd["Absolute Difference"] + / train_test_length_comparison_pd["Train"] + > 0.3 + ) + condition2 = train_test_length_comparison_pd["Absolute Difference"] > 3 length_comparison_pd = train_test_length_comparison_pd[condition1 & condition2] if not length_comparison_pd.empty: - display(Markdown( - "Divergence found in average length of user examples in test vs training data")) - length_comparison_pd.index = np.arange(1, len(length_comparison_pd)+1) + display( + Markdown( + "Divergence found in average length of user examples in test vs training data" + ) + ) + length_comparison_pd.index = np.arange(1, len(length_comparison_pd) + 1) display(length_comparison_pd.round(2)) else: display(Markdown("Average length of user examples is comparable")) if train_vocab and test_vocab: display(Markdown("### Vocabulary Size Test vs Train")) - oov_vocab_percentage = (len(test_vocab) - len(train_vocab.intersection(test_vocab))) \ - / len(test_vocab)*100 - - vocab_df = pd.DataFrame(data={ - 'Train Vocabulary Size': [len(train_vocab)], - 'Test Vocabulary Size': [len(test_vocab)], - '% Test Set Vocabulary not found in Train': [oov_vocab_percentage]}) + oov_vocab_percentage = ( + (len(test_vocab) - len(train_vocab.intersection(test_vocab))) + / len(test_vocab) + * 100 + ) + + vocab_df = pd.DataFrame( + data={ + "Train Vocabulary Size": [len(train_vocab)], + "Test Vocabulary Size": [len(test_vocab)], + "% Test Set Vocabulary not found in Train": [oov_vocab_percentage], + } + ) vocab_df.index = np.arange(1, len(vocab_df) + 1) display(vocab_df.round(2)) diff --git a/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py b/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py index 271c424..84b7bfb 100644 --- a/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py +++ b/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py @@ -5,7 +5,7 @@ from IPython.display import display, Markdown, HTML -def ambiguous_examples_analysis(workspace_pd, threshold=.7): +def ambiguous_examples_analysis(workspace_pd, threshold=0.7): """ Analyze the test workspace and find out similar utterances that belongs to different intent :param workspace_pd: pandas dataframe in format of [utterance,label] @@ -15,31 +15,49 @@ def ambiguous_examples_analysis(workspace_pd, threshold=.7): """ # first create the feature matrix vectorizer = CountVectorizer(ngram_range=(1, 2)) - workspace_bow = vectorizer.fit_transform(workspace_pd['utterance']).todense() + workspace_bow = vectorizer.fit_transform(workspace_pd["utterance"]).todense() cos_sim_score_matrix = _calculate_cosine_similarity(workspace_bow) # remove the lower triangle of the matrix and apply threshold - similar_utterance_index = np.argwhere((cos_sim_score_matrix - np.tril(cos_sim_score_matrix)) - > threshold) - similar_utterance_pd = pd.DataFrame(columns=['Intent1', 'Utterance1', 'Intent2', 'Utterance2', - 'similarity score']) + similar_utterance_index = np.argwhere( + (cos_sim_score_matrix - np.tril(cos_sim_score_matrix)) > threshold + ) + similar_utterance_pd = pd.DataFrame( + columns=["Intent1", "Utterance1", "Intent2", "Utterance2", "similarity score"] + ) for index in similar_utterance_index: - if workspace_pd['intent'].iloc[index[0]] != workspace_pd['intent'].iloc[index[1]]: - intent1 = workspace_pd['intent'].iloc[index[0]] - utterance1 = workspace_pd['utterance'].iloc[index[0]] - intent2 = workspace_pd['intent'].iloc[index[1]] - utterance2 = workspace_pd['utterance'].iloc[index[1]] + if ( + workspace_pd["intent"].iloc[index[0]] + != workspace_pd["intent"].iloc[index[1]] + ): + intent1 = workspace_pd["intent"].iloc[index[0]] + utterance1 = workspace_pd["utterance"].iloc[index[0]] + intent2 = workspace_pd["intent"].iloc[index[1]] + utterance2 = workspace_pd["utterance"].iloc[index[1]] score = cos_sim_score_matrix[index[0], index[1]] temp_pd = pd.DataFrame( - {'Intent1': [intent1], 'Utterance1': [utterance1], 'Intent2': [intent2], - 'Utterance2': [utterance2], 'similarity score': [score]}) - similar_utterance_pd = similar_utterance_pd.append(temp_pd, ignore_index=True) + { + "Intent1": [intent1], + "Utterance1": [utterance1], + "Intent2": [intent2], + "Utterance2": [utterance2], + "similarity score": [score], + } + ) + similar_utterance_pd = similar_utterance_pd.append( + temp_pd, ignore_index=True + ) if not similar_utterance_pd.empty: - with pd.option_context('max_colwidth', 250): - display(HTML(similar_utterance_pd.sort_values(by=['similarity score'], - ascending=False).to_html(index=False))) + with pd.option_context("max_colwidth", 250): + display( + HTML( + similar_utterance_pd.sort_values( + by=["similarity score"], ascending=False + ).to_html(index=False) + ) + ) else: display(Markdown("### There are no similar utterances within different Intent")) diff --git a/assistant_dialog_skill_analysis/data_analysis/summary_generator.py b/assistant_dialog_skill_analysis/data_analysis/summary_generator.py index 354b80a..ce2e3da 100644 --- a/assistant_dialog_skill_analysis/data_analysis/summary_generator.py +++ b/assistant_dialog_skill_analysis/data_analysis/summary_generator.py @@ -15,8 +15,8 @@ def generate_summary_statistics(data, entities_list=None): :return: """ - total_examples = len(data['utterance']) - label_frequency = Counter(data['intent']).most_common() + total_examples = len(data["utterance"]) + label_frequency = Counter(data["intent"]).most_common() number_of_labels = len(label_frequency) average_example_per_intent = np.average(list(dict(label_frequency).values())) standard_deviation_of_intent = np.std(list(dict(label_frequency).values())) @@ -25,19 +25,25 @@ def generate_summary_statistics(data, entities_list=None): characteristics.append(["Total User Examples", total_examples]) characteristics.append(["Unique Intents", number_of_labels]) characteristics.append( - ["Average User Examples per Intent", int(np.around(average_example_per_intent))]) + ["Average User Examples per Intent", int(np.around(average_example_per_intent))] + ) characteristics.append( - ["Standard Deviation from Average", int(np.around(standard_deviation_of_intent))]) + [ + "Standard Deviation from Average", + int(np.around(standard_deviation_of_intent)), + ] + ) if entities_list: characteristics.append(["Total Number of Entities", len(entities_list)]) else: characteristics.append(["Total Number of Entities", 0]) - df = pd.DataFrame(data=characteristics, columns=['Data Characteristic', 'Value']) - df.index = np.arange(1, len(df)+1) + df = pd.DataFrame(data=characteristics, columns=["Data Characteristic", "Value"]) + df.index = np.arange(1, len(df) + 1) display(Markdown("### Summary Statistics")) display(df) + def show_user_examples_per_intent(data): """ Take the workspace dictionary and display summary statistics regarding the workspace @@ -45,13 +51,14 @@ def show_user_examples_per_intent(data): :return: """ - label_frequency = Counter(data['intent']).most_common() + label_frequency = Counter(data["intent"]).most_common() frequencies = list(reversed(label_frequency)) - df = pd.DataFrame(data=frequencies, columns=['Intent', 'Number of User Examples']) + df = pd.DataFrame(data=frequencies, columns=["Intent", "Number of User Examples"]) df.index = np.arange(1, len(df) + 1) display(Markdown("### Sorted Distribution of User Examples per Intent")) display(df) + def scatter_plot_intent_dist(workspace_pd): """ takes the workspace_pd and generate a scatter distribution of the intents @@ -59,18 +66,22 @@ def scatter_plot_intent_dist(workspace_pd): :return: """ - label_frequency = Counter(workspace_pd['intent']).most_common() + label_frequency = Counter(workspace_pd["intent"]).most_common() frequencies = list(reversed(label_frequency)) counter_list = list(range(1, len(frequencies) + 1)) - df = pd.DataFrame(data=frequencies, columns=['Intent', 'Number of User Examples']) - df['Intent'] = counter_list - - sns.set(rc={'figure.figsize': (15, 10)}) - display(Markdown('##

Sorted Distribution of User Examples \ - per Intent

')) - - plt.ylabel('Number of User Examples', fontdict=LABEL_FONT) - plt.xlabel('Intent', fontdict=LABEL_FONT) + df = pd.DataFrame(data=frequencies, columns=["Intent", "Number of User Examples"]) + df["Intent"] = counter_list + + sns.set(rc={"figure.figsize": (15, 10)}) + display( + Markdown( + '##

Sorted Distribution of User Examples \ + per Intent

' + ) + ) + + plt.ylabel("Number of User Examples", fontdict=LABEL_FONT) + plt.xlabel("Intent", fontdict=LABEL_FONT) ax = sns.scatterplot(x="Intent", y="Number of User Examples", data=df, s=100) @@ -81,24 +92,46 @@ def class_imbalance_analysis(workspace_pd): :return: """ - label_frequency = Counter(workspace_pd['intent']).most_common() + label_frequency = Counter(workspace_pd["intent"]).most_common() frequencies = list(reversed(label_frequency)) min_class, min_class_len = frequencies[0] max_class, max_class_len = frequencies[-1] - if max_class_len >= 2*min_class_len: - display(Markdown("### Class Imbalance Detected \ - ")) - display(Markdown("- Data could be potentially biased towards intents with more user \ - examples")) - display(Markdown("- E.g. Intent < {} > has < {} > user examples while intent < {} > has \ - just < {} > user examples ".format(max_class, max_class_len, min_class, min_class_len))) + if max_class_len >= 2 * min_class_len: + display( + Markdown( + "### Class Imbalance Detected \ + " + ) + ) + display( + Markdown( + "- Data could be potentially biased towards intents with more user \ + examples" + ) + ) + display( + Markdown( + "- E.g. Intent < {} > has < {} > user examples while intent < {} > has \ + just < {} > user examples ".format( + max_class, max_class_len, min_class, min_class_len + ) + ) + ) flag = True else: - display(Markdown("### No Significant Class \ - Imbalance Detected ")) - display(Markdown("- Lower chances of inherent bias in classification towards intents with \ - more user examples")) + display( + Markdown( + "### No Significant Class \ + Imbalance Detected " + ) + ) + display( + Markdown( + "- Lower chances of inherent bias in classification towards intents with \ + more user examples" + ) + ) flag = False return flag diff --git a/assistant_dialog_skill_analysis/experimentation/data_manipulator.py b/assistant_dialog_skill_analysis/experimentation/data_manipulator.py index be94150..edf22ac 100644 --- a/assistant_dialog_skill_analysis/experimentation/data_manipulator.py +++ b/assistant_dialog_skill_analysis/experimentation/data_manipulator.py @@ -2,6 +2,7 @@ import random import numpy as np + def under_sampling(workspace, workspace_pd, quantile=None): """ Under sample data @@ -10,31 +11,40 @@ def under_sampling(workspace, workspace_pd, quantile=None): :param quantile: threshold to sample from :return train_workspace_data: list of intent json """ - label_frequency_dict = dict(Counter(workspace_pd['intent']).most_common()) + label_frequency_dict = dict(Counter(workspace_pd["intent"]).most_common()) train_workspace_data = list() if not quantile: - quantile = .75 - sampling_threshold = int(np.quantile(a=list(label_frequency_dict.values()), q=[quantile])[0]) + quantile = 0.75 + sampling_threshold = int( + np.quantile(a=list(label_frequency_dict.values()), q=[quantile])[0] + ) - for i in range(len(workspace['intents'])): + for i in range(len(workspace["intents"])): - if not workspace['intents'][i]['examples']: + if not workspace["intents"][i]["examples"]: continue - if label_frequency_dict[workspace['intents'][i]['intent']] > sampling_threshold: - intent = workspace['intents'][i] - sampling_index = list(np.arange(len(workspace['intents'][i]['examples']))) + if label_frequency_dict[workspace["intents"][i]["intent"]] > sampling_threshold: + intent = workspace["intents"][i] + sampling_index = list(np.arange(len(workspace["intents"][i]["examples"]))) random.shuffle(sampling_index) - train_examples = [intent['examples'][index] for index in - sampling_index[:sampling_threshold]] - train_workspace_data.append({'intent': workspace['intents'][i]['intent']}) - train_workspace_data[-1].update({'description': 'string'}) - train_workspace_data[-1].update({'examples': train_examples}) + train_examples = [ + intent["examples"][index] + for index in sampling_index[:sampling_threshold] + ] + train_workspace_data.append({"intent": workspace["intents"][i]["intent"]}) + train_workspace_data[-1].update({"description": "string"}) + train_workspace_data[-1].update({"examples": train_examples}) else: - train_workspace_data.append({'intent': workspace['intents'][i]['intent']}) - train_workspace_data[-1].update({'description': 'string'}) - train_workspace_data[-1].update({'examples': [example for example in - workspace['intents'][i]['examples']]}) + train_workspace_data.append({"intent": workspace["intents"][i]["intent"]}) + train_workspace_data[-1].update({"description": "string"}) + train_workspace_data[-1].update( + { + "examples": [ + example for example in workspace["intents"][i]["examples"] + ] + } + ) return train_workspace_data diff --git a/assistant_dialog_skill_analysis/highlighting/highlighter.py b/assistant_dialog_skill_analysis/highlighting/highlighter.py index 0208016..073c62f 100644 --- a/assistant_dialog_skill_analysis/highlighting/highlighter.py +++ b/assistant_dialog_skill_analysis/highlighting/highlighter.py @@ -15,12 +15,14 @@ NGRAM_RANGE = [1] -def get_highlights_in_batch_multi_thread(conversation, - workspace_id, - full_results, - output_folder, - confidence_threshold, - show_worst_k): +def get_highlights_in_batch_multi_thread( + conversation, + workspace_id, + full_results, + output_folder, + confidence_threshold, + show_worst_k, +): """ Given the prediction result, rank prediction results from worst to best & analyze the top k worst results. @@ -34,14 +36,23 @@ def get_highlights_in_batch_multi_thread(conversation, :return: """ wrong_examples_sorted = _filter_results(full_results, confidence_threshold) - display(Markdown("### Identified {} problematic utterances " - .format(len(wrong_examples_sorted)))) + display( + Markdown( + "### Identified {} problematic utterances ".format( + len(wrong_examples_sorted) + ) + ) + ) display(Markdown(" ")) - wrong_examples_sorted = wrong_examples_sorted[: show_worst_k] + wrong_examples_sorted = wrong_examples_sorted[:show_worst_k] - adversarial_results, adversarial_span_dict = _adversarial_examples_multi_thread_inference( - wrong_examples_sorted, conversation, workspace_id) + ( + adversarial_results, + adversarial_span_dict, + ) = _adversarial_examples_multi_thread_inference( + wrong_examples_sorted, conversation, workspace_id + ) if not adversarial_results.empty: @@ -51,11 +62,13 @@ def get_highlights_in_batch_multi_thread(conversation, label = skills_util.OFFTOPIC_LABEL else: label = original_example[2] - label_idx = label + '\t' + str(original_example[0]) + label_idx = label + "\t" + str(original_example[0]) adversarial_result_subset = adversarial_results[ - adversarial_results['correct_intent'] == label_idx] + adversarial_results["correct_intent"] == label_idx + ] highlight = _highlight_scoring( - original_example, adversarial_result_subset, adversarial_span_dict) + original_example, adversarial_result_subset, adversarial_span_dict + ) _plot_highlight(highlight, original_example, output_folder) @@ -70,47 +83,69 @@ def _filter_results(full_results, confidence_threshold): highlighting_candidates = list() for idx in range(len(full_results)): item = full_results.iloc[idx] - results_intent_list = [predict['intent'] for predict in item['top_predicts']] - result_dict = dict(item['top_predicts']) - if item['correct_intent'] in results_intent_list: - reference_position = results_intent_list.index(item['correct_intent']) + results_intent_list = [predict["intent"] for predict in item["top_predicts"]] + result_dict = dict(item["top_predicts"]) + if item["correct_intent"] in results_intent_list: + reference_position = results_intent_list.index(item["correct_intent"]) else: reference_position = len(results_intent_list) rank_score = 0 # for off-topic examples, rank score = off-topic confidence score - confidence threshold - if item['correct_intent'] == skills_util.OFFTOPIC_LABEL: - if item['top_confidence'] > confidence_threshold: - rank_score = item['top_confidence'] - confidence_threshold + if item["correct_intent"] == skills_util.OFFTOPIC_LABEL: + if item["top_confidence"] > confidence_threshold: + rank_score = item["top_confidence"] - confidence_threshold highlighting_candidates.append( - (idx, item['utterance'], None, item['top_intent'], - item['top_confidence'], rank_score, reference_position)) + ( + idx, + item["utterance"], + None, + item["top_intent"], + item["top_confidence"], + rank_score, + reference_position, + ) + ) else: - if(item['top_intent'] != item['correct_intent']) or ( - item['top_confidence'] <= confidence_threshold): - if item['top_intent'] != item['correct_intent']: - # for incorrectly predicted examples, if the correct intent is not in top 10 - # rank score = confidence of the predicted intent - if item['correct_intent'] not in result_dict: - rank_score = item['top_confidence'] + if (item["top_intent"] != item["correct_intent"]) or ( + item["top_confidence"] <= confidence_threshold + ): + if item["top_intent"] != item["correct_intent"]: + # for incorrectly predicted examples, if the correct intent is not in top 10 + # rank score = confidence of the predicted intent + if item["correct_intent"] not in result_dict: + rank_score = item["top_confidence"] else: # for incorrectly predicted examples, if the correct intent is in top 10, # rank score = confidence of predicted intent - confidence of correct intent - rank_score = item['top_confidence'] - result_dict[item['correct_intent']] - elif item['top_confidence'] <= confidence_threshold: + rank_score = ( + item["top_confidence"] - result_dict[item["correct_intent"]] + ) + elif item["top_confidence"] <= confidence_threshold: # for correctly predicted examples, if the predicted confidence is less than # confidence threshold, rank score = confidence threshold - predicted confidence - rank_score = confidence_threshold - item['top_confidence'] + rank_score = confidence_threshold - item["top_confidence"] highlighting_candidates.append( - (idx, item['utterance'], item['correct_intent'], - item['top_intent'], item['top_confidence'], - rank_score, reference_position)) - - highlighting_candidates_sorted = sorted(highlighting_candidates, - key=lambda x: x[5], reverse=True) - highlighting_candidates_sorted = [candidate for candidate in highlighting_candidates_sorted - if len(nltk.word_tokenize(candidate[1])) < MAX_TOKEN_LENGTH] + ( + idx, + item["utterance"], + item["correct_intent"], + item["top_intent"], + item["top_confidence"], + rank_score, + reference_position, + ) + ) + + highlighting_candidates_sorted = sorted( + highlighting_candidates, key=lambda x: x[5], reverse=True + ) + highlighting_candidates_sorted = [ + candidate + for candidate in highlighting_candidates_sorted + if len(nltk.word_tokenize(candidate[1])) < MAX_TOKEN_LENGTH + ] return highlighting_candidates_sorted @@ -127,45 +162,65 @@ def _plot_highlight(highlight, original_example, output_folder): else: label = original_example[2] fig, ax = plt.subplots(figsize=(2, 5)) - ax = sns.heatmap([[i] for i in highlight.tolist()], - yticklabels=nltk.word_tokenize(original_example[1]), - xticklabels=['Sensitivity to intent: ' + '"' + label + '"'], - cbar_kws={"orientation": "vertical"}, - linewidths=0, square=False, - cmap="Blues") + ax = sns.heatmap( + [[i] for i in highlight.tolist()], + yticklabels=nltk.word_tokenize(original_example[1]), + xticklabels=["Sensitivity to intent: " + '"' + label + '"'], + cbar_kws={"orientation": "vertical"}, + linewidths=0, + square=False, + cmap="Blues", + ) if output_folder: - conf_str = '%.3f' % (original_example[4]) + conf_str = "%.3f" % (original_example[4]) if original_example[2]: - filename = str(original_example[0]) + '_groundtruth_' + \ - original_example[2] + '_prediction_' + \ - original_example[3] + '_confidence_' + \ - conf_str + '.png' + filename = ( + str(original_example[0]) + + "_groundtruth_" + + original_example[2] + + "_prediction_" + + original_example[3] + + "_confidence_" + + conf_str + + ".png" + ) else: - filename = str(original_example[0]) + '_groundtruth_offtopic_prediction_' + \ - original_example[3] + '_confidence_' + \ - conf_str + '.png' + filename = ( + str(original_example[0]) + + "_groundtruth_offtopic_prediction_" + + original_example[3] + + "_confidence_" + + conf_str + + ".png" + ) save_path = os.path.join(output_folder, filename) plt.savefig(os.path.join(save_path), bbox_inches="tight") table = list() - table.append(['Test Set Index', original_example[0]]) - table.append(['Utterance', original_example[1]]) + table.append(["Test Set Index", original_example[0]]) + table.append(["Utterance", original_example[1]]) table.append( - ['Actual Intent', original_example[2] if ( - original_example[2]) else skills_util.OFFTOPIC_LABEL]) - table.append(['Predicted Intent', original_example[3]]) - table.append(['Confidence', original_example[4]]) - with pd.option_context('max_colwidth', 250): - df = pd.DataFrame(data=table, columns=['Characteristic', 'Value']) + [ + "Actual Intent", + original_example[2] + if (original_example[2]) + else skills_util.OFFTOPIC_LABEL, + ] + ) + table.append(["Predicted Intent", original_example[3]]) + table.append(["Confidence", original_example[4]]) + with pd.option_context("max_colwidth", 250): + df = pd.DataFrame(data=table, columns=["Characteristic", "Value"]) df.index = np.arange(1, len(df) + 1) display(df) plt.show() def _adversarial_examples_multi_thread_inference( - wrong_examples_sorted, conversation, workspace_id): + wrong_examples_sorted, conversation, workspace_id +): """ Perform multi threaded inference on all the adversarial examples :param wrong_examples_sorted: @@ -180,23 +235,32 @@ def _adversarial_examples_multi_thread_inference( for original_example in wrong_examples_sorted: adversarial_examples, adversarial_span = _generate_adversarial_examples( - original_example[1], original_example[0]) + original_example[1], original_example[0] + ) if not original_example[2]: label = skills_util.OFFTOPIC_LABEL else: label = original_example[2] - adversarial_label = label + '\t' + str(original_example[0]) + adversarial_label = label + "\t" + str(original_example[0]) all_adversarial_examples.extend(adversarial_examples) - all_adversarial_label_idx.extend([adversarial_label] * len(adversarial_examples)) + all_adversarial_label_idx.extend( + [adversarial_label] * len(adversarial_examples) + ) adversarial_span_dict.update(adversarial_span) adversarial_test_data_frame = pd.DataFrame( - {'utterance': all_adversarial_examples, 'intent': all_adversarial_label_idx}) - adversarial_results = inferencer.inference(conversation, workspace_id, - adversarial_test_data_frame, max_retries=10, - max_thread=5, verbose=False) + {"utterance": all_adversarial_examples, "intent": all_adversarial_label_idx} + ) + adversarial_results = inferencer.inference( + conversation, + workspace_id, + adversarial_test_data_frame, + max_retries=10, + max_thread=5, + verbose=False, + ) display(Markdown(" ")) return adversarial_results, adversarial_span_dict @@ -215,37 +279,38 @@ def _generate_adversarial_examples(utt, original_idx): tokens = utt.split() for idx in range(len(tokens)): for ngram in NGRAM_RANGE: - new_sent = ' '.join(tokens[:idx] + tokens[idx + ngram:]) + new_sent = " ".join(tokens[:idx] + tokens[idx + ngram :]) adversarial_examples.append(new_sent) adversarial_span[new_sent + "_" + str(original_idx)] = (idx, idx + ngram) return adversarial_examples, adversarial_span def _highlight_scoring( - original_example, subset_adversarial_result, adversarial_span_dict): + original_example, subset_adversarial_result, adversarial_span_dict +): """ Calculate the highlighting score using classification results of adversarial examples :param original_example: :param subset_adversarial_result: :param adversarial_span_dict: """ - original_utterance = ' '.join(nltk.word_tokenize(original_example[1])) + original_utterance = " ".join(nltk.word_tokenize(original_example[1])) original_idx = original_example[0] original_intent = original_example[3] original_confidence = original_example[4] original_position = original_example[6] - tokens = original_utterance.split(' ') - highlight = np.zeros(len(tokens), dtype='float32') + tokens = original_utterance.split(" ") + highlight = np.zeros(len(tokens), dtype="float32") for idx in range(len(subset_adversarial_result)): adversarial_example = subset_adversarial_result.iloc[idx] - if not adversarial_example['top_predicts']: + if not adversarial_example["top_predicts"]: continue predict_dict = dict() predict_intent_list = list() - for prediction in adversarial_example['top_predicts']: - predict_dict[prediction['intent']] = prediction['confidence'] - predict_intent_list.append(prediction['intent']) + for prediction in adversarial_example["top_predicts"]: + predict_dict[prediction["intent"]] = prediction["confidence"] + predict_intent_list.append(prediction["intent"]) if original_intent in predict_dict: adversarial_position = list(predict_dict.keys()).index(original_intent) @@ -254,27 +319,32 @@ def _highlight_scoring( adversarial_position = len(list(predict_dict.keys())) adversarial_confidence = 0 - start, end = adversarial_span_dict[adversarial_example['utterance'] + - '_' + str(original_idx)] + start, end = adversarial_span_dict[ + adversarial_example["utterance"] + "_" + str(original_idx) + ] - highlight = _scoring_function(highlight, - original_position, - adversarial_position, - original_confidence, - adversarial_confidence, - start, - end) + highlight = _scoring_function( + highlight, + original_position, + adversarial_position, + original_confidence, + adversarial_confidence, + start, + end, + ) return highlight -def _scoring_function(highlight, - original_position, - adversarial_position, - original_confidence, - adversarial_confidence, - start_idx, - end_idx): +def _scoring_function( + highlight, + original_position, + adversarial_position, + original_confidence, + adversarial_confidence, + start_idx, + end_idx, +): """ scoring function for highlighting of the interval start_idx:end_idx :param highlight: np.array of shape (n_tokens) @@ -289,7 +359,8 @@ def _scoring_function(highlight, # position difference accounts for the change in the position of the target intent among # the top 10 intents return by the message api position_difference = (1 / float(original_position + 1.0)) - ( - 1 / float(adversarial_position + 1.0)) + 1 / float(adversarial_position + 1.0) + ) # confidence difference accounts for the change in the confidence confidence_difference = original_confidence - adversarial_confidence @@ -299,9 +370,11 @@ def _scoring_function(highlight, # highlight score for the interval of start_idx:end_idx is a weighted average of # the position difference and confidence difference - weighted_difference = weight * ( - (0.2 * confidence_difference) + - (0.8 * position_difference)) / ngram_size + weighted_difference = ( + weight + * ((0.2 * confidence_difference) + (0.8 * position_difference)) + / ngram_size + ) highlight[start_idx:end_idx] += weighted_difference diff --git a/assistant_dialog_skill_analysis/inferencing/inferencer.py b/assistant_dialog_skill_analysis/inferencing/inferencer.py index 0ef389f..8e6daf4 100644 --- a/assistant_dialog_skill_analysis/inferencing/inferencer.py +++ b/assistant_dialog_skill_analysis/inferencing/inferencer.py @@ -5,7 +5,10 @@ from ..utils import skills_util from ..inferencing.multi_thread_inference import InferenceThread -def inference(conversation, workspace_id, test_data, max_retries=10, max_thread=5, verbose=False): + +def inference( + conversation, workspace_id, test_data, max_retries=10, max_thread=5, verbose=False +): """ query the message api to generate results on the test data :parameter: conversation: the conversation object produced by AssistantV1 api @@ -20,13 +23,16 @@ def inference(conversation, workspace_id, test_data, max_retries=10, max_thread= if max_thread == 1: reach_max_retry = False responses = [] - for test_example, ground_truth in zip(test_data['utterance'], test_data['intent']): + for test_example, ground_truth in zip( + test_data["utterance"], test_data["intent"] + ): attempt = 1 while attempt <= max_retries: try: prediction_json = skills_util.retrieve_classifier_response( - conversation, workspace_id, test_example, True) - time.sleep(.3) + conversation, workspace_id, test_example, True + ) + time.sleep(0.3) success_flag = True except Exception: @@ -39,34 +45,41 @@ def inference(conversation, workspace_id, test_data, max_retries=10, max_thread= reach_max_retry = True if reach_max_retry: - raise Exception('Maximum attempt of {} has reached'.format(max_retries)) - - if not prediction_json['intents']: - responses.append({'top_intent' : skills_util.OFFTOPIC_LABEL, - 'top_confidence' : 0.0, - 'correct_intent' : ground_truth, - 'utterance' : test_example, - 'top_predicts' : [], - 'entities' : []}) + raise Exception("Maximum attempt of {} has reached".format(max_retries)) + + if not prediction_json["intents"]: + responses.append( + { + "top_intent": skills_util.OFFTOPIC_LABEL, + "top_confidence": 0.0, + "correct_intent": ground_truth, + "utterance": test_example, + "top_predicts": [], + "entities": [], + } + ) else: - responses.append({'top_intent' : prediction_json['intents'][0]['intent'], - 'top_confidence' : prediction_json['intents'][0]['confidence'], - 'correct_intent' : ground_truth, - 'utterance' : test_example, - 'top_predicts' : prediction_json['intents'], - 'entities' : prediction_json['entities']}) + responses.append( + { + "top_intent": prediction_json["intents"][0]["intent"], + "top_confidence": prediction_json["intents"][0]["confidence"], + "correct_intent": ground_truth, + "utterance": test_example, + "top_predicts": prediction_json["intents"], + "entities": prediction_json["entities"], + } + ) result_df = pd.DataFrame(data=responses) else: - result_df = thread_inference(conversation, - workspace_id, - test_data, - max_retries, - max_thread, - verbose) + result_df = thread_inference( + conversation, workspace_id, test_data, max_retries, max_thread, verbose + ) return result_df -def thread_inference(conversation, workspace_id, test_data, - max_retries=10, max_thread=5, verbose=False): + +def thread_inference( + conversation, workspace_id, test_data, max_retries=10, max_thread=5, verbose=False +): """ Perform multi thread inference for faster inference time :param conversation: @@ -78,7 +91,7 @@ def thread_inference(conversation, workspace_id, test_data, :return result_df: results dataframe """ if max_thread > 5: - print('only maximum of 5 threads are allowed') + print("only maximum of 5 threads are allowed") thread_list = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5"] thread_list = thread_list[:max_thread] @@ -90,7 +103,7 @@ def thread_inference(conversation, workspace_id, test_data, start_time = time.time() for i in range(len(test_data)): - data_point = [test_data['utterance'].iloc[i], test_data['intent'].iloc[i]] + data_point = [test_data["utterance"].iloc[i], test_data["intent"].iloc[i]] query_queue.put(data_point) # Create new threads @@ -103,7 +116,8 @@ def thread_inference(conversation, workspace_id, test_data, workspace_id, result, max_retries, - verbose) + verbose, + ) thread.start() threads.append(thread) thread_id += 1 @@ -118,6 +132,7 @@ def thread_inference(conversation, workspace_id, test_data, result_df = pd.DataFrame(data=result) return result_df + def get_intents_confidences(conversation, workspace_id, text_input): """ Retrieve a list of confidence for analysis purpose @@ -127,10 +142,12 @@ def get_intents_confidences(conversation, workspace_id, text_input): :return intent_conf: intent confidences """ response_info = skills_util.retrieve_classifier_response( - conversation, workspace_id, text_input, True)['intents'] - intent_conf = [(r['intent'], r['confidence']) for r in response_info] + conversation, workspace_id, text_input, True + )["intents"] + intent_conf = [(r["intent"], r["confidence"]) for r in response_info] return intent_conf + def calculate_mistakes(results): """ retrieve the data frame of miss-classified examples @@ -139,12 +156,13 @@ def calculate_mistakes(results): """ wrongs = list() for idx, row in results.iterrows(): - if row['correct_intent'] != row['top_intent']: + if row["correct_intent"] != row["top_intent"]: wrongs.append(row) wrongs_df = pd.DataFrame(data=wrongs) - wrongs_df.index.name = 'Test Example Index' + wrongs_df.index.name = "Test Example Index" return wrongs_df + def calculate_accuracy(results): """ calculate the accuracy on the test set @@ -152,7 +170,7 @@ def calculate_accuracy(results): :return accuracy: get accuracy on test set """ correct = 0 - for i in range(0, len(results['correct_intent'])): - correct += 1 if results['top_intent'][i] == results['correct_intent'][i] else 0 - accuracy = np.around((correct / len(results['correct_intent']))*100, 2) + for i in range(0, len(results["correct_intent"])): + correct += 1 if results["top_intent"][i] == results["correct_intent"][i] else 0 + accuracy = np.around((correct / len(results["correct_intent"])) * 100, 2) return accuracy diff --git a/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py b/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py index 457af8f..652c4e3 100644 --- a/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py +++ b/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py @@ -4,12 +4,23 @@ from ..utils import skills_util + class InferenceThread(threading.Thread): """ InferenceThread class is used for multi-thread inferencing for faster inference speed """ - def __init__(self, thread_id, name, que, conversation, - workspace_id, result, max_retries=10, verbose=False): + + def __init__( + self, + thread_id, + name, + que, + conversation, + workspace_id, + result, + max_retries=10, + verbose=False, + ): """ Initialize inferencer :param thread_id: @@ -61,39 +72,48 @@ def thread_inference(self): self.conversation, self.workspace_id, query_question, - alternate_intents=True) - time.sleep(.2) - if response['intents']: - top_predicts = response['intents'] - top_intent = response['intents'][0]['intent'] - top_confidence = response['intents'][0]['confidence'] + alternate_intents=True, + ) + time.sleep(0.2) + if response["intents"]: + top_predicts = response["intents"] + top_intent = response["intents"][0]["intent"] + top_confidence = response["intents"][0]["confidence"] else: top_predicts = [] top_intent = skills_util.OFFTOPIC_LABEL top_confidence = 0 - if response['entities']: - entities = response['entities'] + if response["entities"]: + entities = response["entities"] else: entities = [] - new_dict = {'utterance': query_question, - 'correct_intent': query_data[1], - 'top_intent': top_intent, - 'top_confidence': top_confidence, - 'top_predicts': top_predicts, - 'entities':entities} + new_dict = { + "utterance": query_question, + "correct_intent": query_data[1], + "top_intent": top_intent, + "top_confidence": top_confidence, + "top_predicts": top_predicts, + "entities": entities, + } self.result.append(new_dict) success_flag = True except Exception: if self.verbose: - print("{} process {} fail attempt {}" - .format(self.name, query_question, i)) - time.sleep(.2) + print( + "{} process {} fail attempt {}".format( + self.name, query_question, i + ) + ) + time.sleep(0.2) if attempt >= self.max_retries: - print('Maximum attempt of {} has reached for query {}' - .format(self.max_retries, query_question)) + print( + "Maximum attempt of {} has reached for query {}".format( + self.max_retries, query_question + ) + ) _thread.interrupt_main() self.exit() else: diff --git a/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py b/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py index 06d0fa1..adb22f6 100644 --- a/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py +++ b/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py @@ -15,12 +15,13 @@ def strip_punctuations(utterance: str): :param utterance: :return: """ - normalization_pattern = '\'s' - utterance = re.sub(normalization_pattern, ' is', utterance) - puncuation_pattern = '|'.join(skills_util.PUNCTUATION) - utterance = re.sub(puncuation_pattern, ' ', utterance) + normalization_pattern = "'s" + utterance = re.sub(normalization_pattern, " is", utterance) + puncuation_pattern = "|".join(skills_util.PUNCTUATION) + utterance = re.sub(puncuation_pattern, " ", utterance) return utterance + def _preprocess_chi2(workspace_pd): """ Preprocess dataframe for chi2 analysis @@ -31,20 +32,28 @@ def _preprocess_chi2(workspace_pd): """ stopword_list = skills_util.STOP_WORDS - workspace_pd['utterance_punc_stripped'] = \ - workspace_pd['utterance'].apply(strip_punctuations) + workspace_pd["utterance_punc_stripped"] = workspace_pd["utterance"].apply( + strip_punctuations + ) count_vectorizer = CountVectorizer( min_df=1, - encoding='utf-8', + encoding="utf-8", ngram_range=(1, 2), stop_words=stopword_list, - tokenizer=word_tokenize) - features = count_vectorizer.fit_transform(workspace_pd['utterance_punc_stripped']).toarray() - labels = workspace_pd['intent'] + tokenizer=word_tokenize, + token_pattern="(?u)\b\w+\b", + ) + features = count_vectorizer.fit_transform( + workspace_pd["utterance_punc_stripped"] + ).toarray() + labels = workspace_pd["intent"] return labels, count_vectorizer, features -def _compute_chi2_top_feature(features, labels, vectorizer, cls, significance_level=.05): + +def _compute_chi2_top_feature( + features, labels, vectorizer, cls, significance_level=0.05 +): """ Perform chi2 analysis, punctuation filtering and deduplication :param features: count vectorizer features @@ -81,7 +90,8 @@ def _compute_chi2_top_feature(features, labels, vectorizer, cls, significance_le return deduplicated_unigram, deduplicated_bigram -def get_chi2_analysis(workspace_pd, significance_level=.05): + +def get_chi2_analysis(workspace_pd, significance_level=0.05): """ find correlated unigram and bigram of each intent with Chi2 analysis :param workspace_pd: dataframe, workspace data @@ -91,7 +101,7 @@ def get_chi2_analysis(workspace_pd, significance_level=.05): """ labels, vectorizer, features = _preprocess_chi2(workspace_pd) - label_frequency_dict = dict(Counter(workspace_pd['intent']).most_common()) + label_frequency_dict = dict(Counter(workspace_pd["intent"]).most_common()) N = 5 # keys are the set of unigrams/bigrams and value will be the intent @@ -106,22 +116,19 @@ def get_chi2_analysis(workspace_pd, significance_level=.05): for cls in label_frequency_dict.keys(): unigrams, bigrams = _compute_chi2_top_feature( - features, - labels, - vectorizer, - cls, - significance_level) + features, labels, vectorizer, cls, significance_level + ) classes.append(cls) if unigrams: - chi_unigrams.append(', '.join(unigrams[-N:])) + chi_unigrams.append(", ".join(unigrams[-N:])) else: - chi_unigrams.append('None') + chi_unigrams.append("None") if bigrams: - chi_bigrams.append(', '.join(bigrams[-N:])) + chi_bigrams.append(", ".join(bigrams[-N:])) else: - chi_bigrams.append('None') + chi_bigrams.append("None") if unigrams: if frozenset(unigrams[-N:]) in unigram_intent_dict: @@ -137,17 +144,24 @@ def get_chi2_analysis(workspace_pd, significance_level=.05): bigram_intent_dict[frozenset(bigrams[-N:])] = list() bigram_intent_dict[frozenset(bigrams[-N:])].append(cls) - chi_df = pd.DataFrame(data={'Intent': classes}) - chi_df['Correlated Unigrams'] = chi_unigrams - chi_df['Correlated Bigrams'] = chi_bigrams + chi_df = pd.DataFrame(data={"Intent": classes}) + chi_df["Correlated Unigrams"] = chi_unigrams + chi_df["Correlated Bigrams"] = chi_bigrams display(Markdown(("## Chi-squared Analysis"))) - with pd.option_context('display.max_rows', None, 'display.max_columns', - None, 'display.max_colwidth', 100): + with pd.option_context( + "display.max_rows", + None, + "display.max_columns", + None, + "display.max_colwidth", + 100, + ): chi_df.index = np.arange(1, len(chi_df) + 1) display(chi_df) return unigram_intent_dict, bigram_intent_dict + def get_confusing_key_terms(keyterm_intent_map): """ Greedy search for overlapping intents @@ -173,14 +187,23 @@ def get_confusing_key_terms(keyterm_intent_map): overlap = correlated_unigrams.intersection(other_correlated_unigrams) if overlap: for keyword in overlap: - ambiguous_intents.append("<" + current_label[0] + ", " + - keyterm_intent_map[other_correlated_unigrams][0] + ">") + ambiguous_intents.append( + "<" + + current_label[0] + + ", " + + keyterm_intent_map[other_correlated_unigrams][0] + + ">" + ) ambiguous_keywords.append(keyword) - df = pd.DataFrame(data={'Intent Pairs': ambiguous_intents, 'Terms': ambiguous_keywords}) + df = pd.DataFrame( + data={"Intent Pairs": ambiguous_intents, "Terms": ambiguous_keywords} + ) if not ambiguous_intents: - display(Markdown("There is no ambiguity based on top 5 key terms in chi2 analysis")) + display( + Markdown("There is no ambiguity based on top 5 key terms in chi2 analysis") + ) else: display_size = 10 if not df.empty: @@ -190,6 +213,7 @@ def get_confusing_key_terms(keyterm_intent_map): return df + def chi2_overlap_check(ambiguous_unigram_df, ambiguous_bigram_df, intent1, intent2): """ looks for intent overlap for specific intent or intent pairs @@ -202,10 +226,14 @@ def chi2_overlap_check(ambiguous_unigram_df, ambiguous_bigram_df, intent1, inten part1 = None part2 = None if not ambiguous_unigram_df.empty: - part1 = ambiguous_unigram_df[ambiguous_unigram_df['Intent Pairs'].str.contains(intent)] + part1 = ambiguous_unigram_df[ + ambiguous_unigram_df["Intent Pairs"].str.contains(intent) + ] if not ambiguous_bigram_df.empty: - part2 = ambiguous_bigram_df[ambiguous_bigram_df['Intent Pairs'].str.contains(intent)] + part2 = ambiguous_bigram_df[ + ambiguous_bigram_df["Intent Pairs"].str.contains(intent) + ] if part1 is not None and part2 is not None: display(HTML(pd.concat([part1, part2]).to_html(index=False))) diff --git a/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py b/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py index 4ddcc07..8d65e22 100644 --- a/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py +++ b/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py @@ -4,6 +4,7 @@ N = 5 + def _derive_entity_label_matrix(train_full_results, entities): """ Derive entity feature matrix for chi2 anaylsis using entity annotations from message api @@ -20,12 +21,12 @@ def _derive_entity_label_matrix(train_full_results, entities): entity_average_confidence_dict = dict() for i in range(len(train_full_results)): current_result = train_full_results.iloc[i] - if current_result['entities']: + if current_result["entities"]: # create empty feature vector - current_feature = [0]*len(entities) - for entity_reference in current_result['entities']: - e_ref = entity_reference['entity'] - e_conf = entity_reference['confidence'] + current_feature = [0] * len(entities) + for entity_reference in current_result["entities"]: + e_ref = entity_reference["entity"] + e_conf = entity_reference["confidence"] entity_idx = entities.index(e_ref) current_feature[entity_idx] += 1 @@ -33,16 +34,19 @@ def _derive_entity_label_matrix(train_full_results, entities): entity_count_dict[e_ref] = entity_count_dict.get(e_ref, 0) + 1 entity_feature_matrix.append(current_feature) - labels.append(current_result['correct_intent']) + labels.append(current_result["correct_intent"]) entity_feature_matrix = np.array(entity_feature_matrix) labels = np.array(labels) for key in entity_conf_dict: - entity_average_confidence_dict[key] = entity_conf_dict[key]/entity_count_dict[key] + entity_average_confidence_dict[key] = ( + entity_conf_dict[key] / entity_count_dict[key] + ) return entity_feature_matrix, labels, entity_average_confidence_dict -def entity_label_correlation_analysis(train_full_results, entities_list, p_value=.05): + +def entity_label_correlation_analysis(train_full_results, entities_list, p_value=0.05): """ Apply chi2 analysis on entities of the training set :param train_full_results: pandas data frame output by inference @@ -50,9 +54,11 @@ def entity_label_correlation_analysis(train_full_results, entities_list, p_value :param p_value: threshold for chi2 analysis :return entity_label_df: pandas df with col 1 being intents and col 2 entities """ - entity_feature_matrix, labels, entity_average_confidence_dict = _derive_entity_label_matrix( - train_full_results, - entities_list) + ( + entity_feature_matrix, + labels, + entity_average_confidence_dict, + ) = _derive_entity_label_matrix(train_full_results, entities_list) entities_list = np.array(entities_list) unique_labels = list(set(labels)) final_labels = list() @@ -67,8 +73,10 @@ def entity_label_correlation_analysis(train_full_results, entities_list, p_value continue final_labels.append(label) - final_entities.append(', '.join(ordered_entities[-N:])) + final_entities.append(", ".join(ordered_entities[-N:])) - entity_label_df = pd.DataFrame({'Intent': final_labels, 'Correlated Entities': final_entities}) + entity_label_df = pd.DataFrame( + {"Intent": final_labels, "Correlated Entities": final_entities} + ) return entity_label_df diff --git a/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py b/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py index a102f46..6692ec8 100644 --- a/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py +++ b/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py @@ -7,92 +7,115 @@ import nltk from ..utils import skills_util -def _preprocess_for_heat_map(workspace_df, label_for_display=30, - max_token_display=30, class_list=None): - ''' + +def _preprocess_for_heat_map( + workspace_df, label_for_display=30, max_token_display=30, class_list=None +): + """ Preprocess dataframe for heat map visualization :param workspace_df: :param label_for_display: :param max_token_display: :param class_list: - ''' - label_frequency_dict = dict(Counter(workspace_df['intent']).most_common()) + """ + label_frequency_dict = dict(Counter(workspace_df["intent"]).most_common()) if class_list: - workspace_subsampled = workspace_df[workspace_df['intent'].isin(class_list)] - counts = _get_counts_per_label(workspace_subsampled, unigrams_col_name="unigrams") + workspace_subsampled = workspace_df[workspace_df["intent"].isin(class_list)] + counts = _get_counts_per_label( + workspace_subsampled, unigrams_col_name="unigrams" + ) else: if len(label_frequency_dict) > label_for_display: top_30_labels = list(label_frequency_dict.keys())[:label_for_display] - workspace_subsampled = workspace_df[workspace_df['intent'].isin(top_30_labels)] - counts = _get_counts_per_label(workspace_subsampled, unigrams_col_name="unigrams") + workspace_subsampled = workspace_df[ + workspace_df["intent"].isin(top_30_labels) + ] + counts = _get_counts_per_label( + workspace_subsampled, unigrams_col_name="unigrams" + ) else: counts = _get_counts_per_label(workspace_df, unigrams_col_name="unigrams") - max_n = np.int(np.ceil(max_token_display / len(counts.index.get_level_values(0).unique()))) - top_counts = _get_top_n(counts['n_w'], top_n=max_n) + max_n = np.int( + np.ceil(max_token_display / len(counts.index.get_level_values(0).unique())) + ) + top_counts = _get_top_n(counts["n_w"], top_n=max_n) return counts, top_counts + def _get_counts_per_label(training_data, unigrams_col_name="unigrams"): - ''' + """ Create a new dataframe to store unigram counts for each label :param training_data: pandas df :param unigrams_col_name: name of unigrams column name :return counts: dataframe that contains the counts for all unigrams per label - ''' - training_data[unigrams_col_name] = training_data['utterance'].apply(nltk.word_tokenize) + """ + training_data[unigrams_col_name] = training_data["utterance"].apply( + nltk.word_tokenize + ) rows = list() stopword_list = skills_util.STOP_WORDS - for row in training_data[['intent', unigrams_col_name]].iterrows(): + for row in training_data[["intent", unigrams_col_name]].iterrows(): r = row[1] for word in r.unigrams: rows.append((r.intent, word)) - words = pd.DataFrame(rows, columns=['intent', 'word']) + words = pd.DataFrame(rows, columns=["intent", "word"]) # delete all empty words and chars words = words[words.word.str.len() > 1] # delete stopwords words = words.loc[~words["word"].isin(stopword_list)] # get counts per word - counts = words.groupby('intent')\ - .word.value_counts()\ - .to_frame()\ - .rename(columns={'word':'n_w'}) + counts = ( + words.groupby("intent") + .word.value_counts() + .to_frame() + .rename(columns={"word": "n_w"}) + ) return counts + def _get_top_n(series, top_n=5, index_level=0): - ''' + """ Get most frequent words per label :param series: product of a call to get_counts_per_label :param top_n: integer signifying the number of most frequent tokens per class :param index_level: index to group by :return df: dataframe that contains the top_n unigrams per label - ''' - return series\ - .groupby(level=index_level)\ - .nlargest(top_n)\ - .reset_index(level=index_level, drop=True) + """ + return ( + series.groupby(level=index_level) + .nlargest(top_n) + .reset_index(level=index_level, drop=True) + ) + -def seaborn_heatmap(workspace_df, label_for_display=30, max_token_display=30, class_list=None): - ''' +def seaborn_heatmap( + workspace_df, label_for_display=30, max_token_display=30, class_list=None +): + """ Create heat map of word frequencies per intent :param workspace_df: :param label_for_display: :param max_token_display: :param class_list: - ''' + """ counts, top_counts = _preprocess_for_heat_map( - workspace_df, - label_for_display, - max_token_display, - class_list) + workspace_df, label_for_display, max_token_display, class_list + ) reset_groupby = counts.reset_index() - most_frequent_words = top_counts.reset_index()['word'].unique() + most_frequent_words = top_counts.reset_index()["word"].unique() table_format = reset_groupby.pivot(index="word", columns="intent", values="n_w") - table_format = table_format[ - table_format.index.isin(most_frequent_words)].fillna(0).astype("int32") - display(Markdown('##

Token Frequency per Intent

')) + table_format = ( + table_format[table_format.index.isin(most_frequent_words)] + .fillna(0) + .astype("int32") + ) + display( + Markdown('##

Token Frequency per Intent

') + ) fig, ax = plt.subplots(figsize=(20, 20)) - sns.heatmap(table_format, annot=True, fmt='d', linewidths=.1, cmap="PuBu", ax=ax) - plt.ylabel('Token', fontdict=skills_util.LABEL_FONT) - plt.xlabel('Intent', fontdict=skills_util.LABEL_FONT) + sns.heatmap(table_format, annot=True, fmt="d", linewidths=0.1, cmap="PuBu", ax=ax) + plt.ylabel("Token", fontdict=skills_util.LABEL_FONT) + plt.xlabel("Intent", fontdict=skills_util.LABEL_FONT) diff --git a/assistant_dialog_skill_analysis/utils/skills_util.py b/assistant_dialog_skill_analysis/utils/skills_util.py index a4a2ddb..c993241 100644 --- a/assistant_dialog_skill_analysis/utils/skills_util.py +++ b/assistant_dialog_skill_analysis/utils/skills_util.py @@ -10,35 +10,68 @@ from nbconvert.preprocessors import ExecutePreprocessor import nltk import ibm_watson -from ibm_cloud_sdk_core.authenticators import \ - IAMAuthenticator, BasicAuthenticator, NoAuthAuthenticator +from ibm_cloud_sdk_core.authenticators import ( + IAMAuthenticator, + BasicAuthenticator, + NoAuthAuthenticator, +) -DEFAULT_API_VERSION = '2019-02-28' -DEFAULT_PROD_URL = 'https://gateway.watsonplatform.net/assistant/api' -DEFAULT_USERNAME = 'apikey' -STAGE_IAM_URL = 'https://iam.stage1.bluemix.net/identity/token' +DEFAULT_API_VERSION = "2019-02-28" +DEFAULT_PROD_URL = "https://gateway.watsonplatform.net/assistant/api" +DEFAULT_USERNAME = "apikey" +STAGE_IAM_URL = "https://iam.stage1.bluemix.net/identity/token" -OFFTOPIC_LABEL = 'SYSTEM_OUT_OF_DOMAIN' +OFFTOPIC_LABEL = "SYSTEM_OUT_OF_DOMAIN" -LABEL_FONT = {'family': 'normal', - 'weight': 'bold', - 'size': 17} +LABEL_FONT = {"family": "normal", "weight": "bold", "size": 17} -TITLE_FONT = {'family': 'normal', - 'weight': 'bold', - 'size': 25} +TITLE_FONT = {"family": "normal", "weight": "bold", "size": 25} -PUNCTUATION = [";", ":", ",", "\.", "\"", "\'", - "\?", "\(", "\)", "!", "?", "!", - ";", ":", "。", "、", "《", "》", - ",", "¿", "¡", "؟", "،"] +PUNCTUATION = [ + ";", + ":", + ",", + "\.", + '"', + "'", + "\?", + "\(", + "\)", + "!", + "?", + "!", + ";", + ":", + "。", + "、", + "《", + "》", + ",", + "¿", + "¡", + "؟", + "،", +] -STOP_WORDS = ['an', 'a', 'in', 'on', 'be', 'or', 'of', - 'and', 'can', 'is', 'to', 'the', 'i'] +STOP_WORDS = [ + "an", + "a", + "in", + "on", + "be", + "or", + "of", + "and", + "can", + "is", + "to", + "the", + "i", +] -def stratified_sampling(workspace, sampling_percentage=.8): - ''' +def stratified_sampling(workspace, sampling_percentage=0.8): + """ Create a stratified sample of the workspace json & return a intent json acceptable in Assistant API @@ -46,64 +79,74 @@ def stratified_sampling(workspace, sampling_percentage=.8): :param sampling_percentage: percentage of original to sample :return train_workspace_data: list of intents for train :return test_workspace_data: list of utterance,intent pairs for test - ''' + """ train_workspace_data = list() test_workspace_data = list() - for i in range(len(workspace['intents'])): - intent = workspace['intents'][i] - sampling_index = list(np.arange(len(intent['examples']))) + for i in range(len(workspace["intents"])): + intent = workspace["intents"][i] + sampling_index = list(np.arange(len(intent["examples"]))) random.shuffle(sampling_index) # training set train_test_split_cutoff = int(sampling_percentage * len(sampling_index)) train_examples = [ - intent['examples'][index] for index in sampling_index[:train_test_split_cutoff] - ] - train_workspace_data.append({'intent': workspace['intents'][i]['intent']}) + intent["examples"][index] + for index in sampling_index[:train_test_split_cutoff] + ] + train_workspace_data.append({"intent": workspace["intents"][i]["intent"]}) train_workspace_data[i].update({"description": "string"}) train_workspace_data[i].update({"examples": train_examples}) # test set test_examples = [ - intent['examples'][index] for index in sampling_index[train_test_split_cutoff:] + intent["examples"][index] + for index in sampling_index[train_test_split_cutoff:] + ] + test_workspace_data.extend( + [ + utterances["text"] + "\t" + workspace["intents"][i]["intent"] + for utterances in test_examples ] - test_workspace_data.extend([ - utterances['text'] + '\t' + - workspace['intents'][i]['intent'] for utterances in test_examples - ]) + ) return train_workspace_data, test_workspace_data + def create_workspace(conversation, intent_json=None): - ''' + """ Create a workspace for testing purpose :param conversation: conversation object created by Watson Assistant api :param intent_json: nested json of utternance and intent pairs :return response: the workspace id and other metadata related to the new workspace - ''' + """ response = conversation.create_workspace( - name='test_workspace', - description='', - language='en', + name="test_workspace", + description="", + language="en", intents=intent_json, entities=[], counterexamples=[], - metadata={}).get_result() + metadata={}, + ).get_result() return response + def input_credentials(): - ''' + """ Prompt user to enter apikey and workspace id - ''' + """ apikey = getpass.getpass("Please enter apikey: ") workspace_id = getpass.getpass("Please enter workspace-id: ") return apikey, workspace_id -def retrieve_workspace(iam_apikey=None, - workspace_id=None, - url=DEFAULT_PROD_URL, - api_version=DEFAULT_API_VERSION, - username=DEFAULT_USERNAME, - password=None, - export_flag=True): + +def retrieve_workspace( + iam_apikey=None, + workspace_id=None, + url=DEFAULT_PROD_URL, + api_version=DEFAULT_API_VERSION, + username=DEFAULT_USERNAME, + password=None, + export_flag=True, +): """ Retrieve workspace from Assistant instance :param iam_apikey: @@ -123,8 +166,9 @@ def retrieve_workspace(iam_apikey=None, else: authenticator = NoAuthAuthenticator() - conversation = ibm_watson.AssistantV1(authenticator=authenticator, - version=api_version) + conversation = ibm_watson.AssistantV1( + authenticator=authenticator, version=api_version + ) conversation.set_service_url(url) if export_flag: @@ -134,29 +178,30 @@ def retrieve_workspace(iam_apikey=None, def extract_workspace_data(workspace): - ''' + """ Extract relevant data and vocabulary :param workspace: :return relevant_data: :return vocabulary: - ''' - relevant_data = {'utterance': list(), 'intent': list()} + """ + relevant_data = {"utterance": list(), "intent": list()} vocabulary = set() - for i in range(len(workspace['intents'])): - current_intent = workspace['intents'][i]['intent'] - for j in range(len(workspace['intents'][i]['examples'])): - current_example = workspace['intents'][i]['examples'][j]['text'] - relevant_data['utterance'].append(current_example) - relevant_data['intent'].append(current_intent) + for i in range(len(workspace["intents"])): + current_intent = workspace["intents"][i]["intent"] + for j in range(len(workspace["intents"][i]["examples"])): + current_example = workspace["intents"][i]["examples"][j]["text"] + relevant_data["utterance"].append(current_example) + relevant_data["intent"].append(current_intent) vocabulary.update(nltk.word_tokenize(current_example)) return relevant_data, vocabulary -def process_test_set(test_set_filename, delim='\t'): - ''' + +def process_test_set(test_set_filename, delim="\t"): + """ Process test set given the path to the test file :param test_set_filename: link to the test file in tsv format :return test_df: test set stored in pandas dataframe - ''' + """ user_inputs = list() intents = list() with open(test_set_filename, "r", encoding="utf-8") as ts: @@ -170,9 +215,10 @@ def process_test_set(test_set_filename, delim='\t'): intents.append(OFFTOPIC_LABEL) else: continue - test_df = pd.DataFrame(data={'utterance':user_inputs, 'intent':intents}) + test_df = pd.DataFrame(data={"utterance": user_inputs, "intent": intents}) return test_df + def export_workspace(conversation, experiment_workspace_id, export_path): """ Export the workspace to target path @@ -181,23 +227,26 @@ def export_workspace(conversation, experiment_workspace_id, export_path): :param export_path: the path where the exported workspace will be saved """ response = conversation.get_workspace( - workspace_id=experiment_workspace_id, export=True).get_result() - with open(export_path, 'w+', encoding='utf-8') as outfile: + workspace_id=experiment_workspace_id, export=True + ).get_result() + with open(export_path, "w+", encoding="utf-8") as outfile: json.dump(response, outfile) + def load_stopword_list(path): - ''' + """ :param path: path to stopwords list :return stopword_list: - ''' + """ stopword_list = list() - with open(path, 'r', encoding='utf-8') as filehandle: + with open(path, "r", encoding="utf-8") as filehandle: for line in filehandle: stopword_list.append(line.strip()) return stopword_list + def run_notebook(notebook_path, iam_apikey, wksp_id, test_file, output_path): - ''' + """ Run notebook for end to end test :param notebook_path: :param uname: @@ -205,74 +254,86 @@ def run_notebook(notebook_path, iam_apikey, wksp_id, test_file, output_path): :param wksp_id: :param test_file: :param output_path: - ''' + """ notebook_name, _ = os.path.splitext(os.path.basename(notebook_path)) with open(notebook_path) as f: nb = nbformat.read(f, as_version=4) nb, old_cred_text = _replace_nb_input(nb, iam_apikey, wksp_id, test_file) - #nb = _remove_experimentation(nb) + # nb = _remove_experimentation(nb) - proc = ExecutePreprocessor(timeout=60 * 60, kernel_name='python3') + proc = ExecutePreprocessor(timeout=60 * 60, kernel_name="python3") proc.allow_errors = True - proc.preprocess(nb, {'metadata': {'path': os.getcwd()}}) + proc.preprocess(nb, {"metadata": {"path": os.getcwd()}}) errors = [] for cell in nb.cells: - if 'outputs' in cell: - for output in cell['outputs']: - if output.output_type == 'error': + if "outputs" in cell: + for output in cell["outputs"]: + if output.output_type == "error": errors.append(output) - if 'source' in cell and 'iam_apikey = ' in cell['source']: - cell['source'] = old_cred_text + if "source" in cell and "iam_apikey = " in cell["source"]: + cell["source"] = old_cred_text - with open(output_path + '.ipynb', mode='wt') as f: + with open(output_path + ".ipynb", mode="wt") as f: nbformat.write(nb, f) return nb, errors + def _replace_nb_input(nb, apikey, wksp_id, test_file): - ''' + """ Replace notebook interactive input for tests :param nb: :param uname: :param pwd: :param wksp_id: :param test_file: - ''' - apikey_patt = 'iam_apikey = ' - wksp_id_patt = 'workspace_id = ' - test_file_name_patt = 'test_set_path = ' - old_cred_text = '' + """ + apikey_patt = "iam_apikey = " + wksp_id_patt = "workspace_id = " + test_file_name_patt = "test_set_path = " + old_cred_text = "" for cell in nb.cells: - if 'source' in cell and apikey_patt in cell['source']: - old_cred_text = cell['source'] - text = re.sub('(.*)\niam_apikey, (.*)', - (r'\1\n#iam_apikey, \2'), - cell['source']) # comment out input_credentials - - text = re.sub('(.*)#' + apikey_patt + '\'###\'(.*)', - r'\1' + apikey_patt + '\'' + apikey + '\'' + r'\2', - text) # replace pwd - text = re.sub('(.*)#' + wksp_id_patt + '\'###\'(.*)', - r'\1' + wksp_id_patt + '\'' + wksp_id + '\'' + r'\2', - text) # replace wksp_id - cell['source'] = text - elif 'source' in cell and test_file_name_patt in cell['source']: - text = re.sub('(.*)\n' + test_file_name_patt + '\'test_set.tsv\'(.*)', - r'\1\n' + test_file_name_patt + '\'' + test_file + '\'' + r'\2', - cell['source']) # replace test file - cell['source'] = text + if "source" in cell and apikey_patt in cell["source"]: + old_cred_text = cell["source"] + text = re.sub( + "(.*)\niam_apikey, (.*)", (r"\1\n#iam_apikey, \2"), cell["source"] + ) # comment out input_credentials + + text = re.sub( + "(.*)#" + apikey_patt + "'###'(.*)", + r"\1" + apikey_patt + "'" + apikey + "'" + r"\2", + text, + ) # replace pwd + text = re.sub( + "(.*)#" + wksp_id_patt + "'###'(.*)", + r"\1" + wksp_id_patt + "'" + wksp_id + "'" + r"\2", + text, + ) # replace wksp_id + cell["source"] = text + elif "source" in cell and test_file_name_patt in cell["source"]: + text = re.sub( + "(.*)\n" + test_file_name_patt + "'test_set.tsv'(.*)", + r"\1\n" + test_file_name_patt + "'" + test_file + "'" + r"\2", + cell["source"], + ) # replace test file + cell["source"] = text return nb, old_cred_text + def _remove_experimentation(nb): - ''' + """ Remove the experimentation session from end-to-end test :param nb: - ''' - exp_patt = 'Part 3: Experimentation' + """ + exp_patt = "Part 3: Experimentation" new_nb_cells = [] for cell in nb.cells: - if cell.cell_type == 'markdown' and 'source' in cell and exp_patt in cell['source']: + if ( + cell.cell_type == "markdown" + and "source" in cell + and exp_patt in cell["source"] + ): break else: new_nb_cells.append(cell) @@ -280,7 +341,9 @@ def _remove_experimentation(nb): return nb -def retrieve_classifier_response(conversation, workspace_id, text_input, alternate_intents=False): +def retrieve_classifier_response( + conversation, workspace_id, text_input, alternate_intents=False +): """ retrieve classifier response :param conversation: instance @@ -290,11 +353,8 @@ def retrieve_classifier_response(conversation, workspace_id, text_input, alterna :return response: """ response = conversation.message( - input={ - 'message_type': 'text', - 'text': text_input - }, + input={"message_type": "text", "text": text_input}, workspace_id=workspace_id, alternate_intents=alternate_intents, ).get_result() - return response \ No newline at end of file + return response diff --git a/skill_analysis.ipynb b/skill_analysis.ipynb index 392476d..bf0b26f 100644 --- a/skill_analysis.ipynb +++ b/skill_analysis.ipynb @@ -1,50 +1,37 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from IPython.display import Markdown, display, HTML\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" + "# Dialog skill analysis for Watson Assistant\n", + "\n", + "## Introduction\n", + "Dialog Skill Analysis for Watson Assistant (WA) is intended for use by chatbot designers, developers and data scientists who would like to experiment with and improve their existing dialog skill design. \n", + "\n", + "This notebook assumes familiarity with the Watson Assistant product as well as concepts involved in dialog skill design such as intent, entities, and utterances. \n", + "\n", + "### Environment\n", + "- Python version 3.6 or above is required. \n", + "- Install dependencies with `pip install -r requirements.txt` and refer to `requirements.txt`\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Dialog Skill Analysis for Watson Assistant\n", - "\n", - "## Introduction\n", - "Dialog Skill Analysis for Watson Assistant (WA) is intended for use by chatbot designers, developers & data scientists who would like to experiment with and improve on their existing dialog skill design. \n", - "\n", - "We assume familiarity with the Watson Assistant product as well as concepts involved in dialog skill design like intent, entities, utterances etc. \n", - "\n", - "This notebook has been organized into 3 parts based on complexity and expected input from the user.\n", - "\n", - "**Part 1**: Training Data Analysis\n", - "- Analyzes the Watson Assistant dialog skill json already created by the user\n", - "- Requires the user provide access credentials to an existing WA dialog skill like api_key, workspace_id \n", - "\n", - "**Part 2**: Model Analysis\n", - "- Evaluates the dialog skill against a test set provided by the user\n", - "- Requires the user provide a test set for model analysis\n", - "\n", - "**Part 3**: Advanced Analysis\n", - "- Analysis related to confidence threshold, term importance etc.\n", - "- Requires the user provide a test set for model analysis (same data as part 2)\n", - "\n", - "### Usage\n", - "1. Assumes familiarity using a Python Jupyter notebook\n", - "2. Assumes a Python 3.6 or greater environment\n", - "3. Install dependencies with `pip install -r requirements.txt` \n", - "4. Start jupyter server with `jupyter notebook`\n", - "5. Select `skill_analysis.ipynb` to start session\n", - "\n", - "### Alphabetic Contributor List\n", - "Watson Assistant Algorithms: Haode Qi, Ladislav Kunc, Ming Tan, Navneet Rao, Panos Karagiannis, Yang Yu" + "Install all the required packages and filter out any warnings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Markdown, display, HTML\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" ] }, { @@ -85,17 +72,36 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Part 1 : Training Data Analysis" + "## Table of contents\n", + "\n", + "1. [Part 1: Prepare the training data](#part1)
\n", + "2. [Part 2: Prepare the test data](#part2)
\n", + "3. [Part 3: Perform advanced analysis](#part3)
\n", + "4. [Part 4: Summary](#part4)
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Part 1: Prepare the training data\n", + "1.1 [Set up access to the training data](#part1)
\n", + "1.2 [Process Dialog Skill Training Data](#part1.2)
\n", + "1.3 [Analyze data distribution](#part1.3)
\n", + "1.4 [Perform a correlation analysis](#part1.4)
\n", + "1.5 [Visualize terms using a heat map](#part1.5)
\n", + "1.6 [Ambiguity in the training data](#part1.6)
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup: Access Training Data\n", + "\n", + "## 1.1 Set up access to the training data\n", "\n", - "Please provide access credentials for an existing dialog skill that you would like to analyze. \n", - "Have your API Key & Workspace ID values handy" + "Provide access credentials for an existing dialog skill that you would like to analyze. For this you need your API Key and Workspace ID values." ] }, { @@ -174,9 +180,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 1.1 Process Dialog Skill Training Data\n", + "\n", + "## 1.2 Process the dialog skill training data\n", "\n", - "We generate summary statistics related to the given skill & workspace" + "Generate summary statistics related to the given skill and workspace." ] }, { @@ -193,17 +200,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "\n", + "## 1.3 Analyze the data distribution\n", "\n", - "## 1.2 Data Distribution Analysis" + "- [Analyze class imbalance](#imbalance)\n", + "- [List the distribution of user examples by intent](#distribution)\n", + "- [Actions for class imbalance](#actionimbalance)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Class Imbalance Analysis\n", + "### Analyze class imbalance\n", "\n", - "We analyze whether the dataset contains class imbalance by checking whether the largest intent contains less than double the number of user examples contained in the smallest intent. Presense of imbalance does not necessarily indicate an issue, please review the actions section below" + "Analyze whether the data set contains class imbalance by checking whether the largest intent contains less than double the number of user examples contained in the smallest intent. If there is an imbalance it does not necessarily indicate an issue; but you should review the [actions](#actionimbalance) section below." ] }, { @@ -220,8 +231,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Distribution of User Examples by Intent\n", - "We display the distribution of intents vs the number of examples per intent (sorted by the number of examples per intent) below. Ideally we should not have large variations in terms of number of user examples for various intents. " + "### List the distribution of user examples by intent\n", + "Display the distribution of intents versus the number of examples per intent (sorted by the number of examples per intent) below. Ideally you should not have large variations in terms of number of user examples for various intents. " ] }, { @@ -248,47 +259,57 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Actions for Class Imbalance\n", + "### Actions for class imbalance\n", + "\n", + "Class imbalance will not always lead to lower accuracy, which means that all intents (classes) do not need to have the same number of examples.\n", "\n", - "Class imbalance will not always lead to lower accuracy. All intents (classes) thus need not have the same number of examples.\n", + "Given a hypothetical chatbot related to banking:
\n", "\n", - "1. For intents like `updateBankAccount` and `addNewAccountHolder` where the semantics difference between them is more subtle, the number of examples per intent needs to be somewhat balanced else the classifier might favor the intent with the higher number of examples.\n", - "2. For intents like `greetings` that are semantically distinct from other intents like `updateBankAccount`, it may be okay for it to have fewer examples per intent and still be easy for the intent detector to classify.\n", + "- For intents like `updateBankAccount` and `addNewAccountHolder` where the semantics difference between them is subtler, the number of examples per intent needs to be somewhat balanced otherwise the classifier might favor the intent with the higher number of examples.\n", + "- For intents like `greetings` that are semantically distinct from other intents like `updateBankAccount`, it may be acceptable for it to have fewer examples per intent and still be easy for the intent detector to classify.\n", "\n", - "If during testing it seems like intent classification accuracy is lower than expected, we advise you to re-examine this distribution analysis. \n", "\n", - "With regard to sorted distribution of examples per intent, if the sorted number of user examples varies a lot across different intents, it can be a potential source of bias for intent detection. Large imbalances in general should be avoided. This can potentially lead to lower accuracy. If your graph displays this characteristic, this might be a source of error.\n", "\n", - "For further guidance on adding more examples to help balance out your distribution, please refer to \n", - "https://cloud.ibm.com/docs/services/assistant?topic=assistant-intent-recommendations#intent-recommendations-get-example-recommendations" + "If the intent classification accuracy is lower than expected during testing, you should re-examine the distribution analysis. \n", + "\n", + "With regard to sorted distribution of examples per intent, if the sorted number of user examples varies a lot across different intents, it can be a potential source of bias for intent detection. Large imbalances in general should be avoided. This can potentially lead to lower accuracy. If your graph displays this characteristic, this could be a source of error.\n", + "\n", + "For further guidance on adding more examples to help balance out your distribution, refer to \n", + "Intent Example Recommendation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 1.3 Term Analysis - Correlation Analysis" + "\n", + "## 1.4 Perform correlation analysis\n", + "\n", + "- [Retrieve the most correlated unigrams and bigrams for each intent](#retrieve)\n", + "- [Actions for anomalous correlations](#anomalous)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Retrieve the most correlated unigrams and bigrams for each intent\n", + "### Retrieve the most correlated unigrams and bigrams for each intent\n", "\n", - "We perform a chi square significance test using count features to determine the terms that are most correlated with each intent in the dataset. \n", + "Perform a chi square significance test using count features to determine the terms that are most correlated with each intent in the data set. \n", "\n", - "A `unigram` is a single word, while a `bigram` is two consecutive words from within the training data. E.g. If you have a sentence like `Thank you for your service`, each of the words in the sentence are considered unigrams while terms like `Thank you`, `your service` are considered bigrams.\n", + "A `unigram` is a single word, while a `bigram` is two consecutive words from within the training data. For example, if you have a sentence like `Thank you for your service`, each of the words in the sentence are considered unigrams while terms like `Thank you`, `your service` are considered bigrams.\n", "\n", - "If you see terms like `hi`, `hello` correlated with a `greeting` intent that would be reasonable. But if you see terms like `table`, `chair` correlated with the `greeting` intent that would be anomalous. A scan of the most correlated unigrams & bigrams for each intent can help you spot potential anomalies within your training data.\n", + "Terms such as `hi`, `hello` correlated with a `greeting` intent are reasonable. But terms such as `table`, `chair` correlated with the `greeting` intent are anomalous. A scan of the most correlated unigrams & bigrams for each intent can help you spot potential anomalies within your training data.\n", "\n", - "**Note**: We ignore the following common words from consideration `an, a, in, on, be, or, of, a, and, can, is, to, the, i`" + "**Note**: We ignore the following common words (\\\"stop words\\\") from consideration `an, a, in, on, be, or, of, a, and, can, is, to, the, i`" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "importlib.reload(chi2_analyzer)\n", @@ -299,12 +320,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Actions for Anomalous Correlations\n", + "### Actions for anomalous correlations\n", "\n", - "If you identify unusual / anomalous correlated terms like: numbers, names etc., which should not be correlated with an intent please read the following:\n", + "If you identify unusual or anomalous correlated terms such as: numbers, names and so on, which should not be correlated with an intent, consider the following:\n", " \n", "- **Case 1** : If you see names appearing amongst correlated unigrams or bigrams, add more variation of names so no specific names will be correlated \n", - "- **Case 2** : If you see specific numbers like 1234 amongst correlated unigrams or bigrams and are not helpful to the use case, remove or mask these numbers from the examples\n", + "- **Case 2** : If you see specific numbers like 1234 amongst correlated unigrams or bigrams and these are not helpful to the use case, remove or mask these numbers from the examples\n", "- **Case 3** : If you see terms which should never be correlated to that specific intent, consider adding or removing terms/examples so that domain specific terms are correlated with the correct intent" ] }, @@ -312,11 +333,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 1.4 Term Analysis - Heat Map\n", + "\n", + "## 1.5 Visualize terms using a heat map\n", + "\n", + "- [Display term analysis for a custom intent list](#customintent)\n", + "- [Actions for anomalous terms in the heat map](#heatmap)\n", "\n", - "A heatmap of terms is a method using which we can visualize which terms or words are frequently occuring within each intent. Rows are the terms and columns are the intents. \n", + "A heat map of terms is a method to visualize terms or words that frequently occur within each intent. Rows are the terms, and columns are the intents. \n", "\n", - "By default we show only the top 30 intents with the highest number of user examples in the analysis. This number can be changed if needed." + "The code below displays the top 30 intents with the highest number of user examples in the analysis. This number can be changed if needed." ] }, { @@ -338,9 +363,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Term Analysis for Custom Intent List\n", + "### Display term analysis for a custom intent list\n", "\n", - "If you wish to see term analysis for specific intents, feel free to add those intents to the intent list. This shall generate a custom term heatmap. By default we show the top 30 tokens, but this can be changed if needed" + "If you wish to see term analysis for specific intents, feel free to add those intents to the intent list. This generates a custom term heatmap. The code below displays the top 20 terms, but this can be changed if needed." ] }, { @@ -364,26 +389,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Actions for Anomalous Terms in Heat Map\n", + "### Actions for anomalous terms in the heat map\n", "\n", - "If you notice any terms or words which should not be frequently present within an intent, consider modifying examples in that intent" + "If you notice any terms or words which should not be frequently present within an intent, consider modifying examples in that intent." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 1.5 Ambiguity in Training Data\n", - "### Uncover possibly ambiguous terms based on feature correlation\n", - "Based on the chi-square analysis above, we generate intent pairs whose correlated unigrams and bigrams overlap.\n", - "This allows us to get a glimpse of which unigrams or bigrams might cause potential confusion in intent detection." + "\n", + "## 1.6 Ambiguity in the training data\n", + "\n", + "- [Uncover ambiguous utterances across intents](#uncover)\n", + "- [Actions for ambiguity in the training data](#ambiguityaction)\n", + "\n", + "Run the code blocks below to uncover possibly ambiguous terms based on feature correlation.\n", + "\n", + "Based on the chi-square analysis above, generate intent pairs which have overlapping correlated unigrams and bigrams.\n", + "This allows you to get a glimpse of which unigrams or bigrams might cause potential confusion with intent detection:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### A. Top Intent Pairs whose correlated unigrams overlap" + "#### A. Top intent pairs with overlapping correlated unigrams" ] }, { @@ -400,7 +431,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### B. Top Intent Pairs whose correlated bigrams overlap" + "#### B. Top intent pairs with overlapping correlated bigrams" ] }, { @@ -417,7 +448,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### C. Overlap Checker for Specific Intents" + "#### C. Overlap checker for specific intents" ] }, { @@ -437,8 +468,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Uncover ambiguous utterances across intents\n", - "The following analysis shows user examples that are similar but fall under different Intents. " + "### Uncover ambiguous utterances across intents\n", + "The following analysis shows user examples that are similar but fall under different intents. " ] }, { @@ -455,49 +486,55 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Actions for Ambiguity in Training Data\n", + "### Actions for ambiguity in the training data\n", "\n", - "**Ambiguous Intent Pairs** \n", - "If you see terms which are correlated with more than 1 intent, please review if this seems anomalous based on the use case for that intent. If it seems reasonable, it may not be an issue. \n", + "**Ambiguous intent pairs** \n", + "If you see terms which are correlated with more than 1 intent, review if this seems anomalous based on the use case for that intent. If it seems reasonable, it is probably not an issue. \n", "\n", - "**Ambiguous Utterances across intents** \n", - "1. **Duplicates Utterances**: For duplicate or almost identical utterances, remove those that seem unnecesssary\n", - "2. **Similar Utterances**: For similar utterances please review the use case for those intents and make sure that they are not accidental additions caused by human error in creating the training data \n", + "**Ambiguous utterances across intents** \n", + "- **Duplicate utterances**: For duplicate or almost identical utterances, remove those that seem unnecessary.\n", + "- **Similar utterances**: For similar utterances, review the use case for those intents and make sure that they are not accidental additions caused by human error when the training data was created. \n", "\n", - "Reference for more information on entities: https://cloud.ibm.com/docs/services/assistant/services/assistant?topic=assistant-entities\n", + "For more information about entity, refer to the Entity Documentation.\n", "\n", - "For more in-depth analysis related to possible conflicts in your training data across intents, try the conflict detection feature in Watson Assistant https://cloud.ibm.com/docs/services/assistant?topic=assistant-intents#intents-resolve-conflicts" + "For more in-depth analysis related to possible conflicts in your training data across intents, try the conflict detection feature in Watson Assistant. Refer to
Conflict Resolution Documentation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Part 2: Model Analysis\n", + "\n", + "# Part 2: Prepare the test data\n", + "\n", + "Analyze your existing Watson Assistant Dialog Skill with the help of a test set.\n", "\n", - "Analyze your existing Watson Assistant Dialog Skill with the help of a test set." + "2.1. [Obtain test data from Cloud Object Storage](#cos)
\n", + "2.2. [Evaluate the test data](#evaluate)
\n", + "2.3. [Analyze the test data](#testanalysis)
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup: Upload Test Data\n", - "Please upload a test set in csv/tsv format. Each line in the file should have only `User_InputIntent` \n", + "## 2.1 Obtain test data from Cloud Object Storage\n", "\n", - "An example would be\n", + "Upload a test set in tsv format. Each line in the file should have only `User_InputIntent` \n", + "\n", + "For example:\n", "```\n", "hello how are youGreeting \n", "I would like to talk to a humanAgentHandoff \n", - "```\n", - "\n", - "Modify the separator used if you want to use data in csv format rather than tsv" + "```" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "importlib.reload(skills_util)\n", @@ -509,17 +546,18 @@ "test_df = skills_util.process_test_set(test_set_path, separator)\n", "\n", "display(Markdown(\"### Random Test Sample\"))\n", - "display(HTML(test_df.sample(n=10).to_html(index=False)))" + "display(HTML(test_df.sample(n=10).to_html(index=False)))\n", + "display(HTML(test_df.sample(n=10).to_html(index=False))) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Evaluate Test Data\n", - "These steps can take time if you have a large test set \n", + "## 2.2 Evaluate the test data\n", + "These steps can take time if you have a large test set. \n", "\n", - "**Note**: You will be charged for calls made from this notebook based on your WA plan " + "**Note**: You will be charged for calls made from this notebook based on your Watson Assistant plan. " ] }, { @@ -542,14 +580,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2.1 Model Analysis" + "\n", + "## 2.3 Analyze the test data\n", + "\n", + "- [Display an overview of the test data](#overview)\n", + "- [Compare the test data and the training data](#compare)\n", + "- [Determine the overall accuracy on the test set](#accuracy)\n", + "- [Analyze the errors](#errors)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Test Data Overview" + "### Display an overview of the test data" ] }, { @@ -567,17 +611,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Compare Test Data & Training Data\n", + "### Compare the test data and the training data\n", "\n", - "Ideally the Test and Training Data distributions should be similar. The following metrics can help identify gaps between Test Set and Training Set:\n", + "Ideally the test and training data distributions should be similar. The following metrics can help identify gaps between the test set and the training set:\n", "\n", - "**1.** The distribution of User Examples per Intent for the Test Data should be comparable to the Training Data \n", - "**2.** Average length of User Examples for Test and Training Data should be comparable \n", - "**3.** The vocabulary and phrasing of utterances in the Test Data should be comparable to the Training Data\n", + "**1.** The distribution of user examples per intent for the test data should be comparable to the training data \n", + "**2.** The average length of user examples for test and training data should be comparable to the training data
\n", + "**3.** The vocabulary and phrasing of utterances in the test data should be comparable to the training data\n", "\n", - "If your test data comprises of examples labelled from your logs, and the training data comprises of examples created by human subject matter experts, there may be discrepancies between what the virtual assistant designers thought the end users would type and the way they actually type in production. Thus if you find discrepancies in this section, you might want to consider changing your design to more closely resemble the way end users use your system.\n", + "If your test data comprises of examples labelled from your logs, and the training data comprises of examples created by human subject matter experts, there may be discrepancies between what the virtual assistant designers thought the end users would type and the way they actually type in production. Thus, if you find discrepancies in this section, consider changing your design to resemble the way in which end users use your system more closely.\n", "\n", - "**Note**: You will be charged for calls made from this notebook based on your WA plan" + "**Note**: You will be charged for calls made from this notebook based on your WA plan." ] }, { @@ -594,7 +638,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Determine Overall Accuracy on Test Set" + "### Determine the overall accuracy on the test set" ] }, { @@ -613,11 +657,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Error Analysis\n", + "### Analyze the errors\n", "\n", - "This section gives the user an overview of the errors made by the intent classifier on the test set \n", + "This section gives you an overview of the errors made by the intent classifier on the test set. \n", "\n", - "**Note** `System Out of Domain` labels are assigned to user examples which get classified with confidence scores less than 0.2 as Watson Assistant would consider them to be irrelevant" + "**Note**: `System Out of Domain` labels are assigned to user examples which get classified with confidence scores less than 0.2 as Watson Assistant considers them to be irrelevant." ] }, { @@ -640,16 +684,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Part 3: Advanced Analysis" + "\n", + "# Part 3: Perform advanced analysis\n", + "\n", + "3.1 [Perform analysis using confidence thresholds](#part3.1)
\n", + "3.2 [Analysis interpretation @ confidence level T](#levelT)
\n", + "3.3 [Highlighting term importance](#part3.2)
\n", + "3.4 [Analyzing abnormal confidence levels](#part3.3)
\n", + "3.5 [Perform an analysis using correlated entities per intent](#part3.4)
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3.1 Analysis using Confidence Thresholds\n", + "\n", + "## 3.1 Perform analysis using confidence thresholds\n", "\n", - "In this phase of the analysis, we illustrate how a confidence threshold which is used to determine what is considered irrelevant or out of domain can be used for analysis " + "This analysis illustrates how a confidence threshold is used to determine which data considered irrelevant or out of domain can be used for analysis. " ] }, { @@ -666,12 +718,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Analysis Interpretation @ Confidence Level T \n", + "## 3.2 Analysis interpretation @ confidence level T \n", "\n", - "If a certain confidence threshold T is selected then \n", - "1. The on topic accuracy for test examples which cross the threshold is ***TOA***\n", - "2. Percentage of total test examples which returned confidences higher than the threshold measured as ***Bot Coverage %***\n", - "3. If out of domain examples exist, we falsely accept out of domain examples as on topic examples at a rate measured by ***FAR***" + "If a certain confidence threshold T is selected, then: \n", + "- The on-topic accuracy for test examples which cross the threshold is ***TOA***\n", + "- The percentage of total test examples which returns confidences higher than the threshold is measured as ***Bot Coverage %***\n", + "- If out of domain examples exist, falsely accept out of domain examples as on topic examples at a rate measured by ***FAR*** (False Acceptance Rate)" ] }, { @@ -688,35 +740,36 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Threshold Selection\n", + "### Select the threshold value\n", "\n", - "By selecting a higher threshold we can potentially bias our systems towards being more accurate in terms of determining whether an utterance is on topic or out of domain. The default confidence threshold for Watson Assistance is 0.2 \n", + "By selecting a higher threshold, you can potentially bias your systems so that they are more accurate in terms of determining whether an utterance is on topic or out of domain. The default confidence threshold for Watson Assistance is 0.2. \n", "\n", - "**Effect on Accuracy**: When we select a higher threshold T, this can result in higher accuracy (TOA) on those thresholded examples since we are looking at utterances that the intent detector is more confident on.\n", + "**Effect on accuracy**: When you select a higher threshold T, this can result in higher accuracy (TOA) because only examples with confidences greater than the threshold T are included.\n", "\n", - "**Effect on Bot Coverage %**: But when we select a higher threshold T, this can also result in less examples being responded to by the virtual assistant.\n", + "**Effect on bot coverage %**: However, when you select a higher threshold T, this can also result in the virtual assistant responding to less examples.\n", "\n", - "**Deflection to Human Agent**: In the scenarios where the virtual assistant is setup to hand off to a human agent when its less confident, having a higher threshold T can: \n", + "**Deflection to human agent**: In the scenarios where the virtual assistant is setup to hand off to a human agent when it is less confident, having a higher threshold T can: \n", "\n", - "1. Improve end user experience when interacting with a virtual assistant, as it is continuing interaction only when its highly confident\n", - "2. But this can result in higher costs to the customer as this can result in more deflections to the human agents \n", - "3. There is thus a trade-off and a threshold needs to be decided on a per customer basis" + "- Improve end user experience when interacting with a virtual assistant, as it continues interaction only when its highly confident\n", + "- Result in higher costs to the customer as this can result in more deflections to the human agents \n", + "\n", + "Thus, there is a trade-off and you need to decide on a threshold value on a per customer basis." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Threshold Selection on Individual Intents\n", + "### Examine the threshold selection on individual intents\n", "This section allows the examination of thresholds on specific intents.\n", "\n", - "- Use INTENT_LIST = [] to get analysis which averages across all intents\n", - "- Use INTENT_LIST = ['intent1', 'intent2'] to examine specific intents and threshold analysis on these intents\n", - "- Use INTENT_LIST = ['ALL_INTENTS'] to examine all intents and threshold analysis for each\n", - "- Use INTENT_LIST = [MOST_FREQUENT_INTENT] to get analysis on the intent with the most test examples (DEFAULT)\n", + "- Use `INTENT_LIST = []` to get analysis which averages across all intents\n", + "- Use `INTENT_LIST = ['intent1', 'intent2']` to examine specific intents and threshold analysis on these intents\n", + "- Use `INTENT_LIST = ['ALL_INTENTS']` to examine all intents and threshold analysis for each\n", + "- Use `INTENT_LIST = [MOST_FREQUENT_INTENT]` to get analysis on the intent with the most test examples (DEFAULT)\n", "\n", "**False Acceptance Rate (FAR) for specific intents** \n", - "When we calculate FAR across all intents (as in previous section) we calculate fraction of out of domain examples falsely considered on topic. When we calculate FAR for specific intents we calculate the fraction of examples which were falsely predicted to be that specific intent." + "When we calculate FAR across all intents (as in previous section) we calculate fraction of out of domain examples falsely considered on topic. When we calculate FAR for specific intents, we calculate the fraction of examples which were falsely predicted to be that specific intent." ] }, { @@ -743,11 +796,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3.2 Term Importance Highlighting\n", + "\n", + "## 3.3 Highlight term importance\n", "\n", - "This intent can be ground-truth or an incorrect predicted intent. It provides term level insights on which terms the classifier thought were important in relation to that specific intent.\n", + "This intent can be ground-truth or an incorrectly predicted intent. It provides term level insights about which terms the classifier thought were important in relation to that specific intent.\n", "\n", - "Even if the system predicts an intent correctly, the terms which the intent classifier though were important may not be as expected by human insight. Human insight might suggest that the intent classifier is focusing on the wrong terms. \n", + "Even if the system predicts an intent correctly, the terms which the intent classifier thought were important may not be as expected by human insight. Human insight might suggest that the intent classifier is focusing on the wrong terms. \n", "\n", "The score of each term in the following highlighted images can be viewed as importance factor of that term for that specific intent. The larger the score, the more important the term." ] @@ -756,9 +810,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can get the highlighted images for either wrongly-predicted utterances or utterances where the classifier returned a low confidence. \n", + "You can get the highlighted images for either wrongly-predicted utterances or utterances where the classifier returned a low confidence. \n", "\n", - "**Note**: You will be charged for calls made from this notebook based on your WA plan" + "**Note**: You will be charged for calls made from this notebook based on your WA plan." ] }, { @@ -795,9 +849,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the section below we analyze your test results and produce highlighting for the top 25 problematic utterances which were either mistakes or had confidences below the threshold that was set. \n", + "In the section below you analyze your test results and produce highlighting for the top 25 problematic utterances which were either mistakes or had confidences below the threshold that was set. \n", "\n", - "**Note**: You will be charged for calls made from this notebook based on your WA plan" + "**Note**: You will be charged for calls made from this notebook based on your WA plan." ] }, { @@ -834,10 +888,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3.3 Abnormal Confidence Analysis\n", - "Every test utterance is classified as a specific intent with a specific confidence by the WA intent classifier. It is expected that model would be confident when correctly predicting examples and not highly confident when incorrectly predicting examples. \n", + "\n", + "## 3.4 Analyze abnormal confidence levels\n", + "Every test utterance is classified as a specific intent with a specific confidence by the Watson Assistant intent classifier. It is expected that model would be confident when it correctly predicts examples and not highly confident when it incorrectly predicts examples. \n", "\n", - "But often this is not true. This may suggest there are anomalies in the design. Examples that are predicted correctly with low confidence and the examples that are predicted incorrectly with high confidence are thus cases which need to be reviewed." + "But this is not always true. This can be because there are anomalies in the design. Examples that are predicted correctly with low confidence and the examples that are predicted incorrectly with high confidence are cases which need to be reviewed." ] }, { @@ -880,11 +935,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Actions for abnormal confidence examples\n", + "### Actions to take when you have examples of abnormal confidence\n", "\n", - "If there are examples which are getting classified incorrectly with high confidence for specific intents, it may indicate an issue in the design of those specific intents as the user examples provided for that intent may be overlapping with the design of other intents.\n", + "If there are examples which are incorrectly classified with high confidence for specific intents, it may indicate an issue in the design of those specific intents because the user examples provided for that intent may be overlapping with the design of other intents.\n", "\n", - "If intent A seems to always get misclassified as intent B with high confidence or gets correctly predicted with low confidence, please consider using intent conflict detection https://cloud.ibm.com/docs/services/assistant?topic=assistant-intents#intents-resolve-conflicts\n", + "If intent A seems to always get misclassified as intent B with high confidence or gets correctly predicted with low confidence, consider using intent conflict detection. For more information, refer to the Conflict Resolution Documentation.\n", "\n", "Also consider whether those two intents need to be two separate intents or whether they need to be merged. If they can't be merged, then consider adding more user examples which distinguish intent A specifically from intent B." ] @@ -893,11 +948,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3.4 Analysis using Correlated Entities per Intent\n", + "\n", + "## 3.5 Perform an analysis using correlated entities per intent\n", "\n", - "We perform a chi square significance test for entities as we did for unigrams and bigrams in the previous section. For each utterance in the training data, this analysis will call the mesage api for entity detection on each utterance and find the most correlated entities for each intent\n", + "Perform a chi square significance test for entities such as we or you for unigrams and bigrams in the previous section. For each utterance in the training data, this analysis will call the message API for entity detection on each utterance and find the most correlated entities for each intent.\n", "\n", - "**Note**: You will be charged for calls made from this notebook based on your WA plan " + "**Note**: You will be charged for calls made from this notebook based on your Watson Assistant plan. " ] }, { @@ -925,6 +981,16 @@ " display(Markdown(\"### Target workspace has no entities.\"))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Part 4: Summary\n", + "Congratulations! You have successfully completed the dialog skill analysis training.
\n", + "This notebook is designed to improve our dialog skill analysis in an iterative fashion. Use it to tackle one aspect of your dialog skill at a time and start over for another aspect later for continuous improvement." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -935,9 +1001,9 @@ "\n", "**True Negatives (TN):** True Negative measures the number of correctly predicted negative values meaning that the predicted class is the same as the actual class which is not the target intent.\n", "\n", - "**False Positives (FP):** False Positive measures the number of incorrectedly predicted positive values meaning that the predicted class is the target intent but the actual class is not the target intent. \n", + "**False Positives (FP):** False Positive measures the number of incorrectly predicted positive values meaning that the predicted class is the target intent but the actual class is not the target intent. \n", "\n", - "**False Negatives (FN):** False Negatives measures the number of incorrectedly predicted negative values meaning that the predicted class is not the target intent but the actual class is the target intent. \n", + "**False Negatives (FN):** False Negatives measures the number of incorrectly predicted negative values meaning that the predicted class is not the target intent but the actual class is the target intent. \n", "\n", "**Accuracy:** Accuracy measures the ratio of corrected predicted user examples out of all user examples. \n", "Accuracy = (TP + TN) / (TP + TN + FP + FN) \n", @@ -951,15 +1017,43 @@ "**F1 Score:** F1 Score is the harmonic average of Precision and Recall. \n", "F1 = 2 \\* (Precision \\* Recall)/ (Precision + Recall)\n", "\n", - "For more information related to Watson Assistant: https://cloud.ibm.com/docs/services/assistant" + "For more information related to Watson Assistant, refer to the Watson Assistant Documentation." ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### Authors\n", + "\n", + "**Haode Qi** is a data scientist at IBM Watson who delivers new machine learning algorithms into IBM Watson's market leading conversational AI service. He works with clients to help improve their conversational AI agents and helps them tackle complex challenges at scale with tools like Dialog Skill Analysis. His work primarily focuses on natural language technology with interests in defending adversarial attacks in text, PII redaction and Auto-AI for text. He is also a believer in open-source and has been contributing to open-source projects like the IBM Auto-AI framework - Lale.\n", + "\n", + "**Navneet Rao** is an engineering lead at IBM Watson who believes in building unique AI-powered experiences which augment human capabilities. He currently works on AI innovation & research for IBM's award-winning conversational computing platform, the IBM Watson Assistant. His primary areas of interest include machine learning problems related to conversational AI, natural language understanding, semantic search & transfer learning.\n", + "\n", + "**Ming Tan**, PhD, is a research scientist at IBM Watson who works on prototyping and productizing various algorithmic features for the IBM Watson Assistant. His research interests include a broad spectrum of problems related to conversational AI such as low-resource intent classification, out-of-domain detection, multi-user chat channels, passage-level semantic matching and entity detection. His work has been published at various top tier NLP conferences.\n", + "\n", + "**Yang Yu**, PhD, is a research scientist at IBM Watson focusing on problems related to language understanding, question answering, deep learning and representation learning for various NLP tasks. He has been awarded by IBM for his contributions to several internal machine learning competitions which have included researchers from across the globe. Novel machine learning solutions designed by him have helped solve critical question answering and human-computer dialog problems for various IBM Watson products." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Copyright © IBM Corp. 2019. This notebook and its source code are released under the terms of the Apache License, Version 2.0." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Love this notebook? \n", + "Don't have an account yet?
\n", + "Share it with your colleagues and help them discover the power of Watson Studio!\n", + "Sign Up
\n", + "
" + ] } ], "metadata": { diff --git a/tests/end2end/end2end_test.py b/tests/end2end/end2end_test.py index 743fbd8..cfbf6c1 100644 --- a/tests/end2end/end2end_test.py +++ b/tests/end2end/end2end_test.py @@ -5,6 +5,7 @@ class TestNotebook(unittest.TestCase): def setUp(self): + unittest.TestCase.setUp(self) CONFIG_FILE = './wa_config.txt' with open(CONFIG_FILE) as fi: self.apikey = fi.readline().strip() @@ -15,5 +16,8 @@ def test_notebook(self): nb, errors = skills_util.run_notebook('skill_analysis.ipynb', self.apikey, self.wksp_id, test_file, 'notebook_output') self.assertEqual(errors, []) + def tearDown(self): + unittest.TestCase.tearDown(self) + if __name__ == '__main__': unittest.main() \ No newline at end of file