diff --git a/_version.py b/_version.py index 8b77a9a..9297fba 100644 --- a/_version.py +++ b/_version.py @@ -1,4 +1,4 @@ # Semantic versioning # MAJOR.MINOR.PATCH -__version__ = '1.0.2' +__version__ = '1.1.0' diff --git a/assistant_dialog_skill_analysis/__init__.py b/assistant_dialog_skill_analysis/__init__.py index a6221b3..6849410 100644 --- a/assistant_dialog_skill_analysis/__init__.py +++ b/assistant_dialog_skill_analysis/__init__.py @@ -1 +1 @@ -__version__ = '1.0.2' +__version__ = "1.1.0" diff --git a/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py b/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py index 17df373..5b31442 100644 --- a/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py +++ b/assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py @@ -17,51 +17,92 @@ def abnormal_conf(full_results, correct_thresh, incorrect_thresh): :return: """ test_pd = pd.DataFrame(full_results) - test_pd = test_pd.loc[~(test_pd['correct_intent'] == 'SYSTEM_OUT_OF_DOMAIN')] - correct = test_pd.loc[test_pd['correct_intent'] == test_pd['top_intent']] + test_pd = test_pd.loc[~(test_pd["correct_intent"] == "SYSTEM_OUT_OF_DOMAIN")] + correct = test_pd.loc[test_pd["correct_intent"] == test_pd["top_intent"]] - correct_low_conf = correct.loc[correct['top_confidence'] < correct_thresh] + correct_low_conf = correct.loc[correct["top_confidence"] < correct_thresh] correct_low_conf = correct_low_conf[ - ['correct_intent', 'utterance', 'top_confidence', 'top_intent']] + ["correct_intent", "utterance", "top_confidence", "top_intent"] + ] - incorrect = test_pd.loc[~(test_pd['correct_intent'] == test_pd['top_intent'])] - incorrect_high_conf = incorrect.loc[incorrect['top_confidence'] > incorrect_thresh] + incorrect = test_pd.loc[~(test_pd["correct_intent"] == test_pd["top_intent"])] + incorrect_high_conf = incorrect.loc[incorrect["top_confidence"] > incorrect_thresh] top1 = list() top2 = list() top3 = list() for i in range(len(incorrect_high_conf)): - possible_range = len(incorrect_high_conf.iloc[i, :]['top_predicts']) + possible_range = len(incorrect_high_conf.iloc[i, :]["top_predicts"]) for j in range(3): if j == 0: if possible_range >= 1: - top1.append(incorrect_high_conf.iloc[i, :]['top_predicts'][j]['intent'] + ' ' + - '(' + str(np.round(incorrect_high_conf.iloc[i, :]['top_predicts'][j] - ['confidence'], 3)) + ')') + top1.append( + incorrect_high_conf.iloc[i, :]["top_predicts"][j]["intent"] + + " " + + "(" + + str( + np.round( + incorrect_high_conf.iloc[i, :]["top_predicts"][j][ + "confidence" + ], + 3, + ) + ) + + ")" + ) else: - top1.append('NA') + top1.append("NA") if j == 1: if possible_range >= 2: - top2.append(incorrect_high_conf.iloc[i, :]['top_predicts'][j]['intent'] + ' ' + - '(' + str(np.round(incorrect_high_conf.iloc[i, :]['top_predicts'][j] - ['confidence'], 3)) + ')') + top2.append( + incorrect_high_conf.iloc[i, :]["top_predicts"][j]["intent"] + + " " + + "(" + + str( + np.round( + incorrect_high_conf.iloc[i, :]["top_predicts"][j][ + "confidence" + ], + 3, + ) + ) + + ")" + ) else: - top2.append('NA') + top2.append("NA") if j == 2: if possible_range >= 3: - top3.append(incorrect_high_conf.iloc[i, :]['top_predicts'][j]['intent'] + ' ' + - '(' + str(np.round(incorrect_high_conf.iloc[i, :]['top_predicts'][j] - ['confidence'], 3)) + ')') + top3.append( + incorrect_high_conf.iloc[i, :]["top_predicts"][j]["intent"] + + " " + + "(" + + str( + np.round( + incorrect_high_conf.iloc[i, :]["top_predicts"][j][ + "confidence" + ], + 3, + ) + ) + + ")" + ) else: - top3.append('NA') + top3.append("NA") - incorrect_high_conf['top1_prediction'] = top1 - incorrect_high_conf['top2_prediction'] = top2 - incorrect_high_conf['top3_prediction'] = top3 + incorrect_high_conf["top1_prediction"] = top1 + incorrect_high_conf["top2_prediction"] = top2 + incorrect_high_conf["top3_prediction"] = top3 incorrect_high_conf = incorrect_high_conf[ - ['correct_intent', 'utterance', 'top1_prediction', 'top2_prediction', 'top3_prediction']] + [ + "correct_intent", + "utterance", + "top1_prediction", + "top2_prediction", + "top3_prediction", + ] + ] return correct_low_conf, incorrect_high_conf @@ -79,13 +120,13 @@ def analysis(results, intent_list=None): analysis_df = analysis_pipeline(results) return analysis_df - if len(intent_list) == 1 and intent_list[0] == 'ALL_INTENTS': - intent_list = list(results['correct_intent'].unique()) + if len(intent_list) == 1 and intent_list[0] == "ALL_INTENTS": + intent_list = list(results["correct_intent"].unique()) if OFFTOPIC_LABEL in intent_list: intent_list.remove(OFFTOPIC_LABEL) analysis_df_list = list() for intent_name in intent_list: - display(Markdown('### Threshold Analysis for Intent: {}'.format(intent_name))) + display(Markdown("### Threshold Analysis for Intent: {}".format(intent_name))) analysis_df = analysis_pipeline(results, intent_name) if all(analysis_df): analysis_df.index = np.arange(1, len(analysis_df) + 1) @@ -94,37 +135,58 @@ def analysis(results, intent_list=None): return analysis_df_list + def _display_analysis_metrics(display_far): """display the explanation for analysis metrics""" display(Markdown("### Threshold Metrics")) - display(Markdown( - "We calculate metrics for responses where the top intent has a confidence above the \ - threshold specified on the x-axis. ")) - - display(Markdown( - "We consider examples which are within the scope of the chatbot's problem formulation as \ + display( + Markdown( + "We calculate metrics for responses where the top intent has a confidence above the \ + threshold specified on the x-axis. " + ) + ) + + display( + Markdown( + "We consider examples which are within the scope of the chatbot's problem formulation as \ on topic or in domain and those examples which are outside the scope of the problem to be \ - out of domain or irrelevant")) + out of domain or irrelevant" + ) + ) display(Markdown("#### 1) Thresholded On Topic Accuracy (TOA)")) - display(Markdown( - "x-axis: Confidence threshold used || " + - "y-axis: Intent Detection Accuracy for On Topic utterances")) + display( + Markdown( + "x-axis: Confidence threshold used || " + + "y-axis: Intent Detection Accuracy for On Topic utterances" + ) + ) display(Markdown("#### 2) Bot Coverage %")) - display(Markdown( - "x-axis: Confidence threshold used || " + - "y-axis: Fraction of All utterances above the threshold")) + display( + Markdown( + "x-axis: Confidence threshold used || " + + "y-axis: Fraction of All utterances above the threshold" + ) + ) if display_far: - display(Markdown("#### 3) False Acceptance Rate for Out of Domain Examples (FAR)")) - display(Markdown( - "x-axis: Confidence threshold used || " + - "y-axis: Fraction of Out of Domain utterances falsely considered on topic")) - - display(Markdown( - "#### Note: Default acceptance threshold for Watson Assistant is set at 0.2.\ - Utterances with top intent confidence < 0.2 will be considered irrelevant")) + display( + Markdown("#### 3) False Acceptance Rate for Out of Domain Examples (FAR)") + ) + display( + Markdown( + "x-axis: Confidence threshold used || " + + "y-axis: Fraction of Out of Domain utterances falsely considered on topic" + ) + ) + + display( + Markdown( + "#### Note: Default acceptance threshold for Watson Assistant is set at 0.2.\ + Utterances with top intent confidence < 0.2 will be considered irrelevant" + ) + ) def generate_unique_thresholds(sorted_results_tuples): @@ -135,8 +197,12 @@ def generate_unique_thresholds(sorted_results_tuples): """ sort_uniq_confs = list(sorted(set([info[2] for info in sorted_results_tuples]))) thresholds = [0] - thresholds.extend([(sort_uniq_confs[idx] + sort_uniq_confs[idx + 1]) / 2 - for idx in range(len(sort_uniq_confs) - 1)]) + thresholds.extend( + [ + (sort_uniq_confs[idx] + sort_uniq_confs[idx + 1]) / 2 + for idx in range(len(sort_uniq_confs) - 1) + ] + ) return thresholds, sort_uniq_confs @@ -202,7 +268,7 @@ def _get_bot_coverage_list(sorted_infos, thresholds): cur_bot_coverage -= 1 current_step += 1 bot_coverage_count_list.append(cur_bot_coverage) - bot_coverage_list.append(cur_bot_coverage/tol) + bot_coverage_list.append(cur_bot_coverage / tol) return bot_coverage_list, bot_coverage_count_list @@ -226,7 +292,7 @@ def _get_far_list(sorted_infos, thresholds): current_step += 1 else: break - far_list.append(cur_fa_count/tol) + far_list.append(cur_fa_count / tol) far_count.append(cur_fa_count) return far_list, far_count @@ -240,27 +306,38 @@ def _convert_data_format(results, intent_name=None): :return: result_list: list of tuples of (ground_truth, prediction, confidence) sorted by conf """ if intent_name: - results = results[(results['correct_intent'] == intent_name) | - (results['top_intent'] == intent_name)].copy() - - results['correct_intent'] = np.where((results['correct_intent'] != - results['top_intent']) & - (results['top_intent'] == intent_name), - OFFTOPIC_LABEL, - results['correct_intent']) - - results_list = [(gt, pred, conf) for gt, pred, conf in - zip(results['correct_intent'], - results['top_intent'], - results['top_confidence'])] + results = results[ + (results["correct_intent"] == intent_name) + | (results["top_intent"] == intent_name) + ].copy() + + results["correct_intent"] = np.where( + (results["correct_intent"] != results["top_intent"]) + & (results["top_intent"] == intent_name), + OFFTOPIC_LABEL, + results["correct_intent"], + ) + + results_list = [ + (gt, pred, conf) + for gt, pred, conf in zip( + results["correct_intent"], + results["top_intent"], + results["top_confidence"], + ) + ] results_list = sorted(results_list, key=lambda x: x[2]) else: - results_list = [(truth, prediction, confidence) for truth, prediction, confidence - in zip(results['correct_intent'], - results['top_intent'], - results['top_confidence'])] + results_list = [ + (truth, prediction, confidence) + for truth, prediction, confidence in zip( + results["correct_intent"], + results["top_intent"], + results["top_confidence"], + ) + ] results_list = sorted(results_list, key=lambda x: x[2]) return results_list @@ -273,11 +350,13 @@ def extract_by_topic(sorted_results): :return: ontopic_infos, list """ - offtopic_infos = [prediction for prediction in sorted_results - if prediction[0] == OFFTOPIC_LABEL] + offtopic_infos = [ + prediction for prediction in sorted_results if prediction[0] == OFFTOPIC_LABEL + ] - ontopic_infos = [prediction for prediction in sorted_results - if prediction[0] != OFFTOPIC_LABEL] + ontopic_infos = [ + prediction for prediction in sorted_results if prediction[0] != OFFTOPIC_LABEL + ] return ontopic_infos, offtopic_infos @@ -295,23 +374,32 @@ def analysis_pipeline(results, intent_name=None): # if ontopic counts or sorted results are less than 3, the graph will show almost no variation # if all confidence of the predicted result are the same, there will be no variation - if len(ontopic_infos) < 3 or len(sorted_results) < 3 \ - or all(ele[2] == sorted_results[0][2] for ele in sorted_results): - display(Markdown('**Inadequate Data Points**: No analysis will be conducted')) + if ( + len(ontopic_infos) < 3 + or len(sorted_results) < 3 + or all(ele[2] == sorted_results[0][2] for ele in sorted_results) + ): + display(Markdown("**Inadequate Data Points**: No analysis will be conducted")) analysis_df = pd.DataFrame() return analysis_df - analysis_df, toa_list, bot_coverage_list, far_list, thresholds = \ - extract_table_analysis(sorted_results, - ontopic_infos, - offtopic_infos) + ( + analysis_df, + toa_list, + bot_coverage_list, + far_list, + thresholds, + ) = extract_table_analysis(sorted_results, ontopic_infos, offtopic_infos) if not intent_name and not analysis_df.empty: - line_graph_data = pd.DataFrame(data={'Thresholded On Topic Accuracy': toa_list, - 'Bot Coverage %': bot_coverage_list, - 'False Acceptance Rate (FAR) for Out of Domain Examples': - far_list}, - index=thresholds) + line_graph_data = pd.DataFrame( + data={ + "Thresholded On Topic Accuracy": toa_list, + "Bot Coverage %": bot_coverage_list, + "False Acceptance Rate (FAR) for Out of Domain Examples": far_list, + }, + index=thresholds, + ) create_threshold_graph(line_graph_data) @@ -332,25 +420,32 @@ def extract_table_analysis(sorted_results, ontopic_infos, offtopic_infos): thresholds, sort_uniq_confs = generate_unique_thresholds(sorted_results) toa_list, toa_count = _get_ontopic_accuracy_list(sorted_results, thresholds) - bot_coverage_list, bot_coverage_count = _get_bot_coverage_list(sorted_results, thresholds) + bot_coverage_list, bot_coverage_count = _get_bot_coverage_list( + sorted_results, thresholds + ) if len(offtopic_infos) >= OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY: far_list, _ = _get_far_list(sorted_results, thresholds) else: - display(Markdown( - 'Out of Domain examples fewer than **%d** thus \ - no False Acceptance Rate (FAR) calculated' - % OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY)) - far_list = [-1]*len(thresholds) - - analysis_df = create_display_table(toa_list, - bot_coverage_list, - bot_coverage_count, - sorted_results, - thresholds, - offtopic_infos, - far_list) + display( + Markdown( + "Out of Domain examples fewer than **%d** thus \ + no False Acceptance Rate (FAR) calculated" + % OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY + ) + ) + far_list = [-1] * len(thresholds) + + analysis_df = create_display_table( + toa_list, + bot_coverage_list, + bot_coverage_count, + sorted_results, + thresholds, + offtopic_infos, + far_list, + ) return analysis_df, toa_list, bot_coverage_list, far_list, thresholds @@ -361,21 +456,24 @@ def create_threshold_graph(data): :param data: :return: None """ - sns.set(rc={'figure.figsize': (20.7, 10.27)}) + sns.set(rc={"figure.figsize": (20.7, 10.27)}) plt.ylim(0, 1.1) - plt.axvline(.2, 0, 1) + plt.axvline(0.2, 0, 1) plot = sns.lineplot(data=data, palette="tab10", linewidth=3.5) - plt.setp(plot.legend().get_texts(), fontsize='22') - plot.set_xlabel('Threshold T', fontsize=18) - plot.set_ylabel('Metrics mentioned above', fontsize=18) - -def create_display_table(toa_list, - bot_coverage_list, - bot_coverage_count, - sorted_results, - thresholds, - offtopic_infos, - far_list): + plt.setp(plot.legend().get_texts(), fontsize="22") + plot.set_xlabel("Threshold T", fontsize=18) + plot.set_ylabel("Metrics mentioned above", fontsize=18) + + +def create_display_table( + toa_list, + bot_coverage_list, + bot_coverage_count, + sorted_results, + thresholds, + offtopic_infos, + far_list, +): """ create table for display purpose :param toa_list: @@ -388,20 +486,28 @@ def create_display_table(toa_list, :return: analysis_df, pandas dataframe containing metrics at intervals of 10% """ # produce the threhold quantiles for extraction of relevant information - display_thresholds = [t/100 for t in range(0, 100, 10)] + display_thresholds = [t / 100 for t in range(0, 100, 10)] display_indexes = [_find_threshold(t, thresholds) for t in display_thresholds] analysis_data = dict() - analysis_data['Threshold (T)'] = display_thresholds - analysis_data['Ontopic Accuracy (TOA)'] = [toa_list[idx]*100 for idx in display_indexes] - analysis_data['Bot Coverage %'] = [bot_coverage_list[idx]*100 for idx in display_indexes] - analysis_data['Bot Coverage Counts'] = [str(np.round(bot_coverage_count[idx], decimals=0)) - + ' / ' + str(len(sorted_results)) - for idx in display_indexes] + analysis_data["Threshold (T)"] = display_thresholds + analysis_data["Ontopic Accuracy (TOA)"] = [ + toa_list[idx] * 100 for idx in display_indexes + ] + analysis_data["Bot Coverage %"] = [ + bot_coverage_list[idx] * 100 for idx in display_indexes + ] + analysis_data["Bot Coverage Counts"] = [ + str(np.round(bot_coverage_count[idx], decimals=0)) + + " / " + + str(len(sorted_results)) + for idx in display_indexes + ] if len(offtopic_infos) >= OFFTOPIC_CNT_THRESHOLD_FOR_DISPLAY: - analysis_data['False Acceptance Rate (FAR)'] = [far_list[idx]*100 for - idx in display_indexes] + analysis_data["False Acceptance Rate (FAR)"] = [ + far_list[idx] * 100 for idx in display_indexes + ] analysis_df = pd.DataFrame(data=analysis_data) return analysis_df diff --git a/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py b/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py index cb49d2a..a7a7abe 100644 --- a/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py +++ b/assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py @@ -13,9 +13,11 @@ def _label_percentage(data_frame): :return: label_percentage_dict: dictionary maps label : % of labels """ total_examples = len(data_frame) - label_frequency_dict = dict(Counter(data_frame['intent']).most_common()) - percentage_list = np.array(list(label_frequency_dict.values()))/total_examples - label_percentage_dict = dict(zip(list(label_frequency_dict.keys()), percentage_list)) + label_frequency_dict = dict(Counter(data_frame["intent"]).most_common()) + percentage_list = np.array(list(label_frequency_dict.values())) / total_examples + label_percentage_dict = dict( + zip(list(label_frequency_dict.keys()), percentage_list) + ) return label_percentage_dict @@ -26,15 +28,17 @@ def _train_test_coloring(val): :return: """ if val > 25: - color = 'red' + color = "red" elif val > 10: - color = 'DarkBlue' + color = "DarkBlue" else: - color = 'green' - return 'color: %s' % color + color = "green" + return "color: %s" % color -def _train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict): +def _train_test_label_difference( + workspace_label_percentage_dict, test_label_percentage_dict +): """ analyze the difference between training set and test set :param workspace_label_percentage_dict: @@ -66,9 +70,11 @@ def _train_test_label_difference(workspace_label_percentage_dict, test_label_per current_difference = np.abs(test_percentage - workspace_percentage) if key in test_label_percentage_dict: - difference_dict[key] = [workspace_percentage*100, - test_percentage*100, - current_difference*100] + difference_dict[key] = [ + workspace_percentage * 100, + test_percentage * 100, + current_difference * 100, + ] js_distance = distance.jensenshannon(distribution1, distribution2, 2.0) @@ -86,8 +92,8 @@ def _train_test_vocab_difference(train_set_pd, test_set_pd): """ train_vocab = set() test_vocab = set() - train_set_tokens = train_set_pd['utterance'].apply(word_tokenize) - test_set_tokens = test_set_pd['utterance'].apply(word_tokenize) + train_set_tokens = train_set_pd["utterance"].apply(word_tokenize) + test_set_tokens = test_set_pd["utterance"].apply(word_tokenize) for tokens in train_set_tokens.tolist(): train_vocab.update(tokens) @@ -107,24 +113,26 @@ def _train_test_utterance_length_difference(train_set_pd, test_set_pd): train_test_legnth_comparison: pandas dataframe [Intent, Absolute Difference] """ train_pd_temp = train_set_pd.copy() - train_pd_temp['tokens'] = train_set_pd['utterance'].apply(word_tokenize) - train_pd_temp['Train'] = train_pd_temp['tokens'].apply(len) - train_avg_len_by_label = train_pd_temp[['intent', 'Train']].groupby('intent').mean() + train_pd_temp["tokens"] = train_set_pd["utterance"].apply(word_tokenize) + train_pd_temp["Train"] = train_pd_temp["tokens"].apply(len) + train_avg_len_by_label = train_pd_temp[["intent", "Train"]].groupby("intent").mean() test_pd_temp = test_set_pd.copy() - test_pd_temp['tokens'] = test_set_pd['utterance'].apply(word_tokenize) - test_pd_temp['Test'] = test_pd_temp['tokens'].apply(len) - test_avg_len_by_label = test_pd_temp[['intent', 'Test']].groupby('intent').mean() - - train_test_length_comparison = pd.merge(train_avg_len_by_label, - test_avg_len_by_label, on='intent') - train_test_length_comparison['Absolute Difference'] = \ - np.abs(train_test_length_comparison['Train'] - train_test_length_comparison['Test']) + test_pd_temp["tokens"] = test_set_pd["utterance"].apply(word_tokenize) + test_pd_temp["Test"] = test_pd_temp["tokens"].apply(len) + test_avg_len_by_label = test_pd_temp[["intent", "Test"]].groupby("intent").mean() + + train_test_length_comparison = pd.merge( + train_avg_len_by_label, test_avg_len_by_label, on="intent" + ) + train_test_length_comparison["Absolute Difference"] = np.abs( + train_test_length_comparison["Train"] - train_test_length_comparison["Test"] + ) train_test_length_comparison = train_test_length_comparison.sort_values( - by=["Absolute Difference"], ascending=False) + by=["Absolute Difference"], ascending=False + ) train_test_length_comparison = train_test_length_comparison.reset_index() - train_test_length_comparison.rename(columns={'intent':'Intent' - }, inplace=True) + train_test_length_comparison.rename(columns={"intent": "Intent"}, inplace=True) return train_test_length_comparison @@ -137,8 +145,8 @@ def _get_metrics(results): recall_dict: maps the {intent: recall} f1_dict: maps the {intent:f1} """ - groundtruth = results['correct_intent'].values.tolist() - top_intent = results['top_intent'].values.tolist() + groundtruth = results["correct_intent"].values.tolist() + top_intent = results["top_intent"].values.tolist() gt_cnt_dict = dict() pred_cnt_dict = dict() true_positive_dict = dict() @@ -152,13 +160,22 @@ def _get_metrics(results): f1_dict = dict() for lb in true_positive_dict: - recall_dict[lb] = true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0 - - precision_dict[lb] = true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict \ - else 0 - - f1_dict[lb] = 0.0 if recall_dict[lb] == 0 and precision_dict[lb] == 0 \ - else 2.0 * recall_dict[lb] * precision_dict[lb] / (recall_dict[lb] + precision_dict[lb]) + recall_dict[lb] = ( + true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0 + ) + + precision_dict[lb] = ( + true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict else 0 + ) + + f1_dict[lb] = ( + 0.0 + if recall_dict[lb] == 0 and precision_dict[lb] == 0 + else 2.0 + * recall_dict[lb] + * precision_dict[lb] + / (recall_dict[lb] + precision_dict[lb]) + ) return precision_dict, recall_dict, f1_dict @@ -172,12 +189,14 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results): workspace_label_percentage_dict = _label_percentage(train_set_pd) test_label_percentage_dict = _label_percentage(test_set_pd) - missing_label, difference_dict, js = \ - _train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict) + missing_label, difference_dict, js = _train_test_label_difference( + workspace_label_percentage_dict, test_label_percentage_dict + ) train_vocab, test_vocab = _train_test_vocab_difference(train_set_pd, test_set_pd) - train_test_length_comparison_pd = \ - _train_test_utterance_length_difference(train_set_pd, test_set_pd) + train_test_length_comparison_pd = _train_test_utterance_length_difference( + train_set_pd, test_set_pd + ) display(Markdown("## Test Data Evaluation")) @@ -186,35 +205,43 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results): label = list(difference_dict.keys()) diff = np.round(list(difference_dict.values()), 2) precision_dict, recall_dict, f1_dict = _get_metrics(results) - precision = np.round([precision_dict[l]*100.0 if l in precision_dict else 0.0 - for l in label], 2) + precision = np.round( + [precision_dict[l] * 100.0 if l in precision_dict else 0.0 for l in label], + 2, + ) - recall = np.round([recall_dict[l]*100.0 if l in recall_dict else 0.0 for l in label], 2) + recall = np.round( + [recall_dict[l] * 100.0 if l in recall_dict else 0.0 for l in label], 2 + ) - f1 = np.round([f1_dict[l]*100.0 if l in f1_dict else 0.0 for l in label], 2) + f1 = np.round([f1_dict[l] * 100.0 if l in f1_dict else 0.0 for l in label], 2) - train_count_dict = dict(Counter(train_set_pd['intent'])) - test_count_dict = dict(Counter(test_set_pd['intent'])) + train_count_dict = dict(Counter(train_set_pd["intent"])) + test_count_dict = dict(Counter(test_set_pd["intent"])) tr_cnt = [train_count_dict[l] if l in train_count_dict else 0.0 for l in label] te_cnt = [test_count_dict[l] if l in test_count_dict else 0.0 for l in label] - difference_pd = pd.DataFrame({"Intent": label, - "% of Train": diff[:, 0], - "% of Test": diff[:, 1], - "Absolute Difference %": diff[:, 2], - "Train Examples": tr_cnt, - "Test Examples": te_cnt, - "Test Precision %": precision, - "Test Recall %": recall, - "Test F1 %": f1}) - - if not difference_pd[difference_pd["Absolute Difference %"] > .001].empty: - table_for_display = difference_pd[difference_pd["Absolute Difference %"] - > .001].sort_values(by=["Absolute Difference %"], - ascending=False) - table_for_display = \ - table_for_display.style.applymap(_train_test_coloring, - subset=pd.IndexSlice[:, ["Absolute Difference %"]]) + difference_pd = pd.DataFrame( + { + "Intent": label, + "% of Train": diff[:, 0], + "% of Test": diff[:, 1], + "Absolute Difference %": diff[:, 2], + "Train Examples": tr_cnt, + "Test Examples": te_cnt, + "Test Precision %": precision, + "Test Recall %": recall, + "Test F1 %": f1, + } + ) + + if not difference_pd[difference_pd["Absolute Difference %"] > 0.001].empty: + table_for_display = difference_pd[ + difference_pd["Absolute Difference %"] > 0.001 + ].sort_values(by=["Absolute Difference %"], ascending=False) + table_for_display = table_for_display.style.applymap( + _train_test_coloring, subset=pd.IndexSlice[:, ["Absolute Difference %"]] + ) display(table_for_display) display(Markdown("\n")) display(Markdown("Distribution Mismatch Color Code")) @@ -223,42 +250,61 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results): display(Markdown(" Green - Good ")) if js >= 0: - js = np.round(js, 2)*100 - display(Markdown("### Data Distribution Divergence Test vs Train \ - {}%" .format(js))) + js = np.round(js, 2) * 100 + display( + Markdown( + "### Data Distribution Divergence Test vs Train \ + {}%".format( + js + ) + ) + ) display(Markdown("**Note** Metric used is Jensen Shannon Distance")) if missing_label: display(Markdown("### Missing Intents in Test Data")) - missing_label_pd = pd.DataFrame(missing_label, - columns=["Missing Intents in Test Set "]) - missing_label_pd.index = np.arange(1, len(missing_label_pd)+1) + missing_label_pd = pd.DataFrame( + missing_label, columns=["Missing Intents in Test Set "] + ) + missing_label_pd.index = np.arange(1, len(missing_label_pd) + 1) display(missing_label_pd) display(Markdown("### Test Data Example Length")) - condition1 = (train_test_length_comparison_pd["Absolute Difference"] / - train_test_length_comparison_pd["Train"] > .3) - condition2 = (train_test_length_comparison_pd["Absolute Difference"] > 3) + condition1 = ( + train_test_length_comparison_pd["Absolute Difference"] + / train_test_length_comparison_pd["Train"] + > 0.3 + ) + condition2 = train_test_length_comparison_pd["Absolute Difference"] > 3 length_comparison_pd = train_test_length_comparison_pd[condition1 & condition2] if not length_comparison_pd.empty: - display(Markdown( - "Divergence found in average length of user examples in test vs training data")) - length_comparison_pd.index = np.arange(1, len(length_comparison_pd)+1) + display( + Markdown( + "Divergence found in average length of user examples in test vs training data" + ) + ) + length_comparison_pd.index = np.arange(1, len(length_comparison_pd) + 1) display(length_comparison_pd.round(2)) else: display(Markdown("Average length of user examples is comparable")) if train_vocab and test_vocab: display(Markdown("### Vocabulary Size Test vs Train")) - oov_vocab_percentage = (len(test_vocab) - len(train_vocab.intersection(test_vocab))) \ - / len(test_vocab)*100 - - vocab_df = pd.DataFrame(data={ - 'Train Vocabulary Size': [len(train_vocab)], - 'Test Vocabulary Size': [len(test_vocab)], - '% Test Set Vocabulary not found in Train': [oov_vocab_percentage]}) + oov_vocab_percentage = ( + (len(test_vocab) - len(train_vocab.intersection(test_vocab))) + / len(test_vocab) + * 100 + ) + + vocab_df = pd.DataFrame( + data={ + "Train Vocabulary Size": [len(train_vocab)], + "Test Vocabulary Size": [len(test_vocab)], + "% Test Set Vocabulary not found in Train": [oov_vocab_percentage], + } + ) vocab_df.index = np.arange(1, len(vocab_df) + 1) display(vocab_df.round(2)) diff --git a/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py b/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py index 271c424..84b7bfb 100644 --- a/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py +++ b/assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py @@ -5,7 +5,7 @@ from IPython.display import display, Markdown, HTML -def ambiguous_examples_analysis(workspace_pd, threshold=.7): +def ambiguous_examples_analysis(workspace_pd, threshold=0.7): """ Analyze the test workspace and find out similar utterances that belongs to different intent :param workspace_pd: pandas dataframe in format of [utterance,label] @@ -15,31 +15,49 @@ def ambiguous_examples_analysis(workspace_pd, threshold=.7): """ # first create the feature matrix vectorizer = CountVectorizer(ngram_range=(1, 2)) - workspace_bow = vectorizer.fit_transform(workspace_pd['utterance']).todense() + workspace_bow = vectorizer.fit_transform(workspace_pd["utterance"]).todense() cos_sim_score_matrix = _calculate_cosine_similarity(workspace_bow) # remove the lower triangle of the matrix and apply threshold - similar_utterance_index = np.argwhere((cos_sim_score_matrix - np.tril(cos_sim_score_matrix)) - > threshold) - similar_utterance_pd = pd.DataFrame(columns=['Intent1', 'Utterance1', 'Intent2', 'Utterance2', - 'similarity score']) + similar_utterance_index = np.argwhere( + (cos_sim_score_matrix - np.tril(cos_sim_score_matrix)) > threshold + ) + similar_utterance_pd = pd.DataFrame( + columns=["Intent1", "Utterance1", "Intent2", "Utterance2", "similarity score"] + ) for index in similar_utterance_index: - if workspace_pd['intent'].iloc[index[0]] != workspace_pd['intent'].iloc[index[1]]: - intent1 = workspace_pd['intent'].iloc[index[0]] - utterance1 = workspace_pd['utterance'].iloc[index[0]] - intent2 = workspace_pd['intent'].iloc[index[1]] - utterance2 = workspace_pd['utterance'].iloc[index[1]] + if ( + workspace_pd["intent"].iloc[index[0]] + != workspace_pd["intent"].iloc[index[1]] + ): + intent1 = workspace_pd["intent"].iloc[index[0]] + utterance1 = workspace_pd["utterance"].iloc[index[0]] + intent2 = workspace_pd["intent"].iloc[index[1]] + utterance2 = workspace_pd["utterance"].iloc[index[1]] score = cos_sim_score_matrix[index[0], index[1]] temp_pd = pd.DataFrame( - {'Intent1': [intent1], 'Utterance1': [utterance1], 'Intent2': [intent2], - 'Utterance2': [utterance2], 'similarity score': [score]}) - similar_utterance_pd = similar_utterance_pd.append(temp_pd, ignore_index=True) + { + "Intent1": [intent1], + "Utterance1": [utterance1], + "Intent2": [intent2], + "Utterance2": [utterance2], + "similarity score": [score], + } + ) + similar_utterance_pd = similar_utterance_pd.append( + temp_pd, ignore_index=True + ) if not similar_utterance_pd.empty: - with pd.option_context('max_colwidth', 250): - display(HTML(similar_utterance_pd.sort_values(by=['similarity score'], - ascending=False).to_html(index=False))) + with pd.option_context("max_colwidth", 250): + display( + HTML( + similar_utterance_pd.sort_values( + by=["similarity score"], ascending=False + ).to_html(index=False) + ) + ) else: display(Markdown("### There are no similar utterances within different Intent")) diff --git a/assistant_dialog_skill_analysis/data_analysis/summary_generator.py b/assistant_dialog_skill_analysis/data_analysis/summary_generator.py index 354b80a..ce2e3da 100644 --- a/assistant_dialog_skill_analysis/data_analysis/summary_generator.py +++ b/assistant_dialog_skill_analysis/data_analysis/summary_generator.py @@ -15,8 +15,8 @@ def generate_summary_statistics(data, entities_list=None): :return: """ - total_examples = len(data['utterance']) - label_frequency = Counter(data['intent']).most_common() + total_examples = len(data["utterance"]) + label_frequency = Counter(data["intent"]).most_common() number_of_labels = len(label_frequency) average_example_per_intent = np.average(list(dict(label_frequency).values())) standard_deviation_of_intent = np.std(list(dict(label_frequency).values())) @@ -25,19 +25,25 @@ def generate_summary_statistics(data, entities_list=None): characteristics.append(["Total User Examples", total_examples]) characteristics.append(["Unique Intents", number_of_labels]) characteristics.append( - ["Average User Examples per Intent", int(np.around(average_example_per_intent))]) + ["Average User Examples per Intent", int(np.around(average_example_per_intent))] + ) characteristics.append( - ["Standard Deviation from Average", int(np.around(standard_deviation_of_intent))]) + [ + "Standard Deviation from Average", + int(np.around(standard_deviation_of_intent)), + ] + ) if entities_list: characteristics.append(["Total Number of Entities", len(entities_list)]) else: characteristics.append(["Total Number of Entities", 0]) - df = pd.DataFrame(data=characteristics, columns=['Data Characteristic', 'Value']) - df.index = np.arange(1, len(df)+1) + df = pd.DataFrame(data=characteristics, columns=["Data Characteristic", "Value"]) + df.index = np.arange(1, len(df) + 1) display(Markdown("### Summary Statistics")) display(df) + def show_user_examples_per_intent(data): """ Take the workspace dictionary and display summary statistics regarding the workspace @@ -45,13 +51,14 @@ def show_user_examples_per_intent(data): :return: """ - label_frequency = Counter(data['intent']).most_common() + label_frequency = Counter(data["intent"]).most_common() frequencies = list(reversed(label_frequency)) - df = pd.DataFrame(data=frequencies, columns=['Intent', 'Number of User Examples']) + df = pd.DataFrame(data=frequencies, columns=["Intent", "Number of User Examples"]) df.index = np.arange(1, len(df) + 1) display(Markdown("### Sorted Distribution of User Examples per Intent")) display(df) + def scatter_plot_intent_dist(workspace_pd): """ takes the workspace_pd and generate a scatter distribution of the intents @@ -59,18 +66,22 @@ def scatter_plot_intent_dist(workspace_pd): :return: """ - label_frequency = Counter(workspace_pd['intent']).most_common() + label_frequency = Counter(workspace_pd["intent"]).most_common() frequencies = list(reversed(label_frequency)) counter_list = list(range(1, len(frequencies) + 1)) - df = pd.DataFrame(data=frequencies, columns=['Intent', 'Number of User Examples']) - df['Intent'] = counter_list - - sns.set(rc={'figure.figsize': (15, 10)}) - display(Markdown('##
Sorted Distribution of User Examples \ - per Intent
')) - - plt.ylabel('Number of User Examples', fontdict=LABEL_FONT) - plt.xlabel('Intent', fontdict=LABEL_FONT) + df = pd.DataFrame(data=frequencies, columns=["Intent", "Number of User Examples"]) + df["Intent"] = counter_list + + sns.set(rc={"figure.figsize": (15, 10)}) + display( + Markdown( + '##Sorted Distribution of User Examples \ + per Intent
' + ) + ) + + plt.ylabel("Number of User Examples", fontdict=LABEL_FONT) + plt.xlabel("Intent", fontdict=LABEL_FONT) ax = sns.scatterplot(x="Intent", y="Number of User Examples", data=df, s=100) @@ -81,24 +92,46 @@ def class_imbalance_analysis(workspace_pd): :return: """ - label_frequency = Counter(workspace_pd['intent']).most_common() + label_frequency = Counter(workspace_pd["intent"]).most_common() frequencies = list(reversed(label_frequency)) min_class, min_class_len = frequencies[0] max_class, max_class_len = frequencies[-1] - if max_class_len >= 2*min_class_len: - display(Markdown("### Class Imbalance Detected \ - ")) - display(Markdown("- Data could be potentially biased towards intents with more user \ - examples")) - display(Markdown("- E.g. Intent < {} > has < {} > user examples while intent < {} > has \ - just < {} > user examples ".format(max_class, max_class_len, min_class, min_class_len))) + if max_class_len >= 2 * min_class_len: + display( + Markdown( + "### Class Imbalance Detected \ + " + ) + ) + display( + Markdown( + "- Data could be potentially biased towards intents with more user \ + examples" + ) + ) + display( + Markdown( + "- E.g. Intent < {} > has < {} > user examples while intent < {} > has \ + just < {} > user examples ".format( + max_class, max_class_len, min_class, min_class_len + ) + ) + ) flag = True else: - display(Markdown("### No Significant Class \ - Imbalance Detected ")) - display(Markdown("- Lower chances of inherent bias in classification towards intents with \ - more user examples")) + display( + Markdown( + "### No Significant Class \ + Imbalance Detected " + ) + ) + display( + Markdown( + "- Lower chances of inherent bias in classification towards intents with \ + more user examples" + ) + ) flag = False return flag diff --git a/assistant_dialog_skill_analysis/experimentation/data_manipulator.py b/assistant_dialog_skill_analysis/experimentation/data_manipulator.py index be94150..edf22ac 100644 --- a/assistant_dialog_skill_analysis/experimentation/data_manipulator.py +++ b/assistant_dialog_skill_analysis/experimentation/data_manipulator.py @@ -2,6 +2,7 @@ import random import numpy as np + def under_sampling(workspace, workspace_pd, quantile=None): """ Under sample data @@ -10,31 +11,40 @@ def under_sampling(workspace, workspace_pd, quantile=None): :param quantile: threshold to sample from :return train_workspace_data: list of intent json """ - label_frequency_dict = dict(Counter(workspace_pd['intent']).most_common()) + label_frequency_dict = dict(Counter(workspace_pd["intent"]).most_common()) train_workspace_data = list() if not quantile: - quantile = .75 - sampling_threshold = int(np.quantile(a=list(label_frequency_dict.values()), q=[quantile])[0]) + quantile = 0.75 + sampling_threshold = int( + np.quantile(a=list(label_frequency_dict.values()), q=[quantile])[0] + ) - for i in range(len(workspace['intents'])): + for i in range(len(workspace["intents"])): - if not workspace['intents'][i]['examples']: + if not workspace["intents"][i]["examples"]: continue - if label_frequency_dict[workspace['intents'][i]['intent']] > sampling_threshold: - intent = workspace['intents'][i] - sampling_index = list(np.arange(len(workspace['intents'][i]['examples']))) + if label_frequency_dict[workspace["intents"][i]["intent"]] > sampling_threshold: + intent = workspace["intents"][i] + sampling_index = list(np.arange(len(workspace["intents"][i]["examples"]))) random.shuffle(sampling_index) - train_examples = [intent['examples'][index] for index in - sampling_index[:sampling_threshold]] - train_workspace_data.append({'intent': workspace['intents'][i]['intent']}) - train_workspace_data[-1].update({'description': 'string'}) - train_workspace_data[-1].update({'examples': train_examples}) + train_examples = [ + intent["examples"][index] + for index in sampling_index[:sampling_threshold] + ] + train_workspace_data.append({"intent": workspace["intents"][i]["intent"]}) + train_workspace_data[-1].update({"description": "string"}) + train_workspace_data[-1].update({"examples": train_examples}) else: - train_workspace_data.append({'intent': workspace['intents'][i]['intent']}) - train_workspace_data[-1].update({'description': 'string'}) - train_workspace_data[-1].update({'examples': [example for example in - workspace['intents'][i]['examples']]}) + train_workspace_data.append({"intent": workspace["intents"][i]["intent"]}) + train_workspace_data[-1].update({"description": "string"}) + train_workspace_data[-1].update( + { + "examples": [ + example for example in workspace["intents"][i]["examples"] + ] + } + ) return train_workspace_data diff --git a/assistant_dialog_skill_analysis/highlighting/highlighter.py b/assistant_dialog_skill_analysis/highlighting/highlighter.py index 0208016..073c62f 100644 --- a/assistant_dialog_skill_analysis/highlighting/highlighter.py +++ b/assistant_dialog_skill_analysis/highlighting/highlighter.py @@ -15,12 +15,14 @@ NGRAM_RANGE = [1] -def get_highlights_in_batch_multi_thread(conversation, - workspace_id, - full_results, - output_folder, - confidence_threshold, - show_worst_k): +def get_highlights_in_batch_multi_thread( + conversation, + workspace_id, + full_results, + output_folder, + confidence_threshold, + show_worst_k, +): """ Given the prediction result, rank prediction results from worst to best & analyze the top k worst results. @@ -34,14 +36,23 @@ def get_highlights_in_batch_multi_thread(conversation, :return: """ wrong_examples_sorted = _filter_results(full_results, confidence_threshold) - display(Markdown("### Identified {} problematic utterances " - .format(len(wrong_examples_sorted)))) + display( + Markdown( + "### Identified {} problematic utterances ".format( + len(wrong_examples_sorted) + ) + ) + ) display(Markdown(" ")) - wrong_examples_sorted = wrong_examples_sorted[: show_worst_k] + wrong_examples_sorted = wrong_examples_sorted[:show_worst_k] - adversarial_results, adversarial_span_dict = _adversarial_examples_multi_thread_inference( - wrong_examples_sorted, conversation, workspace_id) + ( + adversarial_results, + adversarial_span_dict, + ) = _adversarial_examples_multi_thread_inference( + wrong_examples_sorted, conversation, workspace_id + ) if not adversarial_results.empty: @@ -51,11 +62,13 @@ def get_highlights_in_batch_multi_thread(conversation, label = skills_util.OFFTOPIC_LABEL else: label = original_example[2] - label_idx = label + '\t' + str(original_example[0]) + label_idx = label + "\t" + str(original_example[0]) adversarial_result_subset = adversarial_results[ - adversarial_results['correct_intent'] == label_idx] + adversarial_results["correct_intent"] == label_idx + ] highlight = _highlight_scoring( - original_example, adversarial_result_subset, adversarial_span_dict) + original_example, adversarial_result_subset, adversarial_span_dict + ) _plot_highlight(highlight, original_example, output_folder) @@ -70,47 +83,69 @@ def _filter_results(full_results, confidence_threshold): highlighting_candidates = list() for idx in range(len(full_results)): item = full_results.iloc[idx] - results_intent_list = [predict['intent'] for predict in item['top_predicts']] - result_dict = dict(item['top_predicts']) - if item['correct_intent'] in results_intent_list: - reference_position = results_intent_list.index(item['correct_intent']) + results_intent_list = [predict["intent"] for predict in item["top_predicts"]] + result_dict = dict(item["top_predicts"]) + if item["correct_intent"] in results_intent_list: + reference_position = results_intent_list.index(item["correct_intent"]) else: reference_position = len(results_intent_list) rank_score = 0 # for off-topic examples, rank score = off-topic confidence score - confidence threshold - if item['correct_intent'] == skills_util.OFFTOPIC_LABEL: - if item['top_confidence'] > confidence_threshold: - rank_score = item['top_confidence'] - confidence_threshold + if item["correct_intent"] == skills_util.OFFTOPIC_LABEL: + if item["top_confidence"] > confidence_threshold: + rank_score = item["top_confidence"] - confidence_threshold highlighting_candidates.append( - (idx, item['utterance'], None, item['top_intent'], - item['top_confidence'], rank_score, reference_position)) + ( + idx, + item["utterance"], + None, + item["top_intent"], + item["top_confidence"], + rank_score, + reference_position, + ) + ) else: - if(item['top_intent'] != item['correct_intent']) or ( - item['top_confidence'] <= confidence_threshold): - if item['top_intent'] != item['correct_intent']: - # for incorrectly predicted examples, if the correct intent is not in top 10 - # rank score = confidence of the predicted intent - if item['correct_intent'] not in result_dict: - rank_score = item['top_confidence'] + if (item["top_intent"] != item["correct_intent"]) or ( + item["top_confidence"] <= confidence_threshold + ): + if item["top_intent"] != item["correct_intent"]: + # for incorrectly predicted examples, if the correct intent is not in top 10 + # rank score = confidence of the predicted intent + if item["correct_intent"] not in result_dict: + rank_score = item["top_confidence"] else: # for incorrectly predicted examples, if the correct intent is in top 10, # rank score = confidence of predicted intent - confidence of correct intent - rank_score = item['top_confidence'] - result_dict[item['correct_intent']] - elif item['top_confidence'] <= confidence_threshold: + rank_score = ( + item["top_confidence"] - result_dict[item["correct_intent"]] + ) + elif item["top_confidence"] <= confidence_threshold: # for correctly predicted examples, if the predicted confidence is less than # confidence threshold, rank score = confidence threshold - predicted confidence - rank_score = confidence_threshold - item['top_confidence'] + rank_score = confidence_threshold - item["top_confidence"] highlighting_candidates.append( - (idx, item['utterance'], item['correct_intent'], - item['top_intent'], item['top_confidence'], - rank_score, reference_position)) - - highlighting_candidates_sorted = sorted(highlighting_candidates, - key=lambda x: x[5], reverse=True) - highlighting_candidates_sorted = [candidate for candidate in highlighting_candidates_sorted - if len(nltk.word_tokenize(candidate[1])) < MAX_TOKEN_LENGTH] + ( + idx, + item["utterance"], + item["correct_intent"], + item["top_intent"], + item["top_confidence"], + rank_score, + reference_position, + ) + ) + + highlighting_candidates_sorted = sorted( + highlighting_candidates, key=lambda x: x[5], reverse=True + ) + highlighting_candidates_sorted = [ + candidate + for candidate in highlighting_candidates_sorted + if len(nltk.word_tokenize(candidate[1])) < MAX_TOKEN_LENGTH + ] return highlighting_candidates_sorted @@ -127,45 +162,65 @@ def _plot_highlight(highlight, original_example, output_folder): else: label = original_example[2] fig, ax = plt.subplots(figsize=(2, 5)) - ax = sns.heatmap([[i] for i in highlight.tolist()], - yticklabels=nltk.word_tokenize(original_example[1]), - xticklabels=['Sensitivity to intent: ' + '"' + label + '"'], - cbar_kws={"orientation": "vertical"}, - linewidths=0, square=False, - cmap="Blues") + ax = sns.heatmap( + [[i] for i in highlight.tolist()], + yticklabels=nltk.word_tokenize(original_example[1]), + xticklabels=["Sensitivity to intent: " + '"' + label + '"'], + cbar_kws={"orientation": "vertical"}, + linewidths=0, + square=False, + cmap="Blues", + ) if output_folder: - conf_str = '%.3f' % (original_example[4]) + conf_str = "%.3f" % (original_example[4]) if original_example[2]: - filename = str(original_example[0]) + '_groundtruth_' + \ - original_example[2] + '_prediction_' + \ - original_example[3] + '_confidence_' + \ - conf_str + '.png' + filename = ( + str(original_example[0]) + + "_groundtruth_" + + original_example[2] + + "_prediction_" + + original_example[3] + + "_confidence_" + + conf_str + + ".png" + ) else: - filename = str(original_example[0]) + '_groundtruth_offtopic_prediction_' + \ - original_example[3] + '_confidence_' + \ - conf_str + '.png' + filename = ( + str(original_example[0]) + + "_groundtruth_offtopic_prediction_" + + original_example[3] + + "_confidence_" + + conf_str + + ".png" + ) save_path = os.path.join(output_folder, filename) plt.savefig(os.path.join(save_path), bbox_inches="tight") table = list() - table.append(['Test Set Index', original_example[0]]) - table.append(['Utterance', original_example[1]]) + table.append(["Test Set Index", original_example[0]]) + table.append(["Utterance", original_example[1]]) table.append( - ['Actual Intent', original_example[2] if ( - original_example[2]) else skills_util.OFFTOPIC_LABEL]) - table.append(['Predicted Intent', original_example[3]]) - table.append(['Confidence', original_example[4]]) - with pd.option_context('max_colwidth', 250): - df = pd.DataFrame(data=table, columns=['Characteristic', 'Value']) + [ + "Actual Intent", + original_example[2] + if (original_example[2]) + else skills_util.OFFTOPIC_LABEL, + ] + ) + table.append(["Predicted Intent", original_example[3]]) + table.append(["Confidence", original_example[4]]) + with pd.option_context("max_colwidth", 250): + df = pd.DataFrame(data=table, columns=["Characteristic", "Value"]) df.index = np.arange(1, len(df) + 1) display(df) plt.show() def _adversarial_examples_multi_thread_inference( - wrong_examples_sorted, conversation, workspace_id): + wrong_examples_sorted, conversation, workspace_id +): """ Perform multi threaded inference on all the adversarial examples :param wrong_examples_sorted: @@ -180,23 +235,32 @@ def _adversarial_examples_multi_thread_inference( for original_example in wrong_examples_sorted: adversarial_examples, adversarial_span = _generate_adversarial_examples( - original_example[1], original_example[0]) + original_example[1], original_example[0] + ) if not original_example[2]: label = skills_util.OFFTOPIC_LABEL else: label = original_example[2] - adversarial_label = label + '\t' + str(original_example[0]) + adversarial_label = label + "\t" + str(original_example[0]) all_adversarial_examples.extend(adversarial_examples) - all_adversarial_label_idx.extend([adversarial_label] * len(adversarial_examples)) + all_adversarial_label_idx.extend( + [adversarial_label] * len(adversarial_examples) + ) adversarial_span_dict.update(adversarial_span) adversarial_test_data_frame = pd.DataFrame( - {'utterance': all_adversarial_examples, 'intent': all_adversarial_label_idx}) - adversarial_results = inferencer.inference(conversation, workspace_id, - adversarial_test_data_frame, max_retries=10, - max_thread=5, verbose=False) + {"utterance": all_adversarial_examples, "intent": all_adversarial_label_idx} + ) + adversarial_results = inferencer.inference( + conversation, + workspace_id, + adversarial_test_data_frame, + max_retries=10, + max_thread=5, + verbose=False, + ) display(Markdown(" ")) return adversarial_results, adversarial_span_dict @@ -215,37 +279,38 @@ def _generate_adversarial_examples(utt, original_idx): tokens = utt.split() for idx in range(len(tokens)): for ngram in NGRAM_RANGE: - new_sent = ' '.join(tokens[:idx] + tokens[idx + ngram:]) + new_sent = " ".join(tokens[:idx] + tokens[idx + ngram :]) adversarial_examples.append(new_sent) adversarial_span[new_sent + "_" + str(original_idx)] = (idx, idx + ngram) return adversarial_examples, adversarial_span def _highlight_scoring( - original_example, subset_adversarial_result, adversarial_span_dict): + original_example, subset_adversarial_result, adversarial_span_dict +): """ Calculate the highlighting score using classification results of adversarial examples :param original_example: :param subset_adversarial_result: :param adversarial_span_dict: """ - original_utterance = ' '.join(nltk.word_tokenize(original_example[1])) + original_utterance = " ".join(nltk.word_tokenize(original_example[1])) original_idx = original_example[0] original_intent = original_example[3] original_confidence = original_example[4] original_position = original_example[6] - tokens = original_utterance.split(' ') - highlight = np.zeros(len(tokens), dtype='float32') + tokens = original_utterance.split(" ") + highlight = np.zeros(len(tokens), dtype="float32") for idx in range(len(subset_adversarial_result)): adversarial_example = subset_adversarial_result.iloc[idx] - if not adversarial_example['top_predicts']: + if not adversarial_example["top_predicts"]: continue predict_dict = dict() predict_intent_list = list() - for prediction in adversarial_example['top_predicts']: - predict_dict[prediction['intent']] = prediction['confidence'] - predict_intent_list.append(prediction['intent']) + for prediction in adversarial_example["top_predicts"]: + predict_dict[prediction["intent"]] = prediction["confidence"] + predict_intent_list.append(prediction["intent"]) if original_intent in predict_dict: adversarial_position = list(predict_dict.keys()).index(original_intent) @@ -254,27 +319,32 @@ def _highlight_scoring( adversarial_position = len(list(predict_dict.keys())) adversarial_confidence = 0 - start, end = adversarial_span_dict[adversarial_example['utterance'] + - '_' + str(original_idx)] + start, end = adversarial_span_dict[ + adversarial_example["utterance"] + "_" + str(original_idx) + ] - highlight = _scoring_function(highlight, - original_position, - adversarial_position, - original_confidence, - adversarial_confidence, - start, - end) + highlight = _scoring_function( + highlight, + original_position, + adversarial_position, + original_confidence, + adversarial_confidence, + start, + end, + ) return highlight -def _scoring_function(highlight, - original_position, - adversarial_position, - original_confidence, - adversarial_confidence, - start_idx, - end_idx): +def _scoring_function( + highlight, + original_position, + adversarial_position, + original_confidence, + adversarial_confidence, + start_idx, + end_idx, +): """ scoring function for highlighting of the interval start_idx:end_idx :param highlight: np.array of shape (n_tokens) @@ -289,7 +359,8 @@ def _scoring_function(highlight, # position difference accounts for the change in the position of the target intent among # the top 10 intents return by the message api position_difference = (1 / float(original_position + 1.0)) - ( - 1 / float(adversarial_position + 1.0)) + 1 / float(adversarial_position + 1.0) + ) # confidence difference accounts for the change in the confidence confidence_difference = original_confidence - adversarial_confidence @@ -299,9 +370,11 @@ def _scoring_function(highlight, # highlight score for the interval of start_idx:end_idx is a weighted average of # the position difference and confidence difference - weighted_difference = weight * ( - (0.2 * confidence_difference) + - (0.8 * position_difference)) / ngram_size + weighted_difference = ( + weight + * ((0.2 * confidence_difference) + (0.8 * position_difference)) + / ngram_size + ) highlight[start_idx:end_idx] += weighted_difference diff --git a/assistant_dialog_skill_analysis/inferencing/inferencer.py b/assistant_dialog_skill_analysis/inferencing/inferencer.py index 0ef389f..8e6daf4 100644 --- a/assistant_dialog_skill_analysis/inferencing/inferencer.py +++ b/assistant_dialog_skill_analysis/inferencing/inferencer.py @@ -5,7 +5,10 @@ from ..utils import skills_util from ..inferencing.multi_thread_inference import InferenceThread -def inference(conversation, workspace_id, test_data, max_retries=10, max_thread=5, verbose=False): + +def inference( + conversation, workspace_id, test_data, max_retries=10, max_thread=5, verbose=False +): """ query the message api to generate results on the test data :parameter: conversation: the conversation object produced by AssistantV1 api @@ -20,13 +23,16 @@ def inference(conversation, workspace_id, test_data, max_retries=10, max_thread= if max_thread == 1: reach_max_retry = False responses = [] - for test_example, ground_truth in zip(test_data['utterance'], test_data['intent']): + for test_example, ground_truth in zip( + test_data["utterance"], test_data["intent"] + ): attempt = 1 while attempt <= max_retries: try: prediction_json = skills_util.retrieve_classifier_response( - conversation, workspace_id, test_example, True) - time.sleep(.3) + conversation, workspace_id, test_example, True + ) + time.sleep(0.3) success_flag = True except Exception: @@ -39,34 +45,41 @@ def inference(conversation, workspace_id, test_data, max_retries=10, max_thread= reach_max_retry = True if reach_max_retry: - raise Exception('Maximum attempt of {} has reached'.format(max_retries)) - - if not prediction_json['intents']: - responses.append({'top_intent' : skills_util.OFFTOPIC_LABEL, - 'top_confidence' : 0.0, - 'correct_intent' : ground_truth, - 'utterance' : test_example, - 'top_predicts' : [], - 'entities' : []}) + raise Exception("Maximum attempt of {} has reached".format(max_retries)) + + if not prediction_json["intents"]: + responses.append( + { + "top_intent": skills_util.OFFTOPIC_LABEL, + "top_confidence": 0.0, + "correct_intent": ground_truth, + "utterance": test_example, + "top_predicts": [], + "entities": [], + } + ) else: - responses.append({'top_intent' : prediction_json['intents'][0]['intent'], - 'top_confidence' : prediction_json['intents'][0]['confidence'], - 'correct_intent' : ground_truth, - 'utterance' : test_example, - 'top_predicts' : prediction_json['intents'], - 'entities' : prediction_json['entities']}) + responses.append( + { + "top_intent": prediction_json["intents"][0]["intent"], + "top_confidence": prediction_json["intents"][0]["confidence"], + "correct_intent": ground_truth, + "utterance": test_example, + "top_predicts": prediction_json["intents"], + "entities": prediction_json["entities"], + } + ) result_df = pd.DataFrame(data=responses) else: - result_df = thread_inference(conversation, - workspace_id, - test_data, - max_retries, - max_thread, - verbose) + result_df = thread_inference( + conversation, workspace_id, test_data, max_retries, max_thread, verbose + ) return result_df -def thread_inference(conversation, workspace_id, test_data, - max_retries=10, max_thread=5, verbose=False): + +def thread_inference( + conversation, workspace_id, test_data, max_retries=10, max_thread=5, verbose=False +): """ Perform multi thread inference for faster inference time :param conversation: @@ -78,7 +91,7 @@ def thread_inference(conversation, workspace_id, test_data, :return result_df: results dataframe """ if max_thread > 5: - print('only maximum of 5 threads are allowed') + print("only maximum of 5 threads are allowed") thread_list = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5"] thread_list = thread_list[:max_thread] @@ -90,7 +103,7 @@ def thread_inference(conversation, workspace_id, test_data, start_time = time.time() for i in range(len(test_data)): - data_point = [test_data['utterance'].iloc[i], test_data['intent'].iloc[i]] + data_point = [test_data["utterance"].iloc[i], test_data["intent"].iloc[i]] query_queue.put(data_point) # Create new threads @@ -103,7 +116,8 @@ def thread_inference(conversation, workspace_id, test_data, workspace_id, result, max_retries, - verbose) + verbose, + ) thread.start() threads.append(thread) thread_id += 1 @@ -118,6 +132,7 @@ def thread_inference(conversation, workspace_id, test_data, result_df = pd.DataFrame(data=result) return result_df + def get_intents_confidences(conversation, workspace_id, text_input): """ Retrieve a list of confidence for analysis purpose @@ -127,10 +142,12 @@ def get_intents_confidences(conversation, workspace_id, text_input): :return intent_conf: intent confidences """ response_info = skills_util.retrieve_classifier_response( - conversation, workspace_id, text_input, True)['intents'] - intent_conf = [(r['intent'], r['confidence']) for r in response_info] + conversation, workspace_id, text_input, True + )["intents"] + intent_conf = [(r["intent"], r["confidence"]) for r in response_info] return intent_conf + def calculate_mistakes(results): """ retrieve the data frame of miss-classified examples @@ -139,12 +156,13 @@ def calculate_mistakes(results): """ wrongs = list() for idx, row in results.iterrows(): - if row['correct_intent'] != row['top_intent']: + if row["correct_intent"] != row["top_intent"]: wrongs.append(row) wrongs_df = pd.DataFrame(data=wrongs) - wrongs_df.index.name = 'Test Example Index' + wrongs_df.index.name = "Test Example Index" return wrongs_df + def calculate_accuracy(results): """ calculate the accuracy on the test set @@ -152,7 +170,7 @@ def calculate_accuracy(results): :return accuracy: get accuracy on test set """ correct = 0 - for i in range(0, len(results['correct_intent'])): - correct += 1 if results['top_intent'][i] == results['correct_intent'][i] else 0 - accuracy = np.around((correct / len(results['correct_intent']))*100, 2) + for i in range(0, len(results["correct_intent"])): + correct += 1 if results["top_intent"][i] == results["correct_intent"][i] else 0 + accuracy = np.around((correct / len(results["correct_intent"])) * 100, 2) return accuracy diff --git a/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py b/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py index 457af8f..652c4e3 100644 --- a/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py +++ b/assistant_dialog_skill_analysis/inferencing/multi_thread_inference.py @@ -4,12 +4,23 @@ from ..utils import skills_util + class InferenceThread(threading.Thread): """ InferenceThread class is used for multi-thread inferencing for faster inference speed """ - def __init__(self, thread_id, name, que, conversation, - workspace_id, result, max_retries=10, verbose=False): + + def __init__( + self, + thread_id, + name, + que, + conversation, + workspace_id, + result, + max_retries=10, + verbose=False, + ): """ Initialize inferencer :param thread_id: @@ -61,39 +72,48 @@ def thread_inference(self): self.conversation, self.workspace_id, query_question, - alternate_intents=True) - time.sleep(.2) - if response['intents']: - top_predicts = response['intents'] - top_intent = response['intents'][0]['intent'] - top_confidence = response['intents'][0]['confidence'] + alternate_intents=True, + ) + time.sleep(0.2) + if response["intents"]: + top_predicts = response["intents"] + top_intent = response["intents"][0]["intent"] + top_confidence = response["intents"][0]["confidence"] else: top_predicts = [] top_intent = skills_util.OFFTOPIC_LABEL top_confidence = 0 - if response['entities']: - entities = response['entities'] + if response["entities"]: + entities = response["entities"] else: entities = [] - new_dict = {'utterance': query_question, - 'correct_intent': query_data[1], - 'top_intent': top_intent, - 'top_confidence': top_confidence, - 'top_predicts': top_predicts, - 'entities':entities} + new_dict = { + "utterance": query_question, + "correct_intent": query_data[1], + "top_intent": top_intent, + "top_confidence": top_confidence, + "top_predicts": top_predicts, + "entities": entities, + } self.result.append(new_dict) success_flag = True except Exception: if self.verbose: - print("{} process {} fail attempt {}" - .format(self.name, query_question, i)) - time.sleep(.2) + print( + "{} process {} fail attempt {}".format( + self.name, query_question, i + ) + ) + time.sleep(0.2) if attempt >= self.max_retries: - print('Maximum attempt of {} has reached for query {}' - .format(self.max_retries, query_question)) + print( + "Maximum attempt of {} has reached for query {}".format( + self.max_retries, query_question + ) + ) _thread.interrupt_main() self.exit() else: diff --git a/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py b/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py index 06d0fa1..adb22f6 100644 --- a/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py +++ b/assistant_dialog_skill_analysis/term_analysis/chi2_analyzer.py @@ -15,12 +15,13 @@ def strip_punctuations(utterance: str): :param utterance: :return: """ - normalization_pattern = '\'s' - utterance = re.sub(normalization_pattern, ' is', utterance) - puncuation_pattern = '|'.join(skills_util.PUNCTUATION) - utterance = re.sub(puncuation_pattern, ' ', utterance) + normalization_pattern = "'s" + utterance = re.sub(normalization_pattern, " is", utterance) + puncuation_pattern = "|".join(skills_util.PUNCTUATION) + utterance = re.sub(puncuation_pattern, " ", utterance) return utterance + def _preprocess_chi2(workspace_pd): """ Preprocess dataframe for chi2 analysis @@ -31,20 +32,28 @@ def _preprocess_chi2(workspace_pd): """ stopword_list = skills_util.STOP_WORDS - workspace_pd['utterance_punc_stripped'] = \ - workspace_pd['utterance'].apply(strip_punctuations) + workspace_pd["utterance_punc_stripped"] = workspace_pd["utterance"].apply( + strip_punctuations + ) count_vectorizer = CountVectorizer( min_df=1, - encoding='utf-8', + encoding="utf-8", ngram_range=(1, 2), stop_words=stopword_list, - tokenizer=word_tokenize) - features = count_vectorizer.fit_transform(workspace_pd['utterance_punc_stripped']).toarray() - labels = workspace_pd['intent'] + tokenizer=word_tokenize, + token_pattern="(?u)\b\w+\b", + ) + features = count_vectorizer.fit_transform( + workspace_pd["utterance_punc_stripped"] + ).toarray() + labels = workspace_pd["intent"] return labels, count_vectorizer, features -def _compute_chi2_top_feature(features, labels, vectorizer, cls, significance_level=.05): + +def _compute_chi2_top_feature( + features, labels, vectorizer, cls, significance_level=0.05 +): """ Perform chi2 analysis, punctuation filtering and deduplication :param features: count vectorizer features @@ -81,7 +90,8 @@ def _compute_chi2_top_feature(features, labels, vectorizer, cls, significance_le return deduplicated_unigram, deduplicated_bigram -def get_chi2_analysis(workspace_pd, significance_level=.05): + +def get_chi2_analysis(workspace_pd, significance_level=0.05): """ find correlated unigram and bigram of each intent with Chi2 analysis :param workspace_pd: dataframe, workspace data @@ -91,7 +101,7 @@ def get_chi2_analysis(workspace_pd, significance_level=.05): """ labels, vectorizer, features = _preprocess_chi2(workspace_pd) - label_frequency_dict = dict(Counter(workspace_pd['intent']).most_common()) + label_frequency_dict = dict(Counter(workspace_pd["intent"]).most_common()) N = 5 # keys are the set of unigrams/bigrams and value will be the intent @@ -106,22 +116,19 @@ def get_chi2_analysis(workspace_pd, significance_level=.05): for cls in label_frequency_dict.keys(): unigrams, bigrams = _compute_chi2_top_feature( - features, - labels, - vectorizer, - cls, - significance_level) + features, labels, vectorizer, cls, significance_level + ) classes.append(cls) if unigrams: - chi_unigrams.append(', '.join(unigrams[-N:])) + chi_unigrams.append(", ".join(unigrams[-N:])) else: - chi_unigrams.append('None') + chi_unigrams.append("None") if bigrams: - chi_bigrams.append(', '.join(bigrams[-N:])) + chi_bigrams.append(", ".join(bigrams[-N:])) else: - chi_bigrams.append('None') + chi_bigrams.append("None") if unigrams: if frozenset(unigrams[-N:]) in unigram_intent_dict: @@ -137,17 +144,24 @@ def get_chi2_analysis(workspace_pd, significance_level=.05): bigram_intent_dict[frozenset(bigrams[-N:])] = list() bigram_intent_dict[frozenset(bigrams[-N:])].append(cls) - chi_df = pd.DataFrame(data={'Intent': classes}) - chi_df['Correlated Unigrams'] = chi_unigrams - chi_df['Correlated Bigrams'] = chi_bigrams + chi_df = pd.DataFrame(data={"Intent": classes}) + chi_df["Correlated Unigrams"] = chi_unigrams + chi_df["Correlated Bigrams"] = chi_bigrams display(Markdown(("## Chi-squared Analysis"))) - with pd.option_context('display.max_rows', None, 'display.max_columns', - None, 'display.max_colwidth', 100): + with pd.option_context( + "display.max_rows", + None, + "display.max_columns", + None, + "display.max_colwidth", + 100, + ): chi_df.index = np.arange(1, len(chi_df) + 1) display(chi_df) return unigram_intent_dict, bigram_intent_dict + def get_confusing_key_terms(keyterm_intent_map): """ Greedy search for overlapping intents @@ -173,14 +187,23 @@ def get_confusing_key_terms(keyterm_intent_map): overlap = correlated_unigrams.intersection(other_correlated_unigrams) if overlap: for keyword in overlap: - ambiguous_intents.append("<" + current_label[0] + ", " + - keyterm_intent_map[other_correlated_unigrams][0] + ">") + ambiguous_intents.append( + "<" + + current_label[0] + + ", " + + keyterm_intent_map[other_correlated_unigrams][0] + + ">" + ) ambiguous_keywords.append(keyword) - df = pd.DataFrame(data={'Intent Pairs': ambiguous_intents, 'Terms': ambiguous_keywords}) + df = pd.DataFrame( + data={"Intent Pairs": ambiguous_intents, "Terms": ambiguous_keywords} + ) if not ambiguous_intents: - display(Markdown("There is no ambiguity based on top 5 key terms in chi2 analysis")) + display( + Markdown("There is no ambiguity based on top 5 key terms in chi2 analysis") + ) else: display_size = 10 if not df.empty: @@ -190,6 +213,7 @@ def get_confusing_key_terms(keyterm_intent_map): return df + def chi2_overlap_check(ambiguous_unigram_df, ambiguous_bigram_df, intent1, intent2): """ looks for intent overlap for specific intent or intent pairs @@ -202,10 +226,14 @@ def chi2_overlap_check(ambiguous_unigram_df, ambiguous_bigram_df, intent1, inten part1 = None part2 = None if not ambiguous_unigram_df.empty: - part1 = ambiguous_unigram_df[ambiguous_unigram_df['Intent Pairs'].str.contains(intent)] + part1 = ambiguous_unigram_df[ + ambiguous_unigram_df["Intent Pairs"].str.contains(intent) + ] if not ambiguous_bigram_df.empty: - part2 = ambiguous_bigram_df[ambiguous_bigram_df['Intent Pairs'].str.contains(intent)] + part2 = ambiguous_bigram_df[ + ambiguous_bigram_df["Intent Pairs"].str.contains(intent) + ] if part1 is not None and part2 is not None: display(HTML(pd.concat([part1, part2]).to_html(index=False))) diff --git a/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py b/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py index 4ddcc07..8d65e22 100644 --- a/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py +++ b/assistant_dialog_skill_analysis/term_analysis/entity_analyzer.py @@ -4,6 +4,7 @@ N = 5 + def _derive_entity_label_matrix(train_full_results, entities): """ Derive entity feature matrix for chi2 anaylsis using entity annotations from message api @@ -20,12 +21,12 @@ def _derive_entity_label_matrix(train_full_results, entities): entity_average_confidence_dict = dict() for i in range(len(train_full_results)): current_result = train_full_results.iloc[i] - if current_result['entities']: + if current_result["entities"]: # create empty feature vector - current_feature = [0]*len(entities) - for entity_reference in current_result['entities']: - e_ref = entity_reference['entity'] - e_conf = entity_reference['confidence'] + current_feature = [0] * len(entities) + for entity_reference in current_result["entities"]: + e_ref = entity_reference["entity"] + e_conf = entity_reference["confidence"] entity_idx = entities.index(e_ref) current_feature[entity_idx] += 1 @@ -33,16 +34,19 @@ def _derive_entity_label_matrix(train_full_results, entities): entity_count_dict[e_ref] = entity_count_dict.get(e_ref, 0) + 1 entity_feature_matrix.append(current_feature) - labels.append(current_result['correct_intent']) + labels.append(current_result["correct_intent"]) entity_feature_matrix = np.array(entity_feature_matrix) labels = np.array(labels) for key in entity_conf_dict: - entity_average_confidence_dict[key] = entity_conf_dict[key]/entity_count_dict[key] + entity_average_confidence_dict[key] = ( + entity_conf_dict[key] / entity_count_dict[key] + ) return entity_feature_matrix, labels, entity_average_confidence_dict -def entity_label_correlation_analysis(train_full_results, entities_list, p_value=.05): + +def entity_label_correlation_analysis(train_full_results, entities_list, p_value=0.05): """ Apply chi2 analysis on entities of the training set :param train_full_results: pandas data frame output by inference @@ -50,9 +54,11 @@ def entity_label_correlation_analysis(train_full_results, entities_list, p_value :param p_value: threshold for chi2 analysis :return entity_label_df: pandas df with col 1 being intents and col 2 entities """ - entity_feature_matrix, labels, entity_average_confidence_dict = _derive_entity_label_matrix( - train_full_results, - entities_list) + ( + entity_feature_matrix, + labels, + entity_average_confidence_dict, + ) = _derive_entity_label_matrix(train_full_results, entities_list) entities_list = np.array(entities_list) unique_labels = list(set(labels)) final_labels = list() @@ -67,8 +73,10 @@ def entity_label_correlation_analysis(train_full_results, entities_list, p_value continue final_labels.append(label) - final_entities.append(', '.join(ordered_entities[-N:])) + final_entities.append(", ".join(ordered_entities[-N:])) - entity_label_df = pd.DataFrame({'Intent': final_labels, 'Correlated Entities': final_entities}) + entity_label_df = pd.DataFrame( + {"Intent": final_labels, "Correlated Entities": final_entities} + ) return entity_label_df diff --git a/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py b/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py index a102f46..6692ec8 100644 --- a/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py +++ b/assistant_dialog_skill_analysis/term_analysis/keyword_analyzer.py @@ -7,92 +7,115 @@ import nltk from ..utils import skills_util -def _preprocess_for_heat_map(workspace_df, label_for_display=30, - max_token_display=30, class_list=None): - ''' + +def _preprocess_for_heat_map( + workspace_df, label_for_display=30, max_token_display=30, class_list=None +): + """ Preprocess dataframe for heat map visualization :param workspace_df: :param label_for_display: :param max_token_display: :param class_list: - ''' - label_frequency_dict = dict(Counter(workspace_df['intent']).most_common()) + """ + label_frequency_dict = dict(Counter(workspace_df["intent"]).most_common()) if class_list: - workspace_subsampled = workspace_df[workspace_df['intent'].isin(class_list)] - counts = _get_counts_per_label(workspace_subsampled, unigrams_col_name="unigrams") + workspace_subsampled = workspace_df[workspace_df["intent"].isin(class_list)] + counts = _get_counts_per_label( + workspace_subsampled, unigrams_col_name="unigrams" + ) else: if len(label_frequency_dict) > label_for_display: top_30_labels = list(label_frequency_dict.keys())[:label_for_display] - workspace_subsampled = workspace_df[workspace_df['intent'].isin(top_30_labels)] - counts = _get_counts_per_label(workspace_subsampled, unigrams_col_name="unigrams") + workspace_subsampled = workspace_df[ + workspace_df["intent"].isin(top_30_labels) + ] + counts = _get_counts_per_label( + workspace_subsampled, unigrams_col_name="unigrams" + ) else: counts = _get_counts_per_label(workspace_df, unigrams_col_name="unigrams") - max_n = np.int(np.ceil(max_token_display / len(counts.index.get_level_values(0).unique()))) - top_counts = _get_top_n(counts['n_w'], top_n=max_n) + max_n = np.int( + np.ceil(max_token_display / len(counts.index.get_level_values(0).unique())) + ) + top_counts = _get_top_n(counts["n_w"], top_n=max_n) return counts, top_counts + def _get_counts_per_label(training_data, unigrams_col_name="unigrams"): - ''' + """ Create a new dataframe to store unigram counts for each label :param training_data: pandas df :param unigrams_col_name: name of unigrams column name :return counts: dataframe that contains the counts for all unigrams per label - ''' - training_data[unigrams_col_name] = training_data['utterance'].apply(nltk.word_tokenize) + """ + training_data[unigrams_col_name] = training_data["utterance"].apply( + nltk.word_tokenize + ) rows = list() stopword_list = skills_util.STOP_WORDS - for row in training_data[['intent', unigrams_col_name]].iterrows(): + for row in training_data[["intent", unigrams_col_name]].iterrows(): r = row[1] for word in r.unigrams: rows.append((r.intent, word)) - words = pd.DataFrame(rows, columns=['intent', 'word']) + words = pd.DataFrame(rows, columns=["intent", "word"]) # delete all empty words and chars words = words[words.word.str.len() > 1] # delete stopwords words = words.loc[~words["word"].isin(stopword_list)] # get counts per word - counts = words.groupby('intent')\ - .word.value_counts()\ - .to_frame()\ - .rename(columns={'word':'n_w'}) + counts = ( + words.groupby("intent") + .word.value_counts() + .to_frame() + .rename(columns={"word": "n_w"}) + ) return counts + def _get_top_n(series, top_n=5, index_level=0): - ''' + """ Get most frequent words per label :param series: product of a call to get_counts_per_label :param top_n: integer signifying the number of most frequent tokens per class :param index_level: index to group by :return df: dataframe that contains the top_n unigrams per label - ''' - return series\ - .groupby(level=index_level)\ - .nlargest(top_n)\ - .reset_index(level=index_level, drop=True) + """ + return ( + series.groupby(level=index_level) + .nlargest(top_n) + .reset_index(level=index_level, drop=True) + ) + -def seaborn_heatmap(workspace_df, label_for_display=30, max_token_display=30, class_list=None): - ''' +def seaborn_heatmap( + workspace_df, label_for_display=30, max_token_display=30, class_list=None +): + """ Create heat map of word frequencies per intent :param workspace_df: :param label_for_display: :param max_token_display: :param class_list: - ''' + """ counts, top_counts = _preprocess_for_heat_map( - workspace_df, - label_for_display, - max_token_display, - class_list) + workspace_df, label_for_display, max_token_display, class_list + ) reset_groupby = counts.reset_index() - most_frequent_words = top_counts.reset_index()['word'].unique() + most_frequent_words = top_counts.reset_index()["word"].unique() table_format = reset_groupby.pivot(index="word", columns="intent", values="n_w") - table_format = table_format[ - table_format.index.isin(most_frequent_words)].fillna(0).astype("int32") - display(Markdown('##Token Frequency per Intent
')) + table_format = ( + table_format[table_format.index.isin(most_frequent_words)] + .fillna(0) + .astype("int32") + ) + display( + Markdown('##Token Frequency per Intent
') + ) fig, ax = plt.subplots(figsize=(20, 20)) - sns.heatmap(table_format, annot=True, fmt='d', linewidths=.1, cmap="PuBu", ax=ax) - plt.ylabel('Token', fontdict=skills_util.LABEL_FONT) - plt.xlabel('Intent', fontdict=skills_util.LABEL_FONT) + sns.heatmap(table_format, annot=True, fmt="d", linewidths=0.1, cmap="PuBu", ax=ax) + plt.ylabel("Token", fontdict=skills_util.LABEL_FONT) + plt.xlabel("Intent", fontdict=skills_util.LABEL_FONT) diff --git a/assistant_dialog_skill_analysis/utils/skills_util.py b/assistant_dialog_skill_analysis/utils/skills_util.py index a4a2ddb..c993241 100644 --- a/assistant_dialog_skill_analysis/utils/skills_util.py +++ b/assistant_dialog_skill_analysis/utils/skills_util.py @@ -10,35 +10,68 @@ from nbconvert.preprocessors import ExecutePreprocessor import nltk import ibm_watson -from ibm_cloud_sdk_core.authenticators import \ - IAMAuthenticator, BasicAuthenticator, NoAuthAuthenticator +from ibm_cloud_sdk_core.authenticators import ( + IAMAuthenticator, + BasicAuthenticator, + NoAuthAuthenticator, +) -DEFAULT_API_VERSION = '2019-02-28' -DEFAULT_PROD_URL = 'https://gateway.watsonplatform.net/assistant/api' -DEFAULT_USERNAME = 'apikey' -STAGE_IAM_URL = 'https://iam.stage1.bluemix.net/identity/token' +DEFAULT_API_VERSION = "2019-02-28" +DEFAULT_PROD_URL = "https://gateway.watsonplatform.net/assistant/api" +DEFAULT_USERNAME = "apikey" +STAGE_IAM_URL = "https://iam.stage1.bluemix.net/identity/token" -OFFTOPIC_LABEL = 'SYSTEM_OUT_OF_DOMAIN' +OFFTOPIC_LABEL = "SYSTEM_OUT_OF_DOMAIN" -LABEL_FONT = {'family': 'normal', - 'weight': 'bold', - 'size': 17} +LABEL_FONT = {"family": "normal", "weight": "bold", "size": 17} -TITLE_FONT = {'family': 'normal', - 'weight': 'bold', - 'size': 25} +TITLE_FONT = {"family": "normal", "weight": "bold", "size": 25} -PUNCTUATION = [";", ":", ",", "\.", "\"", "\'", - "\?", "\(", "\)", "!", "?", "!", - ";", ":", "。", "、", "《", "》", - ",", "¿", "¡", "؟", "،"] +PUNCTUATION = [ + ";", + ":", + ",", + "\.", + '"', + "'", + "\?", + "\(", + "\)", + "!", + "?", + "!", + ";", + ":", + "。", + "、", + "《", + "》", + ",", + "¿", + "¡", + "؟", + "،", +] -STOP_WORDS = ['an', 'a', 'in', 'on', 'be', 'or', 'of', - 'and', 'can', 'is', 'to', 'the', 'i'] +STOP_WORDS = [ + "an", + "a", + "in", + "on", + "be", + "or", + "of", + "and", + "can", + "is", + "to", + "the", + "i", +] -def stratified_sampling(workspace, sampling_percentage=.8): - ''' +def stratified_sampling(workspace, sampling_percentage=0.8): + """ Create a stratified sample of the workspace json & return a intent json acceptable in Assistant API @@ -46,64 +79,74 @@ def stratified_sampling(workspace, sampling_percentage=.8): :param sampling_percentage: percentage of original to sample :return train_workspace_data: list of intents for train :return test_workspace_data: list of utterance,intent pairs for test - ''' + """ train_workspace_data = list() test_workspace_data = list() - for i in range(len(workspace['intents'])): - intent = workspace['intents'][i] - sampling_index = list(np.arange(len(intent['examples']))) + for i in range(len(workspace["intents"])): + intent = workspace["intents"][i] + sampling_index = list(np.arange(len(intent["examples"]))) random.shuffle(sampling_index) # training set train_test_split_cutoff = int(sampling_percentage * len(sampling_index)) train_examples = [ - intent['examples'][index] for index in sampling_index[:train_test_split_cutoff] - ] - train_workspace_data.append({'intent': workspace['intents'][i]['intent']}) + intent["examples"][index] + for index in sampling_index[:train_test_split_cutoff] + ] + train_workspace_data.append({"intent": workspace["intents"][i]["intent"]}) train_workspace_data[i].update({"description": "string"}) train_workspace_data[i].update({"examples": train_examples}) # test set test_examples = [ - intent['examples'][index] for index in sampling_index[train_test_split_cutoff:] + intent["examples"][index] + for index in sampling_index[train_test_split_cutoff:] + ] + test_workspace_data.extend( + [ + utterances["text"] + "\t" + workspace["intents"][i]["intent"] + for utterances in test_examples ] - test_workspace_data.extend([ - utterances['text'] + '\t' + - workspace['intents'][i]['intent'] for utterances in test_examples - ]) + ) return train_workspace_data, test_workspace_data + def create_workspace(conversation, intent_json=None): - ''' + """ Create a workspace for testing purpose :param conversation: conversation object created by Watson Assistant api :param intent_json: nested json of utternance and intent pairs :return response: the workspace id and other metadata related to the new workspace - ''' + """ response = conversation.create_workspace( - name='test_workspace', - description='', - language='en', + name="test_workspace", + description="", + language="en", intents=intent_json, entities=[], counterexamples=[], - metadata={}).get_result() + metadata={}, + ).get_result() return response + def input_credentials(): - ''' + """ Prompt user to enter apikey and workspace id - ''' + """ apikey = getpass.getpass("Please enter apikey: ") workspace_id = getpass.getpass("Please enter workspace-id: ") return apikey, workspace_id -def retrieve_workspace(iam_apikey=None, - workspace_id=None, - url=DEFAULT_PROD_URL, - api_version=DEFAULT_API_VERSION, - username=DEFAULT_USERNAME, - password=None, - export_flag=True): + +def retrieve_workspace( + iam_apikey=None, + workspace_id=None, + url=DEFAULT_PROD_URL, + api_version=DEFAULT_API_VERSION, + username=DEFAULT_USERNAME, + password=None, + export_flag=True, +): """ Retrieve workspace from Assistant instance :param iam_apikey: @@ -123,8 +166,9 @@ def retrieve_workspace(iam_apikey=None, else: authenticator = NoAuthAuthenticator() - conversation = ibm_watson.AssistantV1(authenticator=authenticator, - version=api_version) + conversation = ibm_watson.AssistantV1( + authenticator=authenticator, version=api_version + ) conversation.set_service_url(url) if export_flag: @@ -134,29 +178,30 @@ def retrieve_workspace(iam_apikey=None, def extract_workspace_data(workspace): - ''' + """ Extract relevant data and vocabulary :param workspace: :return relevant_data: :return vocabulary: - ''' - relevant_data = {'utterance': list(), 'intent': list()} + """ + relevant_data = {"utterance": list(), "intent": list()} vocabulary = set() - for i in range(len(workspace['intents'])): - current_intent = workspace['intents'][i]['intent'] - for j in range(len(workspace['intents'][i]['examples'])): - current_example = workspace['intents'][i]['examples'][j]['text'] - relevant_data['utterance'].append(current_example) - relevant_data['intent'].append(current_intent) + for i in range(len(workspace["intents"])): + current_intent = workspace["intents"][i]["intent"] + for j in range(len(workspace["intents"][i]["examples"])): + current_example = workspace["intents"][i]["examples"][j]["text"] + relevant_data["utterance"].append(current_example) + relevant_data["intent"].append(current_intent) vocabulary.update(nltk.word_tokenize(current_example)) return relevant_data, vocabulary -def process_test_set(test_set_filename, delim='\t'): - ''' + +def process_test_set(test_set_filename, delim="\t"): + """ Process test set given the path to the test file :param test_set_filename: link to the test file in tsv format :return test_df: test set stored in pandas dataframe - ''' + """ user_inputs = list() intents = list() with open(test_set_filename, "r", encoding="utf-8") as ts: @@ -170,9 +215,10 @@ def process_test_set(test_set_filename, delim='\t'): intents.append(OFFTOPIC_LABEL) else: continue - test_df = pd.DataFrame(data={'utterance':user_inputs, 'intent':intents}) + test_df = pd.DataFrame(data={"utterance": user_inputs, "intent": intents}) return test_df + def export_workspace(conversation, experiment_workspace_id, export_path): """ Export the workspace to target path @@ -181,23 +227,26 @@ def export_workspace(conversation, experiment_workspace_id, export_path): :param export_path: the path where the exported workspace will be saved """ response = conversation.get_workspace( - workspace_id=experiment_workspace_id, export=True).get_result() - with open(export_path, 'w+', encoding='utf-8') as outfile: + workspace_id=experiment_workspace_id, export=True + ).get_result() + with open(export_path, "w+", encoding="utf-8") as outfile: json.dump(response, outfile) + def load_stopword_list(path): - ''' + """ :param path: path to stopwords list :return stopword_list: - ''' + """ stopword_list = list() - with open(path, 'r', encoding='utf-8') as filehandle: + with open(path, "r", encoding="utf-8") as filehandle: for line in filehandle: stopword_list.append(line.strip()) return stopword_list + def run_notebook(notebook_path, iam_apikey, wksp_id, test_file, output_path): - ''' + """ Run notebook for end to end test :param notebook_path: :param uname: @@ -205,74 +254,86 @@ def run_notebook(notebook_path, iam_apikey, wksp_id, test_file, output_path): :param wksp_id: :param test_file: :param output_path: - ''' + """ notebook_name, _ = os.path.splitext(os.path.basename(notebook_path)) with open(notebook_path) as f: nb = nbformat.read(f, as_version=4) nb, old_cred_text = _replace_nb_input(nb, iam_apikey, wksp_id, test_file) - #nb = _remove_experimentation(nb) + # nb = _remove_experimentation(nb) - proc = ExecutePreprocessor(timeout=60 * 60, kernel_name='python3') + proc = ExecutePreprocessor(timeout=60 * 60, kernel_name="python3") proc.allow_errors = True - proc.preprocess(nb, {'metadata': {'path': os.getcwd()}}) + proc.preprocess(nb, {"metadata": {"path": os.getcwd()}}) errors = [] for cell in nb.cells: - if 'outputs' in cell: - for output in cell['outputs']: - if output.output_type == 'error': + if "outputs" in cell: + for output in cell["outputs"]: + if output.output_type == "error": errors.append(output) - if 'source' in cell and 'iam_apikey = ' in cell['source']: - cell['source'] = old_cred_text + if "source" in cell and "iam_apikey = " in cell["source"]: + cell["source"] = old_cred_text - with open(output_path + '.ipynb', mode='wt') as f: + with open(output_path + ".ipynb", mode="wt") as f: nbformat.write(nb, f) return nb, errors + def _replace_nb_input(nb, apikey, wksp_id, test_file): - ''' + """ Replace notebook interactive input for tests :param nb: :param uname: :param pwd: :param wksp_id: :param test_file: - ''' - apikey_patt = 'iam_apikey = ' - wksp_id_patt = 'workspace_id = ' - test_file_name_patt = 'test_set_path = ' - old_cred_text = '' + """ + apikey_patt = "iam_apikey = " + wksp_id_patt = "workspace_id = " + test_file_name_patt = "test_set_path = " + old_cred_text = "" for cell in nb.cells: - if 'source' in cell and apikey_patt in cell['source']: - old_cred_text = cell['source'] - text = re.sub('(.*)\niam_apikey, (.*)', - (r'\1\n#iam_apikey, \2'), - cell['source']) # comment out input_credentials - - text = re.sub('(.*)#' + apikey_patt + '\'###\'(.*)', - r'\1' + apikey_patt + '\'' + apikey + '\'' + r'\2', - text) # replace pwd - text = re.sub('(.*)#' + wksp_id_patt + '\'###\'(.*)', - r'\1' + wksp_id_patt + '\'' + wksp_id + '\'' + r'\2', - text) # replace wksp_id - cell['source'] = text - elif 'source' in cell and test_file_name_patt in cell['source']: - text = re.sub('(.*)\n' + test_file_name_patt + '\'test_set.tsv\'(.*)', - r'\1\n' + test_file_name_patt + '\'' + test_file + '\'' + r'\2', - cell['source']) # replace test file - cell['source'] = text + if "source" in cell and apikey_patt in cell["source"]: + old_cred_text = cell["source"] + text = re.sub( + "(.*)\niam_apikey, (.*)", (r"\1\n#iam_apikey, \2"), cell["source"] + ) # comment out input_credentials + + text = re.sub( + "(.*)#" + apikey_patt + "'###'(.*)", + r"\1" + apikey_patt + "'" + apikey + "'" + r"\2", + text, + ) # replace pwd + text = re.sub( + "(.*)#" + wksp_id_patt + "'###'(.*)", + r"\1" + wksp_id_patt + "'" + wksp_id + "'" + r"\2", + text, + ) # replace wksp_id + cell["source"] = text + elif "source" in cell and test_file_name_patt in cell["source"]: + text = re.sub( + "(.*)\n" + test_file_name_patt + "'test_set.tsv'(.*)", + r"\1\n" + test_file_name_patt + "'" + test_file + "'" + r"\2", + cell["source"], + ) # replace test file + cell["source"] = text return nb, old_cred_text + def _remove_experimentation(nb): - ''' + """ Remove the experimentation session from end-to-end test :param nb: - ''' - exp_patt = 'Part 3: Experimentation' + """ + exp_patt = "Part 3: Experimentation" new_nb_cells = [] for cell in nb.cells: - if cell.cell_type == 'markdown' and 'source' in cell and exp_patt in cell['source']: + if ( + cell.cell_type == "markdown" + and "source" in cell + and exp_patt in cell["source"] + ): break else: new_nb_cells.append(cell) @@ -280,7 +341,9 @@ def _remove_experimentation(nb): return nb -def retrieve_classifier_response(conversation, workspace_id, text_input, alternate_intents=False): +def retrieve_classifier_response( + conversation, workspace_id, text_input, alternate_intents=False +): """ retrieve classifier response :param conversation: instance @@ -290,11 +353,8 @@ def retrieve_classifier_response(conversation, workspace_id, text_input, alterna :return response: """ response = conversation.message( - input={ - 'message_type': 'text', - 'text': text_input - }, + input={"message_type": "text", "text": text_input}, workspace_id=workspace_id, alternate_intents=alternate_intents, ).get_result() - return response \ No newline at end of file + return response diff --git a/skill_analysis.ipynb b/skill_analysis.ipynb index 392476d..bf0b26f 100644 --- a/skill_analysis.ipynb +++ b/skill_analysis.ipynb @@ -1,50 +1,37 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from IPython.display import Markdown, display, HTML\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" + "# Dialog skill analysis for Watson Assistant\n", + "\n", + "## Introduction\n", + "Dialog Skill Analysis for Watson Assistant (WA) is intended for use by chatbot designers, developers and data scientists who would like to experiment with and improve their existing dialog skill design. \n", + "\n", + "This notebook assumes familiarity with the Watson Assistant product as well as concepts involved in dialog skill design such as intent, entities, and utterances. \n", + "\n", + "### Environment\n", + "- Python version 3.6 or above is required. \n", + "- Install dependencies with `pip install -r requirements.txt` and refer to `requirements.txt`\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Dialog Skill Analysis for Watson Assistant\n", - "\n", - "## Introduction\n", - "Dialog Skill Analysis for Watson Assistant (WA) is intended for use by chatbot designers, developers & data scientists who would like to experiment with and improve on their existing dialog skill design. \n", - "\n", - "We assume familiarity with the Watson Assistant product as well as concepts involved in dialog skill design like intent, entities, utterances etc. \n", - "\n", - "This notebook has been organized into 3 parts based on complexity and expected input from the user.\n", - "\n", - "**Part 1**: Training Data Analysis\n", - "- Analyzes the Watson Assistant dialog skill json already created by the user\n", - "- Requires the user provide access credentials to an existing WA dialog skill like api_key, workspace_id \n", - "\n", - "**Part 2**: Model Analysis\n", - "- Evaluates the dialog skill against a test set provided by the user\n", - "- Requires the user provide a test set for model analysis\n", - "\n", - "**Part 3**: Advanced Analysis\n", - "- Analysis related to confidence threshold, term importance etc.\n", - "- Requires the user provide a test set for model analysis (same data as part 2)\n", - "\n", - "### Usage\n", - "1. Assumes familiarity using a Python Jupyter notebook\n", - "2. Assumes a Python 3.6 or greater environment\n", - "3. Install dependencies with `pip install -r requirements.txt` \n", - "4. Start jupyter server with `jupyter notebook`\n", - "5. Select `skill_analysis.ipynb` to start session\n", - "\n", - "### Alphabetic Contributor List\n", - "Watson Assistant Algorithms: Haode Qi, Ladislav Kunc, Ming Tan, Navneet Rao, Panos Karagiannis, Yang Yu" + "Install all the required packages and filter out any warnings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Markdown, display, HTML\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" ] }, { @@ -85,17 +72,36 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Part 1 : Training Data Analysis" + "## Table of contents\n", + "\n", + "1. [Part 1: Prepare the training data](#part1)