From ab7b76a7494d81678f411a601bcac423c552eb93 Mon Sep 17 00:00:00 2001 From: Han Zhang Date: Wed, 3 Feb 2021 16:01:49 +0800 Subject: [PATCH] Updates can be optionally sorted by word frequency or score --- .idea/.gitignore | 8 +++++++ .idea/SmoothNLP.iml | 8 +++++++ .idea/inspectionProfiles/Project_Default.xml | 21 +++++++++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 4 ++++ .idea/modules.xml | 8 +++++++ .idea/vcs.xml | 6 ++++++ smoothnlp/algorithm/phrase/ngram_utils.py | 3 ++- .../algorithm/phrase/phrase_extraction.py | 11 +++++++--- 9 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/SmoothNLP.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/SmoothNLP.iml b/.idea/SmoothNLP.iml new file mode 100644 index 0000000..f409635 --- /dev/null +++ b/.idea/SmoothNLP.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..abf150f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,21 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..4e1828e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..32b9dc1 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/smoothnlp/algorithm/phrase/ngram_utils.py b/smoothnlp/algorithm/phrase/ngram_utils.py index 0c33b32..0cb74a3 100644 --- a/smoothnlp/algorithm/phrase/ngram_utils.py +++ b/smoothnlp/algorithm/phrase/ngram_utils.py @@ -206,7 +206,8 @@ def get_scores(corpus, left_right_entropy[word][0], #left_entropy left_right_entropy[word][1], #right_entropy min(left_right_entropy[word][0],left_right_entropy[word][1]), #branch entropy BE=min{left_entropy,right_entropy} - word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1] #our score + word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1], #our score + ngram_freq[word] # word frequency ) for word in joint_phrase} diff --git a/smoothnlp/algorithm/phrase/phrase_extraction.py b/smoothnlp/algorithm/phrase/phrase_extraction.py index 6f4acf8..83fd8a2 100644 --- a/smoothnlp/algorithm/phrase/phrase_extraction.py +++ b/smoothnlp/algorithm/phrase/phrase_extraction.py @@ -39,14 +39,16 @@ def extract_phrase(corpus, chunk_size: int = 1000000, min_n:int = 2, max_n:int=4, - min_freq:int = 5): + min_freq:int = 5, + order_by: str = 'score'): ''' - 取前k个new words或前k%的new words + 按score或者freq取前k个new words或前k%的new words :param corpus: :param top_k: :param chunk_size: :param max_n: :param min_freq: + :param order_by: :return: ''' if isinstance(corpus,str): @@ -57,7 +59,10 @@ def extract_phrase(corpus, else: corpus_splits = chunk_generator_adapter(corpus, chunk_size) word_info_scores = get_scores(corpus_splits,min_n,max_n,chunk_size,min_freq) - new_words = [item[0] for item in sorted(word_info_scores.items(),key=lambda item:item[1][-1],reverse = True)] + if order_by == 'score': + new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-2], reverse=True)] + elif order_by == 'freq': + new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-1], reverse=True)] if top_k > 1: #输出前k个词 return new_words[:top_k] elif top_k < 1: #输出前k%的词