From ab7b76a7494d81678f411a601bcac423c552eb93 Mon Sep 17 00:00:00 2001
From: Han Zhang <han@HandeMacBook-Pro.local>
Date: Wed, 3 Feb 2021 16:01:49 +0800
Subject: [PATCH] Updates can be optionally sorted by word frequency or score

---
 .idea/.gitignore                              |  8 +++++++
 .idea/SmoothNLP.iml                           |  8 +++++++
 .idea/inspectionProfiles/Project_Default.xml  | 21 +++++++++++++++++++
 .../inspectionProfiles/profiles_settings.xml  |  6 ++++++
 .idea/misc.xml                                |  4 ++++
 .idea/modules.xml                             |  8 +++++++
 .idea/vcs.xml                                 |  6 ++++++
 smoothnlp/algorithm/phrase/ngram_utils.py     |  3 ++-
 .../algorithm/phrase/phrase_extraction.py     | 11 +++++++---
 9 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/SmoothNLP.iml
 create mode 100644 .idea/inspectionProfiles/Project_Default.xml
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/SmoothNLP.iml b/.idea/SmoothNLP.iml
new file mode 100644
index 0000000..f409635
--- /dev/null
+++ b/.idea/SmoothNLP.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (pytorch)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..abf150f
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,21 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="7">
+            <item index="0" class="java.lang.String" itemvalue="tensorflow" />
+            <item index="1" class="java.lang.String" itemvalue="pydot" />
+            <item index="2" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="3" class="java.lang.String" itemvalue="h5py" />
+            <item index="4" class="java.lang.String" itemvalue="keras" />
+            <item index="5" class="java.lang.String" itemvalue="bunch" />
+            <item index="6" class="java.lang.String" itemvalue="numpy" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..4e1828e
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (pytorch)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..32b9dc1
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/SmoothNLP.iml" filepath="$PROJECT_DIR$/.idea/SmoothNLP.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/smoothnlp/algorithm/phrase/ngram_utils.py b/smoothnlp/algorithm/phrase/ngram_utils.py
index 0c33b32..0cb74a3 100644
--- a/smoothnlp/algorithm/phrase/ngram_utils.py
+++ b/smoothnlp/algorithm/phrase/ngram_utils.py
@@ -206,7 +206,8 @@ def get_scores(corpus,
                  left_right_entropy[word][0],   #left_entropy
                  left_right_entropy[word][1],   #right_entropy
                  min(left_right_entropy[word][0],left_right_entropy[word][1]),    #branch entropy  BE=min{left_entropy,right_entropy}
-                 word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1]   #our score
+                 word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1],   #our score
+                 ngram_freq[word]  # word frequency
                      )
               for word in joint_phrase}
 
diff --git a/smoothnlp/algorithm/phrase/phrase_extraction.py b/smoothnlp/algorithm/phrase/phrase_extraction.py
index 6f4acf8..83fd8a2 100644
--- a/smoothnlp/algorithm/phrase/phrase_extraction.py
+++ b/smoothnlp/algorithm/phrase/phrase_extraction.py
@@ -39,14 +39,16 @@ def extract_phrase(corpus,
                    chunk_size: int = 1000000,
                    min_n:int = 2,
                    max_n:int=4,
-                   min_freq:int = 5):
+                   min_freq:int = 5,
+                   order_by: str = 'score'):
     '''
-    取前k个new words或前k%的new words
+    按score或者freq取前k个new words或前k%的new words
     :param corpus:
     :param top_k:
     :param chunk_size:
     :param max_n:
     :param min_freq:
+    :param order_by:
     :return:
     '''
     if isinstance(corpus,str):
@@ -57,7 +59,10 @@ def extract_phrase(corpus,
     else:
         corpus_splits = chunk_generator_adapter(corpus, chunk_size)
     word_info_scores = get_scores(corpus_splits,min_n,max_n,chunk_size,min_freq)
-    new_words = [item[0] for item in sorted(word_info_scores.items(),key=lambda item:item[1][-1],reverse = True)]
+    if order_by == 'score':
+        new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-2], reverse=True)]
+    elif order_by == 'freq':
+        new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-1], reverse=True)]
     if top_k > 1:              #输出前k个词
         return new_words[:top_k]
     elif top_k < 1:            #输出前k%的词