merge of text-summarize

yongzhuo · Mar 17, 2020 · 7fd835f · 7fd835f
1 parent bd516ce
commit 7fd835f
Show file tree

Hide file tree

Showing 13 changed files with 219 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,34 @@
 pip install nlg-yongzhuo
 ```
 
-# Train&Usage(调用),详情见/test/目录下
+# API(联合调用, 整合几种算法)
+```bash
+from nlg_yongzhuo import *
+
+doc = """PageRank算法简介。" \
+              "是上世纪90年代末提出的一种计算网页权重的算法! " \
+              "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
+              "业界急需一种相对比较准确的网页重要性计算方法。 " \
+              "是人们能够从海量互联网世界中找出自己需要的信息。 " \
+              "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
+              "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
+              "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
+              "和投票目标的等级来决定新的等级。简单的说， " \
+              "一个高等级的页面可以使其他低等级页面的等级提升。 " \
+              "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
+              "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
+              "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
+              "总的来说就是一句话，从全局角度考虑，获取重要的信。 """
+
+# fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
+res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
+for rs in res_score:
+    print(rs)
+
+```
+
+
+# Usage(调用),详情见/test/目录下
 ```bash
 
 # feature_base

diff --git a/nlg_yongzhuo/__init__.py b/nlg_yongzhuo/__init__.py
@@ -35,5 +35,6 @@
 lsi = LSISum()
 nmf = NMFSum()
 
-# centeoid_base
+# merge of text_summary
+from nlg_yongzhuo.text_summarization.extractive_sum.text_summary_merge import text_summarize
 
diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/feature_base/text_pronouns.py b/nlg_yongzhuo/text_summarization/extractive_sum/feature_base/text_pronouns.py
@@ -14,7 +14,6 @@
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
 import jieba.analyse as analyse
 from collections import Counter
-import re
 
 
 # jieba预训练好的idf值

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/feature_base/word_significance.py b/nlg_yongzhuo/text_summarization/extractive_sum/feature_base/word_significance.py
@@ -27,7 +27,7 @@ def __init__(self):
         self.stop_words = stop_words.values()
         self.num = 0
 
-    def summarize(self, text, num=8):
+    def summarize(self, text, num=6):
         """
             根据词语意义确定中心句
         :param text: str
@@ -53,7 +53,7 @@ def summarize(self, text, num=8):
         self.word_count = dict(Counter(self.words))
         self.word_count_rank = sorted(self.word_count.items(), key=lambda f: f[1], reverse=True)
         # 最小句子数
-        num_min = min(num, int(len(self.word_count)*0.6))
+        num_min = min(num, len(self.sentences))
         # 词语排序, 按照词频
         self.word_rank = [wcr[0] for wcr in self.word_count_rank][0:num_min]
         res_sentence = []
@@ -100,7 +100,7 @@ def summarize(self, text, num=8):
           "方直科技2016年营业收入9691万元，营业利润1432万元，归属于普通股股东的净利润1847万元。（多知网 黎珊）}}"
 
     ws = WordSignificanceSum()
-    res = ws.summarize(doc, num=6)
+    res = ws.summarize(doc, num=100)
     for r in res:
         print(r)
 

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/graph_base/textrank/textrank.py b/nlg_yongzhuo/text_summarization/extractive_sum/graph_base/textrank/textrank.py
@@ -5,14 +5,13 @@
 # @function :textrank of textrank4zh, sklearn or gensim
 
 
+from nlg_yongzhuo.text_summarization.extractive_sum.graph_base.textrank.textrank_gensim import TextrankGensimSum
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
-from gensim.summarization.summarizer import summarize
-from nlg_yongzhuo.text_summarization.extractive_sum.graph_base.textrank.textrank_gensim import TextrankGensimSum
 from textrank4zh import TextRank4Sentence
 import networkx as nx
 import jieba
-import re
 
 
 # textrank of textrank4zh
@@ -21,16 +20,7 @@
 # textrank of gensim
 trgs = TextrankGensimSum()
 
-# textrank of sklearn
-def cut_sentence(sentence):
-    """
-        分句
-    :param sentence:str
-    :return:list
-    """
-    re_sen = re.compile('[.。？！?!\n\r]')
-    sentences = re_sen.split(sentence)
-    return sentences
+
 def tdidf_sim(sentences):
     """
        tfidf相似度
@@ -51,6 +41,7 @@ def tdidf_sim(sentences):
     matrix = model.fit_transform(sentences)
     matrix_norm = TfidfTransformer().fit_transform(matrix)
     return matrix_norm
+
 def textrank_tfidf(sentences, topk=6):
     """
         使用tf-idf作为相似度, networkx.pagerank获取中心句子作为摘要
@@ -78,7 +69,7 @@ class TextRankSum:
     def __init__(self):
         self.algorithm = 'textrank'
 
-    def summarize(self, text, num=6, model_type="textrank_textrank4zh"):
+    def summarize(self, text, num=6, model_type="textrank_gensim"):
         if model_type=="textrank_textrank4zh":
             tr4s.analyze(text=text, lower=True, source='all_filters')
             key_tr4s = tr4s.get_key_sentences(num=num)

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/graph_base/textrank/textrank_gensim.py b/nlg_yongzhuo/text_summarization/extractive_sum/graph_base/textrank/textrank_gensim.py
@@ -32,7 +32,7 @@
 logger = logging.getLogger(__name__)
 
 
-from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
+# from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
 from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
 from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
@@ -64,7 +64,7 @@ def summarize(self, text, num=6):
                 sentences_score[sentences[count]] = most_important_docs[tuple_cor]
                 count += 1
             # 最小句子数
-            num_min = min(num, int(len(sentences) * 0.6))
+            num_min = min(num, len(sentences))
             score_sen = [(rc[1], rc[0]) for rc in sorted(sentences_score.items(),
                                                          key=lambda d: d[1], reverse=True)][0:num_min]
             return score_sen

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/nous_base/lead_3/lead_3.py b/nlg_yongzhuo/text_summarization/extractive_sum/nous_base/lead_3/lead_3.py
@@ -12,18 +12,18 @@ class Lead3Sum:
     def __init__(self):
         self.algorithm = 'lead_3'
 
-    def summarize(self, doc, type='mix', num=3):
+    def summarize(self, text, type='mix', num=3):
         """
             lead-s
         :param sentences: list
         :param type: str, you can choose 'begin', 'end' or 'mix'
         :return: list
         """
-        sentences = cut_sentence(doc)
+        sentences = cut_sentence(text)
         if len(sentences) < num:
             return sentences
         # 最小句子数
-        num_min = min(num, int(len(sentences) * 0.6))
+        num_min = min(num, len(sentences))
         if type=='begin':
             summers = sentences[0:num]
         elif type=='end':

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/text_summary_merge.py b/nlg_yongzhuo/text_summarization/extractive_sum/text_summary_merge.py
@@ -0,0 +1,128 @@
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+# @time    : 2020/3/17 9:32
+# @author  : Mo
+# @function: text-summary of merge of multi-processing or serial
+
+
+from nlg_yongzhuo import text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
+from multiprocessing import Manager, Process
+import multiprocessing
+import platform
+if platform.system()=='Windows':
+    multiprocessing.freeze_support()
+    multiprocessing.set_start_method("spawn", force=True)
+
+
+# 共享变量
+def worker(i, text, num, fs, return_dict):
+    """
+        worker function
+    :param i: int
+    :param text: str
+    :param fs: list
+    :param return_dict: list<list> 
+    :return: None
+    """
+    return_dict[i] = fs[i].summarize(text=text, num=num)
+
+
+def summary_multi_preprocess(doc, num=None, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]):
+    """
+        len(fs) 个进程
+    :param doc: str
+    :return: list
+    """
+    manager = Manager()
+    return_dict = manager.dict()
+    jobs = []
+    for i in range(len(fs)):
+        p = Process(target=worker, args=(i, doc, num, fs, return_dict))
+        jobs.append(p)
+        p.start()
+    for proc in jobs:
+        proc.join()
+    return list(return_dict.values())
+
+
+def summary_serial(doc, num=None, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]):
+    """
+        单独串行跑所有
+    :param doc: str
+    :return: list
+    """
+    res = []
+    for fs_ in fs:
+        res_fs = fs_.summarize(text=doc, num=num)
+        res.append(res_fs)
+    return res
+
+
+def summary_post_preprocess(reses):
+    """
+        后处理
+    :param reses: list<list>
+    :return: list
+    """
+    res_dict = {}
+    for res in reses:
+        r_dict = {}
+        sum_score = sum([r[0] for r in res])
+        for score, sent in res:
+            r_dict[sent] = score/sum_score
+            if sent in res_dict:
+                res_dict[sent] = res_dict[sent] + r_dict[sent]
+            else:
+                res_dict[sent] = r_dict[sent]
+    score_sen = [(rc[1], rc[0]) for rc in sorted(res_dict.items(),
+                                                 key=lambda d: d[1], reverse=True)]
+    return score_sen
+
+
+def text_summarize(doc, num=None, multi_process=False,
+                   fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]):
+    """
+        抽取式文本摘要, 汇总, 使用几个方法
+    :param doc: 
+    :param num: 
+    :param multi_process: 
+    :return: 
+    """
+    if type(doc)==list:
+        doc = "。".join(doc)
+    elif not doc or (type(doc) != str):
+        raise RuntimeError(" type of doc must be 'list' or 'str' ")
+    if not num:
+        from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
+        num = len(cut_sentence(doc))
+    # 是否使用多进程, 注意: 当cpu数量不足或性能较差时, 多进程不一定比串行快
+    if multi_process:
+        res = summary_multi_preprocess(doc, num, fs)
+    else:
+        res = summary_serial(doc, num, fs)
+    # 后处理
+    res_score = summary_post_preprocess(res)
+    return res_score
+
+
+if __name__ == '__main__':
+    doc = """PageRank算法简介。" \
+          "是上世纪90年代末提出的一种计算网页权重的算法! " \
+          "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
+          "业界急需一种相对比较准确的网页重要性计算方法。 " \
+          "是人们能够从海量互联网世界中找出自己需要的信息。 " \
+          "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
+          "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
+          "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
+          "和投票目标的等级来决定新的等级。简单的说， " \
+          "一个高等级的页面可以使其他低等级页面的等级提升。 " \
+          "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
+          "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
+          "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
+          "总的来说就是一句话，从全局角度考虑，获取重要的信。 """
+
+    # fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
+    res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
+    for rs in res_score:
+        print(rs)
+
diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/topic_base/topic_lda.py b/nlg_yongzhuo/text_summarization/extractive_sum/topic_base/topic_lda.py
@@ -84,7 +84,7 @@ def summarize(self, text, num=8, topic_min=6, judge_topic=None):
                 res_row_i_argmax = np.argmax(res_row_i)
                 res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
-        num_min = min(num, int(len_sentences_cut * 0.6))
+        num_min = min(num, len(self.sentences))
         return score_sen[0:num_min]
 
 

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/topic_base/topic_lsi.py b/nlg_yongzhuo/text_summarization/extractive_sum/topic_base/topic_lsi.py
@@ -79,7 +79,7 @@ def summarize(self, text, num=8, topic_min=5, judge_topic='all'):
                 res_row_i_argmax = np.argmax(res_row_i)
                 res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
-        num_min = min(num, int(len_sentences_cut * 0.6))
+        num_min = min(num, len(self.sentences))
         return score_sen[0:num_min]
 
 

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/topic_base/topic_nmf.py b/nlg_yongzhuo/text_summarization/extractive_sum/topic_base/topic_nmf.py
@@ -102,7 +102,7 @@ def summarize(self, text, num=8, topic_min=3, judge_topic="all"):
                 res_row_i_argmax = np.argmax(res_row_i)
                 res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
-        num_min = min(num, int(len_sentences_cut * 0.6))
+        num_min = min(num, len(self.sentences))
         return score_sen[0:num_min]
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
-gensim>=3.7.1
-jieba>=0.39
-numpy>=1.16.2
-pandas>=0.23.4
-scikit-learn>=0.19.1
-tqdm>=4.31.1
-passlib>=1.7.1
-textrank4zh>=0.3
+gensim==3.7.1
+jieba==0.39
+numpy==1.16.2
+pandas==0.23.4
+scikit-learn==0.19.1
+tqdm==4.31.1
+passlib==1.7.1
+textrank4zh==0.3
diff --git a/test/tet_text_summarize.py b/test/tet_text_summarize.py
@@ -0,0 +1,37 @@
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+# @time    : 2020/3/17 20:59
+# @author  : Mo
+# @function:
+
+
+
+from nlg_yongzhuo import *
+
+
+doc = """PageRank算法简介。" \
+              "是上世纪90年代末提出的一种计算网页权重的算法! " \
+              "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
+              "业界急需一种相对比较准确的网页重要性计算方法。 " \
+              "是人们能够从海量互联网世界中找出自己需要的信息。 " \
+              "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
+              "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
+              "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
+              "和投票目标的等级来决定新的等级。简单的说， " \
+              "一个高等级的页面可以使其他低等级页面的等级提升。 " \
+              "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
+              "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
+              "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
+              "总的来说就是一句话，从全局角度考虑，获取重要的信。 """
+
+# fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
+res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
+for rs in res_score:
+    print(rs)
+
+
+
+
+
+
+