Skip to content

Commit

Permalink
merge of text-summarize
Browse files Browse the repository at this point in the history
  • Loading branch information
moyongzhuo committed Mar 17, 2020
1 parent bd516ce commit 7fd835f
Show file tree
Hide file tree
Showing 13 changed files with 219 additions and 36 deletions.
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,34 @@
pip install nlg-yongzhuo
```

# Train&Usage(调用),详情见/test/目录下
# API(联合调用, 整合几种算法)
```bash
from nlg_yongzhuo import *

doc = """PageRank算法简介。" \
"是上世纪90年代末提出的一种计算网页权重的算法! " \
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
"业界急需一种相对比较准确的网页重要性计算方法。 " \
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
"和投票目标的等级来决定新的等级。简单的说, " \
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 """

# fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
for rs in res_score:
print(rs)

```
# Usage(调用),详情见/test/目录下
```bash

# feature_base
Expand Down
3 changes: 2 additions & 1 deletion nlg_yongzhuo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,6 @@
lsi = LSISum()
nmf = NMFSum()

# centeoid_base
# merge of text_summary
from nlg_yongzhuo.text_summarization.extractive_sum.text_summary_merge import text_summarize

Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
import jieba.analyse as analyse
from collections import Counter
import re


# jieba预训练好的idf值
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self):
self.stop_words = stop_words.values()
self.num = 0

def summarize(self, text, num=8):
def summarize(self, text, num=6):
"""
根据词语意义确定中心句
:param text: str
Expand All @@ -53,7 +53,7 @@ def summarize(self, text, num=8):
self.word_count = dict(Counter(self.words))
self.word_count_rank = sorted(self.word_count.items(), key=lambda f: f[1], reverse=True)
# 最小句子数
num_min = min(num, int(len(self.word_count)*0.6))
num_min = min(num, len(self.sentences))
# 词语排序, 按照词频
self.word_rank = [wcr[0] for wcr in self.word_count_rank][0:num_min]
res_sentence = []
Expand Down Expand Up @@ -100,7 +100,7 @@ def summarize(self, text, num=8):
"方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"

ws = WordSignificanceSum()
res = ws.summarize(doc, num=6)
res = ws.summarize(doc, num=100)
for r in res:
print(r)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
# @function :textrank of textrank4zh, sklearn or gensim


from nlg_yongzhuo.text_summarization.extractive_sum.graph_base.textrank.textrank_gensim import TextrankGensimSum
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.summarization.summarizer import summarize
from nlg_yongzhuo.text_summarization.extractive_sum.graph_base.textrank.textrank_gensim import TextrankGensimSum
from textrank4zh import TextRank4Sentence
import networkx as nx
import jieba
import re


# textrank of textrank4zh
Expand All @@ -21,16 +20,7 @@
# textrank of gensim
trgs = TextrankGensimSum()

# textrank of sklearn
def cut_sentence(sentence):
"""
分句
:param sentence:str
:return:list
"""
re_sen = re.compile('[.。?!?!\n\r]')
sentences = re_sen.split(sentence)
return sentences

def tdidf_sim(sentences):
"""
tfidf相似度
Expand All @@ -51,6 +41,7 @@ def tdidf_sim(sentences):
matrix = model.fit_transform(sentences)
matrix_norm = TfidfTransformer().fit_transform(matrix)
return matrix_norm

def textrank_tfidf(sentences, topk=6):
"""
使用tf-idf作为相似度, networkx.pagerank获取中心句子作为摘要
Expand Down Expand Up @@ -78,7 +69,7 @@ class TextRankSum:
def __init__(self):
self.algorithm = 'textrank'

def summarize(self, text, num=6, model_type="textrank_textrank4zh"):
def summarize(self, text, num=6, model_type="textrank_gensim"):
if model_type=="textrank_textrank4zh":
tr4s.analyze(text=text, lower=True, source='all_filters')
key_tr4s = tr4s.get_key_sentences(num=num)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
logger = logging.getLogger(__name__)


from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
# from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
Expand Down Expand Up @@ -64,7 +64,7 @@ def summarize(self, text, num=6):
sentences_score[sentences[count]] = most_important_docs[tuple_cor]
count += 1
# 最小句子数
num_min = min(num, int(len(sentences) * 0.6))
num_min = min(num, len(sentences))
score_sen = [(rc[1], rc[0]) for rc in sorted(sentences_score.items(),
key=lambda d: d[1], reverse=True)][0:num_min]
return score_sen
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@ class Lead3Sum:
def __init__(self):
self.algorithm = 'lead_3'

def summarize(self, doc, type='mix', num=3):
def summarize(self, text, type='mix', num=3):
"""
lead-s
:param sentences: list
:param type: str, you can choose 'begin', 'end' or 'mix'
:return: list
"""
sentences = cut_sentence(doc)
sentences = cut_sentence(text)
if len(sentences) < num:
return sentences
# 最小句子数
num_min = min(num, int(len(sentences) * 0.6))
num_min = min(num, len(sentences))
if type=='begin':
summers = sentences[0:num]
elif type=='end':
Expand Down
128 changes: 128 additions & 0 deletions nlg_yongzhuo/text_summarization/extractive_sum/text_summary_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2020/3/17 9:32
# @author : Mo
# @function: text-summary of merge of multi-processing or serial


from nlg_yongzhuo import text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
from multiprocessing import Manager, Process
import multiprocessing
import platform
if platform.system()=='Windows':
multiprocessing.freeze_support()
multiprocessing.set_start_method("spawn", force=True)


# 共享变量
def worker(i, text, num, fs, return_dict):
"""
worker function
:param i: int
:param text: str
:param fs: list
:param return_dict: list<list>
:return: None
"""
return_dict[i] = fs[i].summarize(text=text, num=num)


def summary_multi_preprocess(doc, num=None, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]):
"""
len(fs) 个进程
:param doc: str
:return: list
"""
manager = Manager()
return_dict = manager.dict()
jobs = []
for i in range(len(fs)):
p = Process(target=worker, args=(i, doc, num, fs, return_dict))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
return list(return_dict.values())


def summary_serial(doc, num=None, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]):
"""
单独串行跑所有
:param doc: str
:return: list
"""
res = []
for fs_ in fs:
res_fs = fs_.summarize(text=doc, num=num)
res.append(res_fs)
return res


def summary_post_preprocess(reses):
"""
后处理
:param reses: list<list>
:return: list
"""
res_dict = {}
for res in reses:
r_dict = {}
sum_score = sum([r[0] for r in res])
for score, sent in res:
r_dict[sent] = score/sum_score
if sent in res_dict:
res_dict[sent] = res_dict[sent] + r_dict[sent]
else:
res_dict[sent] = r_dict[sent]
score_sen = [(rc[1], rc[0]) for rc in sorted(res_dict.items(),
key=lambda d: d[1], reverse=True)]
return score_sen


def text_summarize(doc, num=None, multi_process=False,
fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]):
"""
抽取式文本摘要, 汇总, 使用几个方法
:param doc:
:param num:
:param multi_process:
:return:
"""
if type(doc)==list:
doc = "。".join(doc)
elif not doc or (type(doc) != str):
raise RuntimeError(" type of doc must be 'list' or 'str' ")
if not num:
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
num = len(cut_sentence(doc))
# 是否使用多进程, 注意: 当cpu数量不足或性能较差时, 多进程不一定比串行快
if multi_process:
res = summary_multi_preprocess(doc, num, fs)
else:
res = summary_serial(doc, num, fs)
# 后处理
res_score = summary_post_preprocess(res)
return res_score


if __name__ == '__main__':
doc = """PageRank算法简介。" \
"是上世纪90年代末提出的一种计算网页权重的算法! " \
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
"业界急需一种相对比较准确的网页重要性计算方法。 " \
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
"和投票目标的等级来决定新的等级。简单的说, " \
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 """

# fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
for rs in res_score:
print(rs)

Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def summarize(self, text, num=8, topic_min=6, judge_topic=None):
res_row_i_argmax = np.argmax(res_row_i)
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
num_min = min(num, int(len_sentences_cut * 0.6))
num_min = min(num, len(self.sentences))
return score_sen[0:num_min]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def summarize(self, text, num=8, topic_min=5, judge_topic='all'):
res_row_i_argmax = np.argmax(res_row_i)
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
num_min = min(num, int(len_sentences_cut * 0.6))
num_min = min(num, len(self.sentences))
return score_sen[0:num_min]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def summarize(self, text, num=8, topic_min=3, judge_topic="all"):
res_row_i_argmax = np.argmax(res_row_i)
res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
num_min = min(num, int(len_sentences_cut * 0.6))
num_min = min(num, len(self.sentences))
return score_sen[0:num_min]


Expand Down
16 changes: 8 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
gensim>=3.7.1
jieba>=0.39
numpy>=1.16.2
pandas>=0.23.4
scikit-learn>=0.19.1
tqdm>=4.31.1
passlib>=1.7.1
textrank4zh>=0.3
gensim==3.7.1
jieba==0.39
numpy==1.16.2
pandas==0.23.4
scikit-learn==0.19.1
tqdm==4.31.1
passlib==1.7.1
textrank4zh==0.3
37 changes: 37 additions & 0 deletions test/tet_text_summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2020/3/17 20:59
# @author : Mo
# @function:



from nlg_yongzhuo import *


doc = """PageRank算法简介。" \
"是上世纪90年代末提出的一种计算网页权重的算法! " \
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
"业界急需一种相对比较准确的网页重要性计算方法。 " \
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
"和投票目标的等级来决定新的等级。简单的说, " \
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 """

# fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
for rs in res_score:
print(rs)







0 comments on commit 7fd835f

Please sign in to comment.