-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bd516ce
commit 7fd835f
Showing
13 changed files
with
219 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
128 changes: 128 additions & 0 deletions
128
nlg_yongzhuo/text_summarization/extractive_sum/text_summary_merge.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# !/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
# @time : 2020/3/17 9:32 | ||
# @author : Mo | ||
# @function: text-summary of merge of multi-processing or serial | ||
|
||
|
||
from nlg_yongzhuo import text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf | ||
from multiprocessing import Manager, Process | ||
import multiprocessing | ||
import platform | ||
if platform.system()=='Windows': | ||
multiprocessing.freeze_support() | ||
multiprocessing.set_start_method("spawn", force=True) | ||
|
||
|
||
# 共享变量 | ||
def worker(i, text, num, fs, return_dict): | ||
""" | ||
worker function | ||
:param i: int | ||
:param text: str | ||
:param fs: list | ||
:param return_dict: list<list> | ||
:return: None | ||
""" | ||
return_dict[i] = fs[i].summarize(text=text, num=num) | ||
|
||
|
||
def summary_multi_preprocess(doc, num=None, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]): | ||
""" | ||
len(fs) 个进程 | ||
:param doc: str | ||
:return: list | ||
""" | ||
manager = Manager() | ||
return_dict = manager.dict() | ||
jobs = [] | ||
for i in range(len(fs)): | ||
p = Process(target=worker, args=(i, doc, num, fs, return_dict)) | ||
jobs.append(p) | ||
p.start() | ||
for proc in jobs: | ||
proc.join() | ||
return list(return_dict.values()) | ||
|
||
|
||
def summary_serial(doc, num=None, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]): | ||
""" | ||
单独串行跑所有 | ||
:param doc: str | ||
:return: list | ||
""" | ||
res = [] | ||
for fs_ in fs: | ||
res_fs = fs_.summarize(text=doc, num=num) | ||
res.append(res_fs) | ||
return res | ||
|
||
|
||
def summary_post_preprocess(reses): | ||
""" | ||
后处理 | ||
:param reses: list<list> | ||
:return: list | ||
""" | ||
res_dict = {} | ||
for res in reses: | ||
r_dict = {} | ||
sum_score = sum([r[0] for r in res]) | ||
for score, sent in res: | ||
r_dict[sent] = score/sum_score | ||
if sent in res_dict: | ||
res_dict[sent] = res_dict[sent] + r_dict[sent] | ||
else: | ||
res_dict[sent] = r_dict[sent] | ||
score_sen = [(rc[1], rc[0]) for rc in sorted(res_dict.items(), | ||
key=lambda d: d[1], reverse=True)] | ||
return score_sen | ||
|
||
|
||
def text_summarize(doc, num=None, multi_process=False, | ||
fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]): | ||
""" | ||
抽取式文本摘要, 汇总, 使用几个方法 | ||
:param doc: | ||
:param num: | ||
:param multi_process: | ||
:return: | ||
""" | ||
if type(doc)==list: | ||
doc = "。".join(doc) | ||
elif not doc or (type(doc) != str): | ||
raise RuntimeError(" type of doc must be 'list' or 'str' ") | ||
if not num: | ||
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence | ||
num = len(cut_sentence(doc)) | ||
# 是否使用多进程, 注意: 当cpu数量不足或性能较差时, 多进程不一定比串行快 | ||
if multi_process: | ||
res = summary_multi_preprocess(doc, num, fs) | ||
else: | ||
res = summary_serial(doc, num, fs) | ||
# 后处理 | ||
res_score = summary_post_preprocess(res) | ||
return res_score | ||
|
||
|
||
if __name__ == '__main__': | ||
doc = """PageRank算法简介。" \ | ||
"是上世纪90年代末提出的一种计算网页权重的算法! " \ | ||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ | ||
"业界急需一种相对比较准确的网页重要性计算方法。 " \ | ||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \ | ||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ | ||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ | ||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ | ||
"和投票目标的等级来决定新的等级。简单的说, " \ | ||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \ | ||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ | ||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ | ||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ | ||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 """ | ||
|
||
# fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf | ||
res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]) | ||
for rs in res_score: | ||
print(rs) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
gensim>=3.7.1 | ||
jieba>=0.39 | ||
numpy>=1.16.2 | ||
pandas>=0.23.4 | ||
scikit-learn>=0.19.1 | ||
tqdm>=4.31.1 | ||
passlib>=1.7.1 | ||
textrank4zh>=0.3 | ||
gensim==3.7.1 | ||
jieba==0.39 | ||
numpy==1.16.2 | ||
pandas==0.23.4 | ||
scikit-learn==0.19.1 | ||
tqdm==4.31.1 | ||
passlib==1.7.1 | ||
textrank4zh==0.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# !/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
# @time : 2020/3/17 20:59 | ||
# @author : Mo | ||
# @function: | ||
|
||
|
||
|
||
from nlg_yongzhuo import * | ||
|
||
|
||
doc = """PageRank算法简介。" \ | ||
"是上世纪90年代末提出的一种计算网页权重的算法! " \ | ||
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ | ||
"业界急需一种相对比较准确的网页重要性计算方法。 " \ | ||
"是人们能够从海量互联网世界中找出自己需要的信息。 " \ | ||
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ | ||
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ | ||
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ | ||
"和投票目标的等级来决定新的等级。简单的说, " \ | ||
"一个高等级的页面可以使其他低等级页面的等级提升。 " \ | ||
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ | ||
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ | ||
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ | ||
"总的来说就是一句话,从全局角度考虑,获取重要的信。 """ | ||
|
||
# fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf | ||
res_score = text_summarize(doc, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]) | ||
for rs in res_score: | ||
print(rs) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|