Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Text Summarizer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

# Text Summarizer Using Natural Language Tool Kit

# About Summarizer
A Beginner Level Summarizer for Slingshot OSS Challenge build using NLTK and Pyautogui for inputs and outputs

# Demo
https://user-images.githubusercontent.com/68835688/128850788-b3cc4b16-ef56-4754-b8a4-750e8d14e54f.mp4


If demo video does not work, please watch the demo video at https://youtu.be/Uv0K9-1PzkM
Real time testing can be done at https://text-summarizer.aryamansri.repl.co/
93 changes: 93 additions & 0 deletions Text Summarizer/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from __future__ import unicode_literals
from flask import Flask,render_template,url_for,request

from spacy_summarization import text_summarizer
from gensim.summarization import summarize
from nltk_summarization import nltk_summarizer
import time
import spacy
nlp = spacy.load('en_core_web_sm')
app = Flask(__name__)

# Web Scraping Pkg
from bs4 import BeautifulSoup
# from urllib.request import urlopen
from urllib.request import urlopen

# Sumy Pkg
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# Sumy
def sumy_summary(docx):
parser = PlaintextParser.from_string(docx,Tokenizer("english"))
lex_summarizer = LexRankSummarizer()
summary = lex_summarizer(parser.document,3)
summary_list = [str(sentence) for sentence in summary]
result = ' '.join(summary_list)
return result



# Fetch Text From Url
def get_text(url):
page = urlopen(url)
soup = BeautifulSoup(page)
fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p')))
return fetched_text

@app.route('/')
def index():
return render_template('index.html')


@app.route('/analyze',methods=['GET','POST'])
def analyze():
start = time.time()
if request.method == 'POST':
rawtext = request.form['rawtext']
final_summary = text_summarizer(rawtext)
return render_template('index.html',ctext=rawtext,final_summary=final_summary)

@app.route('/analyze_url',methods=['GET','POST'])
def analyze_url():
start = time.time()
if request.method == 'POST':
raw_url = request.form['raw_url']
rawtext = get_text(raw_url)
final_summary = text_summarizer(rawtext)
return render_template('index.html',ctext=rawtext,final_summary=final_summary,)



@app.route('/compare_summary')
def compare_summary():
return render_template('compare_summary.html')

@app.route('/comparer',methods=['GET','POST'])
def comparer():
start = time.time()
if request.method == 'POST':
rawtext = request.form['rawtext']
final_summary_spacy = text_summarizer(rawtext)

# Gensim Summarizer
final_summary_gensim = summarize(rawtext)
# NLTK
final_summary_nltk = nltk_summarizer(rawtext)
# Sumy
final_summary_sumy = sumy_summary(rawtext)

end = time.time()
final_time = end-start
return render_template('compare_summary.html',ctext=rawtext,final_summary_spacy=final_summary_spacy,final_summary_gensim=final_summary_gensim,final_summary_nltk=final_summary_nltk,final_time=final_time)



@app.route('/about')
def about():
return render_template('index.html')

if __name__ == '__main__':
app.run(host = "0.0.0.0", debug=True)
37 changes: 37 additions & 0 deletions Text Summarizer/nltk_summarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq

def nltk_summarizer(raw_text):
stopWords = set(stopwords.words("english"))
word_frequencies = {}
for word in nltk.word_tokenize(raw_text):
if word not in stopWords:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1

maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

sentence_list = nltk.sent_tokenize(raw_text)
sentence_scores = {}
for sent in sentence_list:
for word in nltk.word_tokenize(sent.lower()):
if word in word_frequencies.keys():
if len(sent.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word]
else:
sentence_scores[sent] += word_frequencies[word]



summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)
return summary
Binary file added Text Summarizer/nltk_summarization.pyc
Binary file not shown.
18 changes: 18 additions & 0 deletions Text Summarizer/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[tool.poetry]
name = "repl_python3_SlategreyAwkwardFirewall"
version = "0.1.0"
description = ""
authors = ["Your Name <[email protected]>"]

[tool.poetry.dependencies]
python = "^3.8"
Flask = "^2.0.1"
gensim = "^3.8.3"
sumy = "^0.8.1"
bs4 = "^0.0.1"
spacy = "^3.1.1"
[tool.poetry.dev-dependencies]

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
49 changes: 49 additions & 0 deletions Text Summarizer/spacy_summarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# NLP Pkgs
import spacy
nlp = spacy.load('en_core_web_sm')
# Pkgs for Normalizing Text
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
# Import Heapq for Finding the Top N Sentences
from heapq import nlargest



def text_summarizer(raw_docx):
raw_text = raw_docx
docx = nlp(raw_text)
stopwords = list(STOP_WORDS)
# Build Word Frequency # word.text is tokenization in spacy
word_frequencies = {}
for word in docx:
if word.text not in stopwords:
if word.text not in word_frequencies.keys():
word_frequencies[word.text] = 1
else:
word_frequencies[word.text] += 1


maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
# Sentence Tokens
sentence_list = [ sentence for sentence in docx.sents ]

# Sentence Scores
sentence_scores = {}
for sent in sentence_list:
for word in sent:
if word.text.lower() in word_frequencies.keys():
if len(sent.text.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.text.lower()]
else:
sentence_scores[sent] += word_frequencies[word.text.lower()]


summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
final_sentences = [ w.text for w in summarized_sentences ]
summary = ' '.join(final_sentences)
return summary

Binary file added Text Summarizer/spacy_summarization.pyc
Binary file not shown.
54 changes: 54 additions & 0 deletions Text Summarizer/spacy_summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# NLP Pkgs
import spacy
nlp = spacy.load('en_core_web_sm')
# Pkgs for Normalizing Text
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
# Import Heapq for Finding the Top N Sentences
from heapq import nlargest



def text_summarizer(raw_docx):
raw_text = raw_docx
docx = nlp(raw_text)
stopwords = list(STOP_WORDS)
# Build Word Frequency # word.text is tokenization in spacy
word_frequencies = {}
for word in docx:
if word.text not in stopwords:
if word.text not in word_frequencies.keys():
word_frequencies[word.text] = 1
else:
word_frequencies[word.text] += 1


maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
# Sentence Tokens
sentence_list = [ sentence for sentence in docx.sents ]

# Sentence Scores
sentence_scores = {}
for sent in sentence_list:
for word in sent:
if word.text.lower() in word_frequencies.keys():
if len(sent.text.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.text.lower()]
else:
sentence_scores[sent] += word_frequencies[word.text.lower()]


summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
final_sentences = [ w.text for w in summarized_sentences ]
summary = ' '.join(final_sentences)
print("Original Document\n")
print(raw_docx)
print("Total Length:",len(raw_docx))
print('\n\nSummarized Document\n')
print(summary)
print("Total Length:",len(summary))

50 changes: 50 additions & 0 deletions Text Summarizer/static/css/custom.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

nav ul a,
nav .brand-logo {
color: #444;
}

p {
line-height: 2rem;
}

.sidenav-trigger {
color: #26a69a;
}

.parallax-container {
min-height: 380px;
line-height: 0;
height: auto;
color: rgba(255,255,255,.9);
}
.parallax-container .section {
width: 100%;
}

@media only screen and (max-width : 992px) {
.parallax-container .section {
position: absolute;
top: 40%;
}
#index-banner .section {
top: 10%;
}
}

@media only screen and (max-width : 600px) {
#index-banner .section {
top: 0;
}
}

.icon-block {
padding: 0 15px;
}
.icon-block .material-icons {
font-size: inherit;
}

footer.page-footer {
margin: 0;
}
Loading