garimasingh128 · AryamanSrii · Oct 9, 2021 · Oct 9, 2021 · Oct 9, 2021
diff --git a/Text Summarizer/README.md b/Text Summarizer/README.md
@@ -0,0 +1,12 @@
+
+# Text Summarizer Using Natural Language Tool Kit
+
+# About Summarizer
+A Beginner Level Summarizer for Slingshot OSS Challenge build using NLTK and Pyautogui for inputs and outputs
+
+# Demo
+https://user-images.githubusercontent.com/68835688/128850788-b3cc4b16-ef56-4754-b8a4-750e8d14e54f.mp4
+
+
+If demo video does not work, please watch the demo video at https://youtu.be/Uv0K9-1PzkM
+Real time testing can be done at https://text-summarizer.aryamansri.repl.co/
diff --git a/Text Summarizer/main.py b/Text Summarizer/main.py
@@ -0,0 +1,93 @@
+from __future__ import unicode_literals
+from flask import Flask,render_template,url_for,request
+
+from spacy_summarization import text_summarizer
+from gensim.summarization import summarize
+from nltk_summarization import nltk_summarizer
+import time
+import spacy
+nlp = spacy.load('en_core_web_sm')
+app = Flask(__name__)
+
+# Web Scraping Pkg
+from bs4 import BeautifulSoup
+# from urllib.request import urlopen
+from urllib.request import urlopen
+
+# Sumy Pkg
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.lex_rank import LexRankSummarizer
+
+# Sumy 
+def sumy_summary(docx):
+	parser = PlaintextParser.from_string(docx,Tokenizer("english"))
+	lex_summarizer = LexRankSummarizer()
+	summary = lex_summarizer(parser.document,3)
+	summary_list = [str(sentence) for sentence in summary]
+	result = ' '.join(summary_list)
+	return result
+
+
+
+# Fetch Text From Url
+def get_text(url):
+	page = urlopen(url)
+	soup = BeautifulSoup(page)
+	fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p')))
+	return fetched_text
+
+@app.route('/')
+def index():
+	return render_template('index.html')
+
+
+@app.route('/analyze',methods=['GET','POST'])
+def analyze():
+	start = time.time()
+	if request.method == 'POST':
+		rawtext = request.form['rawtext']
+		final_summary = text_summarizer(rawtext)
+	return render_template('index.html',ctext=rawtext,final_summary=final_summary)
+
+@app.route('/analyze_url',methods=['GET','POST'])
+def analyze_url():
+	start = time.time()
+	if request.method == 'POST':
+		raw_url = request.form['raw_url']
+		rawtext = get_text(raw_url)
+		final_summary = text_summarizer(rawtext)
+	return render_template('index.html',ctext=rawtext,final_summary=final_summary,)
+
+
+
+@app.route('/compare_summary')
+def compare_summary():
+	return render_template('compare_summary.html')
+
+@app.route('/comparer',methods=['GET','POST'])
+def comparer():
+	start = time.time()
+	if request.method == 'POST':
+		rawtext = request.form['rawtext']
+		final_summary_spacy = text_summarizer(rawtext)
+
+		# Gensim Summarizer
+		final_summary_gensim = summarize(rawtext)
+		# NLTK
+		final_summary_nltk = nltk_summarizer(rawtext)
+		# Sumy
+		final_summary_sumy = sumy_summary(rawtext)
+
+		end = time.time()
+		final_time = end-start
+	return render_template('compare_summary.html',ctext=rawtext,final_summary_spacy=final_summary_spacy,final_summary_gensim=final_summary_gensim,final_summary_nltk=final_summary_nltk,final_time=final_time)
+
+
+
+@app.route('/about')
+def about():
+	return render_template('index.html')
+
+if __name__ == '__main__':
+	app.run(host = "0.0.0.0", debug=True)
diff --git a/Text Summarizer/nltk_summarization.py b/Text Summarizer/nltk_summarization.py
@@ -0,0 +1,37 @@
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
+import heapq  
+
+def nltk_summarizer(raw_text):
+	stopWords = set(stopwords.words("english"))
+	word_frequencies = {}  
+	for word in nltk.word_tokenize(raw_text):  
+	    if word not in stopWords:
+	        if word not in word_frequencies.keys():
+	            word_frequencies[word] = 1
+	        else:
+	            word_frequencies[word] += 1
+
+	maximum_frequncy = max(word_frequencies.values())
+
+	for word in word_frequencies.keys():  
+	    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
+
+	sentence_list = nltk.sent_tokenize(raw_text)
+	sentence_scores = {}  
+	for sent in sentence_list:  
+	    for word in nltk.word_tokenize(sent.lower()):
+	        if word in word_frequencies.keys():
+	            if len(sent.split(' ')) < 30:
+	                if sent not in sentence_scores.keys():
+	                    sentence_scores[sent] = word_frequencies[word]
+	                else:
+	                    sentence_scores[sent] += word_frequencies[word]
+
+
+
+	summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
+
+	summary = ' '.join(summary_sentences)  
+	return summary
diff --git a/Text Summarizer/nltk_summarization.pyc b/Text Summarizer/nltk_summarization.pyc
diff --git a/Text Summarizer/pyproject.toml b/Text Summarizer/pyproject.toml
@@ -0,0 +1,18 @@
+[tool.poetry]
+name = "repl_python3_SlategreyAwkwardFirewall"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <[email protected]>"]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+Flask = "^2.0.1"
+gensim = "^3.8.3"
+sumy = "^0.8.1"
+bs4 = "^0.0.1"
+spacy = "^3.1.1"
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/Text Summarizer/spacy_summarization.py b/Text Summarizer/spacy_summarization.py
@@ -0,0 +1,49 @@
+# NLP Pkgs
+import spacy 
+nlp = spacy.load('en_core_web_sm')
+# Pkgs for Normalizing Text
+from spacy.lang.en.stop_words import STOP_WORDS
+from string import punctuation
+# Import Heapq for Finding the Top N Sentences
+from heapq import nlargest
+
+
+
+def text_summarizer(raw_docx):
+    raw_text = raw_docx
+    docx = nlp(raw_text)
+    stopwords = list(STOP_WORDS)
+    # Build Word Frequency # word.text is tokenization in spacy
+    word_frequencies = {}  
+    for word in docx:  
+        if word.text not in stopwords:
+            if word.text not in word_frequencies.keys():
+                word_frequencies[word.text] = 1
+            else:
+                word_frequencies[word.text] += 1
+
+
+    maximum_frequncy = max(word_frequencies.values())
+
+    for word in word_frequencies.keys():  
+        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
+    # Sentence Tokens
+    sentence_list = [ sentence for sentence in docx.sents ]
+
+    # Sentence Scores
+    sentence_scores = {}  
+    for sent in sentence_list:  
+        for word in sent:
+            if word.text.lower() in word_frequencies.keys():
+                if len(sent.text.split(' ')) < 30:
+                    if sent not in sentence_scores.keys():
+                        sentence_scores[sent] = word_frequencies[word.text.lower()]
+                    else:
+                        sentence_scores[sent] += word_frequencies[word.text.lower()]
+
+
+    summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
+    final_sentences = [ w.text for w in summarized_sentences ]
+    summary = ' '.join(final_sentences)
+    return summary
+
diff --git a/Text Summarizer/spacy_summarization.pyc b/Text Summarizer/spacy_summarization.pyc
diff --git a/Text Summarizer/spacy_summarizer.py b/Text Summarizer/spacy_summarizer.py
@@ -0,0 +1,54 @@
+# NLP Pkgs
+import spacy 
+nlp = spacy.load('en_core_web_sm')
+# Pkgs for Normalizing Text
+from spacy.lang.en.stop_words import STOP_WORDS
+from string import punctuation
+# Import Heapq for Finding the Top N Sentences
+from heapq import nlargest
+
+
+
+def text_summarizer(raw_docx):
+    raw_text = raw_docx
+    docx = nlp(raw_text)
+    stopwords = list(STOP_WORDS)
+    # Build Word Frequency # word.text is tokenization in spacy
+    word_frequencies = {}  
+    for word in docx:  
+        if word.text not in stopwords:
+            if word.text not in word_frequencies.keys():
+                word_frequencies[word.text] = 1
+            else:
+                word_frequencies[word.text] += 1
+
+
+    maximum_frequncy = max(word_frequencies.values())
+
+    for word in word_frequencies.keys():  
+        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
+    # Sentence Tokens
+    sentence_list = [ sentence for sentence in docx.sents ]
+
+    # Sentence Scores
+    sentence_scores = {}  
+    for sent in sentence_list:  
+        for word in sent:
+            if word.text.lower() in word_frequencies.keys():
+                if len(sent.text.split(' ')) < 30:
+                    if sent not in sentence_scores.keys():
+                        sentence_scores[sent] = word_frequencies[word.text.lower()]
+                    else:
+                        sentence_scores[sent] += word_frequencies[word.text.lower()]
+
+
+    summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
+    final_sentences = [ w.text for w in summarized_sentences ]
+    summary = ' '.join(final_sentences)
+    print("Original Document\n")
+    print(raw_docx)
+    print("Total Length:",len(raw_docx))
+    print('\n\nSummarized Document\n')
+    print(summary)
+    print("Total Length:",len(summary))
+
diff --git a/Text Summarizer/static/css/custom.css b/Text Summarizer/static/css/custom.css
@@ -0,0 +1,50 @@
+
+nav ul a,
+nav .brand-logo {
+  color: #444;
+}
+
+p {
+  line-height: 2rem;
+}
+
+.sidenav-trigger {
+  color: #26a69a;
+}
+
+.parallax-container {
+  min-height: 380px;
+  line-height: 0;
+  height: auto;
+  color: rgba(255,255,255,.9);
+}
+  .parallax-container .section {
+    width: 100%;
+  }
+
+@media only screen and (max-width : 992px) {
+  .parallax-container .section {
+    position: absolute;
+    top: 40%;
+  }
+  #index-banner .section {
+    top: 10%;
+  }
+}
+
+@media only screen and (max-width : 600px) {
+  #index-banner .section {
+    top: 0;
+  }
+}
+
+.icon-block {
+  padding: 0 15px;
+}
+.icon-block .material-icons {
+  font-size: inherit;
+}
+
+footer.page-footer {
+  margin: 0;
+}