Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
["spelling_check"],
["max_abstract_size_check"],
["theme_in_report_check"],
["compare_goal_and_content_check"],
["compare_tasks_and_content_check"],
]

DEFAULT_TYPE = 'pres'
Expand Down
4 changes: 3 additions & 1 deletion app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,7 @@
from .sections_check import LRReportSectionCheck
from .style_check import ReportStyleCheck
from .spelling_check import SpellingCheck
from .compare_goal_and_content import CompareGoalAndContentCheck
from .compare_tasks_and_content import CompareTasksAndContentCheck
from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
from .template_name import ReportTemplateNameCheck
from .template_name import ReportTemplateNameCheck
87 changes: 87 additions & 0 deletions app/main/checks/report_checks/compare_goal_and_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from ..base_check import BaseReportCriterion, answer

import app.nlp.text_similarity as ts


class CompareGoalAndContentCheck(BaseReportCriterion):
label = "Проверка соответствия цели и содержания"
description = "Степень раскрытия цели в содержании"
id = 'compare_goal_and_content_check'

def __init__(self, file_info):
super().__init__(file_info)
self.headers = []
self.goal = ""
self.chapters = {}
self.weights = {}
self.to_pass = 0
self.to_ignore = []

def late_init(self):
self.headers = self.file.make_chapters(self.file_type['report_type'])
self.weights = {
"ВВЕДЕНИЕ": 1,
"1": 2,
"2": 2,
"3": 5,
"4": 2,
"5": 1,
"ЗАКЛЮЧЕНИЕ": 1
}
self.to_pass = 0.1
self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]

def check(self):
self.late_init()
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
result = ""
intro_text = ""
for header in self.headers:
if header["text"].lower().find("введение") >= 0:
for child in header["child"]:
intro_text += child["text"]
goal_index = intro_text.find("Цель")
if goal_index > 0:
goal_start = goal_index + len("Цель") + 1
goal_end = intro_text.find(".", goal_start)
self.goal = intro_text[goal_start:goal_end]
else:
return answer(False, "В введении не найдена цель работы")
for header in self.headers:
if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
continue
text = ""
for child in header["child"]:
text += child['text']
self.chapters[header["text"]] = text
self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
NLPProcessor = ts.NLPProcessor()
calculate_result = NLPProcessor.calculate_cosine_similarity(self.goal, self.chapters)
max_result = max(calculate_result.values())
for k, v in calculate_result.items():
for chapter, weight in self.weights.items():
if k.find(chapter) == 0:
calculate_result[k] = v * weight
break
calculate_result[k] = calculate_result[k] / max_result
avg = round(sum(calculate_result.values()) / len(calculate_result.values()), 3)
if avg < self.to_pass:
return answer(False,
f"Цель недостаточно раскрыта в содержании (нужно {self.to_pass * 100}%, набрано {avg * 100}%)")
result += f"<br><b>Тема раскрыта на {avg * 100}%</b><br>"
sorted_chapters = dict(sorted(calculate_result.items(), key=lambda item: item[1], reverse=True))
result += f"<br><b>7 разделов, наиболее раскрывающих тему:</b><br>"
for i, key in enumerate(sorted_chapters.keys()):
if i >= 7:
break
result += f"<br>\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают тему<br>"
result += f"<br><b>7 разделов, наименее раскрывающих тему:</b><br>"
for i, key in enumerate(sorted_chapters.keys()):
if i < len(sorted_chapters) - 7:
continue
result += f"<br>\"{key}\", {self.__output(sorted_chapters[key], sum(sorted_chapters.values()))}% текста раскрывают тему<br>"
return answer(True, result)

def __output(self, value, summ):
return round(value / summ, 3) * 100
109 changes: 109 additions & 0 deletions app/main/checks/report_checks/compare_tasks_and_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from ..base_check import BaseReportCriterion, answer

import app.nlp.text_similarity as ts


class CompareTasksAndContentCheck(BaseReportCriterion):
label = "Проверка соответствия задач и содержания"
description = "Степень раскрытия задач в содержании"
id = 'compare_tasks_and_content_check'

def __init__(self, file_info):
super().__init__(file_info)
self.headers = []
self.tasks = []
self.chapters = {}
self.weights = {}
self.all_to_pass = 0
self.specific_to_pass = 0
self.to_ignore = []
self.minimum_tasks = 0

def late_init(self):
self.headers = self.file.make_chapters(self.file_type['report_type'])
self.weights = {
"ВВЕДЕНИЕ": 1,
"1": 2,
"2": 2,
"3": 5,
"4": 2,
"5": 1,
"ЗАКЛЮЧЕНИЕ": 1
}
self.all_to_pass = 0.15
self.specific_to_pass = 0.05
self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]
self.minimum_tasks = 3

def check(self):
self.late_init()
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
result = ""
possible_tasks = []
for header in self.headers:
if header["text"].lower().find("введение") >= 0:
for i, child in enumerate(header["child"]):
if child["text"].lower().find("задачи") >= 0:
possible_tasks.append(i)
if child["text"].lower().find("объект") >= 0 and child["text"].lower().find("исследования") > 0:
if not possible_tasks:
return answer(False, "В введении не найдены задачи работы")
tasks = header["child"][max(possible_tasks) + 1:i]
while len(tasks) <= self.minimum_tasks:
try:
possible_tasks.remove(max(possible_tasks))
tasks = header["child"][max(possible_tasks) + 1:i]
except:
return answer(False, f"В введении меньше {self.minimum_tasks} задач, что меньше необходимого минимума")
self.tasks = [task["text"] for task in tasks]
break
if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
continue
text = ""
for child in header["child"]:
text += child['text']
self.chapters[header["text"]] = text
self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
NLPProcessor = ts.NLPProcessor()
all_tasks_result = NLPProcessor.calculate_cosine_similarity(" ".join(self.tasks), self.chapters)
max_result = max(all_tasks_result.values())
for k, v in all_tasks_result.items():
for chapter, weight in self.weights.items():
if k.find(chapter) == 0:
all_tasks_result[k] = v * weight
break
all_tasks_result[k] = round(all_tasks_result[k] / max_result, 3)
avg = round(sum(all_tasks_result.values()) / len(all_tasks_result.values()), 3)
if avg < self.all_to_pass:
return answer(False, f"Задачи недостаточно раскрыты в содержании (нужно {self.all_to_pass * 100}%, набрано {avg * 100}%)")
result += f"<br><b>Задачи раскрыты на {avg * 100}%</b><br>"
for task in self.tasks:
cur_task = NLPProcessor.calculate_cosine_similarity(task, self.chapters)
max_result = max(cur_task.values())
for k, v in cur_task.items():
for chapter, weight in self.weights.items():
if k.find(chapter) == 0:
cur_task[k] = v * weight
break
cur_task[k] = cur_task[k] / max_result
sorted_chapters = dict(sorted(cur_task.items(), key=lambda item: item[1], reverse=True))
specific_avg = sum(sorted_chapters.values()) / len(sorted_chapters.values())
specific_avg = round(specific_avg, 3)
if specific_avg < self.specific_to_pass:
return answer(False, f"<br>Задача \"{task}\" недостаточно раскрыта<br>")
result += f"<br><b>Задача \"{task}\" раскрыта на {round(specific_avg * 100, 2)}%</b><br><br>Задачу \"{task}\" наиболее раскрывают разделы: <br>"
for i, key in enumerate(sorted_chapters.keys()):
if i >= 3:
break
result += f"<br>\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают задачу<br>"
all_tasks_result = dict(sorted(all_tasks_result.items(), key=lambda item: item[1], reverse=True))
result += f"<br><b>Разделы, наименее раскрывающие задачи:</b><br>"
for i, key in enumerate(all_tasks_result.keys()):
if i < len(all_tasks_result.keys()) - 5:
continue
result += f"<br>{key}: {round(all_tasks_result[key] * 100, 3)}%<br>"
return answer(True, result)

def __output(self, value, summ):
return (value / summ) * 100
97 changes: 97 additions & 0 deletions app/nlp/text_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from collections import defaultdict

import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.util import ngrams
import string


class NLPProcessor:
def __init__(self, language='russian'):
nltk.download('punkt')
nltk.download('stopwords')
self.stop_words = set(stopwords.words(language))
self.stemmer = SnowballStemmer(language)

def preprocessing(self, text):
text = text.translate(str.maketrans('', '', string.punctuation))
tokens = word_tokenize(text)
tokens = [word for word in tokens if word.lower() not in self.stop_words]
return [self.stemmer.stem(token) for token in tokens]

def get_ngrams(self, tokens, n=2):
result = []
for i in range(n):
n_grams = ngrams(tokens, i + 1)
result.extend([' '.join(grams) for grams in n_grams])
return result

def get_bag_of_n_gramms(self, corpus):
new_corpus = []
for item in corpus:
for n_gramm in item:
new_corpus.append(n_gramm)
index_word = {}
i = 0
for word in new_corpus:
if word in index_word.keys():
continue
index_word[word] = i
i += 1
return index_word

def get_vector_by_BOW(self, bag_of_ngramms, doc, docs):
def tf(word, doc):
return doc.count(word) / len(doc)

def idf(word, docs):
word_in_docs = 0
for item in docs:
if word in item:
word_in_docs += 1
return np.log10(len(docs) / (word_in_docs + 1))

def tf_idf(word, doc, docs):
return tf(word, doc) * idf(word, docs)

count_dict = defaultdict(int)
vec = np.zeros(len(bag_of_ngramms))
for word in doc:
count_dict[word] += tf_idf(word, doc, docs)

for key, item in count_dict.items():
vec[bag_of_ngramms[key]] = item
return vec

def cosine_similarity(self, vector1, vector2):
norm1 = np.linalg.norm(vector1)
norm2 = np.linalg.norm(vector2)
dot_product = np.dot(vector1, vector2)
if norm1 == 0.0 or norm2 == 0.0:
return 0
cosine_sim = dot_product / (norm1 * norm2)
return round(cosine_sim, 3)

def calculate_cosine_similarity(self, goal, texts: dict):
if not (goal or texts):
return
corpus = []
text1_n_grams = self.get_ngrams(self.preprocessing(goal))
text2_n_grams = {}
for chapter in texts.keys():
text2_n_grams[chapter] = self.get_ngrams(self.preprocessing(texts[chapter]))
corpus.append(text1_n_grams)
corpus.extend(text2_n_grams.values())
bag_of_n_grams = self.get_bag_of_n_gramms(corpus)
goal_vector = self.get_vector_by_BOW(bag_of_n_grams, text1_n_grams, corpus)
text_vectors = {}
for chapter, text in text2_n_grams.items():
text_vectors[chapter] = self.get_vector_by_BOW(bag_of_n_grams, text, corpus)
result = {}
for chapter in text_vectors.keys():
text_vector = text_vectors[chapter]
result[chapter] = self.cosine_similarity(goal_vector, text_vector)
return result