-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
95 lines (75 loc) · 3.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import fitz
from PyPDF2 import PdfReader
from ultralytics import YOLO
import os
import shutil
def find_pages_with_keyword(pdf_path, keyword):
"""
Finds pages containing the specified keyword within a PDF document.
Args:
pdf_path (str): The path to the PDF document.
keyword (str): The keyword to search for.
Returns:
list: A list of page numbers containing the keyword.
"""
found_pages = []
with fitz.open(pdf_path) as pdf_document:
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
page_text = page.get_text()
if keyword.lower() in page_text.lower():
found_pages.append(page_number + 1)
return found_pages
def extract_images_from_pdf(pdf_path, output_folder):
"""
Extracts images from a PDF document and saves them to the specified folder.
Args:
pdf_path (str): The path to the PDF document.
output_folder (str): The folder where extracted images will be saved.
"""
try:
if not os.path.exists(output_folder):
os.makedirs(output_folder)
pdf_document = fitz.open(pdf_path)
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
images = page.get_images(full=True)
for img_index, img_info in enumerate(images):
image_index = img_info[0]
base_image = pdf_document.extract_image(image_index)
image_bytes = base_image["image"]
image_filename = f"page_{page_number + 1}_img_{img_index + 1}.jpg"
full_path = os.path.join(output_folder, image_filename)
with open(full_path, "wb") as image_file:
image_file.write(image_bytes)
print("Image extraction completed.")
except Exception as e:
print(f"Error during image extraction: {str(e)}")
def detect_images(image_folder, keyword, class_id):
"""
Detects images containing specified objects using YOLOv5.
Args:
image_folder (str): The folder containing images.
keyword (str): The keyword to match images against.
class_id (int): The class ID of the object to detect.
"""
model = YOLO("yolov8m.pt")
for image_name in os.listdir(image_folder):
image_path = os.path.join(image_folder, image_name)
results = model.predict(source=image_path, save=True, show=True)
for result in results:
boxes = result.boxes
if class_id in boxes.cls:
print(f"Image of {keyword} found.")
else:
os.remove(image_path)
print("Image removed.")
# Example usage
pdf_path = "./dog.pdf"
output_folder = "./extracted_images"
# Extract images from PDF
# extract_images_from_pdf(pdf_path, output_folder)
# Find pages containing the keyword 'dog'
# print(find_pages_with_keyword(pdf_path, "dog"))
# Detect images containing dogs
# detect_images(output_folder, "dog", 16)