-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathannotation.py
171 lines (141 loc) · 7.27 KB
/
annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import streamlit as st
import fitz # PyMuPDF
import os
def extract_annotations_with_content(pdf_file):
"""
Extracts annotations and their content (including highlighted, strikethrough, and underline text)
from a given PDF file.
Parameters:
pdf_file (str): The path to the input PDF file.
Returns:
list: A list of dictionaries, where each dictionary represents an annotation with keys:
- "page": The page number (1-indexed) where the annotation is found.
- "type": The type of annotation (e.g., "highlight", "underline", "note", etc.).
- "content": The content of the annotation.
- "rect": The rectangle coordinates (left, top, right, bottom) of the annotation.
"""
annotations = [] # List to store extracted annotations
# Open the PDF file
pdf_document = fitz.open(pdf_file)
# Iterate through each page of the PDF
for page_number in range(pdf_document.page_count):
page = pdf_document.load_page(page_number)
# Get annotations for the current page
page_annotations = page.annots()
# Get all words on the current page
words = page.get_text("words")
# Iterate through annotations on the current page
for annot in page_annotations:
annotation_type = annot.type[0] # Annotation type (e.g., "H" for highlight, "T" for text, etc.)
content = ""
# Handle highlight, underline, and strikeout annotations
if annotation_type in [8, 9, 11]:
# Get the text content under the annotation rectangle
rect = annot.rect
words_under_annot = [word for word in words if fitz.Rect(word[:4]).intersects(rect)]
content = " ".join(word[4] for word in words_under_annot)
# Extract the content of other annotations if available
elif annotation_type == 0: # Text annotation
content = annot.info.get("content", "")
# Handle squiggly underline annotations
elif annotation_type == 10: # Squiggly annotation
content = "Squiggly Underline"
# Handle signature annotations
elif annotation_type == 12: # Signature annotation
content = "Signature " + annot.info.get("content", "")
# Handle shape annotations
elif 1 <= annotation_type <= 7 & annotation_type != 3: # Shape annotations (types 1 to 7)
content = "Shape"
# Extract the content of other annotations if available
else:
content = annot.info.get("content", "")
annotation_data = {
"page": page_number + 1, # Page number starts from 1, not 0
"type": annotation_type,
"content": content,
"rect": annot.rect, # Rectangle coordinates of the annotation (left, top, right, bottom)
}
# Add the annotation data to the list
annotations.append(annotation_data)
# Close the PDF document
pdf_document.close()
return annotations
def display_annotations_via_streamlit():
st.title("PDF Annotation Extractor")
st.info("This app allows you to extract annotations from PDF files. With this app Underlined, Strikethrough, Highlightes, Text annotation wil have content displayed")
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
st.sidebar.title("Annotation Output Format Meaning")
st.sidebar.write("page : page noumber")
st.sidebar.write("type : numerical value")
st.sidebar.write("content : data annotation is applied to")
st.sidebar.write("rect : rectangle coordinates(x, y, widthof rectangle, height of rectangle)")
st.sidebar.title("Annotation Type Meanings")
st.sidebar.write("0: Text Annotation")
st.sidebar.write("1: Link Annotation")
st.sidebar.write("2: FreeText Annotation")
st.sidebar.write("3: Line Annotation")
st.sidebar.write("4: Square Annotation")
st.sidebar.write("5: Circle Annotation")
st.sidebar.write("6: Polygon Annotation")
st.sidebar.write("7: PolyLine Annotation")
st.sidebar.write("8: Highlight Annotation")
st.sidebar.write("9: Underline Annotation")
st.sidebar.write("10: Squiggly Annotation")
st.sidebar.write("11: Strikeout Annotation")
st.sidebar.write("12: Stamp Annotation")
st.sidebar.write("13: Caret Annotation")
st.sidebar.write("14: Ink Annotation")
st.sidebar.write("15: Popup Annotation")
st.sidebar.write("16: File Attachment Annotation")
st.sidebar.write("17: Sound Annotation")
st.sidebar.write("18: Movie Annotation")
st.sidebar.write("19: Widget Annotation")
st.sidebar.write("20: Screen Annotation")
st.sidebar.write("21: PrinterMark Annotation")
st.sidebar.write("22: TrapNet Annotation")
st.sidebar.write("23: Watermark Annotation")
st.sidebar.write("24: 3D Annotation")
st.sidebar.write("25: Redact Annotation")
st.sidebar.write("26: Projection Annotation")
if uploaded_file:
# Save the uploaded file temporarily
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
annotations_list = extract_annotations_with_content("temp.pdf")
# Display extracted annotations
if annotations_list:
st.header("Extracted Annotations:")
for annotation in annotations_list:
st.write(annotation)
# Add a button to copy all output in normal text format
if st.button("Copy Annotations as Text"):
annotations_text = "\n".join([str(annotation) for annotation in annotations_list])
annotations_text = "\n".join([str(annotation) for annotation in annotations_list])
annotations_text = annotations_text.replace('{', '')
annotations_text = annotations_text.replace('}', '')
annotations_text = annotations_text.replace('\'', '')
annotations_text = annotations_text.replace('\"', '')
st.text(annotations_text)
# Add a button to download all output in normal text format
if True:
annotations_text = "\n".join([str(annotation) for annotation in annotations_list])
annotations_text = "\n".join([str(annotation) for annotation in annotations_list])
annotations_text = annotations_text.replace('{', '')
annotations_text = annotations_text.replace('}', '')
annotations_text = annotations_text.replace('\'', '')
annotations_text = annotations_text.replace('\"', '')
pdf_filename = os.path.splitext(uploaded_file.name)[0]
st.download_button(
label=f"Download {pdf_filename}_output.txt",
data=annotations_text,
file_name=f"{pdf_filename}_output.txt",
mime="text/plain"
)
else:
st.warning("No annotations found in the uploaded PDF.")
# Remove the temporary file
os.remove("temp.pdf")
if __name__ == "__main__":
# Create a 'static' directory in the current working directory if it doesn't exist
# Display annotations via streamlit
display_annotations_via_streamlit()