1
+ # USAGE
2
+ # python text_detection_video.py --east frozen_east_text_detection.pb
3
+
4
+ # import the necessary packages
5
+ from imutils .video import VideoStream
6
+ from imutils .video import FPS
7
+ from imutils .object_detection import non_max_suppression
8
+ import numpy as np
9
+ import argparse
10
+ import imutils
11
+ import time
12
+ import cv2
13
+
14
+ def decode_predictions (scores , geometry ):
15
+ # grab the number of rows and columns from the scores volume, then
16
+ # initialize our set of bounding box rectangles and corresponding
17
+ # confidence scores
18
+ (numRows , numCols ) = scores .shape [2 :4 ]
19
+ rects = []
20
+ confidences = []
21
+
22
+ # loop over the number of rows
23
+ for y in range (0 , numRows ):
24
+ # extract the scores (probabilities), followed by the
25
+ # geometrical data used to derive potential bounding box
26
+ # coordinates that surround text
27
+ scoresData = scores [0 , 0 , y ]
28
+ xData0 = geometry [0 , 0 , y ]
29
+ xData1 = geometry [0 , 1 , y ]
30
+ xData2 = geometry [0 , 2 , y ]
31
+ xData3 = geometry [0 , 3 , y ]
32
+ anglesData = geometry [0 , 4 , y ]
33
+
34
+ # loop over the number of columns
35
+ for x in range (0 , numCols ):
36
+ # if our score does not have sufficient probability,
37
+ # ignore it
38
+ if scoresData [x ] < args ["min_confidence" ]:
39
+ continue
40
+
41
+ # compute the offset factor as our resulting feature
42
+ # maps will be 4x smaller than the input image
43
+ (offsetX , offsetY ) = (x * 4.0 , y * 4.0 )
44
+
45
+ # extract the rotation angle for the prediction and
46
+ # then compute the sin and cosine
47
+ angle = anglesData [x ]
48
+ cos = np .cos (angle )
49
+ sin = np .sin (angle )
50
+
51
+ # use the geometry volume to derive the width and height
52
+ # of the bounding box
53
+ h = xData0 [x ] + xData2 [x ]
54
+ w = xData1 [x ] + xData3 [x ]
55
+
56
+ # compute both the starting and ending (x, y)-coordinates
57
+ # for the text prediction bounding box
58
+ endX = int (offsetX + (cos * xData1 [x ]) + (sin * xData2 [x ]))
59
+ endY = int (offsetY - (sin * xData1 [x ]) + (cos * xData2 [x ]))
60
+ startX = int (endX - w )
61
+ startY = int (endY - h )
62
+
63
+ # add the bounding box coordinates and probability score
64
+ # to our respective lists
65
+ rects .append ((startX , startY , endX , endY ))
66
+ confidences .append (scoresData [x ])
67
+
68
+ # return a tuple of the bounding boxes and associated confidences
69
+ return (rects , confidences )
70
+
71
+ # construct the argument parser and parse the arguments
72
+ ap = argparse .ArgumentParser ()
73
+ ap .add_argument ("-east" , "--east" , type = str , required = True ,
74
+ help = "path to input EAST text detector" )
75
+ ap .add_argument ("-v" , "--video" , type = str ,
76
+ help = "path to optinal input video file" )
77
+ ap .add_argument ("-c" , "--min-confidence" , type = float , default = 0.5 ,
78
+ help = "minimum probability required to inspect a region" )
79
+ ap .add_argument ("-w" , "--width" , type = int , default = 320 ,
80
+ help = "resized image width (should be multiple of 32)" )
81
+ ap .add_argument ("-e" , "--height" , type = int , default = 320 ,
82
+ help = "resized image height (should be multiple of 32)" )
83
+ args = vars (ap .parse_args ())
84
+
85
+ # initialize the original frame dimensions, new frame dimensions,
86
+ # and ratio between the dimensions
87
+ (W , H ) = (None , None )
88
+ (newW , newH ) = (args ["width" ], args ["height" ])
89
+ (rW , rH ) = (None , None )
90
+
91
+ # define the two output layer names for the EAST detector model that
92
+ # we are interested -- the first is the output probabilities and the
93
+ # second can be used to derive the bounding box coordinates of text
94
+ layerNames = [
95
+ "feature_fusion/Conv_7/Sigmoid" ,
96
+ "feature_fusion/concat_3" ]
97
+
98
+ # load the pre-trained EAST text detector
99
+ print ("[INFO] loading EAST text detector..." )
100
+ net = cv2 .dnn .readNet (args ["east" ])
101
+
102
+ # if a video path was not supplied, grab the reference to the web cam
103
+ if not args .get ("video" , False ):
104
+ print ("[INFO] starting video stream..." )
105
+ vs = VideoStream (src = 0 ).start ()
106
+ time .sleep (1.0 )
107
+
108
+ # otherwise, grab a reference to the video file
109
+ else :
110
+ vs = cv2 .VideoCapture (args ["video" ])
111
+
112
+ # start the FPS throughput estimator
113
+ fps = FPS ().start ()
114
+
115
+ # loop over frames from the video stream
116
+ while True :
117
+ # grab the current frame, then handle if we are using a
118
+ # VideoStream or VideoCapture object
119
+ frame = vs .read ()
120
+ frame = frame [1 ] if args .get ("video" , False ) else frame
121
+
122
+ # check to see if we have reached the end of the stream
123
+ if frame is None :
124
+ break
125
+
126
+ # resize the frame, maintaining the aspect ratio
127
+ frame = imutils .resize (frame , width = 1000 )
128
+ orig = frame .copy ()
129
+
130
+ # if our frame dimensions are None, we still need to compute the
131
+ # ratio of old frame dimensions to new frame dimensions
132
+ if W is None or H is None :
133
+ (H , W ) = frame .shape [:2 ]
134
+ rW = W / float (newW )
135
+ rH = H / float (newH )
136
+
137
+ # resize the frame, this time ignoring aspect ratio
138
+ frame = cv2 .resize (frame , (newW , newH ))
139
+
140
+ # construct a blob from the frame and then perform a forward pass
141
+ # of the model to obtain the two output layer sets
142
+ blob = cv2 .dnn .blobFromImage (frame , 1.0 , (newW , newH ),
143
+ (123.68 , 116.78 , 103.94 ), swapRB = True , crop = False )
144
+ net .setInput (blob )
145
+ (scores , geometry ) = net .forward (layerNames )
146
+
147
+ # decode the predictions, then apply non-maxima suppression to
148
+ # suppress weak, overlapping bounding boxes
149
+ (rects , confidences ) = decode_predictions (scores , geometry )
150
+ boxes = non_max_suppression (np .array (rects ), probs = confidences )
151
+
152
+ # loop over the bounding boxes
153
+ for (startX , startY , endX , endY ) in boxes :
154
+ # scale the bounding box coordinates based on the respective
155
+ # ratios
156
+ startX = int (startX * rW )
157
+ startY = int (startY * rH )
158
+ endX = int (endX * rW )
159
+ endY = int (endY * rH )
160
+
161
+ # draw the bounding box on the frame
162
+ cv2 .rectangle (orig , (startX , startY ), (endX , endY ), (0 , 255 , 0 ), 2 )
163
+
164
+ # update the FPS counter
165
+ fps .update ()
166
+
167
+ # show the output frame
168
+ cv2 .imshow ("Text Detection" , orig )
169
+ key = cv2 .waitKey (1 ) & 0xFF
170
+
171
+ # if the `q` key was pressed, break from the loop
172
+ if key == ord ("q" ):
173
+ break
174
+
175
+ # stop the timer and display FPS information
176
+ fps .stop ()
177
+ print ("[INFO] elasped time: {:.2f}" .format (fps .elapsed ()))
178
+ print ("[INFO] approx. FPS: {:.2f}" .format (fps .fps ()))
179
+
180
+ # if we are using a webcam, release the pointer
181
+ if not args .get ("video" , False ):
182
+ vs .stop ()
183
+
184
+ # otherwise, release the file pointer
185
+ else :
186
+ vs .release ()
187
+
188
+ # close all windows
189
+ cv2 .destroyAllWindows ()
0 commit comments