diff --git a/Dockerfile b/Dockerfile index 1848e1f1b..8f25cf380 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,10 +21,10 @@ RUN apt-get update && apt-get install -y \ RUN git clone https://github.com/ifzhang/ByteTrack \ && cd ByteTrack \ - && git checkout 3434c5e8bc6a5ae8ad530528ba8d9a431967f237 \ + && git checkout 8d52fbdf9cd03757d8dd02c0631e526d164bd726 \ && mkdir -p YOLOX_outputs/yolox_x_mix_det/track_vis \ && sed -i 's/torch>=1.7/torch==1.9.1+cu111/g' requirements.txt \ - && sed -i 's/torchvision==0.10.0/torchvision==0.10.1+cu111/g' requirements.txt \ + && sed -i 's/torchvision>=0.10.0/torchvision==0.10.1+cu111/g' requirements.txt \ && sed -i "s/'cuda'/0/g" tools/demo_track.py \ && pip3 install pip --upgrade \ && pip3 install -r requirements.txt -f https://download.pytorch.org/whl/torch_stable.html \ diff --git a/darknet/darknet.py b/darknet/darknet.py new file mode 100644 index 000000000..ebb0eede2 --- /dev/null +++ b/darknet/darknet.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 + +""" +Python 3 wrapper for identifying objects in images + +Running the script requires opencv-python to be installed (`pip install opencv-python`) +Directly viewing or returning bounding-boxed images requires scikit-image to be installed (`pip install scikit-image`) +Use pip3 instead of pip on some systems to be sure to install modules for python3 +""" + +from ctypes import * +import math +import random +import os + + +class BOX(Structure): + _fields_ = [("x", c_float), + ("y", c_float), + ("w", c_float), + ("h", c_float)] + + +class DETECTION(Structure): + _fields_ = [("bbox", BOX), + ("classes", c_int), + ("best_class_idx", c_int), + ("prob", POINTER(c_float)), + ("mask", POINTER(c_float)), + ("objectness", c_float), + ("sort_class", c_int), + ("uc", POINTER(c_float)), + ("points", c_int), + ("embeddings", POINTER(c_float)), + ("embedding_size", c_int), + ("sim", c_float), + ("track_id", c_int)] + +class DETNUMPAIR(Structure): + _fields_ = [("num", c_int), + ("dets", POINTER(DETECTION))] + + +class IMAGE(Structure): + _fields_ = [("w", c_int), + ("h", c_int), + ("c", c_int), + ("data", POINTER(c_float))] + + +class METADATA(Structure): + _fields_ = [("classes", c_int), + ("names", POINTER(c_char_p))] + + +def network_width(net): + return lib.network_width(net) + + +def network_height(net): + return lib.network_height(net) + + +def bbox2points(bbox): + """ + From bounding box yolo format + to corner points cv2 rectangle + """ + x, y, w, h = bbox + xmin = int(round(x - (w / 2))) + xmax = int(round(x + (w / 2))) + ymin = int(round(y - (h / 2))) + ymax = int(round(y + (h / 2))) + return xmin, ymin, xmax, ymax + + +def class_colors(names): + """ + Create a dict with one random BGR color for each + class name + """ + return {name: ( + random.randint(0, 255), + random.randint(0, 255), + random.randint(0, 255)) for name in names} + + +def load_network(config_file, data_file, weights, batch_size=1): + """ + load model description and weights from config files + args: + config_file (str): path to .cfg model file + data_file (str): path to .data model file + weights (str): path to weights + returns: + network: trained model + class_names + class_colors + """ + network = load_net_custom( + config_file.encode("ascii"), + weights.encode("ascii"), 0, batch_size) + metadata = load_meta(data_file.encode("ascii")) + class_names = [metadata.names[i].decode("ascii") for i in range(metadata.classes)] + colors = class_colors(class_names) + return network, class_names, colors + + +def print_detections(detections, coordinates=False): + print("\nObjects:") + for label, confidence, bbox in detections: + x, y, w, h = bbox + if coordinates: + print("{}: {}% (left_x: {:.0f} top_y: {:.0f} width: {:.0f} height: {:.0f})".format(label, confidence, x, y, w, h)) + else: + print("{}: {}%".format(label, confidence)) + + +def draw_boxes(detections, image, colors): + import cv2 + for label, confidence, bbox in detections: + left, top, right, bottom = bbox2points(bbox) + cv2.rectangle(image, (left, top), (right, bottom), colors[label], 1) + cv2.putText(image, "{} [{:.2f}]".format(label, float(confidence)), + (left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, + colors[label], 2) + return image + + +def decode_detection(detections): + decoded = [] + for label, confidence, bbox in detections: + confidence = str(round(confidence * 100, 2)) + decoded.append((str(label), confidence, bbox)) + return decoded + +# https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ +# Malisiewicz et al. +def non_max_suppression_fast(detections, overlap_thresh): + boxes = [] + for detection in detections: + _, _, _, (x, y, w, h) = detection + x1 = x - w / 2 + y1 = y - h / 2 + x2 = x + w / 2 + y2 = y + h / 2 + boxes.append(np.array([x1, y1, x2, y2])) + boxes_array = np.array(boxes) + + # initialize the list of picked indexes + pick = [] + # grab the coordinates of the bounding boxes + x1 = boxes_array[:, 0] + y1 = boxes_array[:, 1] + x2 = boxes_array[:, 2] + y2 = boxes_array[:, 3] + # compute the area of the bounding boxes and sort the bounding + # boxes by the bottom-right y-coordinate of the bounding box + area = (x2 - x1 + 1) * (y2 - y1 + 1) + idxs = np.argsort(y2) + # keep looping while some indexes still remain in the indexes + # list + while len(idxs) > 0: + # grab the last index in the indexes list and add the + # index value to the list of picked indexes + last = len(idxs) - 1 + i = idxs[last] + pick.append(i) + # find the largest (x, y) coordinates for the start of + # the bounding box and the smallest (x, y) coordinates + # for the end of the bounding box + xx1 = np.maximum(x1[i], x1[idxs[:last]]) + yy1 = np.maximum(y1[i], y1[idxs[:last]]) + xx2 = np.minimum(x2[i], x2[idxs[:last]]) + yy2 = np.minimum(y2[i], y2[idxs[:last]]) + # compute the width and height of the bounding box + w = np.maximum(0, xx2 - xx1 + 1) + h = np.maximum(0, yy2 - yy1 + 1) + # compute the ratio of overlap + overlap = (w * h) / area[idxs[:last]] + # delete all indexes from the index list that have + idxs = np.delete(idxs, np.concatenate(([last], + np.where(overlap > overlap_thresh)[0]))) + # return only the bounding boxes that were picked using the + # integer data type + return [detections[i] for i in pick] + +def remove_negatives(detections, class_names, num): + """ + Remove all classes with 0% confidence within the detection + """ + predictions = [] + for j in range(num): + for idx, name in enumerate(class_names): + if detections[j].prob[idx] > 0: + bbox = detections[j].bbox + bbox = (bbox.x, bbox.y, bbox.w, bbox.h) + predictions.append((name, detections[j].prob[idx], (bbox))) + return predictions + + +def remove_negatives_faster(detections, class_names, num): + """ + Faster version of remove_negatives (very useful when using yolo9000) + """ + predictions = [] + for j in range(num): + if detections[j].best_class_idx == -1: + continue + name = class_names[detections[j].best_class_idx] + bbox = detections[j].bbox + bbox = (bbox.x, bbox.y, bbox.w, bbox.h) + predictions.append((name, detections[j].prob[detections[j].best_class_idx], bbox)) + return predictions + + +def detect_image(network, class_names, image, thresh=.5, hier_thresh=.5, nms=.45): + """ + Returns a list with highest confidence class and their bbox + """ + pnum = pointer(c_int(0)) + predict_image(network, image) + detections = get_network_boxes(network, image.w, image.h, + thresh, hier_thresh, None, 0, pnum, 0) + num = pnum[0] + if nms: + do_nms_sort(detections, num, len(class_names), nms) + predictions = remove_negatives(detections, class_names, num) + predictions = decode_detection(predictions) + free_detections(detections, num) + return sorted(predictions, key=lambda x: x[1]) + + +if os.name == "posix": + cwd = os.path.dirname(__file__) + lib = CDLL(cwd + "/libdarknet.so", RTLD_GLOBAL) +elif os.name == "nt": + cwd = os.path.dirname(__file__) + os.environ['PATH'] = cwd + ';' + os.environ['PATH'] + lib = CDLL("darknet.dll", RTLD_GLOBAL) +else: + print("Unsupported OS") + exit + +lib.network_width.argtypes = [c_void_p] +lib.network_width.restype = c_int +lib.network_height.argtypes = [c_void_p] +lib.network_height.restype = c_int + +copy_image_from_bytes = lib.copy_image_from_bytes +copy_image_from_bytes.argtypes = [IMAGE,c_char_p] + +predict = lib.network_predict_ptr +predict.argtypes = [c_void_p, POINTER(c_float)] +predict.restype = POINTER(c_float) + +set_gpu = lib.cuda_set_device +init_cpu = lib.init_cpu + +make_image = lib.make_image +make_image.argtypes = [c_int, c_int, c_int] +make_image.restype = IMAGE + +get_network_boxes = lib.get_network_boxes +get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int), c_int] +get_network_boxes.restype = POINTER(DETECTION) + +make_network_boxes = lib.make_network_boxes +make_network_boxes.argtypes = [c_void_p] +make_network_boxes.restype = POINTER(DETECTION) + +free_detections = lib.free_detections +free_detections.argtypes = [POINTER(DETECTION), c_int] + +free_batch_detections = lib.free_batch_detections +free_batch_detections.argtypes = [POINTER(DETNUMPAIR), c_int] + +free_ptrs = lib.free_ptrs +free_ptrs.argtypes = [POINTER(c_void_p), c_int] + +network_predict = lib.network_predict_ptr +network_predict.argtypes = [c_void_p, POINTER(c_float)] + +reset_rnn = lib.reset_rnn +reset_rnn.argtypes = [c_void_p] + +load_net = lib.load_network +load_net.argtypes = [c_char_p, c_char_p, c_int] +load_net.restype = c_void_p + +load_net_custom = lib.load_network_custom +load_net_custom.argtypes = [c_char_p, c_char_p, c_int, c_int] +load_net_custom.restype = c_void_p + +free_network_ptr = lib.free_network_ptr +free_network_ptr.argtypes = [c_void_p] +free_network_ptr.restype = c_void_p + +do_nms_obj = lib.do_nms_obj +do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float] + +do_nms_sort = lib.do_nms_sort +do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float] + +free_image = lib.free_image +free_image.argtypes = [IMAGE] + +letterbox_image = lib.letterbox_image +letterbox_image.argtypes = [IMAGE, c_int, c_int] +letterbox_image.restype = IMAGE + +load_meta = lib.get_metadata +lib.get_metadata.argtypes = [c_char_p] +lib.get_metadata.restype = METADATA + +load_image = lib.load_image_color +load_image.argtypes = [c_char_p, c_int, c_int] +load_image.restype = IMAGE + +rgbgr_image = lib.rgbgr_image +rgbgr_image.argtypes = [IMAGE] + +predict_image = lib.network_predict_image +predict_image.argtypes = [c_void_p, IMAGE] +predict_image.restype = POINTER(c_float) + +predict_image_letterbox = lib.network_predict_image_letterbox +predict_image_letterbox.argtypes = [c_void_p, IMAGE] +predict_image_letterbox.restype = POINTER(c_float) + +network_predict_batch = lib.network_predict_batch +network_predict_batch.argtypes = [c_void_p, IMAGE, c_int, c_int, c_int, + c_float, c_float, POINTER(c_int), c_int, c_int] +network_predict_batch.restype = POINTER(DETNUMPAIR) diff --git a/darknet/libdarknet.so b/darknet/libdarknet.so new file mode 100755 index 000000000..a1078b9ee Binary files /dev/null and b/darknet/libdarknet.so differ diff --git a/exps/example/mot/yolov4.py b/exps/example/mot/yolov4.py new file mode 100644 index 000000000..187f20d05 --- /dev/null +++ b/exps/example/mot/yolov4.py @@ -0,0 +1,147 @@ +# encoding: utf-8 +import os +import random +import torch +import torch.nn as nn +import torch.distributed as dist + +from yolox.exp import Exp as MyExp +from yolox.data import get_yolox_datadir +import darknet.darknet as darknet +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.num_classes = 1 + self.depth = 1.33 + self.width = 1.25 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + self.train_ann = "train.json" + self.val_ann = "test.json" # change to train.json when running on training set + self.input_size = (608, 608) + self.test_size = (608, 608) + self.random_size = (20, 36) + self.max_epoch = 80 + self.print_interval = 20 + self.eval_interval = 5 + self.test_conf = 0.001 + self.nmsthre = 0.7 + self.no_aug_epochs = 10 + self.basic_lr_per_img = 0.001 / 64.0 + self.warmup_epochs = 1 + + def get_model(self, config_file, data_file, weights_file): + network, class_names, class_colors = darknet.load_network( + config_file, + data_file, + weights_file, + batch_size=1 + ) + return network, class_names, class_colors + + def get_data_loader(self, batch_size, is_distributed, no_aug=False): + from yolox.data import ( + MOTDataset, + TrainTransform, + YoloBatchSampler, + DataLoader, + InfiniteSampler, + MosaicDetection, + ) + + dataset = MOTDataset( + data_dir=os.path.join(get_yolox_datadir(), "mix_mot20_ch"), + json_file=self.train_ann, + name='', + img_size=self.input_size, + preproc=TrainTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + max_labels=600, + ), + ) + + dataset = MosaicDetection( + dataset, + mosaic=not no_aug, + img_size=self.input_size, + preproc=TrainTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + max_labels=1200, + ), + degrees=self.degrees, + translate=self.translate, + scale=self.scale, + shear=self.shear, + perspective=self.perspective, + enable_mixup=self.enable_mixup, + ) + + self.dataset = dataset + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + + sampler = InfiniteSampler( + len(self.dataset), seed=self.seed if self.seed else 0 + ) + + batch_sampler = YoloBatchSampler( + sampler=sampler, + batch_size=batch_size, + drop_last=False, + input_dimension=self.input_size, + mosaic=not no_aug, + ) + + dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} + dataloader_kwargs["batch_sampler"] = batch_sampler + train_loader = DataLoader(self.dataset, **dataloader_kwargs) + + return train_loader + + def get_eval_loader(self, batch_size, is_distributed, testdev=False): + from yolox.data import MOTDataset, ValTransform + + valdataset = MOTDataset( + data_dir=os.path.join(get_yolox_datadir(), "MOT20"), + json_file=self.val_ann, + img_size=self.test_size, + name='test', # change to train when running on training set + preproc=ValTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + ), + ) + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + sampler = torch.utils.data.distributed.DistributedSampler( + valdataset, shuffle=False + ) + else: + sampler = torch.utils.data.SequentialSampler(valdataset) + + dataloader_kwargs = { + "num_workers": self.data_num_workers, + "pin_memory": True, + "sampler": sampler, + } + dataloader_kwargs["batch_size"] = batch_size + val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) + + return val_loader + + def get_evaluator(self, batch_size, is_distributed, testdev=False): + from yolox.evaluators import COCOEvaluator + + val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) + evaluator = COCOEvaluator( + dataloader=val_loader, + img_size=self.test_size, + confthre=self.test_conf, + nmsthre=self.nmsthre, + num_classes=self.num_classes, + testdev=testdev, + ) + return evaluator diff --git a/exps/example/mot/yolov4_tiny_signate.py b/exps/example/mot/yolov4_tiny_signate.py new file mode 100644 index 000000000..4ad46d37e --- /dev/null +++ b/exps/example/mot/yolov4_tiny_signate.py @@ -0,0 +1,147 @@ +# encoding: utf-8 +import os +import random +import torch +import torch.nn as nn +import torch.distributed as dist + +from yolox.exp import Exp as MyExp +from yolox.data import get_yolox_datadir +import darknet.darknet as darknet +class Exp(MyExp): + def __init__(self): + super(Exp, self).__init__() + self.num_classes = 1 + self.depth = 1.33 + self.width = 1.25 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + self.train_ann = "train.json" + self.val_ann = "test.json" # change to train.json when running on training set + self.input_size = (416, 416) + self.test_size = (416, 416) + self.random_size = (20, 36) + self.max_epoch = 80 + self.print_interval = 20 + self.eval_interval = 5 + self.test_conf = 0.001 + self.nmsthre = 0.7 + self.no_aug_epochs = 10 + self.basic_lr_per_img = 0.001 / 64.0 + self.warmup_epochs = 1 + + def get_model(self, config_file, data_file, weights_file): + network, class_names, class_colors = darknet.load_network( + config_file, + data_file, + weights_file, + batch_size=1 + ) + return network, class_names, class_colors + + def get_data_loader(self, batch_size, is_distributed, no_aug=False): + from yolox.data import ( + MOTDataset, + TrainTransform, + YoloBatchSampler, + DataLoader, + InfiniteSampler, + MosaicDetection, + ) + + dataset = MOTDataset( + data_dir=os.path.join(get_yolox_datadir(), "mix_mot20_ch"), + json_file=self.train_ann, + name='', + img_size=self.input_size, + preproc=TrainTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + max_labels=600, + ), + ) + + dataset = MosaicDetection( + dataset, + mosaic=not no_aug, + img_size=self.input_size, + preproc=TrainTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + max_labels=1200, + ), + degrees=self.degrees, + translate=self.translate, + scale=self.scale, + shear=self.shear, + perspective=self.perspective, + enable_mixup=self.enable_mixup, + ) + + self.dataset = dataset + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + + sampler = InfiniteSampler( + len(self.dataset), seed=self.seed if self.seed else 0 + ) + + batch_sampler = YoloBatchSampler( + sampler=sampler, + batch_size=batch_size, + drop_last=False, + input_dimension=self.input_size, + mosaic=not no_aug, + ) + + dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} + dataloader_kwargs["batch_sampler"] = batch_sampler + train_loader = DataLoader(self.dataset, **dataloader_kwargs) + + return train_loader + + def get_eval_loader(self, batch_size, is_distributed, testdev=False): + from yolox.data import MOTDataset, ValTransform + + valdataset = MOTDataset( + data_dir=os.path.join(get_yolox_datadir(), "MOT20"), + json_file=self.val_ann, + img_size=self.test_size, + name='test', # change to train when running on training set + preproc=ValTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + ), + ) + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + sampler = torch.utils.data.distributed.DistributedSampler( + valdataset, shuffle=False + ) + else: + sampler = torch.utils.data.SequentialSampler(valdataset) + + dataloader_kwargs = { + "num_workers": self.data_num_workers, + "pin_memory": True, + "sampler": sampler, + } + dataloader_kwargs["batch_size"] = batch_size + val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) + + return val_loader + + def get_evaluator(self, batch_size, is_distributed, testdev=False): + from yolox.evaluators import COCOEvaluator + + val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) + evaluator = COCOEvaluator( + dataloader=val_loader, + img_size=self.test_size, + confthre=self.test_conf, + nmsthre=self.nmsthre, + num_classes=self.num_classes, + testdev=testdev, + ) + return evaluator diff --git a/tools/demo_track.py b/tools/demo_track.py index 4f4e7dc3a..5c2d9bf89 100644 --- a/tools/demo_track.py +++ b/tools/demo_track.py @@ -13,7 +13,7 @@ from yolox.utils.visualize import plot_tracking from yolox.tracker.byte_tracker import BYTETracker from yolox.tracking_utils.timer import Timer - +from tools.predictor import Predictor, get_predictor IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"] @@ -26,6 +26,8 @@ def make_parser(): parser.add_argument("-expn", "--experiment-name", type=str, default=None) parser.add_argument("-n", "--name", type=str, default=None, help="model name") + parser.add_argument("--model_type", type=str, default="yolox", choices=["yolox", "darknet"]) + parser.add_argument( #"--path", default="./datasets/mot/train/MOT17-05-FRCNN/img1", help="path to images or video" "--path", default="./videos/palace.mp4", help="path to images or video" @@ -77,16 +79,37 @@ def make_parser(): action="store_true", help="Using TensorRT model for testing.", ) + parser.add_argument( + "--config_file", + help="darknet config file(.cfg)", + default=None + ) + parser.add_argument( + "--data_file", + help="darknet data file(.data)", + default=None + ) + parser.add_argument( + "--weights_file", + help="darknet weights file(.weights)", + default=None + ) # tracking args parser.add_argument("--track_thresh", type=float, default=0.5, help="tracking confidence threshold") parser.add_argument("--track_buffer", type=int, default=30, help="the frames for keep lost tracks") parser.add_argument("--match_thresh", type=float, default=0.8, help="matching threshold for tracking") parser.add_argument( "--aspect_ratio_thresh", type=float, default=1.6, - help="threshold for filtering out boxes of which aspect ratio are above the given value." + help="threshold for filtering out boxes of which aspect ratio are above the given value. -1 means not to use filtering." ) parser.add_argument('--min_box_area', type=float, default=10, help='filter out tiny boxes') parser.add_argument("--mot20", dest="mot20", default=False, action="store_true", help="test mot20.") + parser.add_argument( + "--tracking_classes", + nargs="+", + type=int, + default=[0] + ) return parser @@ -114,66 +137,6 @@ def write_results(filename, results): logger.info('save results to {}'.format(filename)) -class Predictor(object): - def __init__( - self, - model, - exp, - trt_file=None, - decoder=None, - device=torch.device("cpu"), - fp16=False - ): - self.model = model - self.decoder = decoder - self.num_classes = exp.num_classes - self.confthre = exp.test_conf - self.nmsthre = exp.nmsthre - self.test_size = exp.test_size - self.device = device - self.fp16 = fp16 - if trt_file is not None: - from torch2trt import TRTModule - - model_trt = TRTModule() - model_trt.load_state_dict(torch.load(trt_file)) - - x = torch.ones((1, 3, exp.test_size[0], exp.test_size[1]), device=device) - self.model(x) - self.model = model_trt - self.rgb_means = (0.485, 0.456, 0.406) - self.std = (0.229, 0.224, 0.225) - - def inference(self, img, timer): - img_info = {"id": 0} - if isinstance(img, str): - img_info["file_name"] = osp.basename(img) - img = cv2.imread(img) - else: - img_info["file_name"] = None - - height, width = img.shape[:2] - img_info["height"] = height - img_info["width"] = width - img_info["raw_img"] = img - - img, ratio = preproc(img, self.test_size, self.rgb_means, self.std) - img_info["ratio"] = ratio - img = torch.from_numpy(img).unsqueeze(0).float().to(self.device) - if self.fp16: - img = img.half() # to FP16 - - with torch.no_grad(): - timer.tic() - outputs = self.model(img) - if self.decoder is not None: - outputs = self.decoder(outputs, dtype=outputs.type()) - outputs = postprocess( - outputs, self.num_classes, self.confthre, self.nmsthre - ) - #logger.info("Infer time: {:.4f}s".format(time.time() - t0)) - return outputs, img_info - def image_demo(predictor, vis_folder, current_time, args): if osp.isdir(args.path): @@ -196,7 +159,7 @@ def image_demo(predictor, vis_folder, current_time, args): tlwh = t.tlwh tid = t.track_id vertical = tlwh[2] / tlwh[3] > args.aspect_ratio_thresh - if tlwh[2] * tlwh[3] > args.min_box_area and not vertical: + if tlwh[2] * tlwh[3] > args.min_box_area and (not vertical or args.dont_use_aspect_ratio_thresh): online_tlwhs.append(tlwh) online_ids.append(tid) online_scores.append(t.score) @@ -233,15 +196,16 @@ def image_demo(predictor, vis_folder, current_time, args): logger.info(f"save results to {res_file}") -def imageflow_demo(predictor, vis_folder, current_time, args): - cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid) +def imageflow_demo(predictor, vis_folder, current_time, args, save_folder=None): + cap = cv2.VideoCapture(args.path if "video" in args.demo else args.camid) width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float fps = cap.get(cv2.CAP_PROP_FPS) timestamp = time.strftime("%Y_%m_%d_%H_%M_%S", current_time) - save_folder = osp.join(vis_folder, timestamp) + if save_folder is None: + save_folder = osp.join(vis_folder, timestamp) os.makedirs(save_folder, exist_ok=True) - if args.demo == "video": + if "video" in args.demo: save_path = osp.join(save_folder, args.path.split("/")[-1]) else: save_path = osp.join(save_folder, "camera.mp4") @@ -249,7 +213,10 @@ def imageflow_demo(predictor, vis_folder, current_time, args): vid_writer = cv2.VideoWriter( save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height)) ) - tracker = BYTETracker(args, frame_rate=30) + image_keep_ratio = True + if args.model_type == "darknet": + image_keep_ratio = False + trackers = [BYTETracker(args, target_class=i, frame_rate=args.fps, image_keep_ratio=image_keep_ratio) for i in args.tracking_classes] timer = Timer() frame_id = 0 results = [] @@ -259,26 +226,35 @@ def imageflow_demo(predictor, vis_folder, current_time, args): ret_val, frame = cap.read() if ret_val: outputs, img_info = predictor.inference(frame, timer) - if outputs[0] is not None: - online_targets = tracker.update(outputs[0], [img_info['height'], img_info['width']], exp.test_size) - online_tlwhs = [] - online_ids = [] - online_scores = [] - for t in online_targets: - tlwh = t.tlwh - tid = t.track_id - vertical = tlwh[2] / tlwh[3] > args.aspect_ratio_thresh - if tlwh[2] * tlwh[3] > args.min_box_area and not vertical: - online_tlwhs.append(tlwh) - online_ids.append(tid) - online_scores.append(t.score) - results.append( - f"{frame_id},{tid},{tlwh[0]:.2f},{tlwh[1]:.2f},{tlwh[2]:.2f},{tlwh[3]:.2f},{t.score:.2f},-1,-1,-1\n" - ) - timer.toc() - online_im = plot_tracking( - img_info['raw_img'], online_tlwhs, online_ids, frame_id=frame_id + 1, fps=1. / timer.average_time - ) + if outputs[0] is not None and len(outputs[0]) > 0: + tracking_results = [] + for tracker in trackers: + target_class = tracker.target_class + online_targets = tracker.update(outputs[0], [img_info['height'], img_info['width']], exp.test_size) + online_tlwhs = [] + online_ids = [] + online_scores = [] + for t in online_targets: + tlwh = t.tlwh + tid = t.track_id + # args.aspect_ratio_thresh < 0 means not to use aspect ratio filtering + vertical = tlwh[2] / tlwh[3] > args.aspect_ratio_thresh + ok_aspect_ratio = ((not vertical) or args.aspect_ratio_thresh < 0) + ok_box_area = tlwh[2] * tlwh[3] > args.min_box_area + if ok_aspect_ratio and ok_aspect_ratio: + online_tlwhs.append(tlwh) + online_ids.append(tid) + online_scores.append(t.score) + results.append( + f"{frame_id},{tid}, {target_class}, {tlwh[0]:.2f},{tlwh[1]:.2f},{tlwh[2]:.2f},{tlwh[3]:.2f},{t.score:.2f},-1,-1,-1\n" + ) + tracking_results.append((online_tlwhs, online_ids, online_scores)) + timer.toc() + online_im = img_info['raw_img'].copy() + for (online_tlwhs, online_ids, online_scores) in tracking_results: + online_im = plot_tracking( + online_im, online_tlwhs, online_ids, frame_id=frame_id + 1, fps=1. / timer.average_time + ) else: timer.toc() online_im = img_info['raw_img'] @@ -292,7 +268,7 @@ def imageflow_demo(predictor, vis_folder, current_time, args): frame_id += 1 if args.save_result: - res_file = osp.join(vis_folder, f"{timestamp}.txt") + res_file = osp.join(save_folder, args.path.split("/")[-1].replace(".mp4", ".txt")) with open(res_file, 'w') as f: f.writelines(results) logger.info(f"save results to {res_file}") @@ -309,10 +285,6 @@ def main(exp, args): vis_folder = osp.join(output_dir, "track_vis") os.makedirs(vis_folder, exist_ok=True) - if args.trt: - args.device = "gpu" - args.device = torch.device("cuda" if args.device == "gpu" else "cpu") - logger.info("Args: {}".format(args)) if args.conf is not None: @@ -322,51 +294,26 @@ def main(exp, args): if args.tsize is not None: exp.test_size = (args.tsize, args.tsize) - model = exp.get_model().to(args.device) - logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size))) - model.eval() - - if not args.trt: - if args.ckpt is None: - ckpt_file = osp.join(output_dir, "best_ckpt.pth.tar") - else: - ckpt_file = args.ckpt - logger.info("loading checkpoint") - ckpt = torch.load(ckpt_file, map_location="cpu") - # load the model state dict - model.load_state_dict(ckpt["model"]) - logger.info("loaded checkpoint done.") - - if args.fuse: - logger.info("\tFusing model...") - model = fuse_model(model) - - if args.fp16: - model = model.half() # to FP16 - - if args.trt: - assert not args.fuse, "TensorRT model is not support model fusing!" - trt_file = osp.join(output_dir, "model_trt.pth") - assert osp.exists( - trt_file - ), "TensorRT model is not found!\n Run python3 tools/trt.py first!" - model.head.decode_in_inference = False - decoder = model.head.decode_outputs - logger.info("Using TensorRT to inference") - else: - trt_file = None - decoder = None - - predictor = Predictor(model, exp, trt_file, decoder, args.device, args.fp16) + predictor = get_predictor(exp, args) current_time = time.localtime() if args.demo == "image": image_demo(predictor, vis_folder, current_time, args) elif args.demo == "video" or args.demo == "webcam": imageflow_demo(predictor, vis_folder, current_time, args) + elif args.demo == "video_multiple": + import glob + videos = glob.glob(osp.join(args.path, "*.mp4")) + logger.info(f"Found {len(videos)} videos") + timestamp = time.strftime("%Y_%m_%d_%H_%M_%S", current_time) + save_base_folder = osp.join(vis_folder, timestamp) + for video in videos: + print(video) + args.path = video + save_folder = osp.join(save_base_folder, osp.basename(video).replace('.mp4', '')) + imageflow_demo(predictor, vis_folder, current_time, args, save_folder) if __name__ == "__main__": args = make_parser().parse_args() exp = get_exp(args.exp_file, args.name) - main(exp, args) diff --git a/tools/make_submit.py b/tools/make_submit.py new file mode 100644 index 000000000..70cebc429 --- /dev/null +++ b/tools/make_submit.py @@ -0,0 +1,94 @@ +import json +import argparse +import glob +import os + + +VIDEO_NUM = 74 +FRAME_WIDTH = 1936 +FRAME_HEIGHT = 1216 +TEST_VIDEO_FRAME_NUM = 150 #5fps * 30sec + +class Bbox: + def __init__(self, line): + elems = line.split(',') + elems = [int(float(x.strip())) for x in elems] + self.frame_id = elems[0] + self.object_id = elems[1] + self.category = elems[2] + # x1, y1, w, h + self.x1 = max(0, elems[3]) + self.y1 = max(0, elems[4]) + self.x2 = min(FRAME_WIDTH, elems[3] + elems[5]) + self.y2 = min(FRAME_HEIGHT, elems[4] + elems[6]) + + def to_bboxdic(self): + return { + "id": self.object_id, + "box2d": [self.x1, self.y1, self.x2, self.y2] + } + @staticmethod + def convert_line_to_bboxes(lines): + return [Bbox(line) for line in lines] + +def count_tracked_frames(bboxes): + count_dic = {} + for bbox in bboxes: + key = (bbox.object_id, bbox.category) + if key not in count_dic: + count_dic[key] = 0 + count_dic[key] += 1 + return count_dic + +def remove_few_frame_bboxes(count_dic, bboxes): + new_bboxes = [] + for bbox in bboxes: + key = (bbox.object_id, bbox.category) + if count_dic[key] < 3: + print(f"Removed object_id={bbox.object_id}, category={bbox.category}") + else: + new_bboxes.append(bbox) + return new_bboxes + +def bboxes_to_dic(bboxes): + # readme.txt on SIGNATE says: + # If you do not want to make any prediction in some frames, just write "{}" in the corresponding frames. + + result_list = [{} for i in range(TEST_VIDEO_FRAME_NUM)] + for bbox in bboxes: + assert(bbox.frame_id < TEST_VIDEO_FRAME_NUM) + category_name = ["Car", "Pedestrian"][bbox.category] + if category_name not in result_list[bbox.frame_id]: + result_list[bbox.frame_id][category_name] = [] + result_list[bbox.frame_id][category_name].append(bbox.to_bboxdic()) + return result_list + +def process(txt_file_path): + bboxes = Bbox.convert_line_to_bboxes(open(txt_file_path).readlines()) + # 1. count number of tracked frames for each tracked id + count_dic = count_tracked_frames(bboxes) + # 2. remove bboxes whose tracked frame num is less than 3 + bboxes = remove_few_frame_bboxes(count_dic, bboxes) + # 3. convert to list of dictionary + result_list = bboxes_to_dic(bboxes) + return result_list + +def main(args): + txt_file_paths = glob.glob(os.path.join(args.result_dir, "*/test_*.txt")) + txt_file_paths = sorted(txt_file_paths) + assert(len(txt_file_paths) == VIDEO_NUM) + + result_dic = {} + for txt_file_path in txt_file_paths: + video_name = os.path.basename(txt_file_path).replace(".txt", ".mp4") + result_dic[video_name] = process(txt_file_path) + + open(args.output_path, "w").write(json.dumps(result_dic)) + print("Done!") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--result_dir", type=str) + parser.add_argument("--output_path", type=str, default="./submit.json") + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/tools/predictor.py b/tools/predictor.py new file mode 100644 index 000000000..340a12118 --- /dev/null +++ b/tools/predictor.py @@ -0,0 +1,182 @@ +import os.path as osp +import os +from loguru import logger +from yolox.data.data_augment import preproc +from yolox.utils import fuse_model, get_model_info, postprocess + +import torch +import cv2 +import numpy as np + +class Predictor(object): + def __init__( + self, + model, + exp, + trt_file=None, + decoder=None, + device=torch.device("cpu"), + fp16=False + ): + self.model = model + self.decoder = decoder + self.num_classes = exp.num_classes + self.confthre = exp.test_conf + self.nmsthre = exp.nmsthre + self.test_size = exp.test_size + self.device = device + self.fp16 = fp16 + if trt_file is not None: + from torch2trt import TRTModule + + model_trt = TRTModule() + model_trt.load_state_dict(torch.load(trt_file)) + + x = torch.ones((1, 3, exp.test_size[0], exp.test_size[1]), device=device) + self.model(x) + self.model = model_trt + self.rgb_means = (0.485, 0.456, 0.406) + self.std = (0.229, 0.224, 0.225) + + def inference(self, img, timer): + img_info = {"id": 0} + if isinstance(img, str): + img_info["file_name"] = osp.basename(img) + img = cv2.imread(img) + else: + img_info["file_name"] = None + + height, width = img.shape[:2] + img_info["height"] = height + img_info["width"] = width + img_info["raw_img"] = img + + img, ratio = preproc(img, self.test_size, self.rgb_means, self.std) + img_info["ratio"] = (ratio, ratio) + img = torch.from_numpy(img).unsqueeze(0).float().to(self.device) + if self.fp16: + img = img.half() # to FP16 + + with torch.no_grad(): + timer.tic() + outputs = self.model(img) + if self.decoder is not None: + outputs = self.decoder(outputs, dtype=outputs.type()) + outputs = postprocess( + outputs, self.num_classes, self.confthre, self.nmsthre + ) + #logger.info("Infer time: {:.4f}s".format(time.time() - t0)) + return outputs, img_info + +import darknet.darknet as darknet + +class YOLOv4Predictor(Predictor): + def __init__( + self, + network, + class_names, + class_colors, + exp + ): + self.confthre = exp.test_conf + self.nmsthre = exp.nmsthre + self.network = network + self.class_names = class_names + self.class_colors = class_colors + self.class_name_dic = {x: i for i, x in enumerate(class_names)} + + + def inference(self, img, timer): + net_width = darknet.network_width(self.network) + net_height = darknet.network_height(self.network) + + # img_info + img_info = {"id": 0} + img_info["file_name"] = None + height, width = img.shape[:2] + img_info["height"] = height + img_info["width"] = width + img_info["raw_img"] = img + img_info["ratio"] = (net_width/width, net_height/height) + # preprocessing + darknet_image = darknet.make_image(net_width, net_height, 3) + + image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + image_resized = cv2.resize(image_rgb, (net_width, net_height), + interpolation=cv2.INTER_LINEAR) + darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes()) + # inference + timer.tic() + detections = darknet.detect_image(self.network, self.class_names, darknet_image, thresh=self.confthre, nms=self.nmsthre) + darknet.free_image(darknet_image) + + outputs = [] + for label, confidence, bbox in detections: + class_id = self.class_name_dic[label] + left, top, right, bottom = darknet.bbox2points(bbox) + confidence = float(confidence)/100.0 + outputs.append([left, top, right, bottom, confidence, class_id]) + # wrap as batched list + outputs = [outputs] + outputs = np.array(outputs) + return outputs, img_info + + +def get_yolox_predictor(exp, args, output_dir): + if isinstance(args.device, str): + args.device = torch.device("cuda" if args.device == "gpu" else "cpu") + + model = exp.get_model().to(args.device) + logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size))) + model.eval() + + if not args.trt: + if args.ckpt is None: + ckpt_file = osp.join(output_dir, "best_ckpt.pth.tar") + else: + ckpt_file = args.ckpt + logger.info("loading checkpoint") + ckpt = torch.load(ckpt_file, map_location="cpu") + # load the model state dict + model.load_state_dict(ckpt["model"]) + logger.info("loaded checkpoint done.") + + if args.fuse: + logger.info("\tFusing model...") + model = fuse_model(model) + + if args.fp16: + model = model.half() # to FP16 + + if args.trt: + assert not args.fuse, "TensorRT model is not support model fusing!" + trt_file = osp.join(output_dir, "model_trt.pth") + assert osp.exists( + trt_file + ), "TensorRT model is not found!\n Run python3 tools/trt.py first!" + model.head.decode_in_inference = False + decoder = model.head.decode_outputs + logger.info("Using TensorRT to inference") + else: + trt_file = None + decoder = None + + return Predictor(model, exp, trt_file=trt_file, decoder=decoder, device=args.device, fp16=args.fp16) + +def get_darknet_predictor(exp, args): + network, class_names, class_colors = exp.get_model( + args.config_file, + args.data_file, + args.weights_file + ) + return YOLOv4Predictor(network, class_names, class_colors, exp) + +def get_predictor(exp, args) -> Predictor: + output_dir = osp.join(exp.output_dir, args.experiment_name) + os.makedirs(output_dir, exist_ok=True) + if args.model_type == 'yolox': + return get_yolox_predictor(exp, args, output_dir) + elif args.model_type == 'darknet': + return get_darknet_predictor(exp, args) + else: + raise Exception("unreachable") diff --git a/tools/test_predictor.py b/tools/test_predictor.py new file mode 100644 index 000000000..5d9d1f7f3 --- /dev/null +++ b/tools/test_predictor.py @@ -0,0 +1,130 @@ +from tools.predictor import get_predictor +from yolox.exp import get_exp +from yolox.tracking_utils.timer import Timer +import cv2 +import torch +import argparse +import numpy as np + +def make_parser(): + parser = argparse.ArgumentParser("Test Predictor!") + # parser.add_argument( + # "demo", default="image", help="demo type, eg. image, video and webcam" + # ) + parser.add_argument("-expn", "--experiment-name", type=str, default=None) + + parser.add_argument("-n", "--name", type=str, default=None, help="model name") + + # parser.add_argument( + # "--path", default="./videos/palace.mp4", help="path to images or video" + # ) + parser.add_argument("--model_type", type=str, default="yolox", choices=["yolox", "darknet"]) + parser.add_argument( + "--img_path", default="./images/image_0090.jpg" + ) + parser.add_argument( + "--output_path", default="./bboxes.jpg" + ) + parser.add_argument( + "--device", + default="gpu", + type=str, + help="device to run our model, can either be cpu or gpu", + ) + parser.add_argument( + "-f", + "--exp_file", + default=None, + type=str, + help="pls input your expriment description file", + ) + parser.add_argument( + "--trt", + dest="trt", + default=False, + action="store_true", + help="Using TensorRT model for testing.", + ) + parser.add_argument( + "--fp16", + dest="fp16", + default=False, + action="store_true", + help="Adopting mix precision evaluating.", + ) + parser.add_argument( + "--fuse", + dest="fuse", + default=False, + action="store_true", + help="Fuse conv and bn for testing.", + ) + parser.add_argument( + "--conf_thresh", + default=0.75, + type=float, + help="threshold for drawing detection box" + ) + parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval") + parser.add_argument( + "--config_file", + help="darknet config file(.cfg)", + default=None + ) + parser.add_argument( + "--data_file", + help="darknet data file(.data)", + default=None + ) + parser.add_argument( + "--weights_file", + help="darknet weights file(.weights)", + default=None + ) + + return parser + +def draw_bboxes(img, outputs, img_info): + #assume batch size is 1 + assert(len(outputs) == 1) + outputs = outputs[0] + if isinstance(outputs, torch.Tensor): + # yoloX + # (x1, y1, x2, y2, obj_conf, class_conf, class_pred) + outputs = outputs.cpu().numpy() + scores = outputs[:, 4] * outputs[:, 5] + bboxes = outputs[:, :4] + else: + # darknet + # (x1, y1, x2, y2, bbox_conf, class_pred) + scores = outputs[:, 4] + bboxes = outputs[:, :4] + + print(outputs.shape) + ratio = img_info["ratio"] + for ((x1, y1, x2, y2), score) in zip(bboxes, scores): + x1, x2 = [x/ratio[0] for x in [x1, x2]] + y1, y2 = [y/ratio[1] for y in [y1, y2]] + x1, y1, x2, y2 = [int(x) for x in [x1, y1, x2, y2]] + if score > args.conf_thresh: + print(x1, y1, x2, y2, score) + cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2) + return img + + +def main(exp, args): + if not args.experiment_name: + args.experiment_name = exp.exp_name + + timer = Timer() + img = cv2.imread(args.img_path) + predictor = get_predictor(exp, args) + outputs, img_info = predictor.inference(img, timer) + img = draw_bboxes(img, outputs, img_info) + cv2.imwrite(args.output_path, img) + +if __name__ == "__main__": + args = make_parser().parse_args() + exp = get_exp(args.exp_file, args.name) + + main(exp, args) diff --git a/yolox/tracker/byte_tracker.py b/yolox/tracker/byte_tracker.py index 2d004599b..ad54f3bdd 100644 --- a/yolox/tracker/byte_tracker.py +++ b/yolox/tracker/byte_tracker.py @@ -143,7 +143,7 @@ def __repr__(self): class BYTETracker(object): - def __init__(self, args, frame_rate=30): + def __init__(self, args, target_class, frame_rate=30, image_keep_ratio=True): self.tracked_stracks = [] # type: list[STrack] self.lost_stracks = [] # type: list[STrack] self.removed_stracks = [] # type: list[STrack] @@ -155,6 +155,8 @@ def __init__(self, args, frame_rate=30): self.buffer_size = int(frame_rate / 30.0 * args.track_buffer) self.max_time_lost = self.buffer_size self.kalman_filter = KalmanFilter() + self.image_keep_ratio = image_keep_ratio + self.target_class = target_class def update(self, output_results, img_info, img_size): self.frame_id += 1 @@ -163,16 +165,33 @@ def update(self, output_results, img_info, img_size): lost_stracks = [] removed_stracks = [] - if output_results.shape[1] == 5: + if isinstance(output_results, torch.Tensor): + output_results = output_results.cpu().numpy() + if output_results.shape[1] == 6: + # (x1, y1, x2, y2, bbox_conf, class_pred) + # extract results where class_pred is target_class + output_results = output_results[output_results[:, 5] == self.target_class] scores = output_results[:, 4] bboxes = output_results[:, :4] - else: - output_results = output_results.cpu().numpy() + elif output_results.shape[1] == 7: + # (x1, y1, x2, y2, obj_conf, class_conf, class_pred) + # extract results where class_pred is target_class + output_results = output_results[output_results[:, 6] == self.target_class] scores = output_results[:, 4] * output_results[:, 5] bboxes = output_results[:, :4] # x1y1x2y2 + else: + raise Exception("output_results.shape[1] must be 6 or 7") img_h, img_w = img_info[0], img_info[1] - scale = min(img_size[0] / float(img_h), img_size[1] / float(img_w)) - bboxes /= scale + if self.image_keep_ratio: + scale = min(img_size[0] / float(img_h), img_size[1] / float(img_w)) + bboxes /= scale + else: + scale_h = img_size[0] / float(img_h) + scale_w = img_size[1] / float(img_w) + bboxes[:, 0] /= scale_w + bboxes[:, 1] /= scale_h + bboxes[:, 2] /= scale_w + bboxes[:, 3] /= scale_h remain_inds = scores > self.args.track_thresh inds_low = scores > 0.1