diff --git a/Dockerfile b/Dockerfile
index 1848e1f1b..8f25cf380 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,10 +21,10 @@ RUN apt-get update && apt-get install -y \
 
 RUN git clone https://github.com/ifzhang/ByteTrack \
     && cd ByteTrack \
-    && git checkout 3434c5e8bc6a5ae8ad530528ba8d9a431967f237 \
+    && git checkout  8d52fbdf9cd03757d8dd02c0631e526d164bd726 \
     && mkdir -p YOLOX_outputs/yolox_x_mix_det/track_vis \
     && sed -i 's/torch>=1.7/torch==1.9.1+cu111/g' requirements.txt \
-    && sed -i 's/torchvision==0.10.0/torchvision==0.10.1+cu111/g' requirements.txt \
+    && sed -i 's/torchvision>=0.10.0/torchvision==0.10.1+cu111/g' requirements.txt \
     && sed -i "s/'cuda'/0/g" tools/demo_track.py \
     && pip3 install pip --upgrade \
     && pip3 install -r requirements.txt -f https://download.pytorch.org/whl/torch_stable.html \
diff --git a/darknet/darknet.py b/darknet/darknet.py
new file mode 100644
index 000000000..ebb0eede2
--- /dev/null
+++ b/darknet/darknet.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+
+"""
+Python 3 wrapper for identifying objects in images
+
+Running the script requires opencv-python to be installed (`pip install opencv-python`)
+Directly viewing or returning bounding-boxed images requires scikit-image to be installed (`pip install scikit-image`)
+Use pip3 instead of pip on some systems to be sure to install modules for python3
+"""
+
+from ctypes import *
+import math
+import random
+import os
+
+
+class BOX(Structure):
+    _fields_ = [("x", c_float),
+                ("y", c_float),
+                ("w", c_float),
+                ("h", c_float)]
+
+
+class DETECTION(Structure):
+    _fields_ = [("bbox", BOX),
+                ("classes", c_int),
+                ("best_class_idx", c_int),
+                ("prob", POINTER(c_float)),
+                ("mask", POINTER(c_float)),
+                ("objectness", c_float),
+                ("sort_class", c_int),
+                ("uc", POINTER(c_float)),
+                ("points", c_int),
+                ("embeddings", POINTER(c_float)),
+                ("embedding_size", c_int),
+                ("sim", c_float),
+                ("track_id", c_int)]
+
+class DETNUMPAIR(Structure):
+    _fields_ = [("num", c_int),
+                ("dets", POINTER(DETECTION))]
+
+
+class IMAGE(Structure):
+    _fields_ = [("w", c_int),
+                ("h", c_int),
+                ("c", c_int),
+                ("data", POINTER(c_float))]
+
+
+class METADATA(Structure):
+    _fields_ = [("classes", c_int),
+                ("names", POINTER(c_char_p))]
+
+
+def network_width(net):
+    return lib.network_width(net)
+
+
+def network_height(net):
+    return lib.network_height(net)
+
+
+def bbox2points(bbox):
+    """
+    From bounding box yolo format
+    to corner points cv2 rectangle
+    """
+    x, y, w, h = bbox
+    xmin = int(round(x - (w / 2)))
+    xmax = int(round(x + (w / 2)))
+    ymin = int(round(y - (h / 2)))
+    ymax = int(round(y + (h / 2)))
+    return xmin, ymin, xmax, ymax
+
+
+def class_colors(names):
+    """
+    Create a dict with one random BGR color for each
+    class name
+    """
+    return {name: (
+        random.randint(0, 255),
+        random.randint(0, 255),
+        random.randint(0, 255)) for name in names}
+
+
+def load_network(config_file, data_file, weights, batch_size=1):
+    """
+    load model description and weights from config files
+    args:
+        config_file (str): path to .cfg model file
+        data_file (str): path to .data model file
+        weights (str): path to weights
+    returns:
+        network: trained model
+        class_names
+        class_colors
+    """
+    network = load_net_custom(
+        config_file.encode("ascii"),
+        weights.encode("ascii"), 0, batch_size)
+    metadata = load_meta(data_file.encode("ascii"))
+    class_names = [metadata.names[i].decode("ascii") for i in range(metadata.classes)]
+    colors = class_colors(class_names)
+    return network, class_names, colors
+
+
+def print_detections(detections, coordinates=False):
+    print("\nObjects:")
+    for label, confidence, bbox in detections:
+        x, y, w, h = bbox
+        if coordinates:
+            print("{}: {}%    (left_x: {:.0f}   top_y:  {:.0f}   width:   {:.0f}   height:  {:.0f})".format(label, confidence, x, y, w, h))
+        else:
+            print("{}: {}%".format(label, confidence))
+
+
+def draw_boxes(detections, image, colors):
+    import cv2
+    for label, confidence, bbox in detections:
+        left, top, right, bottom = bbox2points(bbox)
+        cv2.rectangle(image, (left, top), (right, bottom), colors[label], 1)
+        cv2.putText(image, "{} [{:.2f}]".format(label, float(confidence)),
+                    (left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
+                    colors[label], 2)
+    return image
+
+
+def decode_detection(detections):
+    decoded = []
+    for label, confidence, bbox in detections:
+        confidence = str(round(confidence * 100, 2))
+        decoded.append((str(label), confidence, bbox))
+    return decoded
+
+# https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/
+# Malisiewicz et al.
+def non_max_suppression_fast(detections, overlap_thresh):
+    boxes = []
+    for detection in detections:
+        _, _, _, (x, y, w, h) = detection
+        x1 = x - w / 2
+        y1 = y - h / 2
+        x2 = x + w / 2
+        y2 = y + h / 2
+        boxes.append(np.array([x1, y1, x2, y2]))
+    boxes_array = np.array(boxes)
+
+    # initialize the list of picked indexes
+    pick = []
+    # grab the coordinates of the bounding boxes
+    x1 = boxes_array[:, 0]
+    y1 = boxes_array[:, 1]
+    x2 = boxes_array[:, 2]
+    y2 = boxes_array[:, 3]
+    # compute the area of the bounding boxes and sort the bounding
+    # boxes by the bottom-right y-coordinate of the bounding box
+    area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    idxs = np.argsort(y2)
+    # keep looping while some indexes still remain in the indexes
+    # list
+    while len(idxs) > 0:
+        # grab the last index in the indexes list and add the
+        # index value to the list of picked indexes
+        last = len(idxs) - 1
+        i = idxs[last]
+        pick.append(i)
+        # find the largest (x, y) coordinates for the start of
+        # the bounding box and the smallest (x, y) coordinates
+        # for the end of the bounding box
+        xx1 = np.maximum(x1[i], x1[idxs[:last]])
+        yy1 = np.maximum(y1[i], y1[idxs[:last]])
+        xx2 = np.minimum(x2[i], x2[idxs[:last]])
+        yy2 = np.minimum(y2[i], y2[idxs[:last]])
+        # compute the width and height of the bounding box
+        w = np.maximum(0, xx2 - xx1 + 1)
+        h = np.maximum(0, yy2 - yy1 + 1)
+        # compute the ratio of overlap
+        overlap = (w * h) / area[idxs[:last]]
+        # delete all indexes from the index list that have
+        idxs = np.delete(idxs, np.concatenate(([last],
+                                               np.where(overlap > overlap_thresh)[0])))
+        # return only the bounding boxes that were picked using the
+        # integer data type
+    return [detections[i] for i in pick]
+
+def remove_negatives(detections, class_names, num):
+    """
+    Remove all classes with 0% confidence within the detection
+    """
+    predictions = []
+    for j in range(num):
+        for idx, name in enumerate(class_names):
+            if detections[j].prob[idx] > 0:
+                bbox = detections[j].bbox
+                bbox = (bbox.x, bbox.y, bbox.w, bbox.h)
+                predictions.append((name, detections[j].prob[idx], (bbox)))
+    return predictions
+
+
+def remove_negatives_faster(detections, class_names, num):
+    """
+    Faster version of remove_negatives (very useful when using yolo9000)
+    """
+    predictions = []
+    for j in range(num):
+        if detections[j].best_class_idx == -1:
+            continue
+        name = class_names[detections[j].best_class_idx]
+        bbox = detections[j].bbox
+        bbox = (bbox.x, bbox.y, bbox.w, bbox.h)
+        predictions.append((name, detections[j].prob[detections[j].best_class_idx], bbox))
+    return predictions
+
+
+def detect_image(network, class_names, image, thresh=.5, hier_thresh=.5, nms=.45):
+    """
+        Returns a list with highest confidence class and their bbox
+    """
+    pnum = pointer(c_int(0))
+    predict_image(network, image)
+    detections = get_network_boxes(network, image.w, image.h,
+                                   thresh, hier_thresh, None, 0, pnum, 0)
+    num = pnum[0]
+    if nms:
+        do_nms_sort(detections, num, len(class_names), nms)
+    predictions = remove_negatives(detections, class_names, num)
+    predictions = decode_detection(predictions)
+    free_detections(detections, num)
+    return sorted(predictions, key=lambda x: x[1])
+
+
+if os.name == "posix":
+    cwd = os.path.dirname(__file__)
+    lib = CDLL(cwd + "/libdarknet.so", RTLD_GLOBAL)
+elif os.name == "nt":
+    cwd = os.path.dirname(__file__)
+    os.environ['PATH'] = cwd + ';' + os.environ['PATH']
+    lib = CDLL("darknet.dll", RTLD_GLOBAL)
+else:
+    print("Unsupported OS")
+    exit
+
+lib.network_width.argtypes = [c_void_p]
+lib.network_width.restype = c_int
+lib.network_height.argtypes = [c_void_p]
+lib.network_height.restype = c_int
+
+copy_image_from_bytes = lib.copy_image_from_bytes
+copy_image_from_bytes.argtypes = [IMAGE,c_char_p]
+
+predict = lib.network_predict_ptr
+predict.argtypes = [c_void_p, POINTER(c_float)]
+predict.restype = POINTER(c_float)
+
+set_gpu = lib.cuda_set_device
+init_cpu = lib.init_cpu
+
+make_image = lib.make_image
+make_image.argtypes = [c_int, c_int, c_int]
+make_image.restype = IMAGE
+
+get_network_boxes = lib.get_network_boxes
+get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int), c_int]
+get_network_boxes.restype = POINTER(DETECTION)
+
+make_network_boxes = lib.make_network_boxes
+make_network_boxes.argtypes = [c_void_p]
+make_network_boxes.restype = POINTER(DETECTION)
+
+free_detections = lib.free_detections
+free_detections.argtypes = [POINTER(DETECTION), c_int]
+
+free_batch_detections = lib.free_batch_detections
+free_batch_detections.argtypes = [POINTER(DETNUMPAIR), c_int]
+
+free_ptrs = lib.free_ptrs
+free_ptrs.argtypes = [POINTER(c_void_p), c_int]
+
+network_predict = lib.network_predict_ptr
+network_predict.argtypes = [c_void_p, POINTER(c_float)]
+
+reset_rnn = lib.reset_rnn
+reset_rnn.argtypes = [c_void_p]
+
+load_net = lib.load_network
+load_net.argtypes = [c_char_p, c_char_p, c_int]
+load_net.restype = c_void_p
+
+load_net_custom = lib.load_network_custom
+load_net_custom.argtypes = [c_char_p, c_char_p, c_int, c_int]
+load_net_custom.restype = c_void_p
+
+free_network_ptr = lib.free_network_ptr
+free_network_ptr.argtypes = [c_void_p]
+free_network_ptr.restype = c_void_p
+
+do_nms_obj = lib.do_nms_obj
+do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+
+do_nms_sort = lib.do_nms_sort
+do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+
+free_image = lib.free_image
+free_image.argtypes = [IMAGE]
+
+letterbox_image = lib.letterbox_image
+letterbox_image.argtypes = [IMAGE, c_int, c_int]
+letterbox_image.restype = IMAGE
+
+load_meta = lib.get_metadata
+lib.get_metadata.argtypes = [c_char_p]
+lib.get_metadata.restype = METADATA
+
+load_image = lib.load_image_color
+load_image.argtypes = [c_char_p, c_int, c_int]
+load_image.restype = IMAGE
+
+rgbgr_image = lib.rgbgr_image
+rgbgr_image.argtypes = [IMAGE]
+
+predict_image = lib.network_predict_image
+predict_image.argtypes = [c_void_p, IMAGE]
+predict_image.restype = POINTER(c_float)
+
+predict_image_letterbox = lib.network_predict_image_letterbox
+predict_image_letterbox.argtypes = [c_void_p, IMAGE]
+predict_image_letterbox.restype = POINTER(c_float)
+
+network_predict_batch = lib.network_predict_batch
+network_predict_batch.argtypes = [c_void_p, IMAGE, c_int, c_int, c_int,
+                                   c_float, c_float, POINTER(c_int), c_int, c_int]
+network_predict_batch.restype = POINTER(DETNUMPAIR)
diff --git a/darknet/libdarknet.so b/darknet/libdarknet.so
new file mode 100755
index 000000000..a1078b9ee
Binary files /dev/null and b/darknet/libdarknet.so differ
diff --git a/exps/example/mot/yolov4.py b/exps/example/mot/yolov4.py
new file mode 100644
index 000000000..187f20d05
--- /dev/null
+++ b/exps/example/mot/yolov4.py
@@ -0,0 +1,147 @@
+# encoding: utf-8
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+import darknet.darknet as darknet
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 1
+        self.depth = 1.33
+        self.width = 1.25
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.train_ann = "train.json"
+        self.val_ann = "test.json"   # change to train.json when running on training set
+        self.input_size = (608, 608)
+        self.test_size = (608, 608)
+        self.random_size = (20, 36)
+        self.max_epoch = 80
+        self.print_interval = 20
+        self.eval_interval = 5
+        self.test_conf = 0.001
+        self.nmsthre = 0.7
+        self.no_aug_epochs = 10
+        self.basic_lr_per_img = 0.001 / 64.0
+        self.warmup_epochs = 1
+
+    def get_model(self, config_file, data_file, weights_file):
+        network, class_names, class_colors = darknet.load_network(
+            config_file,
+            data_file,
+            weights_file,
+            batch_size=1
+        )
+        return network, class_names, class_colors
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            MOTDataset,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = MOTDataset(
+            data_dir=os.path.join(get_yolox_datadir(), "mix_mot20_ch"),
+            json_file=self.train_ann,
+            name='',
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=600,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=1200,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import MOTDataset, ValTransform
+
+        valdataset = MOTDataset(
+            data_dir=os.path.join(get_yolox_datadir(), "MOT20"),
+            json_file=self.val_ann,
+            img_size=self.test_size,
+            name='test', # change to train when running on training set
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import COCOEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = COCOEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+            testdev=testdev,
+        )
+        return evaluator
diff --git a/exps/example/mot/yolov4_tiny_signate.py b/exps/example/mot/yolov4_tiny_signate.py
new file mode 100644
index 000000000..4ad46d37e
--- /dev/null
+++ b/exps/example/mot/yolov4_tiny_signate.py
@@ -0,0 +1,147 @@
+# encoding: utf-8
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+import darknet.darknet as darknet
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 1
+        self.depth = 1.33
+        self.width = 1.25
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.train_ann = "train.json"
+        self.val_ann = "test.json"   # change to train.json when running on training set
+        self.input_size = (416, 416)
+        self.test_size = (416, 416)
+        self.random_size = (20, 36)
+        self.max_epoch = 80
+        self.print_interval = 20
+        self.eval_interval = 5
+        self.test_conf = 0.001
+        self.nmsthre = 0.7
+        self.no_aug_epochs = 10
+        self.basic_lr_per_img = 0.001 / 64.0
+        self.warmup_epochs = 1
+
+    def get_model(self, config_file, data_file, weights_file):
+        network, class_names, class_colors = darknet.load_network(
+            config_file,
+            data_file,
+            weights_file,
+            batch_size=1
+        )
+        return network, class_names, class_colors
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            MOTDataset,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = MOTDataset(
+            data_dir=os.path.join(get_yolox_datadir(), "mix_mot20_ch"),
+            json_file=self.train_ann,
+            name='',
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=600,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=1200,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import MOTDataset, ValTransform
+
+        valdataset = MOTDataset(
+            data_dir=os.path.join(get_yolox_datadir(), "MOT20"),
+            json_file=self.val_ann,
+            img_size=self.test_size,
+            name='test', # change to train when running on training set
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import COCOEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = COCOEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+            testdev=testdev,
+        )
+        return evaluator
diff --git a/tools/demo_track.py b/tools/demo_track.py
index 4f4e7dc3a..5c2d9bf89 100644
--- a/tools/demo_track.py
+++ b/tools/demo_track.py
@@ -13,7 +13,7 @@
 from yolox.utils.visualize import plot_tracking
 from yolox.tracker.byte_tracker import BYTETracker
 from yolox.tracking_utils.timer import Timer
-
+from tools.predictor import Predictor, get_predictor
 
 IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
 
@@ -26,6 +26,8 @@ def make_parser():
     parser.add_argument("-expn", "--experiment-name", type=str, default=None)
     parser.add_argument("-n", "--name", type=str, default=None, help="model name")
 
+    parser.add_argument("--model_type", type=str, default="yolox", choices=["yolox", "darknet"])
+
     parser.add_argument(
         #"--path", default="./datasets/mot/train/MOT17-05-FRCNN/img1", help="path to images or video"
         "--path", default="./videos/palace.mp4", help="path to images or video"
@@ -77,16 +79,37 @@ def make_parser():
         action="store_true",
         help="Using TensorRT model for testing.",
     )
+    parser.add_argument(
+        "--config_file",
+        help="darknet config file(.cfg)",
+        default=None
+    )
+    parser.add_argument(
+        "--data_file",
+        help="darknet data file(.data)",
+        default=None
+    )
+    parser.add_argument(
+        "--weights_file",
+        help="darknet weights file(.weights)",
+        default=None
+    )
     # tracking args
     parser.add_argument("--track_thresh", type=float, default=0.5, help="tracking confidence threshold")
     parser.add_argument("--track_buffer", type=int, default=30, help="the frames for keep lost tracks")
     parser.add_argument("--match_thresh", type=float, default=0.8, help="matching threshold for tracking")
     parser.add_argument(
         "--aspect_ratio_thresh", type=float, default=1.6,
-        help="threshold for filtering out boxes of which aspect ratio are above the given value."
+        help="threshold for filtering out boxes of which aspect ratio are above the given value. -1 means not to use filtering."
     )
     parser.add_argument('--min_box_area', type=float, default=10, help='filter out tiny boxes')
     parser.add_argument("--mot20", dest="mot20", default=False, action="store_true", help="test mot20.")
+    parser.add_argument(
+        "--tracking_classes",
+        nargs="+",
+        type=int,
+        default=[0]
+    )
     return parser
 
 
@@ -114,66 +137,6 @@ def write_results(filename, results):
     logger.info('save results to {}'.format(filename))
 
 
-class Predictor(object):
-    def __init__(
-        self,
-        model,
-        exp,
-        trt_file=None,
-        decoder=None,
-        device=torch.device("cpu"),
-        fp16=False
-    ):
-        self.model = model
-        self.decoder = decoder
-        self.num_classes = exp.num_classes
-        self.confthre = exp.test_conf
-        self.nmsthre = exp.nmsthre
-        self.test_size = exp.test_size
-        self.device = device
-        self.fp16 = fp16
-        if trt_file is not None:
-            from torch2trt import TRTModule
-
-            model_trt = TRTModule()
-            model_trt.load_state_dict(torch.load(trt_file))
-
-            x = torch.ones((1, 3, exp.test_size[0], exp.test_size[1]), device=device)
-            self.model(x)
-            self.model = model_trt
-        self.rgb_means = (0.485, 0.456, 0.406)
-        self.std = (0.229, 0.224, 0.225)
-
-    def inference(self, img, timer):
-        img_info = {"id": 0}
-        if isinstance(img, str):
-            img_info["file_name"] = osp.basename(img)
-            img = cv2.imread(img)
-        else:
-            img_info["file_name"] = None
-
-        height, width = img.shape[:2]
-        img_info["height"] = height
-        img_info["width"] = width
-        img_info["raw_img"] = img
-
-        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
-        img_info["ratio"] = ratio
-        img = torch.from_numpy(img).unsqueeze(0).float().to(self.device)
-        if self.fp16:
-            img = img.half()  # to FP16
-
-        with torch.no_grad():
-            timer.tic()
-            outputs = self.model(img)
-            if self.decoder is not None:
-                outputs = self.decoder(outputs, dtype=outputs.type())
-            outputs = postprocess(
-                outputs, self.num_classes, self.confthre, self.nmsthre
-            )
-            #logger.info("Infer time: {:.4f}s".format(time.time() - t0))
-        return outputs, img_info
-
 
 def image_demo(predictor, vis_folder, current_time, args):
     if osp.isdir(args.path):
@@ -196,7 +159,7 @@ def image_demo(predictor, vis_folder, current_time, args):
                 tlwh = t.tlwh
                 tid = t.track_id
                 vertical = tlwh[2] / tlwh[3] > args.aspect_ratio_thresh
-                if tlwh[2] * tlwh[3] > args.min_box_area and not vertical:
+                if tlwh[2] * tlwh[3] > args.min_box_area and (not vertical or args.dont_use_aspect_ratio_thresh):
                     online_tlwhs.append(tlwh)
                     online_ids.append(tid)
                     online_scores.append(t.score)
@@ -233,15 +196,16 @@ def image_demo(predictor, vis_folder, current_time, args):
         logger.info(f"save results to {res_file}")
 
 
-def imageflow_demo(predictor, vis_folder, current_time, args):
-    cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid)
+def imageflow_demo(predictor, vis_folder, current_time, args, save_folder=None):
+    cap = cv2.VideoCapture(args.path if "video" in args.demo else args.camid)
     width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
     height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
     fps = cap.get(cv2.CAP_PROP_FPS)
     timestamp = time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
-    save_folder = osp.join(vis_folder, timestamp)
+    if save_folder is None:
+        save_folder = osp.join(vis_folder, timestamp)
     os.makedirs(save_folder, exist_ok=True)
-    if args.demo == "video":
+    if "video" in args.demo:
         save_path = osp.join(save_folder, args.path.split("/")[-1])
     else:
         save_path = osp.join(save_folder, "camera.mp4")
@@ -249,7 +213,10 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
     vid_writer = cv2.VideoWriter(
         save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
     )
-    tracker = BYTETracker(args, frame_rate=30)
+    image_keep_ratio = True
+    if args.model_type == "darknet":
+        image_keep_ratio = False
+    trackers = [BYTETracker(args, target_class=i, frame_rate=args.fps, image_keep_ratio=image_keep_ratio) for i in args.tracking_classes]
     timer = Timer()
     frame_id = 0
     results = []
@@ -259,26 +226,35 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
         ret_val, frame = cap.read()
         if ret_val:
             outputs, img_info = predictor.inference(frame, timer)
-            if outputs[0] is not None:
-                online_targets = tracker.update(outputs[0], [img_info['height'], img_info['width']], exp.test_size)
-                online_tlwhs = []
-                online_ids = []
-                online_scores = []
-                for t in online_targets:
-                    tlwh = t.tlwh
-                    tid = t.track_id
-                    vertical = tlwh[2] / tlwh[3] > args.aspect_ratio_thresh
-                    if tlwh[2] * tlwh[3] > args.min_box_area and not vertical:
-                        online_tlwhs.append(tlwh)
-                        online_ids.append(tid)
-                        online_scores.append(t.score)
-                        results.append(
-                            f"{frame_id},{tid},{tlwh[0]:.2f},{tlwh[1]:.2f},{tlwh[2]:.2f},{tlwh[3]:.2f},{t.score:.2f},-1,-1,-1\n"
-                        )
-                timer.toc()
-                online_im = plot_tracking(
-                    img_info['raw_img'], online_tlwhs, online_ids, frame_id=frame_id + 1, fps=1. / timer.average_time
-                )
+            if outputs[0] is not None and len(outputs[0]) > 0:
+                tracking_results = []
+                for tracker in trackers:
+                    target_class = tracker.target_class
+                    online_targets = tracker.update(outputs[0], [img_info['height'], img_info['width']], exp.test_size)
+                    online_tlwhs = []
+                    online_ids = []
+                    online_scores = []
+                    for t in online_targets:
+                        tlwh = t.tlwh
+                        tid = t.track_id
+                        # args.aspect_ratio_thresh < 0 means not to use aspect ratio filtering
+                        vertical = tlwh[2] / tlwh[3] > args.aspect_ratio_thresh
+                        ok_aspect_ratio = ((not vertical) or args.aspect_ratio_thresh < 0)
+                        ok_box_area = tlwh[2] * tlwh[3] > args.min_box_area
+                        if ok_aspect_ratio and ok_aspect_ratio:
+                            online_tlwhs.append(tlwh)
+                            online_ids.append(tid)
+                            online_scores.append(t.score)
+                            results.append(
+                                f"{frame_id},{tid}, {target_class}, {tlwh[0]:.2f},{tlwh[1]:.2f},{tlwh[2]:.2f},{tlwh[3]:.2f},{t.score:.2f},-1,-1,-1\n"
+                            )
+                    tracking_results.append((online_tlwhs, online_ids, online_scores))
+                    timer.toc()
+                online_im = img_info['raw_img'].copy()
+                for (online_tlwhs, online_ids, online_scores) in tracking_results:
+                    online_im = plot_tracking(
+                        online_im, online_tlwhs, online_ids, frame_id=frame_id + 1, fps=1. / timer.average_time
+                    )
             else:
                 timer.toc()
                 online_im = img_info['raw_img']
@@ -292,7 +268,7 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
         frame_id += 1
 
     if args.save_result:
-        res_file = osp.join(vis_folder, f"{timestamp}.txt")
+        res_file = osp.join(save_folder, args.path.split("/")[-1].replace(".mp4", ".txt"))
         with open(res_file, 'w') as f:
             f.writelines(results)
         logger.info(f"save results to {res_file}")
@@ -309,10 +285,6 @@ def main(exp, args):
         vis_folder = osp.join(output_dir, "track_vis")
         os.makedirs(vis_folder, exist_ok=True)
 
-    if args.trt:
-        args.device = "gpu"
-    args.device = torch.device("cuda" if args.device == "gpu" else "cpu")
-
     logger.info("Args: {}".format(args))
 
     if args.conf is not None:
@@ -322,51 +294,26 @@ def main(exp, args):
     if args.tsize is not None:
         exp.test_size = (args.tsize, args.tsize)
 
-    model = exp.get_model().to(args.device)
-    logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
-    model.eval()
-
-    if not args.trt:
-        if args.ckpt is None:
-            ckpt_file = osp.join(output_dir, "best_ckpt.pth.tar")
-        else:
-            ckpt_file = args.ckpt
-        logger.info("loading checkpoint")
-        ckpt = torch.load(ckpt_file, map_location="cpu")
-        # load the model state dict
-        model.load_state_dict(ckpt["model"])
-        logger.info("loaded checkpoint done.")
-
-    if args.fuse:
-        logger.info("\tFusing model...")
-        model = fuse_model(model)
-
-    if args.fp16:
-        model = model.half()  # to FP16
-
-    if args.trt:
-        assert not args.fuse, "TensorRT model is not support model fusing!"
-        trt_file = osp.join(output_dir, "model_trt.pth")
-        assert osp.exists(
-            trt_file
-        ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
-        model.head.decode_in_inference = False
-        decoder = model.head.decode_outputs
-        logger.info("Using TensorRT to inference")
-    else:
-        trt_file = None
-        decoder = None
-
-    predictor = Predictor(model, exp, trt_file, decoder, args.device, args.fp16)
+    predictor = get_predictor(exp, args)
     current_time = time.localtime()
     if args.demo == "image":
         image_demo(predictor, vis_folder, current_time, args)
     elif args.demo == "video" or args.demo == "webcam":
         imageflow_demo(predictor, vis_folder, current_time, args)
+    elif args.demo == "video_multiple":
+        import glob
+        videos = glob.glob(osp.join(args.path, "*.mp4"))
+        logger.info(f"Found {len(videos)} videos")
+        timestamp = time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+        save_base_folder = osp.join(vis_folder, timestamp)
+        for video in videos:
+            print(video)
+            args.path = video
+            save_folder = osp.join(save_base_folder, osp.basename(video).replace('.mp4', ''))
+            imageflow_demo(predictor, vis_folder, current_time, args, save_folder)
 
 
 if __name__ == "__main__":
     args = make_parser().parse_args()
     exp = get_exp(args.exp_file, args.name)
-
     main(exp, args)
diff --git a/tools/make_submit.py b/tools/make_submit.py
new file mode 100644
index 000000000..70cebc429
--- /dev/null
+++ b/tools/make_submit.py
@@ -0,0 +1,94 @@
+import json
+import argparse
+import glob
+import os
+
+
+VIDEO_NUM = 74
+FRAME_WIDTH = 1936
+FRAME_HEIGHT = 1216
+TEST_VIDEO_FRAME_NUM = 150 #5fps * 30sec
+
+class Bbox:
+    def __init__(self, line):
+        elems = line.split(',')
+        elems = [int(float(x.strip())) for x in elems]
+        self.frame_id = elems[0]
+        self.object_id = elems[1]
+        self.category = elems[2]
+        # x1, y1, w, h
+        self.x1 = max(0, elems[3])
+        self.y1 = max(0, elems[4])
+        self.x2 = min(FRAME_WIDTH, elems[3] + elems[5])
+        self.y2 = min(FRAME_HEIGHT, elems[4] + elems[6])
+
+    def to_bboxdic(self):
+        return {
+            "id": self.object_id,
+            "box2d": [self.x1, self.y1, self.x2, self.y2]
+        }
+    @staticmethod
+    def convert_line_to_bboxes(lines):
+        return [Bbox(line) for line in lines]
+
+def count_tracked_frames(bboxes):
+    count_dic = {}
+    for bbox in bboxes:
+        key = (bbox.object_id, bbox.category)
+        if key not in count_dic:
+            count_dic[key] = 0
+        count_dic[key] += 1
+    return count_dic
+
+def remove_few_frame_bboxes(count_dic, bboxes):
+    new_bboxes = []
+    for bbox in bboxes:
+        key = (bbox.object_id, bbox.category)
+        if count_dic[key] < 3:
+            print(f"Removed object_id={bbox.object_id}, category={bbox.category}")
+        else:
+            new_bboxes.append(bbox)
+    return new_bboxes
+
+def bboxes_to_dic(bboxes):
+    # readme.txt on SIGNATE says:
+    # If you do not want to make any prediction in some frames, just write "{}" in the corresponding frames.
+
+    result_list = [{} for i in range(TEST_VIDEO_FRAME_NUM)]
+    for bbox in bboxes:
+        assert(bbox.frame_id < TEST_VIDEO_FRAME_NUM)
+        category_name = ["Car", "Pedestrian"][bbox.category]
+        if category_name not in result_list[bbox.frame_id]:
+            result_list[bbox.frame_id][category_name] = []
+        result_list[bbox.frame_id][category_name].append(bbox.to_bboxdic())
+    return result_list
+
+def process(txt_file_path):
+    bboxes = Bbox.convert_line_to_bboxes(open(txt_file_path).readlines())
+    # 1. count number of tracked frames for each tracked id
+    count_dic = count_tracked_frames(bboxes)
+    # 2. remove bboxes whose tracked frame num is less than 3
+    bboxes = remove_few_frame_bboxes(count_dic, bboxes)
+    # 3. convert to list of dictionary
+    result_list = bboxes_to_dic(bboxes)
+    return result_list
+
+def main(args):
+    txt_file_paths = glob.glob(os.path.join(args.result_dir, "*/test_*.txt"))
+    txt_file_paths = sorted(txt_file_paths)
+    assert(len(txt_file_paths) == VIDEO_NUM)
+
+    result_dic = {}
+    for txt_file_path in txt_file_paths:
+        video_name = os.path.basename(txt_file_path).replace(".txt", ".mp4")
+        result_dic[video_name] = process(txt_file_path)
+
+    open(args.output_path, "w").write(json.dumps(result_dic))
+    print("Done!")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--result_dir", type=str)
+    parser.add_argument("--output_path", type=str, default="./submit.json")
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/tools/predictor.py b/tools/predictor.py
new file mode 100644
index 000000000..340a12118
--- /dev/null
+++ b/tools/predictor.py
@@ -0,0 +1,182 @@
+import os.path as osp
+import os
+from loguru import logger
+from yolox.data.data_augment import preproc
+from yolox.utils import fuse_model, get_model_info, postprocess
+
+import torch
+import cv2
+import numpy as np
+
+class Predictor(object):
+    def __init__(
+        self,
+        model,
+        exp,
+        trt_file=None,
+        decoder=None,
+        device=torch.device("cpu"),
+        fp16=False
+    ):
+        self.model = model
+        self.decoder = decoder
+        self.num_classes = exp.num_classes
+        self.confthre = exp.test_conf
+        self.nmsthre = exp.nmsthre
+        self.test_size = exp.test_size
+        self.device = device
+        self.fp16 = fp16
+        if trt_file is not None:
+            from torch2trt import TRTModule
+
+            model_trt = TRTModule()
+            model_trt.load_state_dict(torch.load(trt_file))
+
+            x = torch.ones((1, 3, exp.test_size[0], exp.test_size[1]), device=device)
+            self.model(x)
+            self.model = model_trt
+        self.rgb_means = (0.485, 0.456, 0.406)
+        self.std = (0.229, 0.224, 0.225)
+
+    def inference(self, img, timer):
+        img_info = {"id": 0}
+        if isinstance(img, str):
+            img_info["file_name"] = osp.basename(img)
+            img = cv2.imread(img)
+        else:
+            img_info["file_name"] = None
+
+        height, width = img.shape[:2]
+        img_info["height"] = height
+        img_info["width"] = width
+        img_info["raw_img"] = img
+
+        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
+        img_info["ratio"] = (ratio, ratio)
+        img = torch.from_numpy(img).unsqueeze(0).float().to(self.device)
+        if self.fp16:
+            img = img.half()  # to FP16
+
+        with torch.no_grad():
+            timer.tic()
+            outputs = self.model(img)
+            if self.decoder is not None:
+                outputs = self.decoder(outputs, dtype=outputs.type())
+            outputs = postprocess(
+                outputs, self.num_classes, self.confthre, self.nmsthre
+            )
+            #logger.info("Infer time: {:.4f}s".format(time.time() - t0))
+        return outputs, img_info
+
+import darknet.darknet as darknet
+
+class YOLOv4Predictor(Predictor):
+    def __init__(
+        self,
+        network,
+        class_names,
+        class_colors,
+        exp
+    ):
+        self.confthre = exp.test_conf
+        self.nmsthre = exp.nmsthre
+        self.network = network
+        self.class_names = class_names
+        self.class_colors = class_colors
+        self.class_name_dic = {x: i for i, x in enumerate(class_names)}
+
+
+    def inference(self, img, timer):
+        net_width = darknet.network_width(self.network)
+        net_height = darknet.network_height(self.network)
+
+        # img_info
+        img_info = {"id": 0}
+        img_info["file_name"] = None
+        height, width = img.shape[:2]
+        img_info["height"] = height
+        img_info["width"] = width
+        img_info["raw_img"] = img
+        img_info["ratio"] = (net_width/width, net_height/height)
+        # preprocessing
+        darknet_image = darknet.make_image(net_width, net_height, 3)
+
+        image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        image_resized = cv2.resize(image_rgb, (net_width, net_height),
+                                interpolation=cv2.INTER_LINEAR)
+        darknet.copy_image_from_bytes(darknet_image, image_resized.tobytes())
+        # inference
+        timer.tic()
+        detections = darknet.detect_image(self.network, self.class_names, darknet_image, thresh=self.confthre, nms=self.nmsthre)
+        darknet.free_image(darknet_image)
+
+        outputs = []
+        for label, confidence, bbox in detections:
+            class_id = self.class_name_dic[label]
+            left, top, right, bottom = darknet.bbox2points(bbox)
+            confidence = float(confidence)/100.0
+            outputs.append([left, top, right, bottom, confidence, class_id])
+        # wrap as batched list
+        outputs = [outputs]
+        outputs = np.array(outputs)
+        return outputs, img_info
+
+
+def get_yolox_predictor(exp, args, output_dir):
+    if isinstance(args.device, str):
+        args.device = torch.device("cuda" if args.device == "gpu" else "cpu")
+
+    model = exp.get_model().to(args.device)
+    logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
+    model.eval()
+
+    if not args.trt:
+        if args.ckpt is None:
+            ckpt_file = osp.join(output_dir, "best_ckpt.pth.tar")
+        else:
+            ckpt_file = args.ckpt
+        logger.info("loading checkpoint")
+        ckpt = torch.load(ckpt_file, map_location="cpu")
+        # load the model state dict
+        model.load_state_dict(ckpt["model"])
+        logger.info("loaded checkpoint done.")
+
+    if args.fuse:
+        logger.info("\tFusing model...")
+        model = fuse_model(model)
+
+    if args.fp16:
+        model = model.half()  # to FP16
+
+    if args.trt:
+        assert not args.fuse, "TensorRT model is not support model fusing!"
+        trt_file = osp.join(output_dir, "model_trt.pth")
+        assert osp.exists(
+            trt_file
+        ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
+        model.head.decode_in_inference = False
+        decoder = model.head.decode_outputs
+        logger.info("Using TensorRT to inference")
+    else:
+        trt_file = None
+        decoder = None
+
+    return Predictor(model, exp, trt_file=trt_file, decoder=decoder, device=args.device, fp16=args.fp16)
+
+def get_darknet_predictor(exp, args):
+    network, class_names, class_colors = exp.get_model(
+        args.config_file,
+        args.data_file,
+        args.weights_file
+    )
+    return YOLOv4Predictor(network, class_names, class_colors, exp)
+
+def get_predictor(exp, args) -> Predictor:
+    output_dir = osp.join(exp.output_dir, args.experiment_name)
+    os.makedirs(output_dir, exist_ok=True)
+    if args.model_type == 'yolox':
+        return get_yolox_predictor(exp, args, output_dir)
+    elif args.model_type == 'darknet':
+        return get_darknet_predictor(exp, args)
+    else:
+        raise Exception("unreachable")
diff --git a/tools/test_predictor.py b/tools/test_predictor.py
new file mode 100644
index 000000000..5d9d1f7f3
--- /dev/null
+++ b/tools/test_predictor.py
@@ -0,0 +1,130 @@
+from tools.predictor import get_predictor
+from yolox.exp import get_exp
+from yolox.tracking_utils.timer import Timer
+import cv2
+import torch
+import argparse
+import numpy as np
+
+def make_parser():
+    parser = argparse.ArgumentParser("Test Predictor!")
+    # parser.add_argument(
+    #     "demo", default="image", help="demo type, eg. image, video and webcam"
+    # )
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+    # parser.add_argument(
+        # "--path", default="./videos/palace.mp4", help="path to images or video"
+    # )
+    parser.add_argument("--model_type", type=str, default="yolox", choices=["yolox", "darknet"])
+    parser.add_argument(
+        "--img_path", default="./images/image_0090.jpg"
+    )
+    parser.add_argument(
+        "--output_path", default="./bboxes.jpg"
+    )
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        type=str,
+        help="device to run our model, can either be cpu or gpu",
+    )
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="pls input your expriment description file",
+    )
+    parser.add_argument(
+        "--trt",
+        dest="trt",
+        default=False,
+        action="store_true",
+        help="Using TensorRT model for testing.",
+    )
+    parser.add_argument(
+        "--fp16",
+        dest="fp16",
+        default=False,
+        action="store_true",
+        help="Adopting mix precision evaluating.",
+    )
+    parser.add_argument(
+        "--fuse",
+        dest="fuse",
+        default=False,
+        action="store_true",
+        help="Fuse conv and bn for testing.",
+    )
+    parser.add_argument(
+        "--conf_thresh",
+        default=0.75,
+        type=float,
+        help="threshold for drawing detection box"
+    )
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
+    parser.add_argument(
+        "--config_file",
+        help="darknet config file(.cfg)",
+        default=None
+    )
+    parser.add_argument(
+        "--data_file",
+        help="darknet data file(.data)",
+        default=None
+    )
+    parser.add_argument(
+        "--weights_file",
+        help="darknet weights file(.weights)",
+        default=None
+    )
+
+    return parser
+
+def draw_bboxes(img, outputs, img_info):
+    #assume batch size is 1
+    assert(len(outputs) == 1)
+    outputs = outputs[0]
+    if isinstance(outputs, torch.Tensor):
+        # yoloX
+        # (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        outputs = outputs.cpu().numpy()
+        scores = outputs[:, 4] * outputs[:, 5]
+        bboxes = outputs[:, :4]
+    else:
+        # darknet
+        # (x1, y1, x2, y2, bbox_conf, class_pred)
+        scores = outputs[:, 4]
+        bboxes = outputs[:, :4]
+
+    print(outputs.shape)
+    ratio = img_info["ratio"]
+    for ((x1, y1, x2, y2), score) in zip(bboxes, scores):
+        x1, x2 = [x/ratio[0] for x in [x1, x2]]
+        y1, y2 = [y/ratio[1] for y in [y1, y2]]
+        x1, y1, x2, y2 = [int(x) for x in [x1, y1, x2, y2]]
+        if score > args.conf_thresh:
+            print(x1, y1, x2, y2, score)
+            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
+    return img
+
+
+def main(exp, args):
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    timer = Timer()
+    img = cv2.imread(args.img_path)
+    predictor = get_predictor(exp, args)
+    outputs, img_info = predictor.inference(img, timer)
+    img = draw_bboxes(img, outputs, img_info)
+    cv2.imwrite(args.output_path, img)
+
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    exp = get_exp(args.exp_file, args.name)
+
+    main(exp, args)
diff --git a/yolox/tracker/byte_tracker.py b/yolox/tracker/byte_tracker.py
index 2d004599b..ad54f3bdd 100644
--- a/yolox/tracker/byte_tracker.py
+++ b/yolox/tracker/byte_tracker.py
@@ -143,7 +143,7 @@ def __repr__(self):
 
 
 class BYTETracker(object):
-    def __init__(self, args, frame_rate=30):
+    def __init__(self, args, target_class, frame_rate=30, image_keep_ratio=True):
         self.tracked_stracks = []  # type: list[STrack]
         self.lost_stracks = []  # type: list[STrack]
         self.removed_stracks = []  # type: list[STrack]
@@ -155,6 +155,8 @@ def __init__(self, args, frame_rate=30):
         self.buffer_size = int(frame_rate / 30.0 * args.track_buffer)
         self.max_time_lost = self.buffer_size
         self.kalman_filter = KalmanFilter()
+        self.image_keep_ratio = image_keep_ratio
+        self.target_class = target_class
 
     def update(self, output_results, img_info, img_size):
         self.frame_id += 1
@@ -163,16 +165,33 @@ def update(self, output_results, img_info, img_size):
         lost_stracks = []
         removed_stracks = []
 
-        if output_results.shape[1] == 5:
+        if isinstance(output_results, torch.Tensor):
+            output_results = output_results.cpu().numpy()
+        if output_results.shape[1] == 6:
+            # (x1, y1, x2, y2, bbox_conf, class_pred)
+            # extract results where class_pred is target_class
+            output_results = output_results[output_results[:, 5] == self.target_class]
             scores = output_results[:, 4]
             bboxes = output_results[:, :4]
-        else:
-            output_results = output_results.cpu().numpy()
+        elif output_results.shape[1] == 7:
+            # (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+            # extract results where class_pred is target_class
+            output_results = output_results[output_results[:, 6] == self.target_class]
             scores = output_results[:, 4] * output_results[:, 5]
             bboxes = output_results[:, :4]  # x1y1x2y2
+        else:
+            raise Exception("output_results.shape[1] must be 6 or 7")
         img_h, img_w = img_info[0], img_info[1]
-        scale = min(img_size[0] / float(img_h), img_size[1] / float(img_w))
-        bboxes /= scale
+        if self.image_keep_ratio:
+            scale = min(img_size[0] / float(img_h), img_size[1] / float(img_w))
+            bboxes /= scale
+        else:
+            scale_h = img_size[0] / float(img_h)
+            scale_w = img_size[1] / float(img_w)
+            bboxes[:, 0] /= scale_w
+            bboxes[:, 1] /= scale_h
+            bboxes[:, 2] /= scale_w
+            bboxes[:, 3] /= scale_h
 
         remain_inds = scores > self.args.track_thresh
         inds_low = scores > 0.1