PaddlePaddle
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎configs/localization/yowo.yaml
+90 b/‎configs/localization/yowo.yaml
+90
diff --git a/‎data/ucf24/build_split.py
+42 b/‎data/ucf24/build_split.py
+42
diff --git a/‎data/ucf24/download_frames_annotations.sh
+5 b/‎data/ucf24/download_frames_annotations.sh
+5
diff --git a/‎data/ucf24/visualization.py
+30 b/‎data/ucf24/visualization.py
+30
diff --git a/‎docs/en/dataset/ucf24.md
+73 b/‎docs/en/dataset/ucf24.md
+73
diff --git a/‎docs/en/model_zoo/localization/yowo.md
+138 b/‎docs/en/model_zoo/localization/yowo.md
+138
diff --git a/‎docs/images/horse_riding.gif
996 KB b/‎docs/images/horse_riding.gif
996 KB
diff --git a/‎docs/images/yowo.jpg
314 KB b/‎docs/images/yowo.jpg
314 KB
@@ -5,6 +5,7 @@ __pycache__/
 *.swp
 *.swo
 *.swn
+.DS_Store
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -0,0 +1,90 @@
+MODEL: #MODEL field
+    framework: "YOWOLocalizer" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
+    backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
+        name: "YOWO" #Mandatory, The name of backbone.
+        num_class: 24
+        pretrained_2d: "data/ucf24/darknet.pdparam"
+        pretrained_3d: "data/ucf24/resnext101_kinetics.pdparams"
+    loss:
+        name: "RegionLoss"
+        num_classes: 24
+        num_anchors: 5
+        anchors: [0.70458, 1.18803, 1.26654, 2.55121, 1.59382, 4.08321, 2.30548, 4.94180, 3.52332, 5.91979]
+        object_scale: 5
+        noobject_scale: 1
+        class_scale: 1
+        coord_scale: 1
+
+DATASET: #DATASET field
+    batch_size: 8 #Mandatory, bacth size
+    num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
+    test_batch_size: 8
+    valid_batch_size: 8
+    train:
+        format: "UCF24Dataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        file_path: "data/ucf24/trainlist.txt" #Mandatory, train data index file path
+    valid:
+        format: "UCF24Dataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        file_path: "data/ucf24/testlist.txt" #Mandatory, test data index file path
+    test:
+        format: "UCF24Dataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
+        file_path: "data/ucf24/testlist.txt" #Mandatory, test data index file path
+
+PIPELINE: #PIPELINE field TODO.....
+    train: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'
+        sample:
+            name: "SamplerUCF24"
+            num_frames: 16
+            valid_mode: False
+        transform: #Mandotary, image transform operator.
+            - YowoAug:
+                valid_mode: False
+    valid: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'
+        sample:
+            name: "SamplerUCF24"
+            num_frames: 16
+            valid_mode: True
+        transform: #Mandotary, image transform operator.
+            - YowoAug:
+                valid_mode: True
+    test:
+        sample:
+            name: "SamplerUCF24"
+            num_frames: 16
+            valid_mode: True
+        transform: #Mandotary, image transform operator.
+            - YowoAug:
+                valid_mode: True
+
+OPTIMIZER: #OPTIMIZER field
+    name: Adam
+    learning_rate:
+        learning_rate: 0.0001
+        name: 'MultiStepDecay'
+        milestones: [1, 2, 3, 4]
+        gamma: 0.5
+    weight_decay:
+        name: "L2"
+        value: 0.0005
+
+GRADIENT_ACCUMULATION:
+    global_batch_size: 128 # Specify the sum of batches to be calculated by all GPUs
+
+METRIC:
+    name: 'YOWOMetric'
+    gt_folder: 'data/ucf24/groundtruths_ucf'
+    result_path: 'output/detections_test'
+    threshold: 0.5
+    log_interval: 100
+
+INFERENCE:
+    name: 'YOWO_Inference_helper'
+    num_seg: 16
+    target_size: 224
+
+model_name: "YOWO"
+log_interval: 20 #Optional, the interal of logger, default:10
+save_interval: 1
+epochs: 5 #Mandatory, total epoch
+log_level: "INFO" #Optional, the logger level. default: "INFO"
+val_interval: 1
@@ -0,0 +1,42 @@
+import argparse
+import os
+
+
+def build_split_list(raw_path, mode):
+    """
+    Generate target format splits based on original splits
+    """
+    raw_path = os.path.join(raw_path, '{}list01.txt'.format(mode))
+    print('{} analysis begin'.format(raw_path))
+    with open(raw_path, 'r') as fin:
+        lines = fin.readlines()
+    fin.close()
+
+    with open('{}list.txt'.format(mode), 'w') as fout:
+        for i, line in enumerate(lines):
+            line = line.strip()  # 'class_name/video_name'
+            label_dir = os.path.join('labels', line)  # 'data/ucf24/labels/class_name/video_name'
+            if not os.path.isdir(label_dir):
+                continue
+            txt_list = os.listdir(label_dir)
+            txt_list.sort()
+            for txt_item in txt_list:
+                filename = os.path.join('data', 'ucf24', label_dir, txt_item)
+                fout.write(filename + '\n')
+            if i % 200 == 0:
+                print('{} videos parsed'.format(i))
+    fout.close()
+    print('{} analysis done'.format(raw_path))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Build file list')
+    parser.add_argument('--raw_path', type=str, default='./splitfiles')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    build_split_list(args.raw_path, 'train')
+    build_split_list(args.raw_path, 'test')
@@ -0,0 +1,5 @@
+#! /usr/bin/bash env
+
+wget --no-check-certificate "https://videotag.bj.bcebos.com/Data/ucf24.zip"
+unzip -q ucf24.zip
+rm -rf ./ucf24.zip
@@ -0,0 +1,30 @@
+import argparse
+import imageio
+import os
+
+
+def imgs2gif(frames_dir, duration):
+    """
+    img_dir: directory for inference results
+    duration: duration = 1 / fps
+    """
+    frames = []
+    for idx in sorted(os.listdir(frames_dir)):
+        img = os.path.join(frames_dir, idx)
+        if img.endswith('jpg'):
+            frames.append(imageio.imread(img))
+    save_name = '.'.join([frames_dir, 'gif'])
+    imageio.mimsave(save_name, frames, 'GIF', duration=duration)
+    print(save_name, 'saved!')
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Build file list')
+    parser.add_argument('--frames_dir', type=str, default='./inference/YOWO_infer/HorseRiding')
+    parser.add_argument('--duration', type=float, default=0.04)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    imgs2gif(args.frames_dir, args.duration)
@@ -0,0 +1,73 @@
+English | [简体中文](../../zh-CN/dataset/ucf24.md)
+
+# UCF24 Data Preparation
+This document mainly introduces the preparation process of UCF24 dataset. It mainly includes the download of the RGB frame files, the annotation files and the pathlist of the generated file.
+
+---
+## 1. Data Download
+Detailed information on UCF24 data can be found on the website [UCF24](http://www.thumos.info/download.html). For ease of use, PaddleVideo provides a download script for the RGB frame, annotation file of the UCF24 data.
+
+First, please ensure access to the [data/ucf24/ directory](../../../data/ucf24) and enter the following command for downloading the RGB frame, annotation file of the UCF24 dataset.
+
+```shell
+bash download_frames_annotations.sh
+```
+
+- To run this command you need to install the unrar decompression tool, which can be installed using the pip method.
+
+- The RGB frame files will be stored in the [data/ucf24/rgb-images/ directory](../../../data/ucf24/rgb-images)
+
+- The annotation files will be stored in the [data/ucf24/lables/ directory](../../../data/ucf24/labels)
+
+---
+## 2. File Pathlist Generation
+To specify the format for dividing the file, enter the following command
+
+```python
+python build_split.py --raw_path ./splitfiles
+```
+
+**Description of parameters**
+
+`--raw_path`： indicates the storage path of the original division file
+
+
+# Folder Structure
+After the whole data pipeline for UCF24 preparation, the folder structure will look like:
+
+```
+├── data
+│   ├── ucf24
+│   |   ├── groundtruths_ucf
+│   |   ├── labels
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00009.txt
+│   |   |   |   |   ├── 00010.txt
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00050.txt
+│   |   |   |   |   ├── 00051.txt
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── rgb-images
+│   |   |   ├── Basketball
+│   |   |   |   ├── v_Basketball_g01_c01
+│   |   |   |   |   ├── 00001.jpg
+│   |   |   |   |   ├── 00002.jpg
+│   |   |   |   |   ├── ...
+│   |   |   |   |   ├── 00140.jpg
+│   |   |   |   |   ├── 00141.jpg
+│   |   |   ├── ...
+│   |   |   ├── WalkingWithDog
+│   |   |   |   ├── v_WalkingWithDog_g01_c01
+│   |   |   |   ├── ...
+│   |   |   |   ├── v_WalkingWithDog_g25_c04
+│   |   ├── splitfiles
+│   |   |   ├── trainlist01.txt
+│   |   |   |── testlist01.txt 
+│   |   ├── trainlist.txt
+│   |   |── testlist.txt 
+```
@@ -0,0 +1,138 @@
+[简体中文](../../../zh-CN/model_zoo/localization/yowo.md) | English
+
+# YOWO
+
+## Content
+
+- [Introduction](#Introduction)
+- [Data](#DATA)
+- [Train](#Train)
+- [Test](#Test)
+- [Inference](#Inference)
+- [Reference](#Reference)
+
+
+## Introduction
+
+YOWO is a single-stage network with two branches. One branch extracts spatial features of key frames (i.e., the current frame) via 2D-CNN, while the other branch acquires spatio-temporal features of clips consisting of previous frames via 3D-CNN. To accurately aggregate these features, YOWO uses a channel fusion and attention mechanism that maximizes the inter-channel dependencies. Finally, the fused features are subjected to frame-level detection.
+
+
+<div align="center">
+<img src="../../../images/yowo.jpg">
+</div>
+
+
+## Data
+
+UCF101-24 data download and preparation please refer to [UCF101-24 data preparation](../../dataset/ucf24.md)
+
+
+## Train
+
+### UCF101-24 data set training
+
+#### Download and add pre-trained models
+
+1. Download the pre-training model [resnext-101-kinetics](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams) 和 [darknet](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam) as Backbone initialization parameters, or download through the wget command
+
+   ```bash
+    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/darknet.pdparam
+    wget -nc https://videotag.bj.bcebos.com/PaddleVideo-release2.3/resnext101_kinetics.pdparams
+   ```
+
+2. Open `PaddleVideo/configs/localization/yowo.yaml`, and fill in the downloaded weight storage path below `pretrained_2d:` and `pretrained_3d:` respectively
+
+    ```yaml
+    MODEL:
+        framework: "YOWOLocalizer"
+        backbone:
+            name: "YOWO"
+            num_class: 24
+            pretrained_2d: fill in the path of 2D pre-training model here
+            pretrained_3d: fill in the path of 3D pre-training model here
+    ```
+
+#### Start training
+
+- The UCF101-24 data set uses 1 card for training, and the start command of the training method is as follows:
+
+    ```bash
+    python3 main.py -c configs/localization/yowo.yaml --validate --seed=1
+    ```
+
+- Turn on amp mixed-precision training to speed up the training process. The training start command is as follows:
+
+    ```bash
+    python3 main.py --amp -c configs/localization/yowo.yaml --validate --seed=1
+    ```
+
+- In addition, you can customize and modify the parameter configuration to achieve the purpose of training/testing on different data sets. It is recommended that the naming method of the configuration file is `model_dataset name_file format_data format_sampling method.yaml` , Please refer to [config](../../tutorials/config.md) for parameter usage.
+
+
+## Test
+
+- The YOWO model is verified synchronously during training. You can find the keyword `best` in the training log to obtain the model test accuracy. The log example is as follows:
+
+  ```
+  Already save the best model (fsocre)0.8779
+  ```
+
+- Since the verification index of the YOWO model test mode is **Frame-mAP (@ IoU 0.5)**, which is different from the **fscore** used in the verification mode during the training process, so the verification index recorded in the training log, called `fscore `, does not represent the final test score, so after the training is completed, you can use the test mode to test the best model to obtain the final index, the command is as follows:
+
+  ```bash
+  python3 main.py -c configs/localization/yowo.yaml --test --seed=1 -w 'output/YOWO/YOWO_epoch_00005.pdparams'
+  ```
+
+
+  When the test configuration uses the following parameters, the test indicators on the validation data set of UCF101-24 are as follows:
+
+
+  | Model    | 3D-CNN backbone | 2D-CNN backbone | Dataset  |Input    | Frame-mAP <br>(@ IoU 0.5)    |   checkpoints  |
+  | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: | :-----------: |
+  | YOWO | 3D-ResNext-101 | Darknet-19 | UCF101-24 | 16-frames, d=1 | 80.94 | [YOWO.pdparams](https://videotag.bj.bcebos.com/PaddleVideo-release2.3/YOWO_epoch_00005.pdparams) |
+
+
+
+## Inference
+
+### Export inference model
+
+```bash
+python3 tools/export_model.py -c configs/localization/yowo.yaml -p 'output/YOWO/YOWO_epoch_00005.pdparams'
+```
+
+The above command will generate the model structure file `YOWO.pdmodel` and the model weight file `YOWO.pdiparams` required for prediction.
+
+- For the meaning of each parameter, please refer to [Model Reasoning Method](../../usage.md#2-infer)
+
+### Use prediction engine inference
+
+- Download the test video [HorseRiding.avi](https://videotag.bj.bcebos.com/Data/HorseRiding.avi) for a quick experience, or via the wget command. The downloaded video should be placed in the `data/ucf24` directory:
+
+```bash
+wget -nc https://videotag.bj.bcebos.com/Data/HorseRiding.avi
+```
+
+- Run the following command for inference:
+
+```bash
+python3 tools/predict.py -c configs/localization/yowo.yaml -i 'data/ucf24/HorseRiding.avi' --model_file ./inference/YOWO.pdmodel --params_file ./inference/YOWO.pdiparams
+```
+
+- When inference is over, the prediction results in image form will be saved in the `inference/YOWO_infer` directory. The image sequence can be converted to a gif by running the following command to complete the final visualisation.
+
+```
+python3 data/ucf24/visualization.py --frames_dir ./inference/YOWO_infer/HorseRiding --duration 0.04
+```
+
+The resulting visualization is as follows:
+
+<div align="center">
+  <img  src="../../../images/horse_riding.gif" alt="Horse Riding">
+</div>
+
+It can be seen that using the YOWO model trained on UCF101-24 to predict `data/ucf24/HorseRiding.avi`, the category of each frame output is HorseRiding with a confidence level of about 0.80.
+
+## Reference
+
+- [You Only Watch Once: A Unified CNN Architecture for Real-Time Spatiotemporal Action Localization](https://arxiv.org/pdf/1911.06644.pdf), Köpüklü O, Wei X, Rigoll G.