Video processing with YOLOv4 and TensorFlow

In this blog we will show how to process video with YOLOv4 and tensorflow. YOLOv4 is out and it’s hot. YOLOv4 is significantly better than YOLOv3 as can be seen in the pic below.

YOLOv4 video processing tensowrflow


YOLO v4 Overview

YOLOv4 uses several of universal features like Weighted-Residual-Connections (WRC), Cross-Stage-Partial-connections (CSP), Cross mini-Batch Normalization (CmBN), Self-adversarial-training (SAT) and Mish-activation. They also use new features: WRC, CSP, CmBN, SAT, Mish activation, Mosaic data augmentation, CmBN, DropBlock regularization, and CIoU loss. A combination of known and new features has enabled them to achieve 43.5%
AP (65.7% AP50) for the MS COCO dataset at a realtime speed of ∼65 FPS on Tesla V100. It is faster and more accurate than YOLOv3 and faster than EfficientDet for similar accuracies.

The authors have tried to design a model that can be trained efficiently on a single GPU. It is optimised to work well in production systems. Users can train and implement YOLOv4 based programs on single GPU systems, keeping the cost low. For more details please see the YOLOv4 paper.

Video processing with YOLO v4 and TensorFlow

We will use this implementation of YOLO in python and Tensorflow in our work. The github project provides implementation in YOLOv3, YOLOv4. It also has methods to convert YOLO weights files to tflite (tensorflow lite models). Tensorflow lite models are smaller and can be implemented for speed at a cost of accuracy. We can also use Tensorflow lite models on edge devices like mobiles, etc.

We will first create a development environment using virtualenv.

mkvirtualenv tf-yolo

Clone the project.

git clone
cd tensorflow-yolo4-tflite

Install the requirements file.

pip install -r requirements.txt

I found an error in the repository, which was fixed by creating an empty in the core directory.

touch core/

I have made a new file for processing video using the existing code to process image.

import time
from absl import app, flags, logging
from absl.flags import FLAGS
import core
import core.utils as utils
from core.yolov4 import YOLOv4, YOLOv3, YOLOv3_tiny, decode
from PIL import Image
from core.config import cfg
import cv2
import numpy as np
import tensorflow as tf

import imutils
from import FPS
from import VideoStream

flags.DEFINE_string('framework', 'tf', '(tf, tflite')
flags.DEFINE_string('weights', '../darknet/yolov4.weights',
                    'path to weights file')
flags.DEFINE_integer('size', 608, 'resize images to')
flags.DEFINE_boolean('tiny', False, 'yolo or yolo-tiny')
flags.DEFINE_string('model', 'yolov4', 'yolov3 or yolov4')
flags.DEFINE_string('video', './data/road.mp4', 'path to input video')
flags.DEFINE_string('output', 'out.avi', 'path to output video')

def main(_argv):
        #FPS calculator enables

    if FLAGS.tiny:
        STRIDES = np.array(cfg.YOLO.STRIDES_TINY)
        ANCHORS = utils.get_anchors(cfg.YOLO.ANCHORS_TINY, FLAGS.tiny)
        STRIDES = np.array(cfg.YOLO.STRIDES)
        if FLAGS.model == 'yolov4':
            ANCHORS = utils.get_anchors(cfg.YOLO.ANCHORS, FLAGS.tiny)
            ANCHORS = utils.get_anchors(cfg.YOLO.ANCHORS_V3, FLAGS.tiny)
    NUM_CLASS = len(utils.read_class_names(cfg.YOLO.CLASSES))
    input_size = FLAGS.size
    video_path =
    fps = FPS().start()

    #Declaration of output file to save the analysed video

    vs = cv2.VideoCapture(video_path)
    while True and cnt < 100:
            (grabbed, original_image) =
            original_image_size = original_image.shape[:2]
            #print (original_image_size)

        if cnt == 1:
            fourcc = cv2.VideoWriter_fourcc(*"MJPG")
            writer = cv2.VideoWriter(output, fourcc, 30, original_image_size[::-1], True)

        original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
        original_image_size = original_image.shape[:2]

        image_data = utils.image_preporcess(np.copy(original_image), [input_size, input_size])
        image_data = image_data[np.newaxis, ...].astype(np.float32)
        if FLAGS.framework == 'tf':
            if cnt == 1:
                input_layer = tf.keras.layers.Input([input_size, input_size, 3])
                if FLAGS.tiny:
                    feature_maps = YOLOv3_tiny(input_layer, NUM_CLASS)
                    bbox_tensors = []
                    for i, fm in enumerate(feature_maps):
                        bbox_tensor = decode(fm, NUM_CLASS, i)
                    model = tf.keras.Model(input_layer, bbox_tensors)
                    utils.load_weights_tiny(model, FLAGS.weights)
                    if FLAGS.model == 'yolov3':
                        feature_maps = YOLOv3(input_layer, NUM_CLASS)
                        bbox_tensors = []
                        for i, fm in enumerate(feature_maps):
                            bbox_tensor = decode(fm, NUM_CLASS, i)
                        model = tf.keras.Model(input_layer, bbox_tensors)
                        utils.load_weights_v3(model, FLAGS.weights)
                    elif FLAGS.model == 'yolov4':
                        feature_maps = YOLOv4(input_layer, NUM_CLASS)
                        bbox_tensors = []
                        for i, fm in enumerate(feature_maps):
                            bbox_tensor = decode(fm, NUM_CLASS, i)
                        model = tf.keras.Model(input_layer, bbox_tensors)
                        utils.load_weights(model, FLAGS.weights)

            pred_bbox = model.predict(image_data)
            if cnt==1:
                # Load TFLite model and allocate tensors.
                interpreter = tf.lite.Interpreter(model_path=FLAGS.weights)
                # Get input and output tensors.
                input_details = interpreter.get_input_details()
                output_details = interpreter.get_output_details()
                interpreter.set_tensor(input_details[0]['index'], image_data)
            pred_bbox = [interpreter.get_tensor(output_details[i]['index']) for i in range(len(output_details))]

        if FLAGS.model == 'yolov4':
            pred_bbox = utils.postprocess_bbbox(pred_bbox, ANCHORS, STRIDES, XYSCALE)
            pred_bbox = utils.postprocess_bbbox(pred_bbox, ANCHORS, STRIDES)
        bboxes = utils.postprocess_boxes(pred_bbox, original_image_size, input_size, 0.25)
        bboxes = utils.nms(bboxes, 0.213, method='nms')

        image = utils.draw_bbox(original_image, bboxes)
        image = Image.fromarray(image)
        image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)

        cv2.imshow("output", cv2.resize(image, (800, 600)))
        key = cv2.waitKey(1) & 0xFF
        if key == ord("q"):


    print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
    print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

    # do a bit of cleanup

    # release the file pointers
    print("[INFO] cleaning up...")

if __name__ == '__main__':
    except SystemExit:

Video processing with YOLOv4 and TensorFlow

python --weights ./data/yolov4.weights --framework tf --size 608 --video ./data/road.mp4