libavfilter/vf_dnn_detect: Add yolov3 support

Add yolov3 support. The difference of yolov3 is that it has multiple outputs in different scale to perform better on both large and small object. The model detail refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/yolo-v3-tf Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> Reviewed-by: Guo Yejun <yejun.guo@intel.com>
author: Wenbin Chen <wenbin.chen@intel.com> 2023-12-12 10:33:33 +0800
committer: Guo Yejun <yejun.guo@intel.com> 2023-12-16 21:50:50 +0800
commit: a882fc029493ef2691360dcba360b0e998c628b4 (patch)
tree: c8d4029c75f44ba776e5a45bce75e5f53fc29258 /libavfilter
parent: da02836b9d204ca002d973ef7b1e6f60a2316cb1 (diff)
download: ffmpeg-a882fc029493ef2691360dcba360b0e998c628b4.tar.gz
1 files changed, 27 insertions, 1 deletions
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 35c0508c50..4c537cf255 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -35,6 +35,7 @@
 typedef enum {
     DDMT_SSD,
     DDMT_YOLOV1V2,
+    DDMT_YOLOV3
 } DNNDetectionModelType;
 
 typedef struct DnnDetectContext {
@@ -73,6 +74,7 @@ static const AVOption dnn_detect_options[] = {
     { "model_type",  "DNN detection model type",   OFFSET2(model_type),      AV_OPT_TYPE_INT,       { .i64 = DDMT_SSD },    INT_MIN, INT_MAX, FLAGS, "model_type" },
         { "ssd",     "output shape [1, 1, N, 7]",  0,                        AV_OPT_TYPE_CONST,       { .i64 = DDMT_SSD },    0, 0, FLAGS, "model_type" },
         { "yolo",    "output shape [1, N*Cx*Cy*DetectionBox]",  0,           AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV1V2 },    0, 0, FLAGS, "model_type" },
+        { "yolov3",  "outputs shape [1, N*D, Cx, Cy]",  0,                   AV_OPT_TYPE_CONST,       { .i64 = DDMT_YOLOV3 },      0, 0, FLAGS, "model_type" },
     { "cell_w",      "cell width",                 OFFSET2(cell_w),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
     { "cell_h",      "cell height",                OFFSET2(cell_h),          AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
     { "nb_classes",  "The number of class",        OFFSET2(nb_classes),      AV_OPT_TYPE_INT,       { .i64 = 0 },    0, INTMAX_MAX, FLAGS },
@@ -151,6 +153,11 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
         cell_h = ctx->cell_h;
         scale_w = cell_w;
         scale_h = cell_h;
+    } else {
+        cell_w = output[output_index].width;
+        cell_h = output[output_index].height;
+        scale_w = ctx->scale_width;
+        scale_h = ctx->scale_height;
     }
     box_size = nb_classes + 5;
 
@@ -178,6 +185,7 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out
                       output[output_index].height *
                       output[output_index].width / box_size / cell_w / cell_h;
 
+    anchors = anchors + (detection_boxes * output_index * 2);
     /**
      * find all candidate bbox
      * yolo output can be reshaped to [B, N*D, Cx, Cy]
@@ -290,6 +298,21 @@ static int dnn_detect_post_proc_yolo(AVFrame *frame, DNNData *output, AVFilterCo
     return 0;
 }
 
+static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output,
+                                       AVFilterContext *filter_ctx, int nb_outputs)
+{
+    int ret = 0;
+    for (int i = 0; i < nb_outputs; i++) {
+        ret = dnn_detect_parse_yolo_output(frame, output, i, filter_ctx);
+        if (ret < 0)
+            return ret;
+    }
+    ret = dnn_detect_fill_side_data(frame, filter_ctx);
+    if (ret < 0)
+        return ret;
+    return 0;
+}
+
 static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
 {
     DnnDetectContext *ctx = filter_ctx->priv;
@@ -386,8 +409,11 @@ static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outpu
         ret = dnn_detect_post_proc_yolo(frame, output, filter_ctx);
         if (ret < 0)
             return ret;
+    case DDMT_YOLOV3:
+        ret = dnn_detect_post_proc_yolov3(frame, output, filter_ctx, nb_outputs);
+        if (ret < 0)
+            return ret;
     }
-
     return 0;
 }
author	Wenbin Chen <wenbin.chen@intel.com>	2023-12-12 10:33:33 +0800
committer	Guo Yejun <yejun.guo@intel.com>	2023-12-16 21:50:50 +0800
commit	a882fc029493ef2691360dcba360b0e998c628b4 (patch)
tree	c8d4029c75f44ba776e5a45bce75e5f53fc29258 /libavfilter
parent	da02836b9d204ca002d973ef7b1e6f60a2316cb1 (diff)
download	ffmpeg-a882fc029493ef2691360dcba360b0e998c628b4.tar.gz