aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/vvc/vvc_sad.asm
blob: 982951a370495f8360e7d539b40f07f9438c6410 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
; /*
; * Provide SIMD DMVR SAD functions for VVC decoding
; *
; * Copyright (c) 2024 Stone Chen
; *
; * This file is part of FFmpeg.
; *
; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */

%include "libavutil/x86/x86util.asm"
%define MAX_PB_SIZE 128
%define ROWS 2

SECTION_RODATA

pw_1: times 2 dw 1

; DMVR SAD is only calculated on even rows to reduce complexity
; Additionally the only valid sizes are 8x16, 16x8, and 16x16
SECTION .text

%macro MIN_MAX_SAD 3
    pminuw           %3, %2, %1
    pmaxuw           %1, %2, %1
    psubusw          %1, %1, %3
%endmacro

%macro HORIZ_ADD 3  ; xm0, xm1, m1
    vextracti128     %1, %3, q0001  ;        3        2      1          0
    paddd            %1, %2         ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
    pshufd           %2, %1, q0032  ; xm1    -      -     (7 + 3)   (6 + 2)
    paddd            %1, %1, %2     ; xm0    _      _     (5 1 7 3) (4 0 6 2)
    pshufd           %2, %1, q0001  ; xm1    _      _     (5 1 7 3) (5 1 7 3)
    paddd            %1, %1, %2     ;                               (01234567)
%endmacro

%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL

INIT_YMM avx2

cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
    movsxdifnidn    dxq, dxd
    movsxdifnidn    dyq, dyd

    sub             dxq, 2
    sub             dyq, 2

    mov             off1q, 2
    mov             off2q, 2

    add             off1q, dyq
    sub             off2q, dyq

    shl             off1q, 7
    shl             off2q, 7

    add             off1q, dxq
    sub             off2q, dxq

    lea             src1q, [src1q + off1q * 2 + 2 * 2]
    lea             src2q, [src2q + off2q * 2 + 2 * 2]

    pxor               m3, m3
    vpbroadcastd       m4, [pw_1]

    cmp          block_wd, 16
    je         vvc_sad_16

    vvc_sad_8:
        .loop_height:
        movu              xm0, [src1q]
        vinserti128        m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
        movu              xm1, [src2q]
        vinserti128        m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1

        MIN_MAX_SAD        m1, m0, m2
        pmaddwd            m1, m4
        paddd              m3, m1

        add         src1q, 2 * MAX_PB_SIZE * ROWS * 2
        add         src2q, 2 * MAX_PB_SIZE * ROWS * 2

        sub      block_hd, 4
        jg   .loop_height

        HORIZ_ADD     xm0, xm3, m3
        movd          eax, xm0
    RET

    vvc_sad_16:
        sar      block_wd, 4
        .loop_height:
        mov         off1q, src1q
        mov         off2q, src2q
        mov      row_idxd, block_wd

        .loop_width:
            movu               m0, [src1q]
            movu               m1, [src2q]
            MIN_MAX_SAD        m1, m0, m2
            pmaddwd            m1, m4
            paddd              m3, m1

            add             src1q, 32
            add             src2q, 32
            dec          row_idxd
            jg        .loop_width

        lea         src1q, [off1q + ROWS * MAX_PB_SIZE * 2]
        lea         src2q, [off2q + ROWS * MAX_PB_SIZE * 2]

        sub      block_hd, 2
        jg   .loop_height

        HORIZ_ADD     xm0, xm3, m3
        movd          eax, xm0
    RET

%endif
%endif