libavfilter/x86/af_anlmdn.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

;*****************************************************************************
;* x86-optimized functions for anlmdn filter
;* Copyright (c) 2017 Paul B Mahol
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

;------------------------------------------------------------------------------
; float ff_compute_distance_ssd(float *f1, const float *f2, ptrdiff_t len)
;------------------------------------------------------------------------------

INIT_XMM sse
cglobal compute_distance_ssd, 3,5,3, f1, f2, len, r, x
    mov       xq, lenq
    shl       xq, 2
    neg       xq
    add       f1q, xq
    add       f2q, xq
    xor       xq, xq
    shl       lenq, 1
    add       lenq, 1
    shl       lenq, 2
    mov       rq, lenq
    and       rq, mmsize - 1
    xorps     m0, m0
    cmp       lenq, mmsize
    jl .loop1
    sub       lenq, rq
ALIGN 16
    .loop0:
        movups    m1, [f1q + xq]
        movups    m2, [f2q + xq]
        subps     m1, m2
        mulps     m1, m1
        addps     m0, m1
        add       xq, mmsize
        cmp       xq, lenq
        jl .loop0

    movhlps   xmm1, xmm0
    addps     xmm0, xmm1
    movss     xmm1, xmm0
    shufps    xmm0, xmm0, 1
    addss     xmm0, xmm1

    cmp       rq, 0
    je .end
    add       lenq, rq
    .loop1:
        movss    xm1, [f1q + xq]
        subss    xm1, [f2q + xq]
        mulss    xm1, xm1
        addss    xm0, xm1
        add       xq, 4
        cmp       xq, lenq
        jl .loop1
    .end:
%if ARCH_X86_64 == 0
    movss     r0m, xm0
    fld dword r0m
%endif
    RET