1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
/*
* IDCT AArch64 NEON optimisations
*
* Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
// Clamp 16-bit signed block coefficients to unsigned 8-bit
// On entry:
// x0 -> array of 64x 16-bit coefficients
// x1 -> 8-bit results
// x2 = row stride for results, bytes
function ff_put_pixels_clamped_neon, export=1
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
sqxtun v4.8b, v4.8h
st1 {v0.8b}, [x1], x2
sqxtun v0.8b, v5.8h
st1 {v1.8b}, [x1], x2
sqxtun v1.8b, v6.8h
st1 {v2.8b}, [x1], x2
sqxtun v2.8b, v7.8h
st1 {v3.8b}, [x1], x2
st1 {v4.8b}, [x1], x2
st1 {v0.8b}, [x1], x2
st1 {v1.8b}, [x1], x2
st1 {v2.8b}, [x1]
ret
endfunc
// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
// On entry:
// x0 -> array of 64x 16-bit coefficients
// x1 -> 8-bit results
// x2 = row stride for results, bytes
function ff_put_signed_pixels_clamped_neon, export=1
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
movi v4.8b, #128
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
sqxtn v0.8b, v0.8h
sqxtn v1.8b, v1.8h
sqxtn v2.8b, v2.8h
sqxtn v3.8b, v3.8h
sqxtn v5.8b, v16.8h
add v0.8b, v0.8b, v4.8b
sqxtn v6.8b, v17.8h
add v1.8b, v1.8b, v4.8b
sqxtn v7.8b, v18.8h
add v2.8b, v2.8b, v4.8b
sqxtn v16.8b, v19.8h
add v3.8b, v3.8b, v4.8b
st1 {v0.8b}, [x1], x2
add v0.8b, v5.8b, v4.8b
st1 {v1.8b}, [x1], x2
add v1.8b, v6.8b, v4.8b
st1 {v2.8b}, [x1], x2
add v2.8b, v7.8b, v4.8b
st1 {v3.8b}, [x1], x2
add v3.8b, v16.8b, v4.8b
st1 {v0.8b}, [x1], x2
st1 {v1.8b}, [x1], x2
st1 {v2.8b}, [x1], x2
st1 {v3.8b}, [x1]
ret
endfunc
// Add 16-bit signed block coefficients to unsigned 8-bit
// On entry:
// x0 -> array of 64x 16-bit coefficients
// x1 -> 8-bit input and results
// x2 = row stride for 8-bit input and results, bytes
function ff_add_pixels_clamped_neon, export=1
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
mov x3, x1
ld1 {v4.8b}, [x1], x2
ld1 {v5.8b}, [x1], x2
ld1 {v6.8b}, [x1], x2
ld1 {v7.8b}, [x1], x2
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
uaddw v0.8h, v0.8h, v4.8b
uaddw v1.8h, v1.8h, v5.8b
uaddw v2.8h, v2.8h, v6.8b
ld1 {v4.8b}, [x1], x2
uaddw v3.8h, v3.8h, v7.8b
ld1 {v5.8b}, [x1], x2
sqxtun v0.8b, v0.8h
ld1 {v6.8b}, [x1], x2
sqxtun v1.8b, v1.8h
ld1 {v7.8b}, [x1]
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
uaddw v4.8h, v16.8h, v4.8b
st1 {v0.8b}, [x3], x2
uaddw v0.8h, v17.8h, v5.8b
st1 {v1.8b}, [x3], x2
uaddw v1.8h, v18.8h, v6.8b
st1 {v2.8b}, [x3], x2
uaddw v2.8h, v19.8h, v7.8b
sqxtun v4.8b, v4.8h
sqxtun v0.8b, v0.8h
st1 {v3.8b}, [x3], x2
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
st1 {v4.8b}, [x3], x2
st1 {v0.8b}, [x3], x2
st1 {v1.8b}, [x3], x2
st1 {v2.8b}, [x3]
ret
endfunc
|