1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
|
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <string.h>
#include "libavutil/intreadwrite.h"
#include "libavutil/macros.h"
#include "libavutil/mem_internal.h"
#include "libavcodec/hevcdsp.h"
#include "checkasm.h"
static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
#define BUF_STRIDE (16 * 2)
#define BUF_LINES (16)
// large buffer sizes based on high bit depth
#define BUF_OFFSET (2 * BUF_STRIDE * BUF_LINES)
#define BUF_SIZE (2 * BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
#define randomize_buffers(buf0, buf1, size) \
do { \
uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
int k; \
for (k = 0; k < size; k += 4) { \
uint32_t r = rnd() & mask; \
AV_WN32A(buf0 + k, r); \
AV_WN32A(buf1 + k, r); \
} \
} while (0)
static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth, int c)
{
// see tctable[] in hevc_filter.c, we check full range
int32_t tc[2] = { rnd() % 25, rnd() % 25 };
// no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c
// variant) functions, see deblocking_filter_CTB() in hevc_filter.c
uint8_t no_p[2] = { rnd() & c, rnd() & c };
uint8_t no_q[2] = { rnd() & c, rnd() & c };
LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
declare_func(void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
if (check_func(c ? h->hevc_h_loop_filter_chroma_c : h->hevc_h_loop_filter_chroma,
"hevc_h_loop_filter_chroma%d%s", bit_depth, c ? "_full" : ""))
{
randomize_buffers(buf0, buf1, BUF_SIZE);
call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
if (memcmp(buf0, buf1, BUF_SIZE))
fail();
bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
}
if (check_func(c ? h->hevc_v_loop_filter_chroma_c : h->hevc_v_loop_filter_chroma,
"hevc_v_loop_filter_chroma%d%s", bit_depth, c ? "_full" : ""))
{
randomize_buffers(buf0, buf1, BUF_SIZE);
call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
if (memcmp(buf0, buf1, BUF_SIZE))
fail();
bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
}
}
#define P3 buf[-4 * xstride]
#define P2 buf[-3 * xstride]
#define P1 buf[-2 * xstride]
#define P0 buf[-1 * xstride]
#define Q0 buf[0 * xstride]
#define Q1 buf[1 * xstride]
#define Q2 buf[2 * xstride]
#define Q3 buf[3 * xstride]
#define TC25(x) ((tc[x] * 5 + 1) >> 1)
#define MASK(x) (uint16_t)(x & ((1 << (bit_depth)) - 1))
#define GET(x) ((SIZEOF_PIXEL == 1) ? *(uint8_t*)(&x) : *(uint16_t*)(&x))
#define SET(x, y) do { \
uint16_t z = MASK(y); \
if (SIZEOF_PIXEL == 1) \
*(uint8_t*)(&x) = z; \
else \
*(uint16_t*)(&x) = z; \
} while (0)
#define RANDCLIP(x, diff) av_clip(GET(x) - (diff), 0, \
(1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1)
// NOTE: this function doesn't work 'correctly' in that it won't always choose
// strong/strong or weak/weak, in most cases it tends to but will sometimes mix
// weak/strong or even skip sometimes. This is more useful to test correctness
// for these functions, though it does make benching them difficult. The easiest
// way to bench these functions is to check an overall decode since there are too
// many paths and ways to trigger the deblock: we would have to bench all
// permutations of weak/strong/skip/nd_q/nd_p/no_q/no_p and it quickly becomes
// too much.
static void randomize_luma_buffers(int type, int *beta, int32_t tc[2],
uint8_t *buf, ptrdiff_t xstride, ptrdiff_t ystride, int bit_depth)
{
int i, j, b3, tc25, tc25diff, b3diff;
// both tc & beta are unscaled inputs
// minimum useful value is 1, full range 0-24
tc[0] = (rnd() % 25) + 1;
tc[1] = (rnd() % 25) + 1;
// minimum useful value for 8bit is 8
*beta = (rnd() % 57) + 8;
switch (type) {
case 0: // strong
for (j = 0; j < 2; j++) {
tc25 = TC25(j) << (bit_depth - 8);
tc25diff = FFMAX(tc25 - 1, 0);
// 4 lines per tc
for (i = 0; i < 4; i++) {
b3 = (*beta << (bit_depth - 8)) >> 3;
SET(P0, rnd() % (1 << bit_depth));
SET(Q0, RANDCLIP(P0, tc25diff));
// p3 - p0 up to beta3 budget
b3diff = rnd() % b3;
SET(P3, RANDCLIP(P0, b3diff));
// q3 - q0, reduced budget
b3diff = rnd() % FFMAX(b3 - b3diff, 1);
SET(Q3, RANDCLIP(Q0, b3diff));
// same concept, budget across 4 pixels
b3 -= b3diff = rnd() % FFMAX(b3, 1);
SET(P2, RANDCLIP(P0, b3diff));
b3 -= b3diff = rnd() % FFMAX(b3, 1);
SET(Q2, RANDCLIP(Q0, b3diff));
// extra reduced budget for weighted pixels
b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
SET(P1, RANDCLIP(P0, b3diff));
b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
SET(Q1, RANDCLIP(Q0, b3diff));
buf += ystride;
}
}
break;
case 1: // weak
for (j = 0; j < 2; j++) {
tc25 = TC25(j) << (bit_depth - 8);
tc25diff = FFMAX(tc25 - 1, 0);
// 4 lines per tc
for (i = 0; i < 4; i++) {
// Weak filtering is signficantly simpler to activate as
// we only need to satisfy d0 + d3 < beta, which
// can be simplified to d0 + d0 < beta. Using the above
// derivations but substiuting b3 for b1 and ensuring
// that P0/Q0 are at least 1/2 tc25diff apart (tending
// towards 1/2 range).
b3 = (*beta << (bit_depth - 8)) >> 1;
SET(P0, rnd() % (1 << bit_depth));
SET(Q0, RANDCLIP(P0, tc25diff >> 1) +
(tc25diff >> 1) * (P0 < (1 << (bit_depth - 1))) ? 1 : -1);
// p3 - p0 up to beta3 budget
b3diff = rnd() % b3;
SET(P3, RANDCLIP(P0, b3diff));
// q3 - q0, reduced budget
b3diff = rnd() % FFMAX(b3 - b3diff, 1);
SET(Q3, RANDCLIP(Q0, b3diff));
// same concept, budget across 4 pixels
b3 -= b3diff = rnd() % FFMAX(b3, 1);
SET(P2, RANDCLIP(P0, b3diff));
b3 -= b3diff = rnd() % FFMAX(b3, 1);
SET(Q2, RANDCLIP(Q0, b3diff));
// extra reduced budget for weighted pixels
b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
SET(P1, RANDCLIP(P0, b3diff));
b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
SET(Q1, RANDCLIP(Q0, b3diff));
buf += ystride;
}
}
break;
case 2: // none
*beta = 0; // ensure skip
for (i = 0; i < 8; i++) {
// we can just fill with completely random data, nothing should be touched.
SET(P3, rnd()); SET(P2, rnd()); SET(P1, rnd()); SET(P0, rnd());
SET(Q0, rnd()); SET(Q1, rnd()); SET(Q2, rnd()); SET(Q3, rnd());
buf += ystride;
}
break;
}
}
static void check_deblock_luma(HEVCDSPContext *h, int bit_depth, int c)
{
const char *type;
const char *types[3] = { "strong", "weak", "skip" };
int beta;
int32_t tc[2] = {0};
uint8_t no_p[2] = { rnd() & c, rnd() & c };
uint8_t no_q[2] = { rnd() & c, rnd() & c };
LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
uint8_t *ptr0 = buf0 + BUF_OFFSET,
*ptr1 = buf1 + BUF_OFFSET;
declare_func(void, uint8_t *pix, ptrdiff_t stride, int beta, int32_t *tc, uint8_t *no_p, uint8_t *no_q);
memset(buf0, 0, BUF_SIZE);
for (int j = 0; j < 3; j++) {
type = types[j];
if (check_func(c ? h->hevc_h_loop_filter_luma_c : h->hevc_h_loop_filter_luma,
"hevc_h_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : ""))
{
randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, 16 * SIZEOF_PIXEL, SIZEOF_PIXEL, bit_depth);
memcpy(buf1, buf0, BUF_SIZE);
call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
if (memcmp(buf0, buf1, BUF_SIZE))
fail();
bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
}
if (check_func(c ? h->hevc_v_loop_filter_luma_c : h->hevc_v_loop_filter_luma,
"hevc_v_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : ""))
{
randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, SIZEOF_PIXEL, 16 * SIZEOF_PIXEL, bit_depth);
memcpy(buf1, buf0, BUF_SIZE);
call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
if (memcmp(buf0, buf1, BUF_SIZE))
fail();
bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
}
}
}
void checkasm_check_hevc_deblock(void)
{
HEVCDSPContext h;
int bit_depth;
for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
ff_hevc_dsp_init(&h, bit_depth);
check_deblock_chroma(&h, bit_depth, 0);
}
report("chroma");
for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
ff_hevc_dsp_init(&h, bit_depth);
check_deblock_chroma(&h, bit_depth, 1);
}
report("chroma_full");
for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
ff_hevc_dsp_init(&h, bit_depth);
check_deblock_luma(&h, bit_depth, 0);
}
report("luma");
for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
ff_hevc_dsp_init(&h, bit_depth);
check_deblock_luma(&h, bit_depth, 1);
}
report("luma_full");
}
|