1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
|
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 code common to several files.
//
// Author: Vincent Rabaud (vrabaud@google.com)
#ifndef WEBP_DSP_COMMON_SSE2_H_
#define WEBP_DSP_COMMON_SSE2_H_
#ifdef __cplusplus
extern "C" {
#endif
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
//------------------------------------------------------------------------------
// Quite useful macro for debugging. Left here for convenience.
#if 0
#include <stdio.h>
static WEBP_INLINE void PrintReg(const __m128i r, const char* const name,
int size) {
int n;
union {
__m128i r;
uint8_t i8[16];
uint16_t i16[8];
uint32_t i32[4];
uint64_t i64[2];
} tmp;
tmp.r = r;
fprintf(stderr, "%s\t: ", name);
if (size == 8) {
for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
} else if (size == 16) {
for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
} else if (size == 32) {
for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
} else {
for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
}
fprintf(stderr, "\n");
}
#endif
//------------------------------------------------------------------------------
// Math functions.
// Return the sum of all the 8b in the register.
static WEBP_INLINE int VP8HorizontalAdd8b(const __m128i* const a) {
const __m128i zero = _mm_setzero_si128();
const __m128i sad8x2 = _mm_sad_epu8(*a, zero);
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
const __m128i sum = _mm_add_epi32(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
return _mm_cvtsi128_si32(sum);
}
// Transpose two 4x4 16b matrices horizontally stored in registers.
static WEBP_INLINE void VP8Transpose_2_4x4_16b(
const __m128i* const in0, const __m128i* const in1,
const __m128i* const in2, const __m128i* const in3, __m128i* const out0,
__m128i* const out1, __m128i* const out2, __m128i* const out3) {
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(*in0, *in1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(*in2, *in3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(*in0, *in1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(*in2, *in3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
*out0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
*out1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
*out2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
*out3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
//------------------------------------------------------------------------------
// Channel mixing.
// Function used several times in VP8PlanarTo24b.
// It samples the in buffer as follows: one every two unsigned char is stored
// at the beginning of the buffer, while the other half is stored at the end.
#define VP8PlanarTo24bHelper(IN, OUT) \
do { \
const __m128i v_mask = _mm_set1_epi16(0x00ff); \
/* Take one every two upper 8b values.*/ \
(OUT##0) = _mm_packus_epi16(_mm_and_si128((IN##0), v_mask), \
_mm_and_si128((IN##1), v_mask)); \
(OUT##1) = _mm_packus_epi16(_mm_and_si128((IN##2), v_mask), \
_mm_and_si128((IN##3), v_mask)); \
(OUT##2) = _mm_packus_epi16(_mm_and_si128((IN##4), v_mask), \
_mm_and_si128((IN##5), v_mask)); \
/* Take one every two lower 8b values.*/ \
(OUT##3) = _mm_packus_epi16(_mm_srli_epi16((IN##0), 8), \
_mm_srli_epi16((IN##1), 8)); \
(OUT##4) = _mm_packus_epi16(_mm_srli_epi16((IN##2), 8), \
_mm_srli_epi16((IN##3), 8)); \
(OUT##5) = _mm_packus_epi16(_mm_srli_epi16((IN##4), 8), \
_mm_srli_epi16((IN##5), 8)); \
} while (0)
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void VP8PlanarTo24b_SSE2(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
// around as follows:
// Input:
// r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
// Split the 6 registers in two sets of 3 registers: the first set as the even
// 8b bytes, the second the odd ones:
// r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
// Repeat the same permutations twice more:
// r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
VP8PlanarTo24bHelper(*in, tmp);
VP8PlanarTo24bHelper(tmp, *in);
VP8PlanarTo24bHelper(*in, tmp);
// We need to do it two more times than the example as we have sixteen bytes.
{
__m128i out0, out1, out2, out3, out4, out5;
VP8PlanarTo24bHelper(tmp, out);
VP8PlanarTo24bHelper(out, *in);
}
}
#undef VP8PlanarTo24bHelper
// Convert four packed four-channel buffers like argbargbargbargb... into the
// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0,
__m128i* const in1,
__m128i* const in2,
__m128i* const in3) {
// Column-wise transpose.
const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);
const __m128i A2 = _mm_unpacklo_epi8(*in2, *in3);
const __m128i A3 = _mm_unpackhi_epi8(*in2, *in3);
const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
const __m128i B2 = _mm_unpacklo_epi8(A2, A3);
const __m128i B3 = _mm_unpackhi_epi8(A2, A3);
// C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
// C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
const __m128i C2 = _mm_unpacklo_epi8(B2, B3);
const __m128i C3 = _mm_unpackhi_epi8(B2, B3);
// Gather the channels.
*in0 = _mm_unpackhi_epi64(C1, C3);
*in1 = _mm_unpacklo_epi64(C1, C3);
*in2 = _mm_unpackhi_epi64(C0, C2);
*in3 = _mm_unpacklo_epi64(C0, C2);
}
#endif // WEBP_USE_SSE2
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_COMMON_SSE2_H_
|