aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorJames Darnley <jdarnley@obe.tv>2022-12-14 17:16:28 +0100
committerJames Darnley <jdarnley@obe.tv>2022-12-20 15:02:45 +0100
commit6af453ca389c56cb113876628cb173577faa9464 (patch)
tree168dc608e5811b6d38ab8bfb61f0335524065828 /libavcodec/x86
parentf30b4c2f47ab689a570a9b36b2e96e78bb462691 (diff)
downloadffmpeg-6af453ca389c56cb113876628cb173577faa9464.tar.gz
avcodec/x86: add avx512icl function for v210dec
Ice Lake (Xeon Silver 4316): 2.01x faster (1147±36.8 vs. 571±38.2 decicycles) compared with avx2
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/v210-init.c10
-rw-r--r--libavcodec/x86/v210.asm60
2 files changed, 68 insertions, 2 deletions
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
index 5db1fef98c..8b3677b8aa 100644
--- a/libavcodec/x86/v210-init.c
+++ b/libavcodec/x86/v210-init.c
@@ -17,7 +17,7 @@
*/
#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
#include "libavcodec/v210dec.h"
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
@@ -28,6 +28,8 @@ extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_avx512icl(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
av_cold void ff_v210_x86_init(V210DecContext *s)
{
#if HAVE_X86ASM
@@ -42,6 +44,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
s->unpack_frame = ff_v210_planar_unpack_aligned_avx2;
+
+ if (EXTERNAL_AVX512ICL(cpu_flags))
+ s->unpack_frame = ff_v210_planar_unpack_avx512icl;
}
else {
if (cpu_flags & AV_CPU_FLAG_SSSE3)
@@ -52,6 +57,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s)
if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2)
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2;
+
+ if (EXTERNAL_AVX512ICL(cpu_flags))
+ s->unpack_frame = ff_v210_planar_unpack_avx512icl;
}
#endif
}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
index 600a4ddc5f..f247737ed0 100644
--- a/libavcodec/x86/v210.asm
+++ b/libavcodec/x86/v210.asm
@@ -22,7 +22,21 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+perm_y:
+ db 0,1, 4,5, 6,7, 8,9, 12,13, 14,15, 16,17, 20,21
+ db 22,23, 24,25, 28,29, 30,31, 32,33, 36,37, 38,39, 40,41
+ db 44,45, 46,47, 48,49, 52,53, 54,55, 56,57, 60,61, 62,63
+times 16 db 0xff ; align to 64
+
+perm_uv:
+ db 0,1, 4,5, 10,11, 16,17, 20,21, 26,27, 32,33, 36,37
+ db 42,43, 48,49, 52,53, 58,59
+times 8 db 0xff ; align to 32
+ db 2,3, 8,9, 12,13, 18,19, 24,25, 28,29, 34,35, 40,41
+ db 44,45, 50,51, 56,57, 60,61
+times 8 db 0xff ; align to 32
; for AVX2 version only
v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required
@@ -34,6 +48,9 @@ v210_mult: dw 64,4,64,4,64,4,64,4
v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+shift: times 4 dw 6, 2
+kmask: dw 0x5555, 0xaaaa
+
SECTION .text
%macro v210_planar_unpack 1
@@ -127,3 +144,44 @@ v210_planar_unpack aligned
INIT_YMM avx2
v210_planar_unpack aligned
%endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+
+INIT_ZMM avx512icl
+
+cglobal v210_planar_unpack, 5, 5, 6, src, y, u, v, w
+ movsxdifnidn wq, wd
+ lea yq, [yq+2*wq]
+ add uq, wq
+ add vq, wq
+ neg wq
+
+ kmovw k1, [kmask] ; odd dword mask
+ kmovw k2, [kmask+2] ; even dword mask
+
+ VBROADCASTI128 m0, [shift]
+ mova m1, [perm_y]
+ mova m2, [perm_uv]
+
+ .loop:
+ movu m3, [srcq]
+ vpsllvw m4, m3, m0
+ pslld m5, m3, 12
+ psrlw m4, 6
+ psrld m5, 22
+
+ vpblendmd m3{k1}, m4, m5
+ vpermb m3, m1, m3 ; could use vpcompressw
+ movu [yq+2*wq], m3
+
+ vpblendmd m5{k2}, m4, m5
+ vpermb m5, m2, m5
+ movu [uq+wq], ym5
+ vextracti32x8 [vq+wq], zm5, 1
+
+ add srcq, mmsize
+ add wq, (mmsize*3)/8
+ jl .loop
+RET
+
+%endif