aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorMartin Vignali <martin.vignali@gmail.com>2017-10-22 19:06:50 +0200
committerJames Darnley <james.darnley@gmail.com>2017-10-29 15:21:35 +0100
commite9930883a26c77261c19ea9ad9b930b065c6f579 (patch)
treebb64c9d59f3f7d614948a457287761c85e23597a /libavcodec/x86
parent9b0510a8e35930a22790518270ee6d3fad9f1666 (diff)
downloadffmpeg-e9930883a26c77261c19ea9ad9b930b065c6f579.tar.gz
libavcodec/bswapdsp : add AVX2 func for bswap_buf (swap uint32_t)
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/bswapdsp.asm48
-rw-r--r--libavcodec/x86/bswapdsp_init.c3
2 files changed, 38 insertions, 13 deletions
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 56d8083622..219d172481 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -35,14 +35,18 @@ SECTION .text
mov r3d, r2d
sar r2d, 3
jz .left4_%1
+%if cpuflag(avx2)
+ sar r2d, 1
+ jz .left8_%1
+%endif
.loop8_%1:
mov%1 m0, [r1 + 0]
- mov%1 m1, [r1 + 16]
-%if cpuflag(ssse3)
+ mov%1 m1, [r1 + mmsize]
+%if cpuflag(ssse3)||cpuflag(avx2)
pshufb m0, m2
pshufb m1, m2
mov%1 [r0 + 0], m0
- mov%1 [r0 + 16], m1
+ mov%1 [r0 + mmsize], m1
%else
pshuflw m0, m0, 10110001b
pshuflw m1, m1, 10110001b
@@ -59,18 +63,29 @@ SECTION .text
mov%1 [r0 + 0], m2
mov%1 [r0 + 16], m3
%endif
- add r0, 32
- add r1, 32
+ add r0, mmsize*2
+ add r1, mmsize*2
dec r2d
jnz .loop8_%1
+%if cpuflag(avx2)
+.left8_%1:
+ mov r2d, r3d
+ test r3d, 8
+ jz .left4_%1
+ mov%1 m0, [r1]
+ pshufb m0, m2
+ mov%1 [r0 + 0], m0
+ add r1, mmsize
+ add r0, mmsize
+%endif
.left4_%1:
mov r2d, r3d
test r3d, 4
jz .left
- mov%1 m0, [r1]
+ mov%1 xm0, [r1]
%if cpuflag(ssse3)
- pshufb m0, m2
- mov%1 [r0], m0
+ pshufb xm0, xm2
+ mov%1 [r0], xm0
%else
pshuflw m0, m0, 10110001b
pshufhw m0, m0, 10110001b
@@ -86,16 +101,20 @@ SECTION .text
; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
%macro BSWAP32_BUF 0
-%if cpuflag(ssse3)
+%if cpuflag(ssse3)||cpuflag(avx2)
cglobal bswap32_buf, 3,4,3
mov r3, r1
+%if cpuflag(avx2)
+ vbroadcasti128 m2, [pb_bswap32]
+%else
mova m2, [pb_bswap32]
+%endif
%else
cglobal bswap32_buf, 3,4,5
mov r3, r1
%endif
or r3, r0
- test r3, 15
+ test r3, mmsize - 1
jz .start_align
BSWAP_LOOPS u
jmp .left
@@ -105,9 +124,9 @@ cglobal bswap32_buf, 3,4,5
%if cpuflag(ssse3)
test r2d, 2
jz .left1
- movq m0, [r1]
- pshufb m0, m2
- movq [r0], m0
+ movq xm0, [r1]
+ pshufb xm0, xm2
+ movq [r0], xm0
add r1, 8
add r0, 8
.left1:
@@ -137,3 +156,6 @@ BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
+
+INIT_YMM avx2
+BSWAP32_BUF
diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c
index c042e56371..877bab1a2c 100644
--- a/libavcodec/x86/bswapdsp_init.c
+++ b/libavcodec/x86/bswapdsp_init.c
@@ -25,6 +25,7 @@
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
{
@@ -34,4 +35,6 @@ av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
c->bswap_buf = ff_bswap32_buf_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
c->bswap_buf = ff_bswap32_buf_ssse3;
+ if (EXTERNAL_AVX2_FAST(cpu_flags))
+ c->bswap_buf = ff_bswap32_buf_avx2;
}