x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128

When called inside a loop, the inline asm version results in one pxor unnecessarely emitted per iteration, as the contents of the __asm__() block are opaque to the compiler's instruction scheduler. This is not the case with intrinsics, where pxor will be emitted once with any half decent compiler. This also has the benefit of removing any SSE -> AVX penalty that may happen when the compiler emits VEX encoded instructions. Signed-off-by: James Almer <jamrial@gmail.com>
author: James Almer <jamrial@gmail.com> 2022-11-14 02:32:33 -0300
committer: James Almer <jamrial@gmail.com> 2024-07-10 13:25:44 -0300
commit: 4a04cca69af807ccf831da977a94350611967c4c (patch)
tree: 827ec417bfa53c97317296f2808dfc8d1b410139 /configure
parent: 34b4ca8696de64ca756e7aed7bdefa9ff6bb5fac (diff)
download: ffmpeg-4a04cca69af807ccf831da977a94350611967c4c.tar.gz
1 files changed, 3 insertions, 0 deletions
diff --git a/configure b/configure
index fa2e384350..f84fefeaab 100755
--- a/configure
+++ b/configure
@@ -2314,6 +2314,7 @@ HEADERS_LIST="
 
 INTRINSICS_LIST="
     intrinsics_neon
+    intrinsics_sse2
 "
 
 MATH_FUNCS="
@@ -2743,6 +2744,7 @@ armv6t2_deps="arm"
 armv8_deps="aarch64"
 neon_deps_any="aarch64 arm"
 intrinsics_neon_deps="neon"
+intrinsics_sse2_deps="sse2"
 vfp_deps="arm"
 vfpv3_deps="vfp"
 setend_deps="arm"
@@ -6444,6 +6446,7 @@ elif enabled loongarch; then
 fi
 
 check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
+check_cc intrinsics_sse2 emmintrin.h "__m128i test = _mm_setzero_si128()"
 
 check_ldflags -Wl,--as-needed
 check_ldflags -Wl,-z,noexecstack
author	James Almer <jamrial@gmail.com>	2022-11-14 02:32:33 -0300
committer	James Almer <jamrial@gmail.com>	2024-07-10 13:25:44 -0300
commit	4a04cca69af807ccf831da977a94350611967c4c (patch)
tree	827ec417bfa53c97317296f2808dfc8d1b410139 /configure
parent	34b4ca8696de64ca756e7aed7bdefa9ff6bb5fac (diff)
download	ffmpeg-4a04cca69af807ccf831da977a94350611967c4c.tar.gz