aboutsummaryrefslogtreecommitdiffstats
path: root/nihav-duck/src
diff options
context:
space:
mode:
authorKostya Shishkov <kostya.shishkov@gmail.com>2023-07-27 18:07:18 +0200
committerKostya Shishkov <kostya.shishkov@gmail.com>2023-07-27 18:07:18 +0200
commite510768d44566563d2eb093c38f04ef83327b903 (patch)
treea03011e0c00774fd46cd007669c33640b8e6e162 /nihav-duck/src
parente6aaad5c5273cd814b5748b7faf3751835a37217 (diff)
downloadnihav-e510768d44566563d2eb093c38f04ef83327b903.tar.gz
vp6dsp: SSE2 intrinsics version of bilinear motion compensation
This is an old patch picked up mostly because those intrinsics are stable now.
Diffstat (limited to 'nihav-duck/src')
-rw-r--r--nihav-duck/src/codecs/vp6dsp.rs83
1 files changed, 83 insertions, 0 deletions
diff --git a/nihav-duck/src/codecs/vp6dsp.rs b/nihav-duck/src/codecs/vp6dsp.rs
index dd62b86..5a2b47f 100644
--- a/nihav-duck/src/codecs/vp6dsp.rs
+++ b/nihav-duck/src/codecs/vp6dsp.rs
@@ -52,6 +52,7 @@ macro_rules! mc_filter {
}
//#[allow(snake_case)]
+#[cfg(not(target_arch = "x86_64"))]
pub fn mc_bilinear(dst: &mut [u8], dstride: usize, src: &[u8], mut soff: usize, sstride: usize, mx: u16, my: u16) {
if my == 0 {
for dline in dst.chunks_mut(dstride).take(8) {
@@ -84,6 +85,88 @@ pub fn mc_bilinear(dst: &mut [u8], dstride: usize, src: &[u8], mut soff: usize,
}
}
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[cfg(target_arch = "x86_64")]
+pub fn mc_bilinear(dst: &mut [u8], dstride: usize, src: &[u8], soff: usize, sstride: usize, mx: u16, my: u16) {
+ if my == 0 {
+ unsafe {
+ let mut sptr = src[soff..].as_ptr();
+ let mut dptr = dst.as_mut_ptr();
+ let bias = _mm_set1_epi16(4);
+ let a = _mm_set1_epi16((8 - mx) as i16);
+ let b = _mm_set1_epi16( mx as i16);
+ let z = _mm_setzero_si128();
+ for _ in 0..8 {
+ let s0 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z);
+ let s1 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr.add(1) as *const __m128i), z);
+ let s0 = _mm_mullo_epi16(s0, a);
+ let s1 = _mm_mullo_epi16(s1, b);
+ sptr = sptr.add(sstride);
+ let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, bias), s1), 3);
+ let t = _mm_packus_epi16(t, t);
+ _mm_storel_epi64(dptr as *mut __m128i, t);
+ dptr = dptr.add(dstride);
+ }
+ }
+ } else if mx == 0 {
+ unsafe {
+ let mut sptr = src[soff..].as_ptr();
+ let mut dptr = dst.as_mut_ptr();
+ let bias = _mm_set1_epi16(4);
+ let a = _mm_set1_epi16((8 - my) as i16);
+ let b = _mm_set1_epi16( my as i16);
+ let z = _mm_setzero_si128();
+ let mut last = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z);
+ last = _mm_mullo_epi16(last, a);
+ sptr = sptr.add(sstride);
+ for _ in 0..8 {
+ let s = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z);
+ sptr = sptr.add(sstride);
+ let s1 = _mm_mullo_epi16(s, b);
+ let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(last, bias), s1), 3);
+ last = _mm_mullo_epi16(s, a);
+ let t = _mm_packus_epi16(t, t);
+ _mm_storel_epi64(dptr as *mut __m128i, t);
+ dptr = dptr.add(dstride);
+ }
+ }
+ } else {
+ unsafe {
+ let mut sptr = src[soff..].as_ptr();
+ let mut dptr = dst.as_mut_ptr();
+ let bias = _mm_set1_epi16(4);
+ let a = _mm_set1_epi16((8 - mx) as i16);
+ let b = _mm_set1_epi16( mx as i16);
+ let c = _mm_set1_epi16((8 - my) as i16);
+ let d = _mm_set1_epi16( my as i16);
+ let z = _mm_setzero_si128();
+
+ let s0 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z);
+ let s1 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr.add(1) as *const __m128i), z);
+ let s0 = _mm_mullo_epi16(s0, a);
+ let s1 = _mm_mullo_epi16(s1, b);
+ let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, bias), s1), 3);
+ let mut last = _mm_mullo_epi16(t, c);
+ sptr = sptr.add(sstride);
+ for _ in 0..8 {
+ let s0 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr as *const __m128i), z);
+ let s1 = _mm_unpacklo_epi8(_mm_loadl_epi64(sptr.add(1) as *const __m128i), z);
+ let s0 = _mm_mullo_epi16(s0, a);
+ let s1 = _mm_mullo_epi16(s1, b);
+ sptr = sptr.add(sstride);
+ let t = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, bias), s1), 3);
+ let t1 = _mm_add_epi16(_mm_add_epi16(last, bias), _mm_mullo_epi16(t, d));
+ last = _mm_mullo_epi16(t, c);
+ let out = _mm_srai_epi16(t1, 3);
+ _mm_storel_epi64(dptr as *mut __m128i, _mm_packus_epi16(out, out));
+ dptr = dptr.add(dstride);
+ }
+ }
+ }
+}
+
#[allow(clippy::trivially_copy_pass_by_ref)]
pub fn mc_bicubic(dst: &mut [u8], dstride: usize, src: &[u8], mut soff: usize, sstride: usize, coeffs_w: &[i16; 4], coeffs_h: &[i16; 4]) {
if coeffs_h[1] == 128 {