avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for AVC idct functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for AVC idct functions in new file h264idct_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
author: Shivraj Patil <shivraj.patil@imgtec.com> 2015-06-10 19:48:40 +0530
committer: Michael Niedermayer <michaelni@gmx.at> 2015-06-11 17:10:45 +0200
commit: fb92f3ecb4d48a5612ee61aa39bd538cac9d08cf (patch)
tree: f60612a94b011b4a0fefac3e5b5d9d5b5fff886f /libavutil/mips
parent: 1d70b6fe1d9d67a35daf2ec4c653ba3eff5d31b7 (diff)
download: ffmpeg-fb92f3ecb4d48a5612ee61aa39bd538cac9d08cf.tar.gz
1 files changed, 96 insertions, 0 deletions
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index 0f06c80c72..1d2d1d0946 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -403,6 +403,19 @@
 }
 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
 
+/* Description : Load as 4x4 block of signed halfword elements from 1D source
+                 data into 4 vectors (Each vector with 4 signed halfwords)
+   Arguments   : Inputs  - psrc
+                 Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3)                \
+{                                                             \
+    out0 = LD_SH(psrc);                                       \
+    out2 = LD_SH(psrc + 8);                                   \
+    out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);  \
+    out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2);  \
+}
+
 /* Description : Load 2 vectors of signed word elements with stride
    Arguments   : Inputs  - psrc    (source pointer to load from)
                          - stride
@@ -2025,6 +2038,22 @@
     out1 = in2 - in3;                         \
 }
 
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Inputs  - in    (input halfword vector)
+                 Outputs - out   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out)                       \
+{                                                    \
+    v8i16 sign_m;                                    \
+                                                     \
+    sign_m = __msa_clti_s_h((v8i16) in, 0);          \
+    out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
+}
+
 /* Description : Sign extend byte elements from input vector and return
                  halfword results in pair of vectors
    Arguments   : Inputs  - in           (1 input byte vector)
@@ -2103,6 +2132,25 @@
     out3 = in0 - in3;                                            \
 }
 
+/* Description : Butterfly of 8 input vectors
+   Arguments   : Inputs  - in0 ...  in7
+                 Outputs - out0 .. out7
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
+                    out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                    \
+    out0 = in0 + in7;                                                \
+    out1 = in1 + in6;                                                \
+    out2 = in2 + in5;                                                \
+    out3 = in3 + in4;                                                \
+                                                                     \
+    out4 = in3 - in4;                                                \
+    out5 = in2 - in5;                                                \
+    out6 = in1 - in6;                                                \
+    out7 = in0 - in7;                                                \
+}
+
 /* Description : Transposes input 4x4 byte block
    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
@@ -2222,6 +2270,22 @@
     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
 }
 
+/* Description : Transposes 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+   Details     :
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v8i16 s0_m, s1_m;                                                   \
+                                                                        \
+    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
+    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
+    out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
+    out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
+}
+
 /* Description : Transposes 8x8 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
@@ -2272,6 +2336,38 @@
     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
 }
 
+/* Description : Add block 4x4
+   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
+                 Outputs -
+                 Return Type - unsigned bytes
+   Details     : Least significant 4 bytes from each input vector are added to
+                 the destination bytes, clipped between 0-255 and then stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
+{                                                                 \
+    uint32_t src0_m, src1_m, src2_m, src3_m;                      \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                      \
+    v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
+    v16i8 dst0_m = { 0 };                                         \
+    v16i8 dst1_m = { 0 };                                         \
+    v16i8 zero_m = { 0 };                                         \
+                                                                  \
+    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
+    LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
+    INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
+    INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
+    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
+    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
+    CLIP_SH2_0_255(res0_m, res1_m);                               \
+    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
+                                                                  \
+    out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
+    out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
+    out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
+    out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
+    SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
+}
+
 /* Description : Pack even elements of input vectors & xor with 128
    Arguments   : Inputs  - in0, in1
                  Outputs - out_m
author	Shivraj Patil <shivraj.patil@imgtec.com>	2015-06-10 19:48:40 +0530
committer	Michael Niedermayer <michaelni@gmx.at>	2015-06-11 17:10:45 +0200
commit	fb92f3ecb4d48a5612ee61aa39bd538cac9d08cf (patch)
tree	f60612a94b011b4a0fefac3e5b5d9d5b5fff886f /libavutil/mips
parent	1d70b6fe1d9d67a35daf2ec4c653ba3eff5d31b7 (diff)
download	ffmpeg-fb92f3ecb4d48a5612ee61aa39bd538cac9d08cf.tar.gz