aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/mips/h264idct_msa.c
diff options
context:
space:
mode:
authorgxw <guxiwei-hf@loongson.cn>2019-08-07 17:52:00 +0800
committerMichael Niedermayer <michael@niedermayer.cc>2019-08-13 16:48:38 +0200
commita3e572d96fd1dd6291f6b28e173db858c08ff8d8 (patch)
tree85807f6ec1442cc362cf8946e67f564c92267e07 /libavcodec/mips/h264idct_msa.c
parent8f92eb05e063e6c4d6e36521020620d4e6e1c21d (diff)
downloadffmpeg-a3e572d96fd1dd6291f6b28e173db858c08ff8d8.tar.gz
avutil/mips: refine msa macros CLIP_*.
Changing details as following: 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in source vector. 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'. Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x). Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x). Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x). 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255' instead, because there are no difference in the effect of this two macros. Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec/mips/h264idct_msa.c')
-rw-r--r--libavcodec/mips/h264idct_msa.c7
1 files changed, 3 insertions, 4 deletions
diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
index 7851bfdf4b..fbf7795e27 100644
--- a/libavcodec/mips/h264idct_msa.c
+++ b/libavcodec/mips/h264idct_msa.c
@@ -233,8 +233,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
res0, res1, res2, res3);
ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
res4, res5, res6, res7);
- CLIP_SH4_0_255(res0, res1, res2, res3);
- CLIP_SH4_0_255(res4, res5, res6, res7);
+ CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
dst0, dst1, dst2, dst3);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
@@ -263,8 +262,8 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
dst0_r, dst1_r, dst2_r, dst3_r);
ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
dst4_r, dst5_r, dst6_r, dst7_r);
- CLIP_SH4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
- CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
+ CLIP_SH8_0_255(dst0_r, dst1_r, dst2_r, dst3_r,
+ dst4_r, dst5_r, dst6_r, dst7_r);
PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
dst0, dst1, dst2, dst3);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)