diff options
author | 周晓勇 <zhouxiaoyong@loongson.cn> | 2015-08-06 21:27:52 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2015-08-09 20:43:54 +0200 |
commit | 7fbafd0b1bfb8bd1f318f68baa78667c82cc8c71 (patch) | |
tree | 7e39a252d2012641fdb76189ffe3bbdc06202408 | |
parent | 0f9d46b70d2ac6ca7fd883f19f8aa540a55ae9b8 (diff) | |
download | ffmpeg-7fbafd0b1bfb8bd1f318f68baa78667c82cc8c71.tar.gz |
avcodec: loongson optimize h264qpel with mmi v1
Signed-off-by: ZhouXiaoyong <zhouxiaoyong@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r-- | libavcodec/mips/Makefile | 1 | ||||
-rw-r--r-- | libavcodec/mips/constants.c | 6 | ||||
-rw-r--r-- | libavcodec/mips/constants.h | 4 | ||||
-rw-r--r-- | libavcodec/mips/h264dsp_mips.h | 200 | ||||
-rw-r--r-- | libavcodec/mips/h264qpel_init_mips.c | 113 | ||||
-rw-r--r-- | libavcodec/mips/h264qpel_mmi.c | 2637 |
6 files changed, 2960 insertions, 1 deletions
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index a359788a09..2e8b1ee479 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -76,3 +76,4 @@ MMI-OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_mmi.o \ MMI-OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvid_idct_mmi.o MMI-OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_mmi.o MMI-OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_mmi.o +MMI-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_mmi.o diff --git a/libavcodec/mips/constants.c b/libavcodec/mips/constants.c index a25fd24ae0..f8130d9ee1 100644 --- a/libavcodec/mips/constants.c +++ b/libavcodec/mips/constants.c @@ -24,18 +24,21 @@ #include "constants.h" DECLARE_ALIGNED(8, const uint64_t, ff_pw_1) = {0x0001000100010001ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_3) = {0x0003000300030003ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_4) = {0x0004000400040004ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_5) = {0x0005000500050005ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_3) = {0x0003000300030003ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_8) = {0x0008000800080008ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_9) = {0x0009000900090009ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_10) = {0x000A000A000A000AULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_16) = {0x0010001000100010ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_18) = {0x0012001200120012ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = {0x0014001400140014ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_28) = {0x001C001C001C001CULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_32) = {0x0020002000200020ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = {0x0035003500350035ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_64) = {0x0040004000400040ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = {0x0080008000800080ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_512) = {0x0200020002000200ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_m8tom5) = {0xFFFBFFFAFFF9FFF8ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_m4tom1) = {0xFFFFFFFEFFFDFFFCULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pw_1to4) = {0x0004000300020001ULL}; @@ -49,6 +52,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pb_1) = {0x0101010101010101ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_3) = {0x0303030303030303ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_80) = {0x8080808080808080ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1) = {0xA1A1A1A1A1A1A1A1ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FE) = {0xFEFEFEFEFEFEFEFEULL}; DECLARE_ALIGNED(8, const uint64_t, ff_rnd) = {0x0004000400040004ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_rnd2) = {0x0040004000400040ULL}; diff --git a/libavcodec/mips/constants.h b/libavcodec/mips/constants.h index 571002f508..0a4effdaa9 100644 --- a/libavcodec/mips/constants.h +++ b/libavcodec/mips/constants.h @@ -30,13 +30,16 @@ extern const uint64_t ff_pw_4; extern const uint64_t ff_pw_5; extern const uint64_t ff_pw_8; extern const uint64_t ff_pw_9; +extern const uint64_t ff_pw_10; extern const uint64_t ff_pw_16; extern const uint64_t ff_pw_18; +extern const uint64_t ff_pw_20; extern const uint64_t ff_pw_28; extern const uint64_t ff_pw_32; extern const uint64_t ff_pw_53; extern const uint64_t ff_pw_64; extern const uint64_t ff_pw_128; +extern const uint64_t ff_pw_512; extern const uint64_t ff_pw_m8tom5; extern const uint64_t ff_pw_m4tom1; extern const uint64_t ff_pw_1to4; @@ -50,6 +53,7 @@ extern const uint64_t ff_pb_1; extern const uint64_t ff_pb_3; extern const uint64_t ff_pb_80; extern const uint64_t ff_pb_A1; +extern const uint64_t ff_pb_FE; extern const uint64_t ff_rnd; extern const uint64_t ff_rnd2; diff --git a/libavcodec/mips/h264dsp_mips.h b/libavcodec/mips/h264dsp_mips.h index 0bc5dde696..0b72c2e3e9 100644 --- a/libavcodec/mips/h264dsp_mips.h +++ b/libavcodec/mips/h264dsp_mips.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + Zhou Xiaoyong <zhouxiaoyong@loongson.cn> * * This file is part of FFmpeg. * @@ -22,6 +23,7 @@ #define H264_DSP_MIPS_H #include "libavcodec/h264.h" +#include "constants.h" void ff_h264_h_lpf_luma_inter_msa(uint8_t *src, int stride, int alpha, int beta, int8_t *tc0); @@ -333,4 +335,202 @@ void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset); +void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + #endif // #ifndef H264_DSP_MIPS_H diff --git a/libavcodec/mips/h264qpel_init_mips.c b/libavcodec/mips/h264qpel_init_mips.c index a202304346..92219f8877 100644 --- a/libavcodec/mips/h264qpel_init_mips.c +++ b/libavcodec/mips/h264qpel_init_mips.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) + * Zhou Xiaoyong <zhouxiaoyong@loongson.cn> * * This file is part of FFmpeg. * @@ -128,9 +129,121 @@ static av_cold void h264qpel_init_msa(H264QpelContext *c, int bit_depth) } #endif // #if HAVE_MSA +#if HAVE_MMI +static av_cold void h264qpel_init_mmi(H264QpelContext *c, int bit_depth) +{ + if (8 == bit_depth) { + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_mmi; + c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_mmi; + c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_mmi; + c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_mmi; + c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_mmi; + c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_mmi; + c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_mmi; + c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_mmi; + c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_mmi; + c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_mmi; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_mmi; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_mmi; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_mmi; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_mmi; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_mmi; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_mmi; + + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_mmi; + c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_mmi; + c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_mmi; + c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_mmi; + c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_mmi; + c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_mmi; + c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_mmi; + c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_mmi; + c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_mmi; + c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_mmi; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_mmi; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_mmi; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_mmi; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_mmi; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_mmi; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_mmi; + + c->put_h264_qpel_pixels_tab[2][0] = ff_put_h264_qpel4_mc00_mmi; + c->put_h264_qpel_pixels_tab[2][1] = ff_put_h264_qpel4_mc10_mmi; + c->put_h264_qpel_pixels_tab[2][2] = ff_put_h264_qpel4_mc20_mmi; + c->put_h264_qpel_pixels_tab[2][3] = ff_put_h264_qpel4_mc30_mmi; + c->put_h264_qpel_pixels_tab[2][4] = ff_put_h264_qpel4_mc01_mmi; + c->put_h264_qpel_pixels_tab[2][5] = ff_put_h264_qpel4_mc11_mmi; + c->put_h264_qpel_pixels_tab[2][6] = ff_put_h264_qpel4_mc21_mmi; + c->put_h264_qpel_pixels_tab[2][7] = ff_put_h264_qpel4_mc31_mmi; + c->put_h264_qpel_pixels_tab[2][8] = ff_put_h264_qpel4_mc02_mmi; + c->put_h264_qpel_pixels_tab[2][9] = ff_put_h264_qpel4_mc12_mmi; + c->put_h264_qpel_pixels_tab[2][10] = ff_put_h264_qpel4_mc22_mmi; + c->put_h264_qpel_pixels_tab[2][11] = ff_put_h264_qpel4_mc32_mmi; + c->put_h264_qpel_pixels_tab[2][12] = ff_put_h264_qpel4_mc03_mmi; + c->put_h264_qpel_pixels_tab[2][13] = ff_put_h264_qpel4_mc13_mmi; + c->put_h264_qpel_pixels_tab[2][14] = ff_put_h264_qpel4_mc23_mmi; + c->put_h264_qpel_pixels_tab[2][15] = ff_put_h264_qpel4_mc33_mmi; + + c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_mmi; + c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_mmi; + c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_mmi; + c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_mmi; + c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_mmi; + c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_mmi; + c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_mmi; + c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_mmi; + c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_mmi; + c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_mmi; + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_mmi; + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_mmi; + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_mmi; + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_mmi; + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_mmi; + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_mmi; + + c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_mmi; + c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_mmi; + c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_mmi; + c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_mmi; + c->avg_h264_qpel_pixels_tab[1][4] = ff_avg_h264_qpel8_mc01_mmi; + c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_mmi; + c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_mmi; + c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_mmi; + c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_mmi; + c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_mmi; + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_mmi; + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_mmi; + c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_mmi; + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_mmi; + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_mmi; + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_mmi; + + c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_h264_qpel4_mc00_mmi; + c->avg_h264_qpel_pixels_tab[2][1] = ff_avg_h264_qpel4_mc10_mmi; + c->avg_h264_qpel_pixels_tab[2][2] = ff_avg_h264_qpel4_mc20_mmi; + c->avg_h264_qpel_pixels_tab[2][3] = ff_avg_h264_qpel4_mc30_mmi; + c->avg_h264_qpel_pixels_tab[2][4] = ff_avg_h264_qpel4_mc01_mmi; + c->avg_h264_qpel_pixels_tab[2][5] = ff_avg_h264_qpel4_mc11_mmi; + c->avg_h264_qpel_pixels_tab[2][6] = ff_avg_h264_qpel4_mc21_mmi; + c->avg_h264_qpel_pixels_tab[2][7] = ff_avg_h264_qpel4_mc31_mmi; + c->avg_h264_qpel_pixels_tab[2][8] = ff_avg_h264_qpel4_mc02_mmi; + c->avg_h264_qpel_pixels_tab[2][9] = ff_avg_h264_qpel4_mc12_mmi; + c->avg_h264_qpel_pixels_tab[2][10] = ff_avg_h264_qpel4_mc22_mmi; + c->avg_h264_qpel_pixels_tab[2][11] = ff_avg_h264_qpel4_mc32_mmi; + c->avg_h264_qpel_pixels_tab[2][12] = ff_avg_h264_qpel4_mc03_mmi; + c->avg_h264_qpel_pixels_tab[2][13] = ff_avg_h264_qpel4_mc13_mmi; + c->avg_h264_qpel_pixels_tab[2][14] = ff_avg_h264_qpel4_mc23_mmi; + c->avg_h264_qpel_pixels_tab[2][15] = ff_avg_h264_qpel4_mc33_mmi; + } +} +#endif /* HAVE_MMI */ + av_cold void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth) { #if HAVE_MSA h264qpel_init_msa(c, bit_depth); #endif // #if HAVE_MSA +#if HAVE_MMI + h264qpel_init_mmi(c, bit_depth); +#endif /* HAVE_MMI */ } diff --git a/libavcodec/mips/h264qpel_mmi.c b/libavcodec/mips/h264qpel_mmi.c new file mode 100644 index 0000000000..e04a2d5936 --- /dev/null +++ b/libavcodec/mips/h264qpel_mmi.c @@ -0,0 +1,2637 @@ +/* + * Loongson SIMD optimized h264qpel + * + * Copyright (c) 2015 Loongson Technology Corporation Limited + * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264dsp_mips.h" +#include "libavcodec/bit_depth_template.c" + +static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gslwlc1 $f2, 3(%[src]) \r\n" + "gslwrc1 $f2, 0(%[src]) \r\n" + "gsswlc1 $f2, 3(%[dst]) \r\n" + "gsswrc1 $f2, 0(%[dst]) \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "dadd %[dst], %[dst], %[dstStride] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [dst]"+&r"(dst),[src]"+&r"(src) + : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h) + : "$f2" + ); +} + +static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gsldlc1 $f2, 7(%[src]) \r\n" + "gsldrc1 $f2, 0(%[src]) \r\n" + "gssdlc1 $f2, 7(%[dst]) \r\n" + "gssdrc1 $f2, 0(%[dst]) \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "dadd %[dst], %[dst], %[dstStride] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [dst]"+&r"(dst),[src]"+&r"(src) + : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h) + : "$f2" + ); +} + +static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gsldlc1 $f2, 7(%[src]) \r\n" + "gsldrc1 $f2, 0(%[src]) \r\n" + "gsldlc1 $f4, 15(%[src]) \r\n" + "gsldrc1 $f4, 8(%[src]) \r\n" + "gssdlc1 $f2, 7(%[dst]) \r\n" + "gssdrc1 $f2, 0(%[dst]) \r\n" + "gssdlc1 $f4, 15(%[dst]) \r\n" + "gssdrc1 $f4, 8(%[dst]) \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "dadd %[dst], %[dst], %[dstStride] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [dst]"+&r"(dst),[src]"+&r"(src) + : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h) + : "$f2","$f4" + ); +} + +#define op_put(a, b) a = b +#define op_avg(a, b) a = rnd_avg_pixel4(a, b) +static inline void put_pixels4_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gslwlc1 $f2, 3(%[pixels]) \r\n" + "gslwrc1 $f2, 0(%[pixels]) \r\n" + "gsswlc1 $f2, 3(%[block]) \r\n" + "gsswrc1 $f2, 0(%[block]) \r\n" + "dadd %[pixels], %[pixels], %[line_size]\r\n" + "dadd %[block], %[block], %[line_size] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [block]"+&r"(block),[pixels]"+&r"(pixels) + : [line_size]"r"(line_size),[h]"r"(h) + : "$f2" + ); +} + +static inline void put_pixels8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gsldlc1 $f2, 7(%[pixels]) \r\n" + "gsldrc1 $f2, 0(%[pixels]) \r\n" + "gssdlc1 $f2, 7(%[block]) \r\n" + "gssdrc1 $f2, 0(%[block]) \r\n" + "dadd %[pixels], %[pixels], %[line_size]\r\n" + "dadd %[block], %[block], %[line_size] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [block]"+&r"(block),[pixels]"+&r"(pixels) + : [line_size]"r"(line_size),[h]"r"(h) + : "$f2" + ); +} + +static inline void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gsldlc1 $f2, 7(%[pixels]) \r\n" + "gsldrc1 $f2, 0(%[pixels]) \r\n" + "gsldlc1 $f4, 15(%[pixels]) \r\n" + "gsldrc1 $f4, 8(%[pixels]) \r\n" + "gssdlc1 $f2, 7(%[block]) \r\n" + "gssdrc1 $f2, 0(%[block]) \r\n" + "gssdlc1 $f4, 15(%[block]) \r\n" + "gssdrc1 $f4, 8(%[block]) \r\n" + "dadd %[pixels], %[pixels], %[line_size]\r\n" + "dadd %[block], %[block], %[line_size] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [block]"+&r"(block),[pixels]"+&r"(pixels) + : [line_size]"r"(line_size),[h]"r"(h) + : "$f2","$f4" + ); +} + +static inline void avg_pixels4_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gslwlc1 $f2, 3(%[pixels]) \r\n" + "gslwrc1 $f2, 0(%[pixels]) \r\n" + "gslwlc1 $f4, 3(%[block]) \r\n" + "gslwrc1 $f4, 0(%[block]) \r\n" + "pavgb $f2, $f2, $f4 \r\n" + "gsswlc1 $f2, 3(%[block]) \r\n" + "gsswrc1 $f2, 0(%[block]) \r\n" + "dadd %[pixels], %[pixels], %[line_size]\r\n" + "dadd %[block], %[block], %[line_size] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [block]"+&r"(block),[pixels]"+&r"(pixels) + : [line_size]"r"(line_size),[h]"r"(h) + : "$f2","$f4" + ); +} + +static inline void avg_pixels8_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gsldlc1 $f2, 7(%[block]) \r\n" + "gsldrc1 $f2, 0(%[block]) \r\n" + "gsldlc1 $f4, 7(%[pixels]) \r\n" + "gsldrc1 $f4, 0(%[pixels]) \r\n" + "pavgb $f2, $f2, $f4 \r\n" + "gssdlc1 $f2, 7(%[block]) \r\n" + "gssdrc1 $f2, 0(%[block]) \r\n" + "dadd %[pixels], %[pixels], %[line_size]\r\n" + "dadd %[block], %[block], %[line_size] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [block]"+&r"(block),[pixels]"+&r"(pixels) + : [line_size]"r"(line_size),[h]"r"(h) + : "$f2","$f4" + ); +} + +static inline void avg_pixels16_mmi(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + __asm__ volatile ( + "1: \r\n" + "gsldlc1 $f2, 7(%[block]) \r\n" + "gsldrc1 $f2, 0(%[block]) \r\n" + "gsldlc1 $f4, 15(%[block]) \r\n" + "gsldrc1 $f4, 8(%[block]) \r\n" + "gsldlc1 $f6, 7(%[pixels]) \r\n" + "gsldrc1 $f6, 0(%[pixels]) \r\n" + "gsldlc1 $f8, 15(%[pixels]) \r\n" + "gsldrc1 $f8, 8(%[pixels]) \r\n" + "pavgb $f2, $f2, $f6 \r\n" + "pavgb $f4, $f4, $f8 \r\n" + "gssdlc1 $f2, 7(%[block]) \r\n" + "gssdrc1 $f2, 0(%[block]) \r\n" + "gssdlc1 $f4, 15(%[block]) \r\n" + "gssdrc1 $f4, 8(%[block]) \r\n" + "dadd %[pixels], %[pixels], %[line_size]\r\n" + "dadd %[block], %[block], %[line_size] \r\n" + "daddi %[h], %[h], -1 \r\n" + "bnez %[h], 1b \r\n" + : [block]"+&r"(block),[pixels]"+&r"(pixels) + : [line_size]"r"(line_size),[h]"r"(h) + : "$f2","$f4","$f6","$f8" + ); +} + +static inline void put_pixels4_l2_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + int i; + for (i = 0; i < h; i++) { + pixel4 a, b; + a = AV_RN4P(&src1[i * src_stride1]); + b = AV_RN4P(&src2[i * src_stride2]); + op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b)); + } +} + +static inline void put_pixels8_l2_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + int i; + for (i = 0; i < h; i++) { + pixel4 a, b; + a = AV_RN4P(&src1[i * src_stride1]); + b = AV_RN4P(&src2[i * src_stride2]); + op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 4]); + b = AV_RN4P(&src2[i * src_stride2 + 4]); + op_put(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b)); + } +} + +static inline void put_pixels16_l2_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + int i; + for (i = 0; i < h; i++) { + pixel4 a, b; + a = AV_RN4P(&src1[i * src_stride1]); + b = AV_RN4P(&src2[i * src_stride2]); + op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 4]); + b = AV_RN4P(&src2[i * src_stride2 + 4]); + op_put(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 8]); + b = AV_RN4P(&src2[i * src_stride2 + 8]); + op_put(*((pixel4 *) &dst[i * dst_stride + 8]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 12]); + b = AV_RN4P(&src2[i * src_stride2 + 12]); + op_put(*((pixel4 *) &dst[i * dst_stride + 12]), rnd_avg_pixel4(a, b)); + } +} + +static inline void avg_pixels4_l2_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + int i; + for (i = 0; i < h; i++) { + pixel4 a, b; + a = AV_RN4P(&src1[i * src_stride1]); + b = AV_RN4P(&src2[i * src_stride2]); + op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b)); + } +} + +static inline void avg_pixels8_l2_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + int i; + for (i = 0; i < h; i++) { + pixel4 a, b; + a = AV_RN4P(&src1[i * src_stride1]); + b = AV_RN4P(&src2[i * src_stride2]); + op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 4]); + b = AV_RN4P(&src2[i * src_stride2 + 4]); + op_avg(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b)); + } +} + +static inline void avg_pixels16_l2_mmi(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, + int h) +{ + int i; + for (i = 0; i < h; i++) { + pixel4 a, b; + a = AV_RN4P(&src1[i * src_stride1]); + b = AV_RN4P(&src2[i * src_stride2]); + op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 4]); + b = AV_RN4P(&src2[i * src_stride2 + 4]); + op_avg(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 8]); + b = AV_RN4P(&src2[i * src_stride2 + 8]); + op_avg(*((pixel4 *) &dst[i * dst_stride + 8]), rnd_avg_pixel4(a, b)); + a = AV_RN4P(&src1[i * src_stride1 + 12]); + b = AV_RN4P(&src2[i * src_stride2 + 12]); + op_avg(*((pixel4 *) &dst[i * dst_stride + 12]), rnd_avg_pixel4(a, b)); + + } +} +#undef op_put +#undef op_avg + +#define op2_avg(a, b) a = (((a)+CLIP(((b) + 512)>>10)+1)>>1) +#define op2_put(a, b) a = CLIP(((b) + 512)>>10) +static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 4 \r\n" + "1: \r\n" + "gslwlc1 $f2, 1(%[src]) \r\n" + "gslwrc1 $f2, -2(%[src]) \r\n" + "gslwlc1 $f4, 2(%[src]) \r\n" + "gslwrc1 $f4, -1(%[src]) \r\n" + "gslwlc1 $f6, 3(%[src]) \r\n" + "gslwrc1 $f6, 0(%[src]) \r\n" + "gslwlc1 $f8, 4(%[src]) \r\n" + "gslwrc1 $f8, 1(%[src]) \r\n" + "gslwlc1 $f10, 5(%[src]) \r\n" + "gslwrc1 $f10, 2(%[src]) \r\n" + "gslwlc1 $f12, 6(%[src]) \r\n" + "gslwrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f2, $f2, $f0 \r\n" + "punpcklbh $f4, $f4, $f0 \r\n" + "punpcklbh $f6, $f6, $f0 \r\n" + "punpcklbh $f8, $f8, $f0 \r\n" + "punpcklbh $f10, $f10, $f0 \r\n" + "punpcklbh $f12, $f12, $f0 \r\n" + "paddsh $f14, $f6, $f8 \r\n" + "paddsh $f16, $f4, $f10 \r\n" + "paddsh $f18, $f2, $f12 \r\n" + "pmullh $f14, $f14, %[ff_pw_20] \r\n" + "pmullh $f16, $f16, %[ff_pw_5] \r\n" + "psubsh $f14, $f14, $f16 \r\n" + "paddsh $f18, $f14, $f18 \r\n" + "paddsh $f18, $f18, %[ff_pw_16] \r\n" + "psrah $f18, $f18, %[ff_pw_5] \r\n" + "packushb $f18, $f18, $f0 \r\n" + "gsswlc1 $f18, 3(%[dst]) \r\n" + "gsswrc1 $f18, 0(%[dst]) \r\n" + "dadd %[dst], %[dst], %[dstStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [dst]"+&r"(dst),[src]"+&r"(src) + : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16) + : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16", + "$f18" + ); +} + +static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 8 \r\n" + "1: \r\n" + "gsldlc1 $f2, 5(%[src]) \r\n" + "gsldrc1 $f2, -2(%[src]) \r\n" + "gsldlc1 $f4, 6(%[src]) \r\n" + "gsldrc1 $f4, -1(%[src]) \r\n" + "gsldlc1 $f6, 7(%[src]) \r\n" + "gsldrc1 $f6, 0(%[src]) \r\n" + "gsldlc1 $f8, 8(%[src]) \r\n" + "gsldrc1 $f8, 1(%[src]) \r\n" + "gsldlc1 $f10, 9(%[src]) \r\n" + "gsldrc1 $f10, 2(%[src]) \r\n" + "gsldlc1 $f12, 10(%[src]) \r\n" + "gsldrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f14, $f6, $f0 \r\n" + "punpckhbh $f16, $f6, $f0 \r\n" + "punpcklbh $f18, $f8, $f0 \r\n" + "punpckhbh $f20, $f8, $f0 \r\n" + "paddsh $f6, $f14, $f18 \r\n" + "paddsh $f8, $f16, $f20 \r\n" + "pmullh $f6, $f6, %[ff_pw_20] \r\n" + "pmullh $f8, $f8, %[ff_pw_20] \r\n" + "punpcklbh $f14, $f4, $f0 \r\n" + "punpckhbh $f16, $f4, $f0 \r\n" + "punpcklbh $f18, $f10, $f0 \r\n" + "punpckhbh $f20, $f10, $f0 \r\n" + "paddsh $f4, $f14, $f18 \r\n" + "paddsh $f10, $f16, $f20 \r\n" + "pmullh $f4, $f4, %[ff_pw_5] \r\n" + "pmullh $f10, $f10, %[ff_pw_5] \r\n" + "punpcklbh $f14, $f2, $f0 \r\n" + "punpckhbh $f16, $f2, $f0 \r\n" + "punpcklbh $f18, $f12, $f0 \r\n" + "punpckhbh $f20, $f12, $f0 \r\n" + "paddsh $f2, $f14, $f18 \r\n" + "paddsh $f12, $f16, $f20 \r\n" + "psubsh $f6, $f6, $f4 \r\n" + "psubsh $f8, $f8, $f10 \r\n" + "paddsh $f6, $f6, $f2 \r\n" + "paddsh $f8, $f8, $f12 \r\n" + "paddsh $f6, $f6, %[ff_pw_16] \r\n" + "paddsh $f8, $f8, %[ff_pw_16] \r\n" + "psrah $f6, $f6, %[ff_pw_5] \r\n" + "psrah $f8, $f8, %[ff_pw_5] \r\n" + "packushb $f18, $f6, $f8 \r\n" + "sdc1 $f18, 0(%[dst]) \r\n" + "dadd %[dst], %[dst], %[dstStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [dst]"+&r"(dst),[src]"+&r"(src) + : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16) + : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16", + "$f18","$f20" + ); +} + +static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 4 \r\n" + "1: \r\n" + "gslwlc1 $f2, 1(%[src]) \r\n" + "gslwrc1 $f2, -2(%[src]) \r\n" + "gslwlc1 $f4, 2(%[src]) \r\n" + "gslwrc1 $f4, -1(%[src]) \r\n" + "gslwlc1 $f6, 3(%[src]) \r\n" + "gslwrc1 $f6, 0(%[src]) \r\n" + "gslwlc1 $f8, 4(%[src]) \r\n" + "gslwrc1 $f8, 1(%[src]) \r\n" + "gslwlc1 $f10, 5(%[src]) \r\n" + "gslwrc1 $f10, 2(%[src]) \r\n" + "gslwlc1 $f12, 6(%[src]) \r\n" + "gslwrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f2, $f2, $f0 \r\n" + "punpcklbh $f4, $f4, $f0 \r\n" + "punpcklbh $f6, $f6, $f0 \r\n" + "punpcklbh $f8, $f8, $f0 \r\n" + "punpcklbh $f10, $f10, $f0 \r\n" + "punpcklbh $f12, $f12, $f0 \r\n" + "paddsh $f14, $f6, $f8 \r\n" + "paddsh $f16, $f4, $f10 \r\n" + "paddsh $f18, $f2, $f12 \r\n" + "pmullh $f14, $f14, %[ff_pw_20] \r\n" + "pmullh $f16, $f16, %[ff_pw_5] \r\n" + "psubsh $f14, $f14, $f16 \r\n" + "paddsh $f18, $f14, $f18 \r\n" + "paddsh $f18, $f18, %[ff_pw_16] \r\n" + "psrah $f18, $f18, %[ff_pw_5] \r\n" + "packushb $f18, $f18, $f0 \r\n" + "lwc1 $f20, 0(%[dst]) \r\n" + "pavgb $f18, $f18, $f20 \r\n" + "gsswlc1 $f18, 3(%[dst]) \r\n" + "gsswrc1 $f18, 0(%[dst]) \r\n" + "dadd %[dst], %[dst], %[dstStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [dst]"+&r"(dst),[src]"+&r"(src) + : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16) + : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16", + "$f18","$f20" + ); +} + +static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 8 \r\n" + "1: \r\n" + "gsldlc1 $f2, 5(%[src]) \r\n" + "gsldrc1 $f2, -2(%[src]) \r\n" + "gsldlc1 $f4, 6(%[src]) \r\n" + "gsldrc1 $f4, -1(%[src]) \r\n" + "gsldlc1 $f6, 7(%[src]) \r\n" + "gsldrc1 $f6, 0(%[src]) \r\n" + "gsldlc1 $f8, 8(%[src]) \r\n" + "gsldrc1 $f8, 1(%[src]) \r\n" + "gsldlc1 $f10, 9(%[src]) \r\n" + "gsldrc1 $f10, 2(%[src]) \r\n" + "gsldlc1 $f12, 10(%[src]) \r\n" + "gsldrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f14, $f6, $f0 \r\n" + "punpckhbh $f16, $f6, $f0 \r\n" + "punpcklbh $f18, $f8, $f0 \r\n" + "punpckhbh $f20, $f8, $f0 \r\n" + "paddsh $f6, $f14, $f18 \r\n" + "paddsh $f8, $f16, $f20 \r\n" + "pmullh $f6, $f6, %[ff_pw_20] \r\n" + "pmullh $f8, $f8, %[ff_pw_20] \r\n" + "punpcklbh $f14, $f4, $f0 \r\n" + "punpckhbh $f16, $f4, $f0 \r\n" + "punpcklbh $f18, $f10, $f0 \r\n" + "punpckhbh $f20, $f10, $f0 \r\n" + "paddsh $f4, $f14, $f18 \r\n" + "paddsh $f10, $f16, $f20 \r\n" + "pmullh $f4, $f4, %[ff_pw_5] \r\n" + "pmullh $f10, $f10, %[ff_pw_5] \r\n" + "punpcklbh $f14, $f2, $f0 \r\n" + "punpckhbh $f16, $f2, $f0 \r\n" + "punpcklbh $f18, $f12, $f0 \r\n" + "punpckhbh $f20, $f12, $f0 \r\n" + "paddsh $f2, $f14, $f18 \r\n" + "paddsh $f12, $f16, $f20 \r\n" + "psubsh $f6, $f6, $f4 \r\n" + "psubsh $f8, $f8, $f10 \r\n" + "paddsh $f6, $f6, $f2 \r\n" + "paddsh $f8, $f8, $f12 \r\n" + "paddsh $f6, $f6, %[ff_pw_16] \r\n" + "paddsh $f8, $f8, %[ff_pw_16] \r\n" + "psrah $f6, $f6, %[ff_pw_5] \r\n" + "psrah $f8, $f8, %[ff_pw_5] \r\n" + "packushb $f18, $f6, $f8 \r\n" + "ldc1 $f20, 0(%[dst]) \r\n" + "pavgb $f18, $f18, $f20 \r\n" + "sdc1 $f18, 0(%[dst]) \r\n" + "dadd %[dst], %[dst], %[dstStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [dst]"+&r"(dst),[src]"+&r"(src) + : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16) + : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16", + "$f18","$f20" + ); +} + +static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "gslwlc1 $f2, 3(%[srcB]) \r\n" + "gslwrc1 $f2, 0(%[srcB]) \r\n" + "gslwlc1 $f4, 3(%[srcA]) \r\n" + "gslwrc1 $f4, 0(%[srcA]) \r\n" + "gslwlc1 $f6, 3(%[src0]) \r\n" + "gslwrc1 $f6, 0(%[src0]) \r\n" + "gslwlc1 $f8, 3(%[src1]) \r\n" + "gslwrc1 $f8, 0(%[src1]) \r\n" + "gslwlc1 $f10, 3(%[src2]) \r\n" + "gslwrc1 $f10, 0(%[src2]) \r\n" + "gslwlc1 $f12, 3(%[src3]) \r\n" + "gslwrc1 $f12, 0(%[src3]) \r\n" + "gslwlc1 $f14, 3(%[src4]) \r\n" + "gslwrc1 $f14, 0(%[src4]) \r\n" + "gslwlc1 $f16, 3(%[src5]) \r\n" + "gslwrc1 $f16, 0(%[src5]) \r\n" + "gslwlc1 $f18, 3(%[src6]) \r\n" + "gslwrc1 $f18, 0(%[src6]) \r\n" + "punpcklbh $f2, $f2, $f0 \r\n" + "punpcklbh $f4, $f4, $f0 \r\n" + "punpcklbh $f6, $f6, $f0 \r\n" + "punpcklbh $f8, $f8, $f0 \r\n" + "punpcklbh $f10, $f10, $f0 \r\n" + "punpcklbh $f12, $f12, $f0 \r\n" + "punpcklbh $f14, $f14, $f0 \r\n" + "punpcklbh $f16, $f16, $f0 \r\n" + "punpcklbh $f18, $f18, $f0 \r\n" + "paddsh $f20, $f6, $f8 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f4, $f10 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f24, $f20, $f22 \r\n" + "paddsh $f24, $f24, $f2 \r\n" + "paddsh $f24, $f24, $f12 \r\n" + "paddsh $f20, $f8, $f10 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f6, $f12 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f26, $f20, $f22 \r\n" + "paddsh $f26, $f26, $f4 \r\n" + "paddsh $f26, $f26, $f14 \r\n" + "paddsh $f20, $f10, $f12 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f8, $f14 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f28, $f20, $f22 \r\n" + "paddsh $f28, $f28, $f6 \r\n" + "paddsh $f28, $f28, $f16 \r\n" + "paddsh $f20, $f12, $f14 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f10, $f16 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f30, $f20, $f22 \r\n" + "paddsh $f30, $f30, $f8 \r\n" + "paddsh $f30, $f30, $f18 \r\n" + "paddsh $f24, $f24, %[ff_pw_16] \r\n" + "paddsh $f26, $f26, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "paddsh $f30, $f30, %[ff_pw_16] \r\n" + "psrah $f24, $f24, %[ff_pw_5] \r\n" + "psrah $f26, $f26, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "psrah $f30, $f30, %[ff_pw_5] \r\n" + "packushb $f24, $f24, $f0 \r\n" + "packushb $f26, $f26, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "packushb $f30, $f30, $f0 \r\n" + "swc1 $f24, 0(%[dst0]) \r\n" + "swc1 $f26, 0(%[dst1]) \r\n" + "swc1 $f28, 0(%[dst2]) \r\n" + "swc1 $f30, 0(%[dst3]) \r\n" + ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride), + [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride), + [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride), + [src0]"r"(src), [src1]"r"(src+srcStride), + [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride), + [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride), + [src6]"r"(src+6*srcStride), [ff_pw_20]"f"(ff_pw_20), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18", + "$f20","$f22","$f24","$f26","$f28","$f30" + ); +} + +static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "gsldlc1 $f2, 7(%[srcB]) \r\n" + "gsldrc1 $f2, 0(%[srcB]) \r\n" + "gsldlc1 $f4, 7(%[srcA]) \r\n" + "gsldrc1 $f4, 0(%[srcA]) \r\n" + "gsldlc1 $f6, 7(%[src0]) \r\n" + "gsldrc1 $f6, 0(%[src0]) \r\n" + "gsldlc1 $f8, 7(%[src1]) \r\n" + "gsldrc1 $f8, 0(%[src1]) \r\n" + "gsldlc1 $f10, 7(%[src2]) \r\n" + "gsldrc1 $f10, 0(%[src2]) \r\n" + "gsldlc1 $f12, 7(%[src3]) \r\n" + "gsldrc1 $f12, 0(%[src3]) \r\n" + "gsldlc1 $f14, 7(%[src4]) \r\n" + "gsldrc1 $f14, 0(%[src4]) \r\n" + "gsldlc1 $f16, 7(%[src5]) \r\n" + "gsldrc1 $f16, 0(%[src5]) \r\n" + "gsldlc1 $f18, 7(%[src6]) \r\n" + "gsldrc1 $f18, 0(%[src6]) \r\n" + "gsldlc1 $f20, 7(%[src7]) \r\n" + "gsldrc1 $f20, 0(%[src7]) \r\n" + "gsldlc1 $f22, 7(%[src8]) \r\n" + "gsldrc1 $f22, 0(%[src8]) \r\n" + "gsldlc1 $f24, 7(%[src9]) \r\n" + "gsldrc1 $f24, 0(%[src9]) \r\n" + "gsldlc1 $f26, 7(%[src10]) \r\n" + "gsldrc1 $f26, 0(%[src10]) \r\n" + "punpcklbh $f1, $f2, $f0 \r\n" + "punpckhbh $f2, $f2, $f0 \r\n" + "punpcklbh $f3, $f4, $f0 \r\n" + "punpckhbh $f4, $f4, $f0 \r\n" + "punpcklbh $f5, $f6, $f0 \r\n" + "punpckhbh $f6, $f6, $f0 \r\n" + "punpcklbh $f7, $f8, $f0 \r\n" + "punpckhbh $f8, $f8, $f0 \r\n" + "punpcklbh $f9, $f10, $f0 \r\n" + "punpckhbh $f10, $f10, $f0 \r\n" + "punpcklbh $f11, $f12, $f0 \r\n" + "punpckhbh $f12, $f12, $f0 \r\n" + "punpcklbh $f13, $f14, $f0 \r\n" + "punpckhbh $f14, $f14, $f0 \r\n" + "punpcklbh $f15, $f16, $f0 \r\n" + "punpckhbh $f16, $f16, $f0 \r\n" + "punpcklbh $f17, $f18, $f0 \r\n" + "punpckhbh $f18, $f18, $f0 \r\n" + "punpcklbh $f19, $f20, $f0 \r\n" + "punpckhbh $f20, $f20, $f0 \r\n" + "punpcklbh $f21, $f22, $f0 \r\n" + "punpckhbh $f22, $f22, $f0 \r\n" + "punpcklbh $f23, $f24, $f0 \r\n" + "punpckhbh $f24, $f24, $f0 \r\n" + "punpcklbh $f25, $f26, $f0 \r\n" + "punpckhbh $f26, $f26, $f0 \r\n" + "paddsh $f27, $f5, $f7 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f6, $f8 \r\n"//src0+src1 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f3 \r\n" + "psubsh $f28, $f28, $f4 \r\n" + "psubsh $f27, $f27, $f9 \r\n" + "psubsh $f28, $f28, $f10 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f1 \r\n" + "paddsh $f28, $f28, $f2 \r\n" + "paddsh $f27, $f27, $f11 \r\n" + "paddsh $f28, $f28, $f12 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f2, $f27, $f28 \r\n" + "sdc1 $f2, 0(%[dst0]) \r\n" + "paddsh $f27, $f7, $f9 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f8, $f10 \r\n"//src1+src2 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f5 \r\n" + "psubsh $f28, $f28, $f6 \r\n" + "psubsh $f27, $f27, $f11 \r\n" + "psubsh $f28, $f28, $f12 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f3 \r\n" + "paddsh $f28, $f28, $f4 \r\n" + "paddsh $f27, $f27, $f13 \r\n" + "paddsh $f28, $f28, $f14 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f4, $f27, $f28 \r\n" + "sdc1 $f4, 0(%[dst1]) \r\n" + "paddsh $f27, $f9, $f11 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f10, $f12 \r\n"//src2+src3 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f7 \r\n" + "psubsh $f28, $f28, $f8 \r\n" + "psubsh $f27, $f27, $f13 \r\n" + "psubsh $f28, $f28, $f14 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f5 \r\n" + "paddsh $f28, $f28, $f6 \r\n" + "paddsh $f27, $f27, $f15 \r\n" + "paddsh $f28, $f28, $f16 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f6, $f27, $f28 \r\n" + "sdc1 $f6, 0(%[dst2]) \r\n" + "paddsh $f27, $f11, $f13 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f12, $f14 \r\n"//src3+src4 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f9 \r\n" + "psubsh $f28, $f28, $f10 \r\n" + "psubsh $f27, $f27, $f15 \r\n" + "psubsh $f28, $f28, $f16 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f7 \r\n" + "paddsh $f28, $f28, $f8 \r\n" + "paddsh $f27, $f27, $f17 \r\n" + "paddsh $f28, $f28, $f18 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f8, $f27, $f28 \r\n" + "sdc1 $f8, 0(%[dst3]) \r\n" + "paddsh $f27, $f13, $f15 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f14, $f16 \r\n"//src4+src5 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f11 \r\n" + "psubsh $f28, $f28, $f12 \r\n" + "psubsh $f27, $f27, $f17 \r\n" + "psubsh $f28, $f28, $f18 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f9 \r\n" + "paddsh $f28, $f28, $f10 \r\n" + "paddsh $f27, $f27, $f19 \r\n" + "paddsh $f28, $f28, $f20 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f10, $f27, $f28 \r\n" + "sdc1 $f10, 0(%[dst4]) \r\n" + + "paddsh $f27, $f15, $f17 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f16, $f18 \r\n"//src5+src6 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f13 \r\n" + "psubsh $f28, $f28, $f14 \r\n" + "psubsh $f27, $f27, $f19 \r\n" + "psubsh $f28, $f28, $f20 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f11 \r\n" + "paddsh $f28, $f28, $f12 \r\n" + "paddsh $f27, $f27, $f21 \r\n" + "paddsh $f28, $f28, $f22 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f12, $f27, $f28 \r\n" + "sdc1 $f12, 0(%[dst5]) \r\n" + "paddsh $f27, $f17, $f19 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f18, $f20 \r\n"//src6+src7 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f15 \r\n" + "psubsh $f28, $f28, $f16 \r\n" + "psubsh $f27, $f27, $f21 \r\n" + "psubsh $f28, $f28, $f22 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f13 \r\n" + "paddsh $f28, $f28, $f14 \r\n" + "paddsh $f27, $f27, $f23 \r\n" + "paddsh $f28, $f28, $f24 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f14, $f27, $f28 \r\n" + "sdc1 $f14, 0(%[dst6]) \r\n" + "paddsh $f27, $f19, $f21 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f20, $f22 \r\n"//src7+src8 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f17 \r\n" + "psubsh $f28, $f28, $f18 \r\n" + "psubsh $f27, $f27, $f23 \r\n" + "psubsh $f28, $f28, $f24 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f15 \r\n" + "paddsh $f28, $f28, $f16 \r\n" + "paddsh $f27, $f27, $f25 \r\n" + "paddsh $f28, $f28, $f26 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f16, $f27, $f28 \r\n" + "sdc1 $f16, 0(%[dst7]) \r\n" + ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride), + [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride), + [dst4]"r"(dst+4*dstStride), [dst5]"r"(dst+5*dstStride), + [dst6]"r"(dst+6*dstStride), [dst7]"r"(dst+7*dstStride), + [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride), + [src0]"r"(src), [src1]"r"(src+srcStride), + [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride), + [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride), + [src6]"r"(src+6*srcStride), [src7]"r"(src+7*srcStride), + [src8]"r"(src+8*srcStride), [src9]"r"(src+9*srcStride), + [src10]"r"(src+10*srcStride), [ff_pw_4]"f"(ff_pw_4), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9","$f10", + "$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18","$f19", + "$f20","$f21","$f22","$f23","$f24","$f25","$f26","$f27","$f28" + ); +} + +static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "gslwlc1 $f2, 3(%[srcB]) \r\n" + "gslwrc1 $f2, 0(%[srcB]) \r\n" + "gslwlc1 $f4, 3(%[srcA]) \r\n" + "gslwrc1 $f4, 0(%[srcA]) \r\n" + "gslwlc1 $f6, 3(%[src0]) \r\n" + "gslwrc1 $f6, 0(%[src0]) \r\n" + "gslwlc1 $f8, 3(%[src1]) \r\n" + "gslwrc1 $f8, 0(%[src1]) \r\n" + "gslwlc1 $f10, 3(%[src2]) \r\n" + "gslwrc1 $f10, 0(%[src2]) \r\n" + "gslwlc1 $f12, 3(%[src3]) \r\n" + "gslwrc1 $f12, 0(%[src3]) \r\n" + "gslwlc1 $f14, 3(%[src4]) \r\n" + "gslwrc1 $f14, 0(%[src4]) \r\n" + "gslwlc1 $f16, 3(%[src5]) \r\n" + "gslwrc1 $f16, 0(%[src5]) \r\n" + "gslwlc1 $f18, 3(%[src6]) \r\n" + "gslwrc1 $f18, 0(%[src6]) \r\n" + "punpcklbh $f2, $f2, $f0 \r\n" + "punpcklbh $f4, $f4, $f0 \r\n" + "punpcklbh $f6, $f6, $f0 \r\n" + "punpcklbh $f8, $f8, $f0 \r\n" + "punpcklbh $f10, $f10, $f0 \r\n" + "punpcklbh $f12, $f12, $f0 \r\n" + "punpcklbh $f14, $f14, $f0 \r\n" + "punpcklbh $f16, $f16, $f0 \r\n" + "punpcklbh $f18, $f18, $f0 \r\n" + "paddsh $f20, $f6, $f8 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f4, $f10 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f24, $f20, $f22 \r\n" + "paddsh $f24, $f24, $f2 \r\n" + "paddsh $f24, $f24, $f12 \r\n" + "paddsh $f20, $f8, $f10 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f6, $f12 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f26, $f20, $f22 \r\n" + "paddsh $f26, $f26, $f4 \r\n" + "paddsh $f26, $f26, $f14 \r\n" + "paddsh $f20, $f10, $f12 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f8, $f14 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f28, $f20, $f22 \r\n" + "paddsh $f28, $f28, $f6 \r\n" + "paddsh $f28, $f28, $f16 \r\n" + "paddsh $f20, $f12, $f14 \r\n" + "pmullh $f20, $f20, %[ff_pw_20] \r\n" + "paddsh $f22, $f10, $f16 \r\n" + "pmullh $f22, $f22, %[ff_pw_5] \r\n" + "psubsh $f30, $f20, $f22 \r\n" + "paddsh $f30, $f30, $f8 \r\n" + "paddsh $f30, $f30, $f18 \r\n" + "paddsh $f24, $f24, %[ff_pw_16] \r\n" + "paddsh $f26, $f26, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "paddsh $f30, $f30, %[ff_pw_16] \r\n" + "psrah $f24, $f24, %[ff_pw_5] \r\n" + "psrah $f26, $f26, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "psrah $f30, $f30, %[ff_pw_5] \r\n" + "packushb $f24, $f24, $f0 \r\n" + "packushb $f26, $f26, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "packushb $f30, $f30, $f0 \r\n" + "lwc1 $f2, 0(%[dst0]) \r\n" + "lwc1 $f4, 0(%[dst1]) \r\n" + "lwc1 $f6, 0(%[dst2]) \r\n" + "lwc1 $f8, 0(%[dst3]) \r\n" + "pavgb $f24, $f2, $f24 \r\n" + "pavgb $f26, $f4, $f26 \r\n" + "pavgb $f28, $f6, $f28 \r\n" + "pavgb $f30, $f8, $f30 \r\n" + "swc1 $f24, 0(%[dst0]) \r\n" + "swc1 $f26, 0(%[dst1]) \r\n" + "swc1 $f28, 0(%[dst2]) \r\n" + "swc1 $f30, 0(%[dst3]) \r\n" + ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride), + [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride), + [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride), + [src0]"r"(src), [src1]"r"(src+srcStride), + [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride), + [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride), + [src6]"r"(src+6*srcStride), [ff_pw_20]"f"(ff_pw_20), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18", + "$f20","$f22","$f24","$f26","$f28","$f30" + ); +} + +static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "gsldlc1 $f2, 7(%[srcB]) \r\n" + "gsldrc1 $f2, 0(%[srcB]) \r\n" + "gsldlc1 $f4, 7(%[srcA]) \r\n" + "gsldrc1 $f4, 0(%[srcA]) \r\n" + "gsldlc1 $f6, 7(%[src0]) \r\n" + "gsldrc1 $f6, 0(%[src0]) \r\n" + "gsldlc1 $f8, 7(%[src1]) \r\n" + "gsldrc1 $f8, 0(%[src1]) \r\n" + "gsldlc1 $f10, 7(%[src2]) \r\n" + "gsldrc1 $f10, 0(%[src2]) \r\n" + "gsldlc1 $f12, 7(%[src3]) \r\n" + "gsldrc1 $f12, 0(%[src3]) \r\n" + "gsldlc1 $f14, 7(%[src4]) \r\n" + "gsldrc1 $f14, 0(%[src4]) \r\n" + "gsldlc1 $f16, 7(%[src5]) \r\n" + "gsldrc1 $f16, 0(%[src5]) \r\n" + "gsldlc1 $f18, 7(%[src6]) \r\n" + "gsldrc1 $f18, 0(%[src6]) \r\n" + "gsldlc1 $f20, 7(%[src7]) \r\n" + "gsldrc1 $f20, 0(%[src7]) \r\n" + "gsldlc1 $f22, 7(%[src8]) \r\n" + "gsldrc1 $f22, 0(%[src8]) \r\n" + "gsldlc1 $f24, 7(%[src9]) \r\n" + "gsldrc1 $f24, 0(%[src9]) \r\n" + "gsldlc1 $f26, 7(%[src10]) \r\n" + "gsldrc1 $f26, 0(%[src10]) \r\n" + "punpcklbh $f1, $f2, $f0 \r\n" + "punpckhbh $f2, $f2, $f0 \r\n" + "punpcklbh $f3, $f4, $f0 \r\n" + "punpckhbh $f4, $f4, $f0 \r\n" + "punpcklbh $f5, $f6, $f0 \r\n" + "punpckhbh $f6, $f6, $f0 \r\n" + "punpcklbh $f7, $f8, $f0 \r\n" + "punpckhbh $f8, $f8, $f0 \r\n" + "punpcklbh $f9, $f10, $f0 \r\n" + "punpckhbh $f10, $f10, $f0 \r\n" + "punpcklbh $f11, $f12, $f0 \r\n" + "punpckhbh $f12, $f12, $f0 \r\n" + "punpcklbh $f13, $f14, $f0 \r\n" + "punpckhbh $f14, $f14, $f0 \r\n" + "punpcklbh $f15, $f16, $f0 \r\n" + "punpckhbh $f16, $f16, $f0 \r\n" + "punpcklbh $f17, $f18, $f0 \r\n" + "punpckhbh $f18, $f18, $f0 \r\n" + "punpcklbh $f19, $f20, $f0 \r\n" + "punpckhbh $f20, $f20, $f0 \r\n" + "punpcklbh $f21, $f22, $f0 \r\n" + "punpckhbh $f22, $f22, $f0 \r\n" + "punpcklbh $f23, $f24, $f0 \r\n" + "punpckhbh $f24, $f24, $f0 \r\n" + "punpcklbh $f25, $f26, $f0 \r\n" + "punpckhbh $f26, $f26, $f0 \r\n" + "paddsh $f27, $f5, $f7 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f6, $f8 \r\n"//src0+src1 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f3 \r\n" + "psubsh $f28, $f28, $f4 \r\n" + "psubsh $f27, $f27, $f9 \r\n" + "psubsh $f28, $f28, $f10 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f1 \r\n" + "paddsh $f28, $f28, $f2 \r\n" + "paddsh $f27, $f27, $f11 \r\n" + "paddsh $f28, $f28, $f12 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f2, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst0]) \r\n" + "pavgb $f2, $f2, $f28 \r\n" + "sdc1 $f2, 0(%[dst0]) \r\n" + "paddsh $f27, $f7, $f9 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f8, $f10 \r\n"//src1+src2 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f5 \r\n" + "psubsh $f28, $f28, $f6 \r\n" + "psubsh $f27, $f27, $f11 \r\n" + "psubsh $f28, $f28, $f12 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f3 \r\n" + "paddsh $f28, $f28, $f4 \r\n" + "paddsh $f27, $f27, $f13 \r\n" + "paddsh $f28, $f28, $f14 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f4, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst1]) \r\n" + "pavgb $f4, $f4, $f28 \r\n" + "sdc1 $f4, 0(%[dst1]) \r\n" + "paddsh $f27, $f9, $f11 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f10, $f12 \r\n"//src2+src3 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f7 \r\n" + "psubsh $f28, $f28, $f8 \r\n" + "psubsh $f27, $f27, $f13 \r\n" + "psubsh $f28, $f28, $f14 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f5 \r\n" + "paddsh $f28, $f28, $f6 \r\n" + "paddsh $f27, $f27, $f15 \r\n" + "paddsh $f28, $f28, $f16 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f6, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst2]) \r\n" + "pavgb $f6, $f6, $f28 \r\n" + "sdc1 $f6, 0(%[dst2]) \r\n" + "paddsh $f27, $f11, $f13 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f12, $f14 \r\n"//src3+src4 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f9 \r\n" + "psubsh $f28, $f28, $f10 \r\n" + "psubsh $f27, $f27, $f15 \r\n" + "psubsh $f28, $f28, $f16 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f7 \r\n" + "paddsh $f28, $f28, $f8 \r\n" + "paddsh $f27, $f27, $f17 \r\n" + "paddsh $f28, $f28, $f18 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f8, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst3]) \r\n" + "pavgb $f8, $f8, $f28 \r\n" + "sdc1 $f8, 0(%[dst3]) \r\n" + "paddsh $f27, $f13, $f15 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f14, $f16 \r\n"//src4+src5 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f11 \r\n" + "psubsh $f28, $f28, $f12 \r\n" + "psubsh $f27, $f27, $f17 \r\n" + "psubsh $f28, $f28, $f18 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f9 \r\n" + "paddsh $f28, $f28, $f10 \r\n" + "paddsh $f27, $f27, $f19 \r\n" + "paddsh $f28, $f28, $f20 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f10, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst4]) \r\n" + "pavgb $f10, $f10, $f28 \r\n" + "sdc1 $f10, 0(%[dst4]) \r\n" + "paddsh $f27, $f15, $f17 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f16, $f18 \r\n"//src5+src6 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f13 \r\n" + "psubsh $f28, $f28, $f14 \r\n" + "psubsh $f27, $f27, $f19 \r\n" + "psubsh $f28, $f28, $f20 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f11 \r\n" + "paddsh $f28, $f28, $f12 \r\n" + "paddsh $f27, $f27, $f21 \r\n" + "paddsh $f28, $f28, $f22 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f12, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst5]) \r\n" + "pavgb $f12, $f12, $f28 \r\n" + "sdc1 $f12, 0(%[dst5]) \r\n" + "paddsh $f27, $f17, $f19 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f18, $f20 \r\n"//src6+src7 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f15 \r\n" + "psubsh $f28, $f28, $f16 \r\n" + "psubsh $f27, $f27, $f21 \r\n" + "psubsh $f28, $f28, $f22 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f13 \r\n" + "paddsh $f28, $f28, $f14 \r\n" + "paddsh $f27, $f27, $f23 \r\n" + "paddsh $f28, $f28, $f24 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f14, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst6]) \r\n" + "pavgb $f14, $f14, $f28 \r\n" + "sdc1 $f14, 0(%[dst6]) \r\n" + "paddsh $f27, $f19, $f21 \r\n" + "pmullh $f27, $f27, %[ff_pw_4] \r\n" + "paddsh $f28, $f20, $f22 \r\n"//src7+src8 + "pmullh $f28, $f28, %[ff_pw_4] \r\n" + "psubsh $f27, $f27, $f17 \r\n" + "psubsh $f28, $f28, $f18 \r\n" + "psubsh $f27, $f27, $f23 \r\n" + "psubsh $f28, $f28, $f24 \r\n" + "pmullh $f27, $f27, %[ff_pw_5] \r\n" + "pmullh $f28, $f28, %[ff_pw_5] \r\n" + "paddsh $f27, $f27, $f15 \r\n" + "paddsh $f28, $f28, $f16 \r\n" + "paddsh $f27, $f27, $f25 \r\n" + "paddsh $f28, $f28, $f26 \r\n" + "paddsh $f27, $f27, %[ff_pw_16] \r\n" + "paddsh $f28, $f28, %[ff_pw_16] \r\n" + "psrah $f27, $f27, %[ff_pw_5] \r\n" + "psrah $f28, $f28, %[ff_pw_5] \r\n" + "packushb $f27, $f27, $f0 \r\n" + "packushb $f28, $f28, $f0 \r\n" + "punpcklwd $f16, $f27, $f28 \r\n" + "ldc1 $f28, 0(%[dst7]) \r\n" + "pavgb $f16, $f16, $f28 \r\n" + "sdc1 $f16, 0(%[dst7]) \r\n" + ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride), + [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride), + [dst4]"r"(dst+4*dstStride), [dst5]"r"(dst+5*dstStride), + [dst6]"r"(dst+6*dstStride), [dst7]"r"(dst+7*dstStride), + [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride), + [src0]"r"(src), [src1]"r"(src+srcStride), + [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride), + [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride), + [src6]"r"(src+6*srcStride), [src7]"r"(src+7*srcStride), + [src8]"r"(src+8*srcStride), [src9]"r"(src+9*srcStride), + [src10]"r"(src+10*srcStride), [ff_pw_4]"f"(ff_pw_4), + [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16) + : "$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9","$f10", + "$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18","$f19", + "$f20","$f21","$f22","$f23","$f24","$f25","$f26","$f27","$f28" + ); +} + +static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + int i; + int16_t _tmp[36]; + int16_t *tmp = _tmp; + src -= 2*srcStride; + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 9 \r\n" + "1: \r\n" + "gslwlc1 $f2, 1(%[src]) \r\n" + "gslwrc1 $f2, -2(%[src]) \r\n" + "gslwlc1 $f4, 2(%[src]) \r\n" + "gslwrc1 $f4, -1(%[src]) \r\n" + "gslwlc1 $f6, 3(%[src]) \r\n" + "gslwrc1 $f6, 0(%[src]) \r\n" + "gslwlc1 $f8, 4(%[src]) \r\n" + "gslwrc1 $f8, 1(%[src]) \r\n" + "gslwlc1 $f10, 5(%[src]) \r\n" + "gslwrc1 $f10, 2(%[src]) \r\n" + "gslwlc1 $f12, 6(%[src]) \r\n" + "gslwrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f2, $f2, $f0 \r\n" + "punpcklbh $f4, $f4, $f0 \r\n" + "punpcklbh $f6, $f6, $f0 \r\n" + "punpcklbh $f8, $f8, $f0 \r\n" + "punpcklbh $f10, $f10, $f0 \r\n" + "punpcklbh $f12, $f12, $f0 \r\n" + "paddsh $f14, $f6, $f8 \r\n" + "paddsh $f16, $f4, $f10 \r\n" + "paddsh $f18, $f2, $f12 \r\n" + "pmullh $f14, $f14, %[ff_pw_20] \r\n" + "pmullh $f16, $f16, %[ff_pw_5] \r\n" + "psubsh $f14, $f14, $f16 \r\n" + "paddsh $f18, $f14, $f18 \r\n" + "sdc1 $f18, 0(%[tmp]) \r\n" + "dadd %[tmp], %[tmp], %[tmpStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [tmp]"+&r"(tmp),[src]"+&r"(src) + : [tmpStride]"r"(8),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5) + : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" + ); + + tmp -= 28; + + for(i=0; i<4; i++) { + const int16_t tmpB= tmp[-8]; + const int16_t tmpA= tmp[-4]; + const int16_t tmp0= tmp[ 0]; + const int16_t tmp1= tmp[ 4]; + const int16_t tmp2= tmp[ 8]; + const int16_t tmp3= tmp[12]; + const int16_t tmp4= tmp[16]; + const int16_t tmp5= tmp[20]; + const int16_t tmp6= tmp[24]; + op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3)); + op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4)); + op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5)); + op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6)); + dst++; + tmp++; + } +} + +static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + int16_t _tmp[104]; + int16_t *tmp = _tmp; + int i; + src -= 2*srcStride; + + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 13 \r\n" + "1: \r\n" + "gsldlc1 $f2, 5(%[src]) \r\n" + "gsldrc1 $f2, -2(%[src]) \r\n" + "gsldlc1 $f4, 6(%[src]) \r\n" + "gsldrc1 $f4, -1(%[src]) \r\n" + "gsldlc1 $f6, 7(%[src]) \r\n" + "gsldrc1 $f6, 0(%[src]) \r\n" + "gsldlc1 $f8, 8(%[src]) \r\n" + "gsldrc1 $f8, 1(%[src]) \r\n" + "gsldlc1 $f10, 9(%[src]) \r\n" + "gsldrc1 $f10, 2(%[src]) \r\n" + "gsldlc1 $f12, 10(%[src]) \r\n" + "gsldrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f1, $f2, $f0 \r\n" + "punpcklbh $f3, $f4, $f0 \r\n" + "punpcklbh $f5, $f6, $f0 \r\n" + "punpcklbh $f7, $f8, $f0 \r\n" + "punpcklbh $f9, $f10, $f0 \r\n" + "punpcklbh $f11, $f12, $f0 \r\n" + "punpckhbh $f2, $f2, $f0 \r\n" + "punpckhbh $f4, $f4, $f0 \r\n" + "punpckhbh $f6, $f6, $f0 \r\n" + "punpckhbh $f8, $f8, $f0 \r\n" + "punpckhbh $f10, $f10, $f0 \r\n" + "punpckhbh $f12, $f12, $f0 \r\n" + "paddsh $f13, $f5, $f7 \r\n" + "paddsh $f15, $f3, $f9 \r\n" + "paddsh $f17, $f1, $f11 \r\n" + "pmullh $f13, $f13, %[ff_pw_20] \r\n" + "pmullh $f15, $f15, %[ff_pw_5] \r\n" + "psubsh $f13, $f13, $f15 \r\n" + "paddsh $f17, $f13, $f17 \r\n" + "paddsh $f14, $f6, $f8 \r\n" + "paddsh $f16, $f4, $f10 \r\n" + "paddsh $f18, $f2, $f12 \r\n" + "pmullh $f14, $f14, %[ff_pw_20] \r\n" + "pmullh $f16, $f16, %[ff_pw_5] \r\n" + "psubsh $f14, $f14, $f16 \r\n" + "paddsh $f18, $f14, $f18 \r\n" + "sdc1 $f17, 0(%[tmp]) \r\n" + "sdc1 $f18, 8(%[tmp]) \r\n" + "dadd %[tmp], %[tmp], %[tmpStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [tmp]"+&r"(tmp),[src]"+&r"(src) + : [tmpStride]"r"(16),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5) + : "$8","$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9", + "$f10","$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18" + ); + + tmp -= 88; + + for(i=0; i<8; i++) { + const int tmpB= tmp[-16]; + const int tmpA= tmp[ -8]; + const int tmp0= tmp[ 0]; + const int tmp1= tmp[ 8]; + const int tmp2= tmp[ 16]; + const int tmp3= tmp[ 24]; + const int tmp4= tmp[ 32]; + const int tmp5= tmp[ 40]; + const int tmp6= tmp[ 48]; + const int tmp7= tmp[ 56]; + const int tmp8= tmp[ 64]; + const int tmp9= tmp[ 72]; + const int tmp10=tmp[ 80]; + op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3)); + op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4)); + op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5)); + op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6)); + op2_put(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7)); + op2_put(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8)); + op2_put(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9)); + op2_put(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10)); + dst++; + tmp++; + } +} + +static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + put_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride); + put_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + int i; + int16_t _tmp[36]; + int16_t *tmp = _tmp; + src -= 2*srcStride; + + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 9 \r\n" + "1: \r\n" + "gslwlc1 $f2, 1(%[src]) \r\n" + "gslwrc1 $f2, -2(%[src]) \r\n" + "gslwlc1 $f4, 2(%[src]) \r\n" + "gslwrc1 $f4, -1(%[src]) \r\n" + "gslwlc1 $f6, 3(%[src]) \r\n" + "gslwrc1 $f6, 0(%[src]) \r\n" + "gslwlc1 $f8, 4(%[src]) \r\n" + "gslwrc1 $f8, 1(%[src]) \r\n" + "gslwlc1 $f10, 5(%[src]) \r\n" + "gslwrc1 $f10, 2(%[src]) \r\n" + "gslwlc1 $f12, 6(%[src]) \r\n" + "gslwrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f2, $f2, $f0 \r\n" + "punpcklbh $f4, $f4, $f0 \r\n" + "punpcklbh $f6, $f6, $f0 \r\n" + "punpcklbh $f8, $f8, $f0 \r\n" + "punpcklbh $f10, $f10, $f0 \r\n" + "punpcklbh $f12, $f12, $f0 \r\n" + "paddsh $f14, $f6, $f8 \r\n" + "paddsh $f16, $f4, $f10 \r\n" + "paddsh $f18, $f2, $f12 \r\n" + "pmullh $f14, $f14, %[ff_pw_20] \r\n" + "pmullh $f16, $f16, %[ff_pw_5] \r\n" + "psubsh $f14, $f14, $f16 \r\n" + "paddsh $f18, $f14, $f18 \r\n" + "sdc1 $f18, 0(%[tmp]) \r\n" + "dadd %[tmp], %[tmp], %[tmpStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [tmp]"+&r"(tmp),[src]"+&r"(src) + : [tmpStride]"r"(8),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5) + : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" + ); + + tmp -= 28; + + for(i=0; i<4; i++) + { + const int16_t tmpB= tmp[-8]; + const int16_t tmpA= tmp[-4]; + const int16_t tmp0= tmp[ 0]; + const int16_t tmp1= tmp[ 4]; + const int16_t tmp2= tmp[ 8]; + const int16_t tmp3= tmp[12]; + const int16_t tmp4= tmp[16]; + const int16_t tmp5= tmp[20]; + const int16_t tmp6= tmp[24]; + op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3)); + op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4)); + op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5)); + op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6)); + dst++; + tmp++; + } +} + +static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + int16_t _tmp[104]; + int16_t *tmp = _tmp; + int i; + src -= 2*srcStride; + + __asm__ volatile ( + "xor $f0, $f0, $f0 \r\n" + "dli $8, 13 \r\n" + "1: \r\n" + "gsldlc1 $f2, 5(%[src]) \r\n" + "gsldrc1 $f2, -2(%[src]) \r\n" + "gsldlc1 $f4, 6(%[src]) \r\n" + "gsldrc1 $f4, -1(%[src]) \r\n" + "gsldlc1 $f6, 7(%[src]) \r\n" + "gsldrc1 $f6, 0(%[src]) \r\n" + "gsldlc1 $f8, 8(%[src]) \r\n" + "gsldrc1 $f8, 1(%[src]) \r\n" + "gsldlc1 $f10, 9(%[src]) \r\n" + "gsldrc1 $f10, 2(%[src]) \r\n" + "gsldlc1 $f12, 10(%[src]) \r\n" + "gsldrc1 $f12, 3(%[src]) \r\n" + "punpcklbh $f1, $f2, $f0 \r\n" + "punpcklbh $f3, $f4, $f0 \r\n" + "punpcklbh $f5, $f6, $f0 \r\n" + "punpcklbh $f7, $f8, $f0 \r\n" + "punpcklbh $f9, $f10, $f0 \r\n" + "punpcklbh $f11, $f12, $f0 \r\n" + "punpckhbh $f2, $f2, $f0 \r\n" + "punpckhbh $f4, $f4, $f0 \r\n" + "punpckhbh $f6, $f6, $f0 \r\n" + "punpckhbh $f8, $f8, $f0 \r\n" + "punpckhbh $f10, $f10, $f0 \r\n" + "punpckhbh $f12, $f12, $f0 \r\n" + "paddsh $f13, $f5, $f7 \r\n" + "paddsh $f15, $f3, $f9 \r\n" + "paddsh $f17, $f1, $f11 \r\n" + "pmullh $f13, $f13, %[ff_pw_20] \r\n" + "pmullh $f15, $f15, %[ff_pw_5] \r\n" + "psubsh $f13, $f13, $f15 \r\n" + "paddsh $f17, $f13, $f17 \r\n" + "paddsh $f14, $f6, $f8 \r\n" + "paddsh $f16, $f4, $f10 \r\n" + "paddsh $f18, $f2, $f12 \r\n" + "pmullh $f14, $f14, %[ff_pw_20] \r\n" + "pmullh $f16, $f16, %[ff_pw_5] \r\n" + "psubsh $f14, $f14, $f16 \r\n" + "paddsh $f18, $f14, $f18 \r\n" + + "sdc1 $f17, 0(%[tmp]) \r\n" + "sdc1 $f18, 8(%[tmp]) \r\n" + "dadd %[tmp], %[tmp], %[tmpStride] \r\n" + "dadd %[src], %[src], %[srcStride] \r\n" + "daddi $8, $8, -1 \r\n" + "bnez $8, 1b \r\n" + : [tmp]"+&r"(tmp),[src]"+&r"(src) + : [tmpStride]"r"(16),[srcStride]"r"(srcStride), + [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5) + : "$8","$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9", + "$f10","$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18" + ); + + tmp -= 88; + + for(i=0; i<8; i++) { + const int tmpB= tmp[-16]; + const int tmpA= tmp[ -8]; + const int tmp0= tmp[ 0]; + const int tmp1= tmp[ 8]; + const int tmp2= tmp[ 16]; + const int tmp3= tmp[ 24]; + const int tmp4= tmp[ 32]; + const int tmp5= tmp[ 40]; + const int tmp6= tmp[ 48]; + const int tmp7= tmp[ 56]; + const int tmp8= tmp[ 64]; + const int tmp9= tmp[ 72]; + const int tmp10=tmp[ 80]; + op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3)); + op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4)); + op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5)); + op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6)); + op2_avg(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7)); + op2_avg(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8)); + op2_avg(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9)); + op2_avg(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10)); + dst++; + tmp++; + } +} + +static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride){ + avg_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride); + avg_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride); +} + +//DEF_H264_MC_MMI(put_, 4) +void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_pixels4_mmi(dst, src, stride, 4); +} + +void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + put_pixels4_l2_mmi(dst, src, half, stride, stride, 4, 4); +} + +void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + put_pixels4_l2_mmi(dst, src+1, half, stride, stride, 4, 4); +} + +void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + put_pixels4_l2_mmi(dst, full_mid, half, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4); +} + +void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + put_pixels4_l2_mmi(dst, full_mid+4, half, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + put_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + put_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + put_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + put_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +//DEF_H264_MC_MMI(avg_, 4) +void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_pixels4_mmi(dst, src, stride, 4); +} + +void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + avg_pixels4_l2_mmi(dst, src, half, stride, stride, 4, 4); +} + +void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[16]; + put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride); + avg_pixels4_l2_mmi(dst, src+1, half, stride, stride, 4, 4); +} + +void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + avg_pixels4_l2_mmi(dst, full_mid, half, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4); +} + +void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t half[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4); + avg_pixels4_l2_mmi(dst, full_mid+4, half, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfH[16]; + uint8_t halfV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + avg_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[16]; + uint8_t halfHV[16]; + put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + avg_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + avg_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[36]; + uint8_t * const full_mid= full + 8; + uint8_t halfV[16]; + uint8_t halfHV[16]; + copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9); + put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4); + put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride); + avg_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4); +} + +//DEF_H264_MC_MMI(put_, 8) +void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_pixels8_mmi(dst, src, stride, 8); +} + +void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + put_pixels8_l2_mmi(dst, src, half, stride, stride, 8, 8); +} + +void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + put_pixels8_l2_mmi(dst, src+1, half, stride, stride, 8, 8); +} + +void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + put_pixels8_l2_mmi(dst, full_mid, half, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8); +} + +void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + put_pixels8_l2_mmi(dst, full_mid+8, half, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel8_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfHV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + put_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfHV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + put_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfV[64]; + uint8_t halfHV[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + put_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8); +} + +void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfV[64]; + uint8_t halfHV[64]; + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + put_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8); +} + +//DEF_H264_MC_MMI(avg_, 8) +void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_pixels8_mmi(dst, src, stride, 8); +} + +void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + avg_pixels8_l2_mmi(dst, src, half, stride, stride, 8, 8); +} + +void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride); + avg_pixels8_l2_mmi(dst, src+1, half, stride, stride, 8, 8); +} + +void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + avg_pixels8_l2_mmi(dst, full_mid, half, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8); +} + +void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t half[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8); + avg_pixels8_l2_mmi(dst, full_mid+8, half, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfH[64]; + uint8_t halfV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel8_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfHV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + avg_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfHV[64]; + put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + avg_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfV[64]; + uint8_t halfHV[64]; + copy_block8_mmi(full, src - stride*2, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + avg_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8); +} + +void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[104]; + uint8_t * const full_mid= full + 16; + uint8_t halfV[64]; + uint8_t halfHV[64]; + copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13); + put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8); + put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride); + avg_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8); +} + +//DEF_H264_MC_MMI(put_, 16) +void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_pixels16_mmi(dst, src, stride, 16); +} + +void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + put_pixels16_l2_mmi(dst, src, half, stride, stride, 16, 16); +} + +void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + put_pixels16_l2_mmi(dst, src+1, half, stride, stride, 16, 16); +} + +void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + put_pixels16_l2_mmi(dst, full_mid, half, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16); +} + +void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + put_pixels16_l2_mmi(dst, full_mid+16, half, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel16_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[256]; + uint8_t halfHV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + put_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[256]; + uint8_t halfHV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + put_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfV[256]; + uint8_t halfHV[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + put_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16); +} + +void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfV[256]; + uint8_t halfHV[256]; + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + put_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16); +} + +//DEF_H264_MC_MMI(avg_, 16) +void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_pixels16_mmi(dst, src, stride, 16); +} + +void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + avg_pixels16_l2_mmi(dst, src, half, stride, stride, 16, 16); +} + +void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride); + avg_pixels16_l2_mmi(dst, src+1, half, stride, stride, 16, 16); +} + +void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + avg_pixels16_l2_mmi(dst, full_mid, half, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16); +} + +void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t half[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16); + avg_pixels16_l2_mmi(dst, full_mid+16, half, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfH[256]; + uint8_t halfV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel16_hv_lowpass_mmi(dst, src, stride, stride); +} + +void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[256]; + uint8_t halfHV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + avg_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[256]; + uint8_t halfHV[256]; + put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + avg_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfV[256]; + uint8_t halfHV[256]; + copy_block16_mmi(full, src - stride*2, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + avg_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16); +} + +void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t full[336]; + uint8_t * const full_mid= full + 32; + uint8_t halfV[256]; + uint8_t halfHV[256]; + copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21); + put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16); + put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride); + avg_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16); +} + +#undef op2_avg +#undef op2_put |