diff options
author | Shivraj Patil <shivraj.patil@imgtec.com> | 2015-06-29 20:57:12 +0530 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2015-07-01 17:32:47 +0200 |
commit | 2eb28e889d9c16914e547cc128db521b5d6c5390 (patch) | |
tree | a291e53a91c163b8fb1b31f0358e79d8311e1c70 /libavcodec/mips/mpegvideo_msa.c | |
parent | 53fd70579bc2314f2f5d528bed96914179b9a209 (diff) | |
download | ffmpeg-2eb28e889d9c16914e547cc128db521b5d6c5390.tar.gz |
avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for mpegvideo functions
This patch adds MSA (MIPS-SIMD-Arch) optimizations for mpegvideo functions in new file mpegvideo_msa.c
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/mips/mpegvideo_msa.c')
-rw-r--r-- | libavcodec/mips/mpegvideo_msa.c | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c new file mode 100644 index 0000000000..aa9ef770eb --- /dev/null +++ b/libavcodec/mips/mpegvideo_msa.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mips/generic_macros_msa.h" +#include "h263dsp_mips.h" + +static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul, + int16_t qadd, int8_t n_coeffs, + uint8_t loop_start) +{ + int16_t *block_dup = block; + int32_t level, cnt; + v8i16 block_vec, qmul_vec, qadd_vec, sub; + v8i16 add, mask, mul, zero_mask; + + qmul_vec = __msa_fill_h(qmul); + qadd_vec = __msa_fill_h(qadd); + for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) { + block_vec = LD_SH(block_dup + loop_start); + mask = __msa_clti_s_h(block_vec, 0); + zero_mask = __msa_ceqi_h(block_vec, 0); + mul = block_vec * qmul_vec; + sub = mul - qadd_vec; + add = mul + qadd_vec; + add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask); + block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec, + (v16u8) zero_mask); + ST_SH(block_vec, block_dup + loop_start); + block_dup += 8; + } + + cnt = ((n_coeffs >> 3) * 8) + loop_start; + + for (; cnt <= n_coeffs; cnt++) { + level = block[cnt]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[cnt] = level; + } + } +} + +static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block, + int32_t qscale, + const int16_t *quant_matrix) +{ + int32_t cnt, sum_res = -1; + v8i16 block_vec, block_neg, qscale_vec, mask; + v8i16 block_org0, block_org1, block_org2, block_org3; + v8i16 quant_m0, quant_m1, quant_m2, quant_m3; + v8i16 sum, mul, zero_mask; + v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l; + v4i32 block_l, block_r, sad; + + qscale_vec = __msa_fill_h(qscale); + for (cnt = 0; cnt < 2; cnt++) { + LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3); + LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3); + mask = __msa_clti_s_h(block_org0, 0); + zero_mask = __msa_ceqi_h(block_org0, 0); + block_neg = -block_org0; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0, + (v16u8) zero_mask); + ST_SH(sum, block); + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + mask = __msa_clti_s_h(block_org1, 0); + zero_mask = __msa_ceqi_h(block_org1, 0); + block_neg = - block_org1; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1, + (v16u8) zero_mask); + ST_SH(sum, block); + + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + mask = __msa_clti_s_h(block_org2, 0); + zero_mask = __msa_ceqi_h(block_org2, 0); + block_neg = - block_org2; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2, + (v16u8) zero_mask); + ST_SH(sum, block); + + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + mask = __msa_clti_s_h(block_org3, 0); + zero_mask = __msa_ceqi_h(block_org3, 0); + block_neg = - block_org3; + block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg, + (v16u8) mask); + block_vec <<= 1; + block_vec += 1; + UNPCK_SH_SW(block_vec, block_r, block_l); + UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l); + UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l); + mul_vec = block_l * qscale_l; + mul_vec *= quant_m_l; + block_l = mul_vec >> 4; + mul_vec = block_r * qscale_r; + mul_vec *= quant_m_r; + block_r = mul_vec >> 4; + mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r); + block_neg = - mul; + sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg, + (v16u8) mask); + sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3, + (v16u8) zero_mask); + ST_SH(sum, block); + + block += 8; + quant_matrix += 8; + sad = __msa_hadd_s_w(sum, sum); + sum_res += HADD_SW_S32(sad); + } + + return sum_res; +} + +void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, + int16_t *block, int32_t index, + int32_t qscale) +{ + int32_t qmul, qadd; + int32_t nCoeffs; + + av_assert2(s->block_last_index[index] >= 0 || s->h263_aic); + + qmul = qscale << 1; + + if (!s->h263_aic) { + block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale; + qadd = (qscale - 1) | 1; + } else { + qadd = 0; + } + if (s->ac_pred) + nCoeffs = 63; + else + nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]]; + + h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1); +} + +void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, + int16_t *block, int32_t index, + int32_t qscale) +{ + int32_t qmul, qadd; + int32_t nCoeffs; + + av_assert2(s->block_last_index[index] >= 0); + + qadd = (qscale - 1) | 1; + qmul = qscale << 1; + + nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]]; + + h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0); +} + +void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, + int16_t *block, int32_t index, + int32_t qscale) +{ + const uint16_t *quant_matrix; + int32_t sum = -1; + + quant_matrix = s->inter_matrix; + + sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix); + + block[63] ^= sum & 1; +} |