diff options
author | Shiyou Yin <yinshiyou-hf@loongson.cn> | 2019-07-17 17:35:00 +0800 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2019-07-19 01:23:23 +0200 |
commit | 153c60752558369b98dce0b7a0ca7acc687fa630 (patch) | |
tree | b1dc51fc656ac92b1ac3f377ee75c6808bec0369 /libavcodec/mips/hevc_mc_biw_msa.c | |
parent | 00ed04d6149691a9abf486b2f88172fd6341d801 (diff) | |
download | ffmpeg-153c60752558369b98dce0b7a0ca7acc687fa630.tar.gz |
avutil/mips: refactor msa load and store macros.
Replace STnxm_UB and LDnxm_SH with new macros ST_{H/W/D}{1/2/4/8}.
The old macros are difficult to use because they don't follow the same parameter passing rules.
Changing details as following:
1. remove LD4x4_SH.
2. replace ST2x4_UB with ST_H4.
3. replace ST4x2_UB with ST_W2.
4. replace ST4x4_UB with ST_W4.
5. replace ST4x8_UB with ST_W8.
6. replace ST6x4_UB with ST_W2 and ST_H2.
7. replace ST8x1_UB with ST_D1.
8. replace ST8x2_UB with ST_D2.
9. replace ST8x4_UB with ST_D4.
10. replace ST8x8_UB with ST_D8.
11. replace ST12x4_UB with ST_D4 and ST_W4.
Examples of new macro: ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
ST_H4 store four half-word elements in vector 'in' to pdst with stride.
About the macro name:
1) 'ST' means store operation.
2) 'H/W/D' means type of vector element is 'half-word/word/double-word'.
3) Number '1/2/4/8' means how many elements will be stored.
About the macro parameter:
1) 'in0, in1...' 128-bits vector.
2) 'idx0, idx1...' elements index.
3) 'pdst' destination pointer to store to
4) 'stride' stride of each store operation.
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec/mips/hevc_mc_biw_msa.c')
-rw-r--r-- | libavcodec/mips/hevc_mc_biw_msa.c | 128 |
1 files changed, 67 insertions, 61 deletions
diff --git a/libavcodec/mips/hevc_mc_biw_msa.c b/libavcodec/mips/hevc_mc_biw_msa.c index ea65f00497..68f122ea48 100644 --- a/libavcodec/mips/hevc_mc_biw_msa.c +++ b/libavcodec/mips/hevc_mc_biw_msa.c @@ -126,7 +126,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); dst0 = CLIP_SH_0_255_MAX_SATU(dst0); out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0); - ST4x2_UB(out0, dst, dst_stride); + ST_W2(out0, 0, 1, dst, dst_stride); } else if (4 == height) { LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); INSERT_W4_SB(tp0, tp1, tp2, tp3, src0); @@ -138,7 +138,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, dst0, dst1); out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); - ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); } else if (0 == height % 8) { for (loop_cnt = (height >> 3); loop_cnt--;) { LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); @@ -162,7 +162,7 @@ static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, in3, weight_vec, rnd_vec, offset_vec, dst0, dst1, dst2, dst3); PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); - ST4x8_UB(out0, out1, dst, dst_stride); + ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); dst += (8 * dst_stride); } } @@ -214,7 +214,10 @@ static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, weight_vec, rnd_vec, offset_vec, dst0, dst1, dst2, dst3); PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); - ST6x4_UB(out0, out1, dst, dst_stride); + ST_W2(out0, 0, 2, dst, dst_stride); + ST_H2(out0, 2, 6, dst + 4, dst_stride); + ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride); + ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); dst += (4 * dst_stride); } } @@ -261,7 +264,7 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, dst0, dst1); out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); - ST8x2_UB(out0, dst, dst_stride); + ST_D2(out0, 0, 1, dst, dst_stride); } else if (6 == height) { LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3); src0_ptr += 4 * src_stride; @@ -281,9 +284,8 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, offset_vec, dst4, dst5); PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - ST8x2_UB(out2, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); + ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); } else if (0 == height % 4) { uint32_t loop_cnt; @@ -302,7 +304,7 @@ static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, in3, weight_vec, rnd_vec, offset_vec, dst0, dst1, dst2, dst3); PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -361,7 +363,8 @@ static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec, offset_vec, dst4, dst5); PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); - ST12x4_UB(out0, out1, out2, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); + ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride); dst += (4 * dst_stride); } } @@ -480,7 +483,7 @@ static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2); PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5); ST_UB4(out0, out1, out3, out4, dst, dst_stride); - ST8x4_UB(out2, out5, dst + 16, dst_stride); + ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride); dst += (4 * dst_stride); } } @@ -720,7 +723,7 @@ static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, out0, out1); out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); - ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out0, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); } } @@ -800,7 +803,7 @@ static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, out0, out1, out2, out3); PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -876,7 +879,7 @@ static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, weight_vec, rnd_vec, offset_vec, out0, out1, out2, out3); PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3); src0_ptr += (4 * src_stride); @@ -895,7 +898,7 @@ static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec, out0, out1); out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0); - ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride); + ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride); dst += (4 * dst_stride); } } @@ -1483,7 +1486,7 @@ static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, out0, out1, out2, out3); PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); - ST4x8_UB(out0, out1, dst, dst_stride); + ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); dst += (8 * dst_stride); src2110 = src10998; @@ -1568,7 +1571,7 @@ static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, out0, out1, out2, out3); PCKEV_B2_SH(out1, out0, out3, out2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); src10_r = src54_r; @@ -1674,8 +1677,8 @@ static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r); out2 = CLIP_SH_0_255(dst2_r); PCKEV_B2_SH(out1, out0, out2, out2, out0, out2); - ST8x2_UB(out0, dst, dst_stride); - ST4x2_UB(out2, dst + 8, dst_stride); + ST_D2(out0, 0, 1, dst, dst_stride); + ST_W2(out2, 0, 1, dst + 8, dst_stride); dst += (2 * dst_stride); src10_r = src32_r; @@ -2048,7 +2051,7 @@ static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3); PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); dst10 = dst54; @@ -2226,7 +2229,7 @@ static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r); PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST8x2_UB(out, dst_tmp, dst_stride); + ST_D2(out, 0, 1, dst_tmp, dst_stride); dst_tmp += (2 * dst_stride); dst0 = dst2; @@ -2412,7 +2415,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2); PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST8x2_UB(out, dst_tmp, dst_stride); + ST_D2(out, 0, 1, dst_tmp, dst_stride); dst_tmp += (2 * dst_stride); dsth0 = dsth2; @@ -2503,7 +2506,7 @@ static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3); PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); dst10 = dst54; @@ -2683,7 +2686,7 @@ static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r); out0 = CLIP_SH_0_255(dst0_r); out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0); - ST4x2_UB(out0, dst, dst_stride); + ST_W2(out0, 0, 1, dst, dst_stride); } static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, @@ -2743,7 +2746,7 @@ static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, dst0, dst1); dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); - ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); + ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); } static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, @@ -2816,7 +2819,7 @@ static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, dst0, dst1, dst2, dst3); PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); - ST4x8_UB(dst0, dst1, dst, dst_stride); + ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); dst += (8 * dst_stride); } } @@ -2918,7 +2921,10 @@ static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, dst0, dst1, dst2, dst3); PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); - ST6x4_UB(dst0, dst1, dst, dst_stride); + ST_W2(dst0, 0, 2, dst, dst_stride); + ST_H2(dst0, 2, 6, dst + 4, dst_stride); + ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride); + ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); dst += (4 * dst_stride); } } @@ -2976,7 +2982,7 @@ static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, dst0, dst1); dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); - ST8x2_UB(dst0, dst, dst_stride); + ST_D2(dst0, 0, 1, dst, dst_stride); } static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, @@ -3049,9 +3055,8 @@ static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); - ST8x4_UB(dst0, dst1, dst, dst_stride); - dst += (4 * dst_stride); - ST8x2_UB(dst3, dst, dst_stride); + ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); + ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride); } static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, @@ -3119,7 +3124,7 @@ static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, dst0, dst1, dst2, dst3); PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); - ST8x4_UB(dst0, dst1, dst, dst_stride); + ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -3235,7 +3240,8 @@ static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1); dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4); - ST12x4_UB(dst0, dst1, dst3, dst, dst_stride); + ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); + ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride); dst += (4 * dst_stride); } } @@ -3411,7 +3417,7 @@ static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, dst0, dst1); dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0); - ST8x2_UB(dst0, (dst + 16), dst_stride); + ST_D2(dst0, 0, 1, (dst + 16), dst_stride); dst += (2 * dst_stride); } } @@ -3551,7 +3557,7 @@ static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r); out = CLIP_SH_0_255(dst10_r); out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out); - ST4x2_UB(out, dst, dst_stride); + ST_W2(out, 0, 1, dst, dst_stride); } static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, @@ -3617,7 +3623,7 @@ static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, dst10, dst32); dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10); - ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride); + ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); } @@ -3702,7 +3708,7 @@ static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, dst10, dst32, dst54, dst76); PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32); - ST4x8_UB(dst10, dst32, dst, dst_stride); + ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); dst += (8 * dst_stride); } } @@ -3807,7 +3813,10 @@ static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, tmp0, tmp1, tmp2, tmp3); PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); - ST6x4_UB(tmp0, tmp1, dst, dst_stride); + ST_W2(tmp0, 0, 2, dst, dst_stride); + ST_H2(tmp0, 2, 6, dst + 4, dst_stride); + ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride); + ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride); dst += (4 * dst_stride); } } @@ -3866,7 +3875,7 @@ static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, tmp0, tmp1); tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST8x2_UB(tmp0, dst, dst_stride); + ST_D2(tmp0, 0, 1, dst, dst_stride); } static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, @@ -3936,9 +3945,8 @@ static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - ST8x2_UB(tmp3, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride); } static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, @@ -4010,7 +4018,7 @@ static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, tmp0, tmp1, tmp2, tmp3); PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -4132,7 +4140,8 @@ static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); - ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride); dst += (4 * dst_stride); } } @@ -4323,7 +4332,7 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, /* 8width */ tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); ST_SH2(tmp0, tmp1, dst, dst_stride); - ST8x2_UB(tmp2, dst + 16, dst_stride); + ST_D2(tmp2, 0, 1, dst + 16, dst_stride); dst += (2 * dst_stride); /* 16width */ @@ -4363,7 +4372,7 @@ static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, /* 8width */ tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2); ST_SH2(tmp0, tmp1, dst, dst_stride); - ST8x2_UB(tmp2, dst + 16, dst_stride); + ST_D2(tmp2, 0, 1, dst + 16, dst_stride); dst += (2 * dst_stride); } } @@ -4568,7 +4577,7 @@ static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0); tmp = CLIP_SH_0_255_MAX_SATU(tmp); out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp); - ST4x2_UB(out, dst, dst_stride); + ST_W2(out, 0, 1, dst, dst_stride); } static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, @@ -4665,7 +4674,7 @@ static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1); CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); } static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, @@ -4803,7 +4812,7 @@ static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, tmp2, tmp3); CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST4x8_UB(out0, out1, dst, dst_stride); + ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); dst += (8 * dst_stride); dst10_r = dst98_r; @@ -5001,7 +5010,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, tmp2, tmp3); CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST4x8_UB(out0, out1, dst, dst_stride); + ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5); @@ -5023,9 +5032,7 @@ static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); - ST2x4_UB(out2, 0, dst + 4, dst_stride); - dst += 4 * dst_stride; - ST2x4_UB(out2, 4, dst + 4, dst_stride); + ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride); } static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, @@ -5121,7 +5128,7 @@ static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1); CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1); out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST8x2_UB(out, dst, dst_stride); + ST_D2(out, 0, 1, dst, dst_stride); } static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr, @@ -5243,7 +5250,7 @@ static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr, tmp0, tmp1, tmp2, tmp3); CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); dst += 8; } } @@ -5394,9 +5401,8 @@ static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5); CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5); out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - ST8x2_UB(out2, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); + ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride); } static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, @@ -5533,7 +5539,7 @@ static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, tmp0, tmp1, tmp2, tmp3); CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst_tmp, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); dst_tmp += (4 * dst_stride); dst10_r = dst54_r; @@ -5720,7 +5726,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, tmp0, tmp1, tmp2, tmp3); CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst_tmp, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride); dst_tmp += (4 * dst_stride); dst10_r = dst54_r; @@ -5816,7 +5822,7 @@ static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, tmp0, tmp1, tmp2, tmp3); CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST4x8_UB(out0, out1, dst, dst_stride); + ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); dst += (8 * dst_stride); dst10_r = dst98_r; |