aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaustubh Raste <kaustubh.raste@imgtec.com>2017-09-12 16:45:12 +0530
committerMichael Niedermayer <michael@niedermayer.cc>2017-09-15 01:47:14 +0200
commitf692e55aab79729ca6d6b00c2671cac116903958 (patch)
tree9cab85a1da3cbb186bc6bee09c9d8b740749e58b
parent197d298ab3b27d1ec2ee7bf568debca105881a54 (diff)
downloadffmpeg-f692e55aab79729ca6d6b00c2671cac116903958.tar.gz
avcodec/mips: Improve hevc lpf msa functions
Seperate the filter processing in all strong, all weak and strong + weak cases. Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com> Reviewed-by: Manojkumar Bhosale <Manojkumar.Bhosale@imgtec.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r--libavcodec/mips/hevc_lpf_sao_msa.c750
1 files changed, 556 insertions, 194 deletions
diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c
index da1db51ef5..79b156fe5c 100644
--- a/libavcodec/mips/hevc_lpf_sao_msa.c
+++ b/libavcodec/mips/hevc_lpf_sao_msa.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ * Copyright (c) 2015 -2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
*
* This file is part of FFmpeg.
*
@@ -35,12 +35,14 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
uint8_t *q3 = src + (stride << 1) + stride;
uint8_t flag0, flag1;
int32_t dp00, dq00, dp30, dq30, d00, d30;
+ int32_t d0030, d0434;
int32_t dp04, dq04, dp34, dq34, d04, d34;
int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
uint64_t dst_val0, dst_val1;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+ v2i64 cmp3;
v8u16 temp0, temp1;
v8i16 temp2;
v8i16 tc_pos, tc_neg;
@@ -54,62 +56,86 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
d00 = dp00 + dq00;
d30 = dp30 + dq30;
- p_is_pcm0 = p_is_pcm[0];
- q_is_pcm0 = q_is_pcm[0];
dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
d04 = dp04 + dq04;
d34 = dp34 + dq34;
+
+ p_is_pcm0 = p_is_pcm[0];
p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm0 = q_is_pcm[0];
q_is_pcm4 = q_is_pcm[1];
- if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
- if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
- p3_src = LD_UH(p3);
- p2_src = LD_UH(p2);
- p1_src = LD_UH(p1);
- p0_src = LD_UH(p0);
- q0_src = LD_UH(q0);
- q1_src = LD_UH(q1);
- q2_src = LD_UH(q2);
- q3_src = LD_UH(q3);
-
- tc0 = tc[0];
- beta30 = beta >> 3;
- beta20 = beta >> 2;
- tc250 = ((tc0 * 5 + 1) >> 1);
- tc4 = tc[1];
- tc254 = ((tc4 * 5 + 1) >> 1);
-
- flag0 = (abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
- abs(p0[0] - q0[0]) < tc250 &&
- abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
- abs(p0[3] - q0[3]) < tc250 &&
- (d00 << 1) < beta20 && (d30 << 1) < beta20);
- cmp0 = __msa_fill_d(flag0);
-
- flag1 = (abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
- abs(p0[4] - q0[4]) < tc254 &&
- abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
- abs(p0[7] - q0[7]) < tc254 &&
- (d04 << 1) < beta20 && (d34 << 1) < beta20);
- cmp1 = __msa_fill_d(flag1);
- cmp2 = __msa_ilvev_d(cmp1, cmp0);
- cmp2 = __msa_ceqi_d(cmp2, 0);
-
- ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
- zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
- p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
- q3_src);
-
- cmp0 = (v2i64) __msa_fill_h(tc0);
- cmp1 = (v2i64) __msa_fill_h(tc4);
- tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+ cmp0 = __msa_fill_d(p_is_pcm0);
+ cmp1 = __msa_fill_d(p_is_pcm4);
+ p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+
+ cmp0 = (v2i64) __msa_fill_w(d0030);
+ cmp1 = (v2i64) __msa_fill_w(d0434);
+ cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
+ cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ p3_src = LD_UH(p3);
+ p2_src = LD_UH(p2);
+ p1_src = LD_UH(p1);
+ p0_src = LD_UH(p0);
+
+ cmp0 = __msa_fill_d(q_is_pcm0);
+ cmp1 = __msa_fill_d(q_is_pcm4);
+ q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = ((tc0 * 5 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = ((tc4 * 5 + 1) >> 1);
+
+ cmp0 = (v2i64) __msa_fill_h(tc0);
+ cmp1 = (v2i64) __msa_fill_h(tc4);
+
+ ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+ p3_src, p2_src, p1_src, p0_src);
+ q0_src = LD_UH(q0);
+ q1_src = LD_UH(q1);
+ q2_src = LD_UH(q2);
+ q3_src = LD_UH(q3);
+
+ flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+ abs(p0[0] - q0[0]) < tc250;
+ flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+ abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+
+ tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+ ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+ q0_src, q1_src, q2_src, q3_src);
+ flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+ abs(p0[4] - q0[4]) < tc254;
+ flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+ abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+
+ cmp0 = (v2i64) __msa_fill_w(flag0);
+ cmp1 = (v2i64) __msa_fill_w(flag1);
+ cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
+ cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
tc_pos <<= 1;
tc_neg = -tc_pos;
+ /* p part */
temp0 = (p1_src + p0_src + q0_src);
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
@@ -129,15 +155,11 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
- cmp0 = __msa_fill_d(p_is_pcm0);
- cmp1 = __msa_fill_d(p_is_pcm4);
- p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
- p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
-
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+ /* q part */
temp0 = (q1_src + p0_src + q0_src);
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
@@ -158,15 +180,176 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
- cmp0 = __msa_fill_d(q_is_pcm0);
- cmp1 = __msa_fill_d(q_is_pcm4);
- q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
- q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+ dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+ dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+ dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+
+ /* pack results to 8 bit */
+ PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+
+ /* pack src to 8 bit */
+ PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
+ dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
+
+ dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
+ dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
+ dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
+
+ dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
+ dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
+
+ ST8x4_UB(dst0, dst1, p2, stride);
+ p2 += (4 * stride);
+ SD(dst_val0, p2);
+ p2 += stride;
+ SD(dst_val1, p2);
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
+ tc_neg = -tc_pos;
+
+ diff0 = (v8i16) (q0_src - p0_src);
+ diff1 = (v8i16) (q1_src - p1_src);
+ diff0 = (diff0 << 3) + diff0;
+ diff1 = (diff1 << 1) + diff1;
+ delta0 = diff0 - diff1;
+ delta0 = __msa_srari_h(delta0, 4);
+
+ temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+ abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+ abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+ delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+
+ temp0 = (v8u16) (delta0 + p0_src);
+ temp0 = (v8u16) CLIP_SH_0_255(temp0);
+ temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ (v16u8) p_is_pcm_vec);
+
+ temp2 = (v8i16) (q0_src - delta0);
+ temp2 = CLIP_SH_0_255(temp2);
+ temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+ (v16u8) q_is_pcm_vec);
+
+ p_is_pcm_vec = ~p_is_pcm_vec;
+ q_is_pcm_vec = ~q_is_pcm_vec;
+ tmp = (beta + (beta >> 1)) >> 3;
+ cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
+ cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
+ cmp0 = __msa_ilvev_d(cmp1, cmp0);
+ cmp0 = __msa_ceqi_d(cmp0, 0);
+ p_is_pcm_vec = p_is_pcm_vec | cmp0;
+
+ cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
+ cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
+ cmp0 = __msa_ilvev_d(cmp1, cmp0);
+ cmp0 = __msa_ceqi_d(cmp0, 0);
+ q_is_pcm_vec = q_is_pcm_vec | cmp0;
+
+ tc_pos >>= 1;
+ tc_neg = -tc_pos;
+
+ delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+ delta1 -= (v8i16) p1_src;
+ delta1 += delta0;
+ delta1 >>= 1;
+ delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+ delta1 = (v8i16) p1_src + (v8i16) delta1;
+ delta1 = CLIP_SH_0_255(delta1);
+ delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+ (v16u8) p_is_pcm_vec);
+
+ delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+ delta2 = delta2 - (v8i16) q1_src;
+ delta2 = delta2 - delta0;
+ delta2 = delta2 >> 1;
+ delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+ delta2 = (v8i16) q1_src + (v8i16) delta2;
+ delta2 = CLIP_SH_0_255(delta2);
+ delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+ (v16u8) q_is_pcm_vec);
+
+ dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+ (v16u8) abs_delta0);
+ dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ (v16u8) abs_delta0);
+ dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+ (v16u8) abs_delta0);
+ dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+ (v16u8) abs_delta0);
+ /* pack results to 8 bit */
+ PCKEV_B2_UB(dst2, dst1, dst4, dst3, dst0, dst1);
+
+ /* pack src to 8 bit */
+ PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
+
+ dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
+ dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
+
+ p2 += stride;
+ ST8x4_UB(dst0, dst1, p2, stride);
+ /* weak filter ends */
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_pos <<= 1;
+ tc_neg = -tc_pos;
+
+ /* p part */
+ temp0 = (p1_src + p0_src + q0_src);
+ temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - p2_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+ temp1 = temp0 + p2_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+ temp2 = (v8i16) (temp1 - p1_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+ temp1 = (temp0 << 1) + p2_src + q1_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - p0_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+ dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+ dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+ dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+ /* q part */
+ temp0 = (q1_src + p0_src + q0_src);
+
+ temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - q2_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+ temp1 = temp0 + q2_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+ temp2 = (v8i16) (temp1 - q1_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+ temp1 = (temp0 << 1) + p1_src + q2_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - q0_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst3 = (v16u8) (temp2 + (v8i16) q0_src);
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+ /* pack strong results to 8 bit */
+ PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+ /* strong filter ends */
+
+ /* weak filter */
tc_pos >>= 1;
tc_neg = -tc_pos;
@@ -193,16 +376,18 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
(v16u8) q_is_pcm_vec);
+ p_is_pcm_vec = ~p_is_pcm_vec;
+ q_is_pcm_vec = ~q_is_pcm_vec;
tmp = (beta + (beta >> 1)) >> 3;
- cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
- cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
- p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
- p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+ cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
+ cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
+ cmp0 = __msa_ilvev_d(cmp1, cmp0);
+ p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
- cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
- cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
- q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
- q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+ cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
+ cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
+ cmp0 = __msa_ilvev_d(cmp1, cmp0);
+ q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
tc_pos >>= 1;
tc_neg = -tc_pos;
@@ -235,28 +420,24 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
(v16u8) abs_delta0);
delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
(v16u8) abs_delta0);
+ /* weak filter ends */
- dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
- dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
- dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
- dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
- dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
- dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
+ /* pack weak results to 8 bit */
+ PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
+ dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
- cmp0 = __msa_fill_d(d00 + d30 >= beta);
- cmp1 = __msa_fill_d(d04 + d34 >= beta);
- cmp0 = __msa_ilvev_d(cmp1, cmp0);
- cmp0 = __msa_ceqi_d(cmp0, 0);
+ /* select between weak or strong */
+ dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
+ dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
+ dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
- dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp0);
- dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp0);
- dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp0);
- dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp0);
- dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp0);
- dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp0);
+ /* pack src to 8 bit */
+ PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
+ dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
- PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
- dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
+ dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
+ dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
+ dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
@@ -282,11 +463,13 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
uint16_t tmp0, tmp1;
uint32_t tmp2, tmp3;
int32_t dp00, dq00, dp30, dq30, d00, d30;
+ int32_t d0030, d0434;
int32_t dp04, dq04, dp34, dq34, d04, d34;
int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+ v2i64 cmp3;
v8u16 temp0, temp1;
v8i16 temp2;
v8i16 tc_pos, tc_neg;
@@ -312,51 +495,71 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
p_is_pcm4 = p_is_pcm[1];
q_is_pcm4 = q_is_pcm[1];
- if (!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) {
- if (!(d00 + d30 >= beta) || !(d04 + d34 >= beta)) {
- src -= 4;
- LD_UH8(src, stride,
- p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
- q3_src);
-
- tc0 = tc[0];
- beta30 = beta >> 3;
- beta20 = beta >> 2;
- tc250 = ((tc0 * 5 + 1) >> 1);
-
- tc4 = tc[1];
- tc254 = ((tc4 * 5 + 1) >> 1);
-
- TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
- q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
- q0_src, q1_src, q2_src, q3_src);
-
- flag0 = (abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
- abs(p3[-1] - p3[0]) < tc250 &&
- abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
- abs(p2[-1] - p2[0]) < tc250 &&
- (d00 << 1) < beta20 && (d30 << 1) < beta20);
- cmp0 = __msa_fill_d(flag0);
-
- flag1 = (abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
- abs(p1[-1] - p1[0]) < tc254 &&
- abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
- abs(p0[-1] - p0[0]) < tc254 &&
- (d04 << 1) < beta20 && (d34 << 1) < beta20);
- cmp1 = __msa_fill_d(flag1);
- cmp2 = __msa_ilvev_d(cmp1, cmp0);
- cmp2 = __msa_ceqi_d(cmp2, 0);
-
- ILVR_B8_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
- zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
- p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src,
- q3_src);
-
- cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
- cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
- tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+ cmp0 = __msa_fill_d(p_is_pcm0);
+ cmp1 = __msa_fill_d(p_is_pcm4);
+ p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+
+ cmp0 = __msa_fill_d(d0030);
+ cmp1 = __msa_fill_d(d0434);
+ cmp3 = __msa_ilvev_d(cmp1, cmp0);
+ cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ src -= 4;
+ LD_UH8(src, stride, p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+ q2_src, q3_src);
+
+ cmp0 = __msa_fill_d(q_is_pcm0);
+ cmp1 = __msa_fill_d(q_is_pcm4);
+ q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = ((tc0 * 5 + 1) >> 1);
+
+ tc4 = tc[1];
+ tc254 = ((tc4 * 5 + 1) >> 1);
+ cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
+ cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
+ tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
+
+ TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+ q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+ q0_src, q1_src, q2_src, q3_src);
+
+ flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+ abs(p3[-1] - p3[0]) < tc250;
+ flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+ abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+ cmp0 = __msa_fill_d(flag0);
+ ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
+ p3_src, p2_src, p1_src, p0_src);
+
+ flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+ abs(p1[-1] - p1[0]) < tc254;
+ flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+ abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+ ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
+ q0_src, q1_src, q2_src, q3_src);
+
+ cmp1 = __msa_fill_d(flag1);
+ cmp2 = __msa_ilvev_d(cmp1, cmp0);
+ cmp2 = __msa_ceqi_d(cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
tc_neg = -tc_pos;
+ /* p part */
temp0 = (p1_src + p0_src + q0_src);
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
@@ -377,15 +580,11 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
- cmp0 = __msa_fill_d(p_is_pcm0);
- cmp1 = __msa_fill_d(p_is_pcm4);
- p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
- p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
-
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+ /* q part */
temp0 = (q1_src + p0_src + q0_src);
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
@@ -405,22 +604,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
- cmp0 = __msa_fill_d(q_is_pcm0);
- cmp1 = __msa_fill_d(q_is_pcm4);
- q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
- q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
-
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
-
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
tc_pos >>= 1;
tc_neg = -tc_pos;
diff0 = (v8i16) (q0_src - p0_src);
diff1 = (v8i16) (q1_src - p1_src);
- diff0 = (v8i16) (diff0 << 3) + diff0;
- diff1 = (v8i16) (diff1 << 1) + diff1;
+ diff0 = (diff0 << 3) + diff0;
+ diff1 = (diff1 << 1) + diff1;
delta0 = diff0 - diff1;
delta0 = __msa_srari_h(delta0, 4);
@@ -429,19 +625,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
abs_delta0 = (v8u16) abs_delta0 < temp1;
delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
- temp0 = (v8u16) delta0 + p0_src;
+ temp0 = (v8u16) (delta0 + p0_src);
temp0 = (v8u16) CLIP_SH_0_255(temp0);
temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
(v16u8) p_is_pcm_vec);
- temp2 = (v8i16) q0_src - delta0;
+ temp2 = (v8i16) (q0_src - delta0);
temp2 = CLIP_SH_0_255(temp2);
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
(v16u8) q_is_pcm_vec);
tmp = ((beta + (beta >> 1)) >> 3);
- cmp0 = __msa_fill_d(!p_is_pcm0 && (dp00 + dp30 < tmp));
- cmp1 = __msa_fill_d(!p_is_pcm4 && (dp04 + dp34 < tmp));
+ cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
+ cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
@@ -472,86 +668,252 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
delta2 = CLIP_SH_0_255(delta2);
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
(v16u8) q_is_pcm_vec);
- delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
- (v16u8) abs_delta0);
- temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
- (v16u8) abs_delta0);
- temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
- (v16u8) abs_delta0);
- delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
- (v16u8) abs_delta0);
- dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
- dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
- dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
- dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
- dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
- dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
+ dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+ (v16u8) abs_delta0);
+ dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ (v16u8) abs_delta0);
+ dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+ (v16u8) abs_delta0);
+ dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+ (v16u8) abs_delta0);
+ /* weak filter ends */
- cmp0 = __msa_fill_d(d00 + d30 >= beta);
- dst7 = (v16u8) __msa_fill_d(d04 + d34 >= beta);
- cmp0 = __msa_ilvev_d((v2i64) dst7, cmp0);
- dst6 = (v16u8) __msa_ceqi_d(cmp0, 0);
+ dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
+ dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
+ dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
+ dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
- dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, dst6);
- dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, dst6);
- dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, dst6);
- dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, dst6);
- dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, dst6);
- dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, dst6);
+ PCKEV_B2_UB(dst2, dst0, dst3, dst1, dst0, dst1);
- PCKEV_B4_UB(dst0, dst0, dst1, dst1, dst2, dst2, dst3, dst3,
- dst0, dst1, dst2, dst3);
- PCKEV_B2_UB(dst4, dst4, dst5, dst5, dst4, dst5);
+ /* transpose */
+ ILVRL_B2_UB(dst1, dst0, dst4, dst5);
+ ILVRL_H2_UB(dst5, dst4, dst0, dst1);
- TRANSPOSE8x8_UB_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
- dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-
- src += 1;
+ src += 2;
tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
- tmp0 = __msa_copy_u_h((v8i16) dst0, 2);
- tmp3 = __msa_copy_u_w((v4i32) dst1, 0);
- tmp1 = __msa_copy_u_h((v8i16) dst1, 2);
+ tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
SW(tmp2, src);
- SH(tmp0, src + 4);
src += stride;
SW(tmp3, src);
- SH(tmp1, src + 4);
src += stride;
- tmp2 = __msa_copy_u_w((v4i32) dst2, 0);
- tmp0 = __msa_copy_u_h((v8i16) dst2, 2);
- tmp3 = __msa_copy_u_w((v4i32) dst3, 0);
- tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
+ tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
+ tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
SW(tmp2, src);
- SH(tmp0, src + 4);
src += stride;
SW(tmp3, src);
- SH(tmp1, src + 4);
src += stride;
- tmp2 = __msa_copy_u_w((v4i32) dst4, 0);
- tmp0 = __msa_copy_u_h((v8i16) dst4, 2);
- tmp3 = __msa_copy_u_w((v4i32) dst5, 0);
- tmp1 = __msa_copy_u_h((v8i16) dst5, 2);
+ tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
+ tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
SW(tmp2, src);
- SH(tmp0, src + 4);
src += stride;
SW(tmp3, src);
- SH(tmp1, src + 4);
src += stride;
- tmp2 = __msa_copy_u_w((v4i32) dst6, 0);
- tmp0 = __msa_copy_u_h((v8i16) dst6, 2);
- tmp3 = __msa_copy_u_w((v4i32) dst7, 0);
- tmp1 = __msa_copy_u_h((v8i16) dst7, 2);
+ tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
+ tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
SW(tmp2, src);
- SH(tmp0, src + 4);
src += stride;
SW(tmp3, src);
- SH(tmp1, src + 4);
+
+ return;
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_neg = -tc_pos;
+
+ /* p part */
+ temp0 = (p1_src + p0_src + q0_src);
+
+ temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - p2_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst0 = (v16u8) (temp2 + (v8i16) p2_src);
+
+ temp1 = temp0 + p2_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+ temp2 = (v8i16) (temp1 - p1_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst1 = (v16u8) (temp2 + (v8i16) p1_src);
+
+ temp1 = (temp0 << 1) + p2_src + q1_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - p0_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst2 = (v16u8) (temp2 + (v8i16) p0_src);
+
+ dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
+ dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
+ dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
+
+ /* q part */
+ temp0 = (q1_src + p0_src + q0_src);
+ temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - q2_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst5 = (v16u8) (temp2 + (v8i16) q2_src);
+
+ temp1 = temp0 + q2_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
+ temp2 = (v8i16) (temp1 - q1_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst4 = (v16u8) (temp2 + (v8i16) q1_src);
+
+ temp1 = (temp0 << 1) + p1_src + q2_src;
+ temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
+ temp2 = (v8i16) (temp1 - q0_src);
+ temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ dst3 = (v16u8) (temp2 + (v8i16) q0_src);
+
+ dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
+ dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
+ dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
+ /* strong filter ends */
+
+ /* weak filter */
+ tc_pos >>= 1;
+ tc_neg = -tc_pos;
+
+ diff0 = (v8i16) (q0_src - p0_src);
+ diff1 = (v8i16) (q1_src - p1_src);
+ diff0 = (diff0 << 3) + diff0;
+ diff1 = (diff1 << 1) + diff1;
+ delta0 = diff0 - diff1;
+ delta0 = __msa_srari_h(delta0, 4);
+
+ temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
+ abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
+ abs_delta0 = (v8u16) abs_delta0 < temp1;
+
+ delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+
+ temp0 = (v8u16) (delta0 + p0_src);
+ temp0 = (v8u16) CLIP_SH_0_255(temp0);
+ temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ (v16u8) p_is_pcm_vec);
+
+ temp2 = (v8i16) (q0_src - delta0);
+ temp2 = CLIP_SH_0_255(temp2);
+ temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+ (v16u8) q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
+ cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
+ p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
+
+ cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
+ cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
+ q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
+ q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
+
+ tc_pos >>= 1;
+ tc_neg = -tc_pos;
+
+ delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
+ delta1 -= (v8i16) p1_src;
+ delta1 += delta0;
+ delta1 >>= 1;
+ delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+ delta1 = (v8i16) p1_src + (v8i16) delta1;
+ delta1 = CLIP_SH_0_255(delta1);
+ delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
+ (v16u8) p_is_pcm_vec);
+
+ delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
+ delta2 = delta2 - (v8i16) q1_src;
+ delta2 = delta2 - delta0;
+ delta2 = delta2 >> 1;
+ delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+ delta2 = (v8i16) q1_src + (v8i16) delta2;
+ delta2 = CLIP_SH_0_255(delta2);
+ delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
+ (v16u8) q_is_pcm_vec);
+ delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
+ (v16u8) abs_delta0);
+ temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ (v16u8) abs_delta0);
+ temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
+ (v16u8) abs_delta0);
+ delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
+ (v16u8) abs_delta0);
+ /* weak filter ends*/
+
+ /* select between weak or strong */
+ dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
+ dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
+ dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
+ dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
+ dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
+ dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
}
+
+ dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
+ dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
+ dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
+ dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
+ dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
+ dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
+
+ /* pack results to 8 bit */
+ PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
+ dst2, dst3);
+
+ /* transpose */
+ ILVRL_B2_UB(dst1, dst0, dst4, dst5);
+ ILVRL_B2_UB(dst3, dst2, dst6, dst7);
+ ILVRL_H2_UB(dst5, dst4, dst0, dst1);
+ ILVRL_H2_UB(dst7, dst6, dst2, dst3);
+
+ src += 1;
+
+ tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
+ tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
+ tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
+ tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
+ SW(tmp2, src);
+ SH(tmp0, src + 4);
+ src += stride;
+ SW(tmp3, src);
+ SH(tmp1, src + 4);
+ src += stride;
+
+ tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
+ tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
+ tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
+ tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
+ SW(tmp2, src);
+ SH(tmp0, src + 4);
+ src += stride;
+ SW(tmp3, src);
+ SH(tmp1, src + 4);
+ src += stride;
+
+ tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
+ tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
+ tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
+ tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
+ SW(tmp2, src);
+ SH(tmp0, src + 4);
+ src += stride;
+ SW(tmp3, src);
+ SH(tmp1, src + 4);
+ src += stride;
+
+ tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
+ tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
+ tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
+ tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
+ SW(tmp2, src);
+ SH(tmp0, src + 4);
+ src += stride;
+ SW(tmp3, src);
+ SH(tmp1, src + 4);
}
}