/*
 * Loongson LSX/LASX optimized h264chroma
 *
 * Copyright (c) 2023 Loongson Technology Corporation Limited
 * Contributed by Lu Wang <wanglu@loongson.cn>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "loongson_asm.S"

/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                   int h, int x, int y) */
function ff_put_h264_chroma_mc8_lsx
    li.d             t8,      8
    sub.d            t1,      t8,     a4     // 8-x
    sub.d            t2,      t8,     a5     // 8-y
    mul.d            t3,      t1,     t2     // A
    mul.d            t4,      a4,     t2     // B
    mul.d            t5,      t1,     a5     // C
    mul.d            t6,      a4,     a5     // D
    add.d            t0,      t4,     t5     // E
    vreplgr2vr.b     vr0,     t3
    vreplgr2vr.b     vr1,     t4
    vreplgr2vr.b     vr2,     t5
    vreplgr2vr.b     vr3,     t6
    vreplgr2vr.b     vr4,     t0
    slli.d           t2,      a2,     1
    add.d            t3,      t2,     a2
    slli.d           t4,      a2,     2

    bge              zero,    t6,     .ENDLOOP_D
    move             t1,      a3
    vilvl.b          vr9,     vr1,    vr0
    vilvl.b          vr10,    vr3,    vr2
.LOOP_D:
    vld              vr5,     a1,     0
    vld              vr6,     a1,     1
    add.d            a1,      a1,     a2
    vld              vr7,     a1,     0
    vld              vr8,     a1,     1
    vilvl.b          vr11,    vr6,    vr5
    vilvl.b          vr12,    vr8,    vr7
    vmulwev.h.bu     vr13,    vr9,    vr11
    vmaddwod.h.bu    vr13,    vr9,    vr11
    vmulwev.h.bu     vr14,    vr10,   vr12
    vmaddwod.h.bu    vr14,    vr10,   vr12
    vadd.h           vr13,    vr13,   vr14
    vsrarni.b.h      vr13,    vr13,   6
    vstelm.d         vr13,    a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vld              vr6,     a1,     1
    vilvl.b          vr11,    vr8,    vr7
    vilvl.b          vr12,    vr6,    vr5
    vmulwev.h.bu     vr13,    vr9,    vr11
    vmaddwod.h.bu    vr13,    vr9,    vr11
    vmulwev.h.bu     vr14,    vr10,   vr12
    vmaddwod.h.bu    vr14,    vr10,   vr12
    vadd.h           vr13,    vr13,   vr14
    vsrarni.b.h      vr13,    vr13,   6
    vstelm.d         vr13,    a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr7,     a1,     0
    vld              vr8,     a1,     1
    vilvl.b          vr11,    vr6,    vr5
    vilvl.b          vr12,    vr8,    vr7
    vmulwev.h.bu     vr13,    vr9,    vr11
    vmaddwod.h.bu    vr13,    vr9,    vr11
    vmulwev.h.bu     vr14,    vr10,   vr12
    vmaddwod.h.bu    vr14,    vr10,   vr12
    vadd.h           vr13,    vr13,   vr14
    vsrarni.b.h      vr13,    vr13,   6
    vstelm.d         vr13,    a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vld              vr6,     a1,     1
    vilvl.b          vr11,    vr8,    vr7
    vilvl.b          vr12,    vr6,    vr5
    vmulwev.h.bu     vr13,    vr9,    vr11
    vmaddwod.h.bu    vr13,    vr9,    vr11
    vmulwev.h.bu     vr14,    vr10,   vr12
    vmaddwod.h.bu    vr14,    vr10,   vr12
    vadd.h           vr13,    vr13,   vr14
    vsrarni.b.h      vr13,    vr13,   6
    vstelm.d         vr13,    a0,     0,    0
    add.d            a0,      a0,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOP_D
    b                .ENDLOOP
.ENDLOOP_D:

    bge              zero,    t0,     .ENDLOOP_E
    move             t1,      a3
    li.d             t7,      1
    slt              t8,      zero,   t5
    maskeqz          t5,      a2,     t8
    masknez          t7,      t7,     t8
    or               t7,      t7,     t5
    vilvl.b          vr7,     vr4,    vr0
.LOOP_E:
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOP_E
    b                .ENDLOOP
.ENDLOOP_E:

    move             t1,      a3
.LOOP:
    vld              vr5,     a1,     0
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vsrarni.b.h      vr7,     vr7,    6
    vilvl.b          vr6,     vr7,    vr6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    vldx             vr5,     a1,     a2
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vsrarni.b.h      vr7,     vr7,    6
    vilvl.b          vr6,     vr7,    vr6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    vldx             vr5,     a1,     t2
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vsrarni.b.h      vr7,     vr7,    6
    vilvl.b          vr6,     vr7,    vr6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    vldx             vr5,     a1,     t3
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vsrarni.b.h      vr7,     vr7,    6
    vilvl.b          vr6,     vr7,    vr6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     t4

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOP
.ENDLOOP:
endfunc

/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                   int h, int x, int y) */
function ff_avg_h264_chroma_mc8_lsx
    li.d             t8,      8
    sub.d            t1,      t8,     a4     // 8-x
    sub.d            t2,      t8,     a5     // 8-y
    mul.d            t3,      t1,     t2     // A
    mul.d            t4,      a4,     t2     // B
    mul.d            t5,      t1,     a5     // C
    mul.d            t6,      a4,     a5     // D
    add.d            t0,      t4,     t5     // E
    vreplgr2vr.b     vr0,     t3
    vreplgr2vr.b     vr1,     t4
    vreplgr2vr.b     vr2,     t5
    vreplgr2vr.b     vr3,     t6
    vreplgr2vr.b     vr4,     t0
    slli.d           t2,      a2,     1
    add.d            t3,      t2,     a2
    slli.d           t4,      a2,     2

    bge              zero,    t6,     .ENDLOOPD
    move             t1,      a3
    vilvl.b          vr9,     vr1,    vr0
    vilvl.b          vr10,    vr3,    vr2
.LOOPD:
    vld              vr5,     a1,     0
    vld              vr6,     a1,     1
    add.d            a1,      a1,     a2
    vld              vr7,     a1,     0
    vld              vr8,     a1,     1
    vld              vr11,    a0,     0
    vilvl.b          vr12,    vr6,    vr5
    vilvl.b          vr13,    vr8,    vr7
    vmulwev.h.bu     vr14,    vr9,    vr12
    vmaddwod.h.bu    vr14,    vr9,    vr12
    vmulwev.h.bu     vr15,    vr10,   vr13
    vmaddwod.h.bu    vr15,    vr10,   vr13
    vadd.h           vr14,    vr14,   vr15
    vsrari.h         vr14,    vr14,   6
    vsllwil.hu.bu    vr11,    vr11,   0
    vadd.h           vr11,    vr14,   vr11
    vsrarni.b.h      vr11,    vr11,   1
    vstelm.d         vr11,    a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vld              vr6,     a1,     1
    vld              vr11,    a0,     0
    vilvl.b          vr12,    vr8,    vr7
    vilvl.b          vr13,    vr6,    vr5
    vmulwev.h.bu     vr14,    vr9,    vr12
    vmaddwod.h.bu    vr14,    vr9,    vr12
    vmulwev.h.bu     vr15,    vr10,   vr13
    vmaddwod.h.bu    vr15,    vr10,   vr13
    vadd.h           vr14,    vr14,   vr15
    vsrari.h         vr14,    vr14,   6
    vsllwil.hu.bu    vr11,    vr11,   0
    vadd.h           vr11,    vr14,   vr11
    vsrarni.b.h      vr11,    vr11,   1
    vstelm.d         vr11,    a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr7,     a1,     0
    vld              vr8,     a1,     1
    vld              vr11,    a0,     0
    vilvl.b          vr12,    vr6,    vr5
    vilvl.b          vr13,    vr8,    vr7
    vmulwev.h.bu     vr14,    vr9,    vr12
    vmaddwod.h.bu    vr14,    vr9,    vr12
    vmulwev.h.bu     vr15,    vr10,   vr13
    vmaddwod.h.bu    vr15,    vr10,   vr13
    vadd.h           vr14,    vr14,   vr15
    vsrari.h         vr14,    vr14,   6
    vsllwil.hu.bu    vr11,    vr11,   0
    vadd.h           vr11,    vr14,   vr11
    vsrarni.b.h      vr11,    vr11,   1
    vstelm.d         vr11,    a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vld              vr6,     a1,     1
    vld              vr11,    a0,     0
    vilvl.b          vr12,    vr8,    vr7
    vilvl.b          vr13,    vr6,    vr5
    vmulwev.h.bu     vr14,    vr9,    vr12
    vmaddwod.h.bu    vr14,    vr9,    vr12
    vmulwev.h.bu     vr15,    vr10,   vr13
    vmaddwod.h.bu    vr15,    vr10,   vr13
    vadd.h           vr14,    vr14,   vr15
    vsrari.h         vr14,    vr14,   6
    vsllwil.hu.bu    vr11,    vr11,   0
    vadd.h           vr11,    vr14,   vr11
    vsrarni.b.h      vr11,    vr11,   1
    vstelm.d         vr11,    a0,     0,    0
    add.d            a0,      a0,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOPD
    b                .ENDLOOPELSE
.ENDLOOPD:

    bge              zero,    t0,     .ENDLOOPE
    move             t1,      a3
    li.d             t7,      1
    slt              t8,      zero,   t5
    maskeqz          t5,      a2,     t8
    masknez          t7,      t7,     t8
    or               t7,      t7,     t5
    vilvl.b          vr7,     vr4,    vr0
.LOOPE:
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vld              vr8,     a0,     0
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vld              vr8,     a0,     0
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vld              vr8,     a0,     0
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vld              vr8,     a0,     0
    vilvl.b          vr5,     vr6,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOPE
    b                .ENDLOOPELSE
.ENDLOOPE:

    move             t1,      a3
.LOOPELSE:
    vld              vr5,     a1,     0
    vld              vr8,     a0,     0
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vilvl.h          vr6,     vr7,    vr6
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    vldx             vr5,     a1,     a2
    vld              vr8,     a0,     0
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vilvl.h          vr6,     vr7,    vr6
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    vldx             vr5,     a1,     t2
    vld              vr8,     a0,     0
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vilvl.h          vr6,     vr7,    vr6
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    vldx             vr5,     a1,     t3
    vld              vr8,     a0,     0
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vilvl.h          vr6,     vr7,    vr6
    vsrari.h         vr6,     vr6,    6
    vsllwil.hu.bu    vr8,     vr8,    0
    vadd.h           vr8,     vr6,    vr8
    vsrarni.b.h      vr8,     vr8,    1
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     t4

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOPELSE
.ENDLOOPELSE:
endfunc

/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                   int h, int x, int y) */
function ff_put_h264_chroma_mc4_lsx
    li.d             t8,      8
    sub.d            t1,      t8,     a4     // 8-x
    sub.d            t2,      t8,     a5     // 8-y
    mul.d            t3,      t1,     t2     // A
    mul.d            t4,      a4,     t2     // B
    mul.d            t5,      t1,     a5     // C
    mul.d            t6,      a4,     a5     // D
    add.d            t0,      t4,     t5     // E
    slli.d           t8,      a2,     1
    vreplgr2vr.b     vr0,     t3
    vreplgr2vr.b     vr1,     t4
    vreplgr2vr.b     vr2,     t5
    vreplgr2vr.b     vr3,     t6
    vreplgr2vr.b     vr4,     t0

    bge              zero,    t6,     .ENDPUT_D
    move             t1,      a3
    vilvl.b          vr9,     vr1,    vr0
    vilvl.b          vr10,    vr3,    vr2
.PUT_D:
    vld              vr5,     a1,     0
    vld              vr6,     a1,     1
    add.d            a1,      a1,     a2
    vld              vr7,     a1,     0
    vld              vr8,     a1,     1
    add.d            a1,      a1,     a2
    vld              vr11,    a1,     0
    vld              vr12,    a1,     1
    vilvl.b          vr5,     vr6,    vr5
    vilvl.b          vr7,     vr8,    vr7
    vilvl.b          vr13,    vr12,   vr11
    vilvl.d          vr5,     vr7,    vr5
    vilvl.d          vr13,    vr13,   vr7
    vmulwev.h.bu     vr14,    vr9,    vr5
    vmaddwod.h.bu    vr14,    vr9,    vr5
    vmulwev.h.bu     vr15,    vr10,   vr13
    vmaddwod.h.bu    vr15,    vr10,   vr13
    vadd.h           vr14,    vr14,   vr15
    vsrarni.b.h      vr14,    vr14,   6
    vstelm.w         vr14,    a0,     0,    0
    add.d            a0,      a0,     a2
    vstelm.w         vr14,    a0,     0,    1
    add.d            a0,      a0,     a2
    addi.d           t1,      t1,     -2
    blt              zero,    t1,     .PUT_D
    b                .ENDPUT
.ENDPUT_D:

    bge              zero,    t0,     .ENDPUT_E
    move             t1,      a3
    li.d             t7,      1
    slt              t8,      zero,   t5
    maskeqz          t5,      a2,     t8
    masknez          t7,      t7,     t8
    or               t7,      t7,     t5
    vilvl.b          vr7,     vr4,    vr0
.PUT_E:
    vld              vr5,     a1,     0
    vldx             vr6,     a1,     t7
    vilvl.b          vr5,     vr6,    vr5
    add.d            a1,      a1,     a2
    vld              vr8,     a1,     0
    vldx             vr9,     a1,     t7
    vilvl.b          vr8,     vr9,    vr8
    vilvl.d          vr5,     vr8,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vstelm.w         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    vstelm.w         vr6,     a0,     0,    1
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    addi.d           t1,      t1,     -2
    blt              zero,    t1,     .PUT_E
    b                .ENDPUT
.ENDPUT_E:

    move             t1,      a3
.PUT:
    vld              vr5,     a1,     0
    vldx             vr8,     a1,     a2
    vilvl.w          vr5,     vr8,    vr5
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vsrarni.b.h      vr7,     vr7,    6
    vilvl.b          vr6,     vr7,    vr6
    vstelm.w         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    vstelm.w         vr6,     a0,     0,    1
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     t8
    addi.d           t1,      t1,     -2
    blt              zero,    t1,     .PUT
.ENDPUT:
endfunc

/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                    int h, int x, int y) */
function ff_put_h264_chroma_mc8_lasx
    li.d             t8,      8
    sub.d            t1,      t8,     a4     // 8-x
    sub.d            t2,      t8,     a5     // 8-y
    mul.d            t3,      t1,     t2     // A
    mul.d            t4,      a4,     t2     // B
    mul.d            t5,      t1,     a5     // C
    mul.d            t6,      a4,     a5     // D
    add.d            t0,      t4,     t5     // E
    xvreplgr2vr.b    xr0,     t3
    xvreplgr2vr.b    xr1,     t4
    xvreplgr2vr.b    xr2,     t5
    xvreplgr2vr.b    xr3,     t6
    xvreplgr2vr.b    xr4,     t0
    slli.d           t2,      a2,     1
    add.d            t3,      t2,     a2
    slli.d           t4,      a2,     2

    bge              zero,    t6,     .ENDLOOP_DA
    move             t1,      a3
    xvilvl.b         xr9,     xr1,    xr0
    xvilvl.b         xr10,    xr3,    xr2
.LOOP_DA:
    fld.d            f5,      a1,     0
    fld.d            f6,      a1,     1
    add.d            a1,      a1,     a2
    fld.d            f7,      a1,     0
    fld.d            f8,      a1,     1
    add.d            a1,      a1,     a2
    fld.d            f13,     a1,     0
    fld.d            f14,     a1,     1
    add.d            a1,      a1,     a2
    fld.d            f15,     a1,     0
    fld.d            f16,     a1,     1
    add.d            a1,      a1,     a2
    fld.d            f17,     a1,     0
    fld.d            f18,     a1,     1
    vilvl.b          vr11,    vr6,    vr5
    vilvl.b          vr12,    vr8,    vr7
    vilvl.b          vr14,    vr14,   vr13
    vilvl.b          vr15,    vr16,   vr15
    vilvl.b          vr16,    vr18,   vr17
    xvpermi.q        xr11,    xr12,   0x02
    xvpermi.q        xr12,    xr14,   0x02
    xvpermi.q        xr14,    xr15,   0x02
    xvpermi.q        xr15,    xr16,   0x02

    xvmulwev.h.bu    xr19,    xr9,    xr11
    xvmaddwod.h.bu   xr19,    xr9,    xr11
    xvmulwev.h.bu    xr20,    xr10,   xr12
    xvmaddwod.h.bu   xr20,    xr10,   xr12
    xvadd.h          xr21,    xr19,   xr20
    xvsrarni.b.h     xr21,    xr21,   6
    vstelm.d         vr21,    a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr21,    a0,     0,    2
    add.d            a0,      a0,     a2
    xvmulwev.h.bu    xr13,    xr9,    xr14
    xvmaddwod.h.bu   xr13,    xr9,    xr14
    xvmulwev.h.bu    xr14,    xr10,   xr15
    xvmaddwod.h.bu   xr14,    xr10,   xr15
    xvadd.h          xr13,    xr13,   xr14
    xvsrarni.b.h     xr13,    xr13,   6
    vstelm.d         vr13,    a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr13,    a0,     0,    2
    add.d            a0,      a0,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOP_DA
    b                .ENDLOOPA
.ENDLOOP_DA:

    bge              zero,    t0,     .ENDLOOP_EA
    move             t1,      a3
    li.d             t7,      1
    slt              t8,      zero,   t5
    maskeqz          t5,      a2,     t8
    masknez          t7,      t7,     t8
    or               t7,      t7,     t5
    xvilvl.b         xr7,     xr4,    xr0
.LOOP_EA:
    fld.d            f5,      a1,     0
    fldx.d           f6,      a1,     t7
    add.d            a1,      a1,     a2
    fld.d            f9,      a1,     0
    fldx.d           f10,     a1,     t7
    add.d            a1,      a1,     a2
    fld.d            f11,     a1,     0
    fldx.d           f12,     a1,     t7
    add.d            a1,      a1,     a2
    fld.d            f13,     a1,     0
    fldx.d           f14,     a1,     t7
    vilvl.b          vr5,     vr6,    vr5
    vilvl.b          vr9,     vr10,   vr9
    vilvl.b          vr11,    vr12,   vr11
    vilvl.b          vr13,    vr14,   vr13
    xvpermi.q        xr5,     xr9,    0x02
    xvpermi.q        xr11,    xr13,   0x02

    xvmulwev.h.bu    xr8,     xr7,    xr5
    xvmaddwod.h.bu   xr8,     xr7,    xr5
    xvmulwev.h.bu    xr6,     xr7,    xr11
    xvmaddwod.h.bu   xr6,     xr7,    xr11
    xvsrarni.b.h     xr8,     xr8,    6
    vstelm.d         vr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr8,     a0,     0,    2
    add.d            a0,      a0,     a2
    xvsrarni.b.h     xr6,     xr6,    6
    vstelm.d         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr6,     a0,     0,    2
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOP_EA
    b                .ENDLOOPA
.ENDLOOP_EA:

    move             t1,      a3
.LOOPA:
    fld.d            f5,      a1,     0
    fldx.d           f6,      a1,     a2
    fldx.d           f7,      a1,     t2
    fldx.d           f8,      a1,     t3
    vilvl.d          vr5,     vr6,    vr5
    vilvl.d          vr7,     vr8,    vr7
    xvpermi.q        xr5,     xr7,    0x02
    xvmulwev.h.bu    xr6,     xr0,    xr5
    xvmulwod.h.bu    xr7,     xr0,    xr5
    xvilvl.h         xr8,     xr7,    xr6
    xvilvh.h         xr9,     xr7,    xr6
    xvsrarni.b.h     xr9,     xr8,    6
    vstelm.d         vr9,     a0,     0,    0
    add.d            a0,      a0,     a2
    vstelm.d         vr9,     a0,     0,    1
    add.d            a0,      a0,     a2
    xvstelm.d        xr9,     a0,     0,    2
    add.d            a0,      a0,     a2
    xvstelm.d        xr9,     a0,     0,    3
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     t4

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOPA
.ENDLOOPA:
endfunc

/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                    int h, int x, int y) */
function ff_avg_h264_chroma_mc8_lasx
    li.d             t8,      8
    sub.d            t1,      t8,     a4     // 8-x
    sub.d            t2,      t8,     a5     // 8-y
    mul.d            t3,      t1,     t2     // A
    mul.d            t4,      a4,     t2     // B
    mul.d            t5,      t1,     a5     // C
    mul.d            t6,      a4,     a5     // D
    add.d            t0,      t4,     t5     // E
    xvreplgr2vr.b    xr0,     t3
    xvreplgr2vr.b    xr1,     t4
    xvreplgr2vr.b    xr2,     t5
    xvreplgr2vr.b    xr3,     t6
    xvreplgr2vr.b    xr4,     t0
    slli.d           t2,      a2,     1
    add.d            t3,      t2,     a2
    slli.d           t4,      a2,     2

    bge              zero,    t6,     .ENDLOOPDA
    move             t1,      a3
    xvilvl.b         xr9,     xr1,    xr0
    xvilvl.b         xr10,    xr3,    xr2
.LOOPDA:
    fld.d            f5,      a1,     0
    fld.d            f6,      a1,     1
    add.d            a1,      a1,     a2
    fld.d            f7,      a1,     0
    fld.d            f8,      a1,     1
    add.d            a1,      a1,     a2
    fld.d            f11,     a1,     0
    fld.d            f12,     a1,     1
    add.d            a1,      a1,     a2
    fld.d            f13,     a1,     0
    fld.d            f14,     a1,     1
    add.d            a1,      a1,     a2
    fld.d            f15,     a1,     0
    fld.d            f16,     a1,     1
    fld.d            f17,     a0,     0
    fldx.d           f18,     a0,     a2
    fldx.d           f19,     a0,     t2
    fldx.d           f20,     a0,     t3
    vilvl.b          vr5,     vr6,    vr5
    vilvl.b          vr7,     vr8,    vr7
    vilvl.b          vr11,    vr12,   vr11
    vilvl.b          vr13,    vr14,   vr13
    vilvl.b          vr16,    vr16,   vr15
    xvpermi.q        xr5,     xr7,    0x02
    xvpermi.q        xr7,     xr11,   0x02
    xvpermi.q        xr11,    xr13,   0x02
    xvpermi.q        xr13,    xr16,   0x02
    xvpermi.q        xr17,    xr18,   0x02
    xvpermi.q        xr19,    xr20,   0x02

    xvmulwev.h.bu    xr14,    xr9,    xr5
    xvmaddwod.h.bu   xr14,    xr9,    xr5
    xvmulwev.h.bu    xr15,    xr10,   xr7
    xvmaddwod.h.bu   xr15,    xr10,   xr7
    xvadd.h          xr14,    xr14,   xr15
    xvsrari.h        xr14,    xr14,   6
    xvsllwil.hu.bu   xr17,    xr17,   0
    xvadd.h          xr20,    xr14,   xr17
    xvsrarni.b.h     xr20,    xr20,   1
    xvstelm.d        xr20,    a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr20,    a0,     0,    2
    add.d            a0,      a0,     a2
    xvmulwev.h.bu    xr14,    xr9,    xr11
    xvmaddwod.h.bu   xr14,    xr9,    xr11
    xvmulwev.h.bu    xr15,    xr10,   xr13
    xvmaddwod.h.bu   xr15,    xr10,   xr13
    xvadd.h          xr14,    xr14,   xr15
    xvsrari.h        xr14,    xr14,   6
    xvsllwil.hu.bu   xr19,    xr19,   0
    xvadd.h          xr21,    xr14,   xr19
    xvsrarni.b.h     xr21,    xr21,   1
    xvstelm.d        xr21,    a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr21,    a0,     0,    2
    add.d            a0,      a0,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOPDA
    b                .ENDLOOPELSEA
.ENDLOOPDA:

    bge              zero,    t0,     .ENDLOOPEA
    move             t1,      a3
    li.d             t7,      1
    slt              t8,      zero,   t5
    maskeqz          t5,      a2,     t8
    masknez          t7,      t7,     t8
    or               t7,      t7,     t5
    xvilvl.b         xr7,     xr4,    xr0
.LOOPEA:
    fld.d            f5,      a1,     0
    fldx.d           f6,      a1,     t7
    add.d            a1,      a1,     a2
    fld.d            f8,      a1,     0
    fldx.d           f9,      a1,     t7
    add.d            a1,      a1,     a2
    fld.d            f10,     a1,     0
    fldx.d           f11,     a1,     t7
    add.d            a1,      a1,     a2
    fld.d            f12,     a1,     0
    fldx.d           f13,     a1,     t7
    add.d            a1,      a1,     a2
    fld.d            f14,     a0,     0
    fldx.d           f15,     a0,     a2
    fldx.d           f16,     a0,     t2
    fldx.d           f17,     a0,     t3
    vilvl.b          vr5,     vr6,    vr5
    vilvl.b          vr8,     vr9,    vr8
    vilvl.b          vr10,    vr11,   vr10
    vilvl.b          vr12,    vr13,   vr12
    xvpermi.q        xr5,     xr8,    0x02
    xvpermi.q        xr10,    xr12,   0x02
    xvpermi.q        xr14,    xr15,   0x02
    xvpermi.q        xr16,    xr17,   0x02

    xvmulwev.h.bu    xr6,     xr7,    xr5
    xvmaddwod.h.bu   xr6,     xr7,    xr5
    xvsrari.h        xr6,     xr6,    6
    xvsllwil.hu.bu   xr14,    xr14,   0
    xvadd.h          xr8,     xr6,    xr14
    xvsrarni.b.h     xr8,     xr8,    1
    xvstelm.d        xr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr8,     a0,     0,    2
    add.d            a0,      a0,     a2
    xvmulwev.h.bu    xr6,     xr7,    xr10
    xvmaddwod.h.bu   xr6,     xr7,    xr10
    xvsrari.h        xr6,     xr6,    6
    xvsllwil.hu.bu   xr16,    xr16,   0
    xvadd.h          xr8,     xr6,    xr16
    xvsrarni.b.h     xr8,     xr8,    1
    xvstelm.d        xr8,     a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr8,     a0,     0,    2
    add.d            a0,      a0,     a2

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOPEA
    b                .ENDLOOPELSEA
.ENDLOOPEA:

    move             t1,      a3
.LOOPELSEA:
    fld.d            f5,      a1,     0
    fldx.d           f6,      a1,     a2
    fldx.d           f7,      a1,     t2
    fldx.d           f8,      a1,     t3
    fld.d            f9,      a0,     0
    fldx.d           f10,     a0,     a2
    fldx.d           f11,     a0,     t2
    fldx.d           f12,     a0,     t3
    xvpermi.q        xr5,     xr6,    0x02
    xvpermi.q        xr7,     xr8,    0x02
    xvpermi.q        xr9,     xr10,   0x02
    xvpermi.q        xr11,    xr12,   0x02

    xvmulwev.h.bu    xr12,    xr0,    xr5
    xvmulwod.h.bu    xr13,    xr0,    xr5
    xvilvl.h         xr12,    xr13,   xr12
    xvsrari.h        xr12,    xr12,   6
    xvsllwil.hu.bu   xr9,     xr9,    0
    xvadd.h          xr9,     xr12,   xr9
    xvsrarni.b.h     xr9,     xr9,    1
    xvstelm.d        xr9,     a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr9,     a0,     0,    2
    add.d            a0,      a0,     a2
    xvmulwev.h.bu    xr12,    xr0,    xr7
    xvmulwod.h.bu    xr13,    xr0,    xr7
    xvilvl.h         xr12,    xr13,   xr12
    xvsrari.h        xr12,    xr12,   6
    xvsllwil.hu.bu   xr11,    xr11,   0
    xvadd.h          xr13,    xr12,   xr11
    xvsrarni.b.h     xr13,    xr13,   1
    xvstelm.d        xr13,    a0,     0,    0
    add.d            a0,      a0,     a2
    xvstelm.d        xr13,    a0,     0,    2
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     t4

    addi.d           t1,      t1,     -4
    blt              zero,    t1,     .LOOPELSEA
.ENDLOOPELSEA:
endfunc

/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                    int h, int x, int y) */
function ff_put_h264_chroma_mc4_lasx
    li.d             t8,      8
    sub.d            t1,      t8,     a4     // 8-x
    sub.d            t2,      t8,     a5     // 8-y
    mul.d            t3,      t1,     t2     // A
    mul.d            t4,      a4,     t2     // B
    mul.d            t5,      t1,     a5     // C
    mul.d            t6,      a4,     a5     // D
    add.d            t0,      t4,     t5     // E
    slli.d           t8,      a2,     1
    vreplgr2vr.b     vr0,     t3
    vreplgr2vr.b     vr1,     t4
    vreplgr2vr.b     vr2,     t5
    vreplgr2vr.b     vr3,     t6
    vreplgr2vr.b     vr4,     t0

    bge              zero,    t6,     .ENDPUT_DA
    move             t1,      a3
    vilvl.b          vr9,     vr1,    vr0
    vilvl.b          vr10,    vr3,    vr2
.PUT_DA:
    fld.d            f5,      a1,     0
    fld.d            f6,      a1,     1
    add.d            a1,      a1,     a2
    fld.d            f7,      a1,     0
    fld.d            f8,      a1,     1
    add.d            a1,      a1,     a2
    fld.d            f11,     a1,     0
    fld.d            f12,     a1,     1
    vilvl.b          vr5,     vr6,    vr5
    vilvl.b          vr7,     vr8,    vr7
    vilvl.b          vr13,    vr12,   vr11
    vilvl.d          vr5,     vr7,    vr5
    vilvl.d          vr13,    vr13,   vr7
    vmulwev.h.bu     vr14,    vr9,    vr5
    vmaddwod.h.bu    vr14,    vr9,    vr5
    vmulwev.h.bu     vr15,    vr10,   vr13
    vmaddwod.h.bu    vr15,    vr10,   vr13
    xvadd.h          xr14,    xr14,   xr15
    vsrarni.b.h      vr16,    vr14,   6
    vstelm.w         vr16,    a0,     0,    0
    add.d            a0,      a0,     a2
    vstelm.w         vr16,    a0,     0,    1
    add.d            a0,      a0,     a2
    addi.d           t1,      t1,     -2
    blt              zero,    t1,     .PUT_DA
    b                .ENDPUTA
.ENDPUT_DA:

    bge              zero,    t0,     .ENDPUT_EA
    move             t1,      a3
    li.d             t7,      1
    slt              t8,      zero,   t5
    maskeqz          t5,      a2,     t8
    masknez          t7,      t7,     t8
    or               t7,      t7,     t5
    vilvl.b          vr7,     vr4,    vr0
.PUT_EA:
    fld.d            f5,      a1,     0
    fldx.d           f6,      a1,     t7
    vilvl.b          vr5,     vr6,    vr5
    add.d            a1,      a1,     a2
    fld.d            f8,      a1,     0
    fldx.d           f9,      a1,     t7
    vilvl.b          vr8,     vr9,    vr8
    vilvl.d          vr5,     vr8,    vr5
    vmulwev.h.bu     vr6,     vr7,    vr5
    vmaddwod.h.bu    vr6,     vr7,    vr5
    vsrarni.b.h      vr6,     vr6,    6
    vstelm.w         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    vstelm.w         vr6,     a0,     0,    1
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     a2
    addi.d           t1,      t1,     -2
    blt              zero,    t1,     .PUT_EA
    b                .ENDPUTA
.ENDPUT_EA:

    move             t1,      a3
.PUTA:
    fld.d            f5,      a1,     0
    fldx.d           f8,      a1,     a2
    vilvl.w          vr5,     vr8,    vr5
    vmulwev.h.bu     vr6,     vr0,    vr5
    vmulwod.h.bu     vr7,     vr0,    vr5
    vilvl.h          vr6,     vr7,    vr6
    vsrarni.b.h      vr6,     vr6,    6
    vstelm.w         vr6,     a0,     0,    0
    add.d            a0,      a0,     a2
    vstelm.w         vr6,     a0,     0,    1
    add.d            a0,      a0,     a2
    add.d            a1,      a1,     t8
    addi.d           t1,      t1,     -2
    blt              zero,    t1,     .PUTA
.ENDPUTA:
endfunc