/*
 * Loongson LSX optimized add_residual functions for HEVC decoding
 *
 * Copyright (c) 2023 Loongson Technology Corporation Limited
 * Contributed by jinbo <jinbo@loongson.cn>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "loongson_asm.S"

/*
 * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
 */
.macro ADD_RES_LSX_4x4_8
    vldrepl.w      vr0,    a0,     0
    add.d          t0,     a0,     a2
    vldrepl.w      vr1,    t0,     0
    vld            vr2,    a1,     0

    vilvl.w        vr1,    vr1,    vr0
    vsllwil.hu.bu  vr1,    vr1,    0
    vadd.h         vr1,    vr1,    vr2
    vssrani.bu.h   vr1,    vr1,    0

    vstelm.w       vr1,    a0,     0,    0
    vstelm.w       vr1,    t0,     0,    1
.endm

function ff_hevc_add_residual4x4_8_lsx
    ADD_RES_LSX_4x4_8
    alsl.d         a0,     a2,     a0,   1
    addi.d         a1,     a1,     16
    ADD_RES_LSX_4x4_8
endfunc

/*
 * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
 */
.macro ADD_RES_LSX_8x8_8
    vldrepl.d      vr0,    a0,     0
    add.d          t0,     a0,     a2
    vldrepl.d      vr1,    t0,     0
    add.d          t1,     t0,     a2
    vldrepl.d      vr2,    t1,     0
    add.d          t2,     t1,     a2
    vldrepl.d      vr3,    t2,     0

    vld            vr4,    a1,     0
    addi.d         t3,     zero,   16
    vldx           vr5,    a1,     t3
    addi.d         t4,     a1,     32
    vld            vr6,    t4,     0
    vldx           vr7,    t4,     t3

    vsllwil.hu.bu  vr0,    vr0,    0
    vsllwil.hu.bu  vr1,    vr1,    0
    vsllwil.hu.bu  vr2,    vr2,    0
    vsllwil.hu.bu  vr3,    vr3,    0
    vadd.h         vr0,    vr0,    vr4
    vadd.h         vr1,    vr1,    vr5
    vadd.h         vr2,    vr2,    vr6
    vadd.h         vr3,    vr3,    vr7
    vssrani.bu.h   vr1,    vr0,    0
    vssrani.bu.h   vr3,    vr2,    0

    vstelm.d       vr1,    a0,     0,     0
    vstelm.d       vr1,    t0,     0,     1
    vstelm.d       vr3,    t1,     0,     0
    vstelm.d       vr3,    t2,     0,     1
.endm

function ff_hevc_add_residual8x8_8_lsx
    ADD_RES_LSX_8x8_8
    alsl.d         a0,     a2,     a0,    2
    addi.d         a1,     a1,     64
    ADD_RES_LSX_8x8_8
endfunc

/*
 * void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
 */
function ff_hevc_add_residual16x16_8_lsx
.rept 8
    vld            vr0,    a0,     0
    vldx           vr2,    a0,     a2

    vld            vr4,    a1,     0
    addi.d         t0,     zero,   16
    vldx           vr5,    a1,     t0
    addi.d         t1,     a1,     32
    vld            vr6,    t1,     0
    vldx           vr7,    t1,     t0

    vexth.hu.bu    vr1,    vr0
    vsllwil.hu.bu  vr0,    vr0,    0
    vexth.hu.bu    vr3,    vr2
    vsllwil.hu.bu  vr2,    vr2,    0
    vadd.h         vr0,    vr0,    vr4
    vadd.h         vr1,    vr1,    vr5
    vadd.h         vr2,    vr2,    vr6
    vadd.h         vr3,    vr3,    vr7

    vssrani.bu.h   vr1,    vr0,    0
    vssrani.bu.h   vr3,    vr2,    0

    vst            vr1,    a0,     0
    vstx           vr3,    a0,     a2

    alsl.d         a0,     a2,     a0,   1
    addi.d         a1,     a1,     64
.endr
endfunc

/*
 * void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
 */
function ff_hevc_add_residual32x32_8_lsx
.rept 32
    vld            vr0,    a0,     0
    addi.w         t0,     zero,   16
    vldx           vr2,    a0,     t0

    vld            vr4,    a1,     0
    vldx           vr5,    a1,     t0
    addi.d         t1,     a1,     32
    vld            vr6,    t1,     0
    vldx           vr7,    t1,     t0

    vexth.hu.bu    vr1,    vr0
    vsllwil.hu.bu  vr0,    vr0,    0
    vexth.hu.bu    vr3,    vr2
    vsllwil.hu.bu  vr2,    vr2,    0
    vadd.h         vr0,    vr0,    vr4
    vadd.h         vr1,    vr1,    vr5
    vadd.h         vr2,    vr2,    vr6
    vadd.h         vr3,    vr3,    vr7

    vssrani.bu.h   vr1,    vr0,    0
    vssrani.bu.h   vr3,    vr2,    0

    vst            vr1,    a0,     0
    vstx           vr3,    a0,     t0

    add.d          a0,     a0,     a2
    addi.d         a1,     a1,     64
.endr
endfunc