/*
 * software YUV to RGB converter
 *
 * Copyright (C) 2001-2007 Michael Niedermayer
 * Copyright (C) 2009-2010 Konstantin Shishkov
 *
 * MMX/MMXEXT template stuff (needed for fast movntq support),
 * 1,4,8bpp support and context / deglobalize stuff
 * by Michael Niedermayer (michaelni@gmx.at)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>

#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"

#if HAVE_X86ASM

#define YUV2RGB_LOOP(depth)                                          \
    h_size = (c->opts.dst_w + 7) & ~7;                               \
    if (h_size * depth > FFABS(dstStride[0]))                        \
        h_size -= 8;                                                 \
                                                                     \
    vshift = c->opts.src_format != AV_PIX_FMT_YUV422P;               \
                                                                     \
    for (y = 0; y < srcSliceH; y++) {                                \
        uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
        const uint8_t *py = src[0] +               y * srcStride[0]; \
        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1]; \
        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
        x86_reg index = -h_size / 2;                                 \

extern void ff_yuv_420_rgb24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                   const uint8_t *py_2index);
extern void ff_yuv_420_bgr24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                   const uint8_t *py_2index);

extern void ff_yuv_420_rgb15_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                   const uint8_t *py_2index);
extern void ff_yuv_420_rgb16_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                   const uint8_t *py_2index);
extern void ff_yuv_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                   const uint8_t *py_2index);
extern void ff_yuv_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                   const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                   const uint8_t *py_2index);
extern void ff_yuva_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                    const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                    const uint8_t *py_2index, const uint8_t *pa_2index);
extern void ff_yuva_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                    const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                    const uint8_t *py_2index, const uint8_t *pa_2index);
#if ARCH_X86_64
extern void ff_yuv_420_gbrp24_ssse3(x86_reg index, uint8_t *image, uint8_t *dst_b, uint8_t *dst_r,
                                    const uint8_t *pu_index, const uint8_t *pv_index,
                                    const uint64_t *pointer_c_dither,
                                    const uint8_t *py_2index);
#endif

static inline int yuv420_rgb15_ssse3(SwsInternal *c, const uint8_t *const src[],
                                     const int srcStride[],
                                     int srcSliceY, int srcSliceH,
                                     uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    YUV2RGB_LOOP(2)

        c->blueDither  = ff_dither8[y       & 1];
        c->greenDither = ff_dither8[y       & 1];
        c->redDither   = ff_dither8[(y + 1) & 1];

        ff_yuv_420_rgb15_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
    }
    return srcSliceH;
}

static inline int yuv420_rgb16_ssse3(SwsInternal *c, const uint8_t *const src[],
                                     const int srcStride[],
                                     int srcSliceY, int srcSliceH,
                                     uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    YUV2RGB_LOOP(2)

        c->blueDither  = ff_dither8[y       & 1];
        c->greenDither = ff_dither4[y       & 1];
        c->redDither   = ff_dither8[(y + 1) & 1];

        ff_yuv_420_rgb16_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
    }
    return srcSliceH;
}

static inline int yuv420_rgb32_ssse3(SwsInternal *c, const uint8_t *const src[],
                                     const int srcStride[],
                                     int srcSliceY, int srcSliceH,
                                     uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    YUV2RGB_LOOP(4)

        ff_yuv_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
    }
    return srcSliceH;
}

static inline int yuv420_bgr32_ssse3(SwsInternal *c, const uint8_t *const src[],
                                     const int srcStride[],
                                     int srcSliceY, int srcSliceH,
                                     uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    YUV2RGB_LOOP(4)

        ff_yuv_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
    }
    return srcSliceH;
}

static inline int yuva420_rgb32_ssse3(SwsInternal *c, const uint8_t *const src[],
                                      const int srcStride[],
                                      int srcSliceY, int srcSliceH,
                                      uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;
    YUV2RGB_LOOP(4)

        const uint8_t *pa = src[3] + y * srcStride[3];
        ff_yuva_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
    }
    return srcSliceH;
}

static inline int yuva420_bgr32_ssse3(SwsInternal *c, const uint8_t *const src[],
                                      const int srcStride[],
                                      int srcSliceY, int srcSliceH,
                                      uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    YUV2RGB_LOOP(4)

        const uint8_t *pa = src[3] + y * srcStride[3];
        ff_yuva_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
    }
    return srcSliceH;
}

static inline int yuv420_rgb24_ssse3(SwsInternal *c, const uint8_t *const src[],
                                     const int srcStride[],
                                     int srcSliceY, int srcSliceH,
                                     uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    YUV2RGB_LOOP(3)

        ff_yuv_420_rgb24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
    }
    return srcSliceH;
}

static inline int yuv420_bgr24_ssse3(SwsInternal *c, const uint8_t *const src[],
                                     const int srcStride[],
                                     int srcSliceY, int srcSliceH,
                                     uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    YUV2RGB_LOOP(3)

        ff_yuv_420_bgr24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
    }
    return srcSliceH;
}

#if ARCH_X86_64
static inline int yuv420_gbrp_ssse3(SwsInternal *c, const uint8_t *const src[],
                                    const int srcStride[],
                                    int srcSliceY, int srcSliceH,
                                    uint8_t *const dst[], const int dstStride[])
{
    int y, h_size, vshift;

    h_size = (c->opts.dst_w + 7) & ~7;
    if (h_size * 3 > FFABS(dstStride[0]))
        h_size -= 8;

    vshift = c->opts.src_format != AV_PIX_FMT_YUV422P;

    for (y = 0; y < srcSliceH; y++) {
        uint8_t *dst_g    = dst[0] + (y + srcSliceY) * dstStride[0];
        uint8_t *dst_b    = dst[1] + (y + srcSliceY) * dstStride[1];
        uint8_t *dst_r    = dst[2] + (y + srcSliceY) * dstStride[2];
        const uint8_t *py = src[0] +               y * srcStride[0];
        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1];
        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2];
        x86_reg index = -h_size / 2;

        ff_yuv_420_gbrp24_ssse3(index, dst_g, dst_b, dst_r, pu - index, pv - index, &(c->redDither), py - 2 * index);
    }
    return srcSliceH;
}
#endif

#endif /* HAVE_X86ASM */

av_cold SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c)
{
#if HAVE_X86ASM
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_SSSE3(cpu_flags)) {
        switch (c->opts.dst_format) {
        case AV_PIX_FMT_RGB32:
            if (c->opts.src_format == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
                return yuva420_rgb32_ssse3;
#endif
                break;
            } else
                return yuv420_rgb32_ssse3;
        case AV_PIX_FMT_BGR32:
            if (c->opts.src_format == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
                return yuva420_bgr32_ssse3;
#endif
                break;
            } else
                return yuv420_bgr32_ssse3;
        case AV_PIX_FMT_RGB24:
            return yuv420_rgb24_ssse3;
        case AV_PIX_FMT_BGR24:
            return yuv420_bgr24_ssse3;
        case AV_PIX_FMT_RGB565:
            return yuv420_rgb16_ssse3;
        case AV_PIX_FMT_RGB555:
            return yuv420_rgb15_ssse3;
#if ARCH_X86_64
        case AV_PIX_FMT_GBRP:
            return yuv420_gbrp_ssse3;
#endif
        }
    }

#endif /* HAVE_X86ASM */
    return NULL;
}