diff options
author | Diego Biurrun <diego@biurrun.de> | 2014-01-07 12:23:13 +0100 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2014-05-27 08:52:34 -0700 |
commit | 0d439fbede03854eac8a978cccf21a3425a3c82d (patch) | |
tree | 91ecc54b480f3011ffda2ad950a0904a0e8df35d /libavcodec/x86 | |
parent | 888dcd86755d37e55fd74166f6d38ad66d41db58 (diff) | |
download | ffmpeg-0d439fbede03854eac8a978cccf21a3425a3c82d.tar.gz |
dsputil: Split off HuffYUV decoding bits into their own context
Also shorten HuffYUV context member names to avoid clutter.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/Makefile | 6 | ||||
-rw-r--r-- | libavcodec/x86/dsputil.asm | 140 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_init.c | 24 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 26 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_x86.h | 6 | ||||
-rw-r--r-- | libavcodec/x86/huffyuvdsp.asm | 165 | ||||
-rw-r--r-- | libavcodec/x86/huffyuvdsp.h | 30 | ||||
-rw-r--r-- | libavcodec/x86/huffyuvdsp_init.c | 63 | ||||
-rw-r--r-- | libavcodec/x86/huffyuvdsp_mmx.c (renamed from libavcodec/x86/dsputil_x86.c) | 34 |
9 files changed, 292 insertions, 202 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 8da5a91313..a354dd8a8e 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -3,8 +3,7 @@ OBJS += x86/constants.o \ OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o -OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \ - x86/dsputil_x86.o +OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \ x86/fdct.o \ x86/motion_est.o @@ -15,6 +14,7 @@ OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o +OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o OBJS-$(CONFIG_LPC) += x86/lpc.o OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o @@ -48,6 +48,7 @@ MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ x86/simple_idct.o MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ x86/hpeldsp_mmx.o +MMX-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_mmx.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o @@ -80,6 +81,7 @@ YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ x86/qpel.o YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ x86/hpeldsp.o +YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index a8c8fdac2f..684f09b7fc 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -22,11 +22,6 @@ %include "libavutil/x86/x86util.asm" SECTION_RODATA -pb_f: times 16 db 15 -pb_zzzzzzzz77777777: times 8 db -1 -pb_7: times 8 db 7 -pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 -pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION_TEXT @@ -203,141 +198,6 @@ SCALARPRODUCT_LOOP 0 RET -; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, -; const uint8_t *diff, int w, -; int *left, int *left_top) -INIT_MMX mmxext -cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top - movq mm0, [topq] - movq mm2, mm0 - movd mm4, [left_topq] - psllq mm2, 8 - movq mm1, mm0 - por mm4, mm2 - movd mm3, [leftq] - psubb mm0, mm4 ; t-tl - add dstq, wq - add topq, wq - add diffq, wq - neg wq - jmp .skip -.loop: - movq mm4, [topq+wq] - movq mm0, mm4 - psllq mm4, 8 - por mm4, mm1 - movq mm1, mm0 ; t - psubb mm0, mm4 ; t-tl -.skip: - movq mm2, [diffq+wq] -%assign i 0 -%rep 8 - movq mm4, mm0 - paddb mm4, mm3 ; t-tl+l - movq mm5, mm3 - pmaxub mm3, mm1 - pminub mm5, mm1 - pminub mm3, mm4 - pmaxub mm3, mm5 ; median - paddb mm3, mm2 ; +residual -%if i==0 - movq mm7, mm3 - psllq mm7, 56 -%else - movq mm6, mm3 - psrlq mm7, 8 - psllq mm6, 56 - por mm7, mm6 -%endif -%if i<7 - psrlq mm0, 8 - psrlq mm1, 8 - psrlq mm2, 8 -%endif -%assign i i+1 -%endrep - movq [dstq+wq], mm7 - add wq, 8 - jl .loop - movzx r2d, byte [dstq-1] - mov [leftq], r2d - movzx r2d, byte [topq-1] - mov [left_topq], r2d - RET - - -%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned - add srcq, wq - add dstq, wq - neg wq -%%.loop: -%if %2 - mova m1, [srcq+wq] -%else - movu m1, [srcq+wq] -%endif - mova m2, m1 - psllw m1, 8 - paddb m1, m2 - mova m2, m1 - pshufb m1, m3 - paddb m1, m2 - pshufb m0, m5 - mova m2, m1 - pshufb m1, m4 - paddb m1, m2 -%if mmsize == 16 - mova m2, m1 - pshufb m1, m6 - paddb m1, m2 -%endif - paddb m0, m1 -%if %1 - mova [dstq+wq], m0 -%else - movq [dstq+wq], m0 - movhps [dstq+wq+8], m0 -%endif - add wq, mmsize - jl %%.loop - mov eax, mmsize-1 - sub eax, wd - movd m1, eax - pshufb m0, m1 - movd eax, m0 - RET -%endmacro - -; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, -; int w, int left) -INIT_MMX ssse3 -cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left -.skip_prologue: - mova m5, [pb_7] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - psllq m0, 56 - ADD_HFYU_LEFT_LOOP 1, 1 - -INIT_XMM sse4 -cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left - mova m5, [pb_f] - mova m6, [pb_zzzzzzzz77777777] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - pslldq m0, 15 - test srcq, 15 - jnz .src_unaligned - test dstq, 15 - jnz .dst_unaligned - ADD_HFYU_LEFT_LOOP 1, 1 -.dst_unaligned: - ADD_HFYU_LEFT_LOOP 0, 1 -.src_unaligned: - ADD_HFYU_LEFT_LOOP 0, 0 - ;----------------------------------------------------------------------------- ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, ; int32_t max, unsigned int len) diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index d0832b1c1f..10fa166db4 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -20,7 +20,6 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/internal.h" -#include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" @@ -90,14 +89,6 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); -void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, - int *left, int *left_top); -int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, - int w, int left); -int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, - int w, int left); - void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src, @@ -549,8 +540,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, } c->gmc = ff_gmc_mmx; - - c->add_bytes = ff_add_bytes_mmx; #endif /* HAVE_MMX_INLINE */ #if HAVE_MMX_EXTERNAL @@ -578,10 +567,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); - /* slower than cmov version on AMD */ - if (!(cpu_flags & AV_CPU_FLAG_3DNOW)) - c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext; - c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; #endif /* HAVE_MMXEXT_EXTERNAL */ @@ -636,10 +621,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, int cpu_flags, unsigned high_bit_depth) { #if HAVE_SSSE3_EXTERNAL - c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; - if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe - c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; - if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; c->bswap_buf = ff_bswap32_buf_ssse3; @@ -659,11 +640,6 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx, { int cpu_flags = av_get_cpu_flags(); -#if HAVE_7REGS && HAVE_INLINE_ASM - if (cpu_flags & AV_CPU_FLAG_CMOV) - c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov; -#endif - if (X86_MMX(cpu_flags)) dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth); diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index c0b3edd874..c17f8d00d5 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -222,32 +222,6 @@ void ff_clear_blocks_sse(int16_t *blocks) : "%"REG_a); } -void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) -{ - x86_reg i = 0; - - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq (%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%2, %0) \n\t" - "movq 8(%1, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %3, %0 \n\t" - "js 1b \n\t" - : "+r" (i) - : "r" (src), "r" (dst), "r" ((x86_reg) w - 15)); - - for (; i < w; i++) - dst[i + 0] += src[i + 0]; -} - /* Draw the edges of width 'w' of an image of size width, height * this MMX version can only handle w == 8 || w == 16. */ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h index 5f6aca46be..a4bc8c2730 100644 --- a/libavcodec/x86/dsputil_x86.h +++ b/libavcodec/x86/dsputil_x86.h @@ -43,12 +43,6 @@ void ff_clear_block_sse(int16_t *block); void ff_clear_blocks_mmx(int16_t *blocks); void ff_clear_blocks_sse(int16_t *blocks); -void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w); - -void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, - int *left, int *left_top); - void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides); diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm new file mode 100644 index 0000000000..436abc8b75 --- /dev/null +++ b/libavcodec/x86/huffyuvdsp.asm @@ -0,0 +1,165 @@ +;****************************************************************************** +;* SIMD-optimized HuffYUV functions +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +pb_f: times 16 db 15 +pb_zzzzzzzz77777777: times 8 db -1 +pb_7: times 8 db 7 +pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 +pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 + +SECTION_TEXT + +; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top, +; const uint8_t *diff, int w, +; int *left, int *left_top) +INIT_MMX mmxext +cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top + movq mm0, [topq] + movq mm2, mm0 + movd mm4, [left_topq] + psllq mm2, 8 + movq mm1, mm0 + por mm4, mm2 + movd mm3, [leftq] + psubb mm0, mm4 ; t-tl + add dstq, wq + add topq, wq + add diffq, wq + neg wq + jmp .skip +.loop: + movq mm4, [topq+wq] + movq mm0, mm4 + psllq mm4, 8 + por mm4, mm1 + movq mm1, mm0 ; t + psubb mm0, mm4 ; t-tl +.skip: + movq mm2, [diffq+wq] +%assign i 0 +%rep 8 + movq mm4, mm0 + paddb mm4, mm3 ; t-tl+l + movq mm5, mm3 + pmaxub mm3, mm1 + pminub mm5, mm1 + pminub mm3, mm4 + pmaxub mm3, mm5 ; median + paddb mm3, mm2 ; +residual +%if i==0 + movq mm7, mm3 + psllq mm7, 56 +%else + movq mm6, mm3 + psrlq mm7, 8 + psllq mm6, 56 + por mm7, mm6 +%endif +%if i<7 + psrlq mm0, 8 + psrlq mm1, 8 + psrlq mm2, 8 +%endif +%assign i i+1 +%endrep + movq [dstq+wq], mm7 + add wq, 8 + jl .loop + movzx r2d, byte [dstq-1] + mov [leftq], r2d + movzx r2d, byte [topq-1] + mov [left_topq], r2d + RET + + +%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned + add srcq, wq + add dstq, wq + neg wq +%%.loop: +%if %2 + mova m1, [srcq+wq] +%else + movu m1, [srcq+wq] +%endif + mova m2, m1 + psllw m1, 8 + paddb m1, m2 + mova m2, m1 + pshufb m1, m3 + paddb m1, m2 + pshufb m0, m5 + mova m2, m1 + pshufb m1, m4 + paddb m1, m2 +%if mmsize == 16 + mova m2, m1 + pshufb m1, m6 + paddb m1, m2 +%endif + paddb m0, m1 +%if %1 + mova [dstq+wq], m0 +%else + movq [dstq+wq], m0 + movhps [dstq+wq+8], m0 +%endif + add wq, mmsize + jl %%.loop + mov eax, mmsize-1 + sub eax, wd + movd m1, eax + pshufb m0, m1 + movd eax, m0 + RET +%endmacro + +; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left) +INIT_MMX ssse3 +cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left +.skip_prologue: + mova m5, [pb_7] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] + movd m0, leftm + psllq m0, 56 + ADD_HFYU_LEFT_LOOP 1, 1 + +INIT_XMM sse4 +cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left + mova m5, [pb_f] + mova m6, [pb_zzzzzzzz77777777] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] + movd m0, leftm + pslldq m0, 15 + test srcq, 15 + jnz .src_unaligned + test dstq, 15 + jnz .dst_unaligned + ADD_HFYU_LEFT_LOOP 1, 1 +.dst_unaligned: + ADD_HFYU_LEFT_LOOP 0, 1 +.src_unaligned: + ADD_HFYU_LEFT_LOOP 0, 0 diff --git a/libavcodec/x86/huffyuvdsp.h b/libavcodec/x86/huffyuvdsp.h new file mode 100644 index 0000000000..6be3e5afd0 --- /dev/null +++ b/libavcodec/x86/huffyuvdsp.h @@ -0,0 +1,30 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_HUFFYUVDSP_H +#define AVCODEC_X86_HUFFYUVDSP_H + +#include <stdint.h> + +void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w); + +void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, + const uint8_t *diff, int w, + int *left, int *left_top); + +#endif /* AVCODEC_X86_HUFFYUVDSP_H */ diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c new file mode 100644 index 0000000000..20c7cb1cc6 --- /dev/null +++ b/libavcodec/x86/huffyuvdsp_init.c @@ -0,0 +1,63 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/huffyuvdsp.h" + +void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w); + +void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, + const uint8_t *diff, int w, + int *left, int *left_top); +void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top, + const uint8_t *diff, int w, + int *left, int *left_top); + +int ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src, + int w, int left); +int ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src, + int w, int left); + +av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_7REGS && HAVE_INLINE_ASM + if (cpu_flags & AV_CPU_FLAG_CMOV) + c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov; +#endif + + if (INLINE_MMX(cpu_flags)) + c->add_bytes = ff_add_bytes_mmx; + + if (EXTERNAL_MMXEXT(cpu_flags)) { + /* slower than cmov version on AMD */ + if (!(cpu_flags & AV_CPU_FLAG_3DNOW)) + c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext; + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3; + if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe + c->add_hfyu_left_pred = ff_add_hfyu_left_pred_sse4; + } +} diff --git a/libavcodec/x86/dsputil_x86.c b/libavcodec/x86/huffyuvdsp_mmx.c index 144339be64..ad5306c21e 100644 --- a/libavcodec/x86/dsputil_x86.c +++ b/libavcodec/x86/huffyuvdsp_mmx.c @@ -20,14 +20,14 @@ #include "config.h" #include "libavutil/x86/asm.h" -#include "dsputil_x86.h" +#include "huffyuvdsp.h" #if HAVE_INLINE_ASM #if HAVE_7REGS -void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, - const uint8_t *diff, int w, - int *left, int *left_top) +void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top, + const uint8_t *diff, int w, + int *left, int *left_top) { x86_reg w2 = -w; x86_reg x; @@ -62,4 +62,30 @@ void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, } #endif +void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) +{ + x86_reg i = 0; + + __asm__ volatile ( + "jmp 2f \n\t" + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq (%2, %0), %%mm1 \n\t" + "paddb %%mm0, %%mm1 \n\t" + "movq %%mm1, (%2, %0) \n\t" + "movq 8(%1, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "paddb %%mm0, %%mm1 \n\t" + "movq %%mm1, 8(%2, %0) \n\t" + "add $16, %0 \n\t" + "2: \n\t" + "cmp %3, %0 \n\t" + "js 1b \n\t" + : "+r" (i) + : "r" (src), "r" (dst), "r" ((x86_reg) w - 15)); + + for (; i < w; i++) + dst[i + 0] += src[i + 0]; +} + #endif /* HAVE_INLINE_ASM */ |