aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-04-10 22:06:53 +0200
committerMichael Niedermayer <michaelni@gmx.at>2012-04-10 22:53:25 +0200
commite387c9d5dd56e1f29470ee933027ee3d92f9cfd6 (patch)
treedaa5876aa5b6515b3c92b6ee45e552852345e35b /libavcodec/x86
parentb1ef4dc406e8a0bd9acea40d880aa4e74412075b (diff)
parent2130bd8f5b6504ea14cd41e33f5d4f431eb724f3 (diff)
downloadffmpeg-e387c9d5dd56e1f29470ee933027ee3d92f9cfd6.tar.gz
Merge remote-tracking branch 'qatar/master'
* qatar/master: (22 commits) rv40dsp x86: use only one register, for both increment and loop counter rv40dsp: implement prescaled versions for biweight. avconv: use default channel layouts when they are unknown avconv: parse channel layout string nutdec: K&R formatting cosmetics vda: Signal 4 byte NAL headers to the decoder regardless of what's in the extradata mem: Consistently return NULL for av_malloc(0) vf_overlay: implement poll_frame() vf_scale: support named constants for sws flags. lavc doxy: add all installed headers to doxy groups. lavc doxy: add avfft to the main lavc group. lavc doxy: add remaining avcodec.h functions to a misc doxygen group. lavc doxy: add AVPicture functions to a doxy group. lavc doxy: add resampling functions to a doxy group. lavc doxy: replace \ with / lavc doxy: add encoding functions to a doxy group. lavc doxy: add decoding functions to a doxy group. lavc doxy: fix formatting of AV_PKT_DATA_{PARAM_CHANGE,H263_MB_INFO} lavc doxy: add AVPacket-related stuff to a separate doxy group. lavc doxy: add core functions/definitions to a doxy group. ... Conflicts: ffmpeg.c libavcodec/avcodec.h libavcodec/vda.c libavcodec/x86/rv40dsp.asm libavfilter/vf_scale.c libavformat/nutdec.c libavutil/mem.c tests/ref/acodec/pcm_s24daud Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/rv40dsp.asm107
-rw-r--r--libavcodec/x86/rv40dsp_init.c30
2 files changed, 68 insertions, 69 deletions
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index c13e9f03d9..e8acfb25fe 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -32,13 +32,14 @@ SECTION .text
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
%macro RV40_WCORE 4-5
- movh m4, [%3 + 0]
- movh m5, [%4 + 0]
+ movh m4, [%3 + r6 + 0]
+ movh m5, [%4 + r6 + 0]
%if %0 == 4
-%define OFFSET mmsize / 2
+%define OFFSET r6 + mmsize / 2
%else
; 8x8 block and sse2, stride was provided
-%define OFFSET %5
+%define OFFSET r6
+ add r6, r5
%endif
movh m6, [%3 + OFFSET]
movh m7, [%4 + OFFSET]
@@ -99,10 +100,12 @@ SECTION .text
packuswb m4, m6
%if %0 == 5
; Only called for 8x8 blocks and sse2
- movh [%2 + 0], m4
- movhps [%2 + %5], m4
+ sub r6, r5
+ movh [%2 + r6], m4
+ add r6, r5
+ movhps [%2 + r6], m4
%else
- mova [%2], m4
+ mova [%2 + r6], m4
%endif
%endmacro
@@ -115,93 +118,79 @@ SECTION .text
%endif
; Prepare for next loop
- add r0, r5
- add r1, r5
- add r2, r5
+ add r6, r5
%else
%ifidn %1, 8
RV40_WCORE %2, r0, r1, r2, r5
; Prepare 2 next lines
- lea r0, [r0 + 2 * r5]
- lea r1, [r1 + 2 * r5]
- lea r2, [r2 + 2 * r5]
+ add r6, r5
%else
RV40_WCORE %2, r0, r1, r2
; Prepare single next line
- add r0, r5
- add r1, r5
- add r2, r5
+ add r6, r5
%endif
%endif
- dec r6
%endmacro
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
; %1=size %2=num of xmm regs
-%macro RV40_WEIGHT 2
-cglobal rv40_weight_func_%1, 6, 7, %2
+; The weights are FP0.14 notation of fractions depending on pts.
+; For timebases without rounding error (i.e. PAL), the fractions
+; can be simplified, and several operations can be avoided.
+; Therefore, we check here whether they are multiples of 2^9 for
+; those simplifications to occur.
+%macro RV40_WEIGHT 3
+cglobal rv40_weight_func_%1_%2, 6, 7, 8
%if cpuflag(ssse3)
mova m1, [shift_round]
%else
mova m1, [pw_16]
%endif
pxor m0, m0
- mov r6, r3
- or r6, r4
- ; The weights are FP0.14 notation of fractions depending on pts.
- ; For timebases without rounding error (i.e. PAL), the fractions
- ; can be simplified, and several operations can be avoided.
- ; Therefore, we check here whether they are multiples of 2^9 for
- ; those simplifications to occur.
- and r6, 0x1FF
; Set loop counter and increments
-%if mmsize == 8
- mov r6, %1
-%else
- mov r6, (%1 * %1) / mmsize
-%endif
+ mov r6, r5
+ shl r6, %3
+ add r0, r6
+ add r1, r6
+ add r2, r6
+ neg r6
- ; Use result of test now
- jz .loop_512
movd m2, r3d
movd m3, r4d
+%ifidn %1,rnd
+%define RND 0
SPLATW m2, m2
- SPLATW m3, m3
-
-.loop:
- MAIN_LOOP %1, 0
- jnz .loop
- REP_RET
-
- ; Weights are multiple of 512, which allows some shortcuts
-.loop_512:
- sar r3, 9
- sar r4, 9
- movd m2, r3d
- movd m3, r4d
+%else
+%define RND 1
%if cpuflag(ssse3)
punpcklbw m3, m2
- SPLATW m3, m3
%else
SPLATW m2, m2
- SPLATW m3, m3
%endif
-.loop2:
- MAIN_LOOP %1, 1
- jnz .loop2
- REP_RET
+%endif
+ SPLATW m3, m3
+.loop:
+ MAIN_LOOP %2, RND
+ jnz .loop
+ REP_RET
%endmacro
INIT_MMX mmx
-RV40_WEIGHT 8, 0
-RV40_WEIGHT 16, 0
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
INIT_XMM sse2
-RV40_WEIGHT 8, 8
-RV40_WEIGHT 16, 8
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
INIT_XMM ssse3
-RV40_WEIGHT 8, 8
-RV40_WEIGHT 16, 8
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 79c70f78c3..df468aa9e5 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
#define DECLARE_WEIGHT(opt) \
-void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride); \
-void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride);
+void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride);
DECLARE_WEIGHT(mmx)
DECLARE_WEIGHT(sse2)
DECLARE_WEIGHT(ssse3)
@@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
if (mm_flags & AV_CPU_FLAG_MMX) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
- c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx;
- c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
}
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
@@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
}
if (mm_flags & AV_CPU_FLAG_SSE2) {
- c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2;
- c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
}
if (mm_flags & AV_CPU_FLAG_SSSE3) {
- c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3;
- c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
}
#endif
}