diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-03-07 02:57:53 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-03-07 03:22:49 +0100 |
commit | 6df42f98746be06c883ce683563e07c9a2af983f (patch) | |
tree | 6bb893aaf179526515cfb3b1cc933721317dcf6f /libswscale/x86/scale.asm | |
parent | 57986c501e8c97d4bd2e1b7ce9e9037c4ae06245 (diff) | |
parent | b5161908e06b4497bf663510fb495ba97a6fd2b5 (diff) | |
download | ffmpeg-6df42f98746be06c883ce683563e07c9a2af983f.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
SBR DSP: fix SSE code to not use SSE2 instructions.
cpu: initialize mask to -1, so that by default, optimizations are used.
error_resilience: initialize s->block_index[].
svq3: protect against negative quantizers.
Don't use ff_cropTbl[] for IDCT.
swscale: make filterPos 32bit.
FATE: add CPUFLAGS variable, mapping to -cpuflags avconv option.
avconv: add -cpuflags option for setting supported cpuflags.
cpu: add av_set_cpu_flags_mask().
libx264: Allow overriding the sliced threads option
avconv: fix counting encoded video size.
Conflicts:
doc/APIchanges
doc/fate.texi
doc/ffmpeg.texi
ffmpeg.c
libavcodec/h264idct_template.c
libavcodec/svq3.c
libavutil/avutil.h
libavutil/cpu.c
libavutil/cpu.h
libswscale/swscale.c
tests/Makefile
tests/fate-run.sh
tests/regression-funcs.sh
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libswscale/x86/scale.asm')
-rw-r--r-- | libswscale/x86/scale.asm | 31 |
1 files changed, 17 insertions, 14 deletions
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 3c8b6fa00e..b8cea49783 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -38,7 +38,7 @@ SECTION .text ; (SwsContext *c, int{16,32}_t *dst, ; int dstW, const uint{8,16}_t *src, ; const int16_t *filter, -; const int16_t *filterPos, int filterSize); +; const int32_t *filterPos, int filterSize); ; ; Scale one horizontal line. Input is either 8-bits width or 16-bits width ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to @@ -53,6 +53,9 @@ SECTION .text cglobal hscale%1to%2_%4_%5, %6, 7, %7 %if ARCH_X86_64 movsxd r2, r2d +%define mov32 movsxd +%else ; x86-32 +%define mov32 mov %endif ; x86-64 %if %2 == 19 %if mmsize == 8 ; mmx @@ -95,14 +98,14 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %else ; %2 == 19 lea r1, [r1+r2*(4>>r2shr)] %endif ; %2 == 15/19 - lea r5, [r5+r2*(2>>r2shr)] + lea r5, [r5+r2*(4>>r2shr)] neg r2 .loop: %if %3 == 4 ; filterSize == 4 scaling ; load 2x4 or 4x4 source pixels into m0/m1 - movsx r0, word [r5+r2*2+0] ; filterPos[0] - movsx r6, word [r5+r2*2+2] ; filterPos[1] + mov32 r0, dword [r5+r2*4+0] ; filterPos[0] + mov32 r6, dword [r5+r2*4+4] ; filterPos[1] movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}] %if mmsize == 8 movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] @@ -112,8 +115,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %else ; %1 == 8 movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] %endif - movsx r0, word [r5+r2*2+4] ; filterPos[2] - movsx r6, word [r5+r2*2+6] ; filterPos[3] + mov32 r0, dword [r5+r2*4+8] ; filterPos[2] + mov32 r6, dword [r5+r2*4+12] ; filterPos[3] movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}] %if %1 > 8 movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] @@ -156,8 +159,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %endif ; mmx/sse2/ssse3/sse4 %else ; %3 == 8, i.e. filterSize == 8 scaling ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 - movsx r0, word [r5+r2*1+0] ; filterPos[0] - movsx r6, word [r5+r2*1+2] ; filterPos[1] + mov32 r0, dword [r5+r2*2+0] ; filterPos[0] + mov32 r6, dword [r5+r2*2+4] ; filterPos[1] movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] %if mmsize == 8 movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] @@ -165,8 +168,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] %else ; mmsize == 16 movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] - movsx r0, word [r5+r2*1+4] ; filterPos[2] - movsx r6, word [r5+r2*1+6] ; filterPos[3] + mov32 r0, dword [r5+r2*2+8] ; filterPos[2] + mov32 r6, dword [r5+r2*2+12] ; filterPos[3] movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] %endif ; mmsize == 8/16 @@ -251,7 +254,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 %define r1x r1 %define filter2 r6m %endif ; x86-32/64 - lea r5, [r5+r2*2] + lea r5, [r5+r2*4] %if %2 == 15 lea r1, [r1+r2*2] %else ; %2 == 19 @@ -261,8 +264,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 neg r2 .loop: - movsx r0, word [r5+r2*2+0] ; filterPos[0] - movsx r1x, word [r5+r2*2+2] ; filterPos[1] + mov32 r0, dword [r5+r2*4+0] ; filterPos[0] + mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? pxor m4, m4 pxor m5, m5 @@ -293,7 +296,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 jl .innerloop %ifidn %4, X4 - movsx r1x, word [r5+r2*2+2] ; filterPos[1] + mov32 r1x, dword [r5+r2*4+4] ; filterPos[1] movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0] sub r1x, r6 ; and first 4 srcpx of dstpx[1] %if %1 > 8 |