aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2011-10-22 01:03:27 +0200
committerMichael Niedermayer <michaelni@gmx.at>2011-10-22 01:16:41 +0200
commitaedc908601de7396751a9a4504e064782d9f6a0b (patch)
tree8f04b899142439893bac426ac83d05c4068b099c /libavcodec
parent1a7090bfafe986d4470ba8059c815939171ddb74 (diff)
parentf4b51d061f0f34e36be876b562b8abe47f4b9c1c (diff)
downloadffmpeg-aedc908601de7396751a9a4504e064782d9f6a0b.tar.gz
Merge remote-tracking branch 'qatar/master'
* qatar/master: (35 commits) flvdec: Do not call parse_keyframes_index with a NULL stream libspeexdec: include system headers before local headers libspeexdec: return meaningful error codes libspeexdec: cosmetics: reindent libspeexdec: decode one frame at a time. swscale: fix signed shift overflows in ff_yuv2rgb_c_init_tables() Move timefilter code from lavf to lavd. mov: add support for hdvd and pgapmetadata atoms mov: rename function _stik, some indentation cosmetics mov: rename function _int8 to remove ambiguity, some indentation cosmetics mov: parse the gnre atom mp3on4: check for allocation failures in decode_init_mp3on4() mp3on4: create a separate flush function for MP3onMP4. mp3on4: ensure that the frame channel count does not exceed the codec channel count. mp3on4: set channel layout mp3on4: fix the output channel order mp3on4: allocate temp buffer with av_malloc() instead of on the stack. mp3on4: copy MPADSPContext from first context to all contexts. fmtconvert: port float_to_int16_interleave() 2-channel x86 inline asm to yasm fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm ... Conflicts: libavcodec/arm/h264dsp_init_arm.c libavcodec/h264.c libavcodec/h264.h libavcodec/h264_cabac.c libavcodec/h264_cavlc.c libavcodec/h264_ps.c libavcodec/h264dsp_template.c libavcodec/h264idct_template.c libavcodec/h264pred.c libavcodec/h264pred_template.c libavcodec/x86/h264dsp_mmx.c libavdevice/Makefile libavdevice/jack_audio.c libavformat/Makefile libavformat/flvdec.c libavformat/flvenc.c libavutil/pixfmt.h libswscale/utils.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/arm/h264dsp_init_arm.c78
-rw-r--r--libavcodec/arm/h264dsp_neon.S86
-rw-r--r--libavcodec/fmtconvert.h10
-rw-r--r--libavcodec/h264.c329
-rw-r--r--libavcodec/h264_cabac.c80
-rw-r--r--libavcodec/h264_ps.c2
-rw-r--r--libavcodec/h264dsp.c28
-rw-r--r--libavcodec/h264dsp.h10
-rw-r--r--libavcodec/h264dsp_template.c30
-rw-r--r--libavcodec/h264idct_template.c27
-rw-r--r--libavcodec/h264pred.c13
-rw-r--r--libavcodec/h264pred_template.c38
-rw-r--r--libavcodec/libspeexdec.c68
-rw-r--r--libavcodec/mpegaudiodec.c92
-rw-r--r--libavcodec/mpegaudiodec_float.c2
-rw-r--r--libavcodec/ppc/h264_altivec.c44
-rw-r--r--libavcodec/utils.c2
-rw-r--r--libavcodec/vp8.c68
-rw-r--r--libavcodec/vp8.h11
-rw-r--r--libavcodec/x86/dsputil_yasm.asm8
-rw-r--r--libavcodec/x86/fmtconvert.asm140
-rw-r--r--libavcodec/x86/fmtconvert_mmx.c213
-rw-r--r--libavcodec/x86/h264_weight.asm210
-rw-r--r--libavcodec/x86/h264_weight_10bit.asm145
-rw-r--r--libavcodec/x86/h264dsp_mmx.c177
25 files changed, 967 insertions, 944 deletions
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index e51026800d..cc4c688c8b 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
- int weight, int offset);
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
- int log2_den, int weightd, int weights,
- int offset);
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@@ -101,23 +76,14 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
}
- c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
- c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
- c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
- c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
- c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
- c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
- c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
- c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
- c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
- c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
- c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
- c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
- c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
- c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
+
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 338de6f643..6426f46637 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1592,7 +1592,7 @@ endfunc
vdup.8 d1, r5
vmov q2, q8
vmov q3, q8
-1: subs ip, ip, #2
+1: subs r3, r3, #2
vld1.8 {d20-d21},[r0,:128], r2
\macd q2, d0, d20
pld [r0]
@@ -1632,7 +1632,7 @@ endfunc
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
-1: subs ip, ip, #2
+1: subs r3, r3, #2
vld1.8 {d4},[r0,:64], r2
\macd q1, d0, d4
pld [r0]
@@ -1662,7 +1662,7 @@ endfunc
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
-1: subs ip, ip, #4
+1: subs r3, r3, #4
vld1.32 {d4[0]},[r0,:32], r2
vld1.32 {d4[1]},[r0,:32], r2
\macd q1, d0, d4
@@ -1700,16 +1700,17 @@ endfunc
.endm
.macro biweight_func w
-function biweight_h264_pixels_\w\()_neon
+function ff_biweight_h264_pixels_\w\()_neon, export=1
push {r4-r6, lr}
- add r4, sp, #16
+ ldr r12, [sp, #16]
+ add r4, sp, #20
ldm r4, {r4-r6}
lsr lr, r4, #31
add r6, r6, #1
eors lr, lr, r5, lsr #30
orr r6, r6, #1
- vdup.16 q9, r3
- lsl r6, r6, r3
+ vdup.16 q9, r12
+ lsl r6, r6, r12
vmvn q9, q9
vdup.16 q8, r6
mov r6, r0
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
endfunc
.endm
- .macro biweight_entry w, h, b=1
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
- mov ip, #\h
-.if \b
- b biweight_h264_pixels_\w\()_neon
-.endif
-endfunc
- .endm
-
- biweight_entry 16, 8
- biweight_entry 16, 16, b=0
biweight_func 16
-
- biweight_entry 8, 16
- biweight_entry 8, 4
- biweight_entry 8, 8, b=0
biweight_func 8
-
- biweight_entry 4, 8
- biweight_entry 4, 2
- biweight_entry 4, 4, b=0
biweight_func 4
@ Weighted prediction
.macro weight_16 add
- vdup.8 d0, r3
-1: subs ip, ip, #2
+ vdup.8 d0, r12
+1: subs r2, r2, #2
vld1.8 {d20-d21},[r0,:128], r1
vmull.u8 q2, d0, d20
pld [r0]
@@ -1785,8 +1767,8 @@ endfunc
.endm
.macro weight_8 add
- vdup.8 d0, r3
-1: subs ip, ip, #2
+ vdup.8 d0, r12
+1: subs r2, r2, #2
vld1.8 {d4},[r0,:64], r1
vmull.u8 q1, d0, d4
pld [r0]
@@ -1806,10 +1788,10 @@ endfunc
.endm
.macro weight_4 add
- vdup.8 d0, r3
+ vdup.8 d0, r12
vmov q1, q8
vmov q10, q8
-1: subs ip, ip, #4
+1: subs r2, r2, #4
vld1.32 {d4[0]},[r0,:32], r1
vld1.32 {d4[1]},[r0,:32], r1
vmull.u8 q1, d0, d4
@@ -1842,50 +1824,32 @@ endfunc
.endm
.macro weight_func w
-function weight_h264_pixels_\w\()_neon
+function ff_weight_h264_pixels_\w\()_neon, export=1
push {r4, lr}
- ldr r4, [sp, #8]
- cmp r2, #1
- lsl r4, r4, r2
+ ldr r12, [sp, #8]
+ ldr r4, [sp, #12]
+ cmp r3, #1
+ lsl r4, r4, r3
vdup.16 q8, r4
mov r4, r0
ble 20f
- rsb lr, r2, #1
+ rsb lr, r3, #1
vdup.16 q9, lr
- cmp r3, #0
+ cmp r12, #0
blt 10f
weight_\w vhadd.s16
-10: rsb r3, r3, #0
+10: rsb r12, r12, #0
weight_\w vhsub.s16
-20: rsb lr, r2, #0
+20: rsb lr, r3, #0
vdup.16 q9, lr
- cmp r3, #0
+ cmp r12, #0
blt 10f
weight_\w vadd.s16
-10: rsb r3, r3, #0
+10: rsb r12, r12, #0
weight_\w vsub.s16
endfunc
.endm
- .macro weight_entry w, h, b=1
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
- mov ip, #\h
-.if \b
- b weight_h264_pixels_\w\()_neon
-.endif
-endfunc
- .endm
-
- weight_entry 16, 8
- weight_entry 16, 16, b=0
weight_func 16
-
- weight_entry 8, 16
- weight_entry 8, 4
- weight_entry 8, 8, b=0
weight_func 8
-
- weight_entry 4, 8
- weight_entry 4, 2
- weight_entry 4, 4, b=0
weight_func 4
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index 825422bed6..c0584753cd 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -70,7 +70,15 @@ typedef struct FmtConvertContext {
long len, int channels);
/**
- * Convert an array of interleaved float to multiple arrays of float.
+ * Convert multiple arrays of float to an array of interleaved float.
+ *
+ * @param dst destination array of interleaved float.
+ * constraints: 16-byte aligned
+ * @param src source array of float arrays, one for each channel.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ * @param channels number of channels
*/
void (*float_interleave)(float *dst, const float **src, unsigned int len,
int channels);
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index addebce07d..4906f92ea8 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -460,11 +460,14 @@ static void chroma_dc_dct_c(DCTELEM *block){
}
#endif
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int src_x_offset, int src_y_offset,
- qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
- int pixel_shift, int chroma444){
+static av_always_inline void
+mc_dir_part(H264Context *h, Picture *pic, int n, int square,
+ int height, int delta, int list,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int src_x_offset, int src_y_offset,
+ qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
+ int pixel_shift, int chroma_idc)
+{
MpegEncContext * const s = &h->s;
const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
@@ -479,6 +482,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
const int full_my= my>>2;
const int pic_width = 16*s->mb_width;
const int pic_height = 16*s->mb_height >> MB_FIELD;
+ int ysh;
if(mx&7) extra_width -= 3;
if(my&7) extra_height -= 3;
@@ -487,7 +491,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
|| full_my < 0-extra_height
|| full_mx + 16/*FIXME*/ > pic_width + extra_width
|| full_my + 16/*FIXME*/ > pic_height + extra_height){
- s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
+ 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
src_y= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
emu=1;
}
@@ -499,7 +504,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
- if(chroma444){
+ if(chroma_idc == 3 /* yuv444 */){
src_cb = pic->f.data[1] + offset;
if(emu){
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
@@ -524,42 +529,55 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
return;
}
- if(MB_FIELD){
+ ysh = 3 - (chroma_idc == 2 /* yuv422 */);
+ if(chroma_idc == 1 /* yuv420 */ && MB_FIELD){
// chroma offset when predicting from a field of opposite parity
my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1));
emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
}
- src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
- src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
+
+ src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
+ src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
if(emu){
- s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize,
+ 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+ pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
src_cb= s->edge_emu_buffer;
}
- chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+ chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
+ mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
if(emu){
- s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize,
+ 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+ pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
src_cr= s->edge_emu_buffer;
}
- chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+ chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
+ mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
}
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int x_offset, int y_offset,
- qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
- qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
- int list0, int list1, int pixel_shift, int chroma444){
+static av_always_inline void
+mc_part_std(H264Context *h, int n, int square, int height, int delta,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int x_offset, int y_offset,
+ qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+ qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+ int list0, int list1, int pixel_shift, int chroma_idc)
+{
MpegEncContext * const s = &h->s;
qpel_mc_func *qpix_op= qpix_put;
h264_chroma_mc_func chroma_op= chroma_put;
dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
- if(chroma444){
+ if (chroma_idc == 3 /* yuv444 */) {
dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
- }else{
+ } else if (chroma_idc == 2 /* yuv422 */) {
+ dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+ dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+ } else /* yuv420 */ {
dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
}
@@ -568,9 +586,9 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
if(list0){
Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
- mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
+ mc_dir_part(h, ref, n, square, height, delta, 0,
dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_op, chroma_op, pixel_shift, chroma444);
+ qpix_op, chroma_op, pixel_shift, chroma_idc);
qpix_op= qpix_avg;
chroma_op= chroma_avg;
@@ -578,28 +596,36 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
if(list1){
Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
- mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
+ mc_dir_part(h, ref, n, square, height, delta, 1,
dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_op, chroma_op, pixel_shift, chroma444);
+ qpix_op, chroma_op, pixel_shift, chroma_idc);
}
}
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int x_offset, int y_offset,
- qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
- h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
- h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
- int list0, int list1, int pixel_shift, int chroma444){
+static av_always_inline void
+mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int x_offset, int y_offset,
+ qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+ h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
+ h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
+ int list0, int list1, int pixel_shift, int chroma_idc){
MpegEncContext * const s = &h->s;
+ int chroma_height;
dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
- if(chroma444){
+ if (chroma_idc == 3 /* yuv444 */) {
+ chroma_height = height;
chroma_weight_avg = luma_weight_avg;
chroma_weight_op = luma_weight_op;
dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
- }else{
+ } else if (chroma_idc == 2 /* yuv422 */) {
+ chroma_height = height;
+ dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+ dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+ } else /* yuv420 */ {
+ chroma_height = height >> 1;
dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
}
@@ -615,27 +641,32 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
int refn0 = h->ref_cache[0][ scan8[n] ];
int refn1 = h->ref_cache[1][ scan8[n] ];
- mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+ mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
dest_y, dest_cb, dest_cr,
- x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
- mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+ x_offset, y_offset, qpix_put, chroma_put,
+ pixel_shift, chroma_idc);
+ mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
tmp_y, tmp_cb, tmp_cr,
- x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+ x_offset, y_offset, qpix_put, chroma_put,
+ pixel_shift, chroma_idc);
if(h->use_weight == 2){
int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
int weight1 = 64 - weight0;
- luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0);
- chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
- chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
+ luma_weight_avg( dest_y, tmp_y, h-> mb_linesize,
+ height, 5, weight0, weight1, 0);
+ chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
+ chroma_height, 5, weight0, weight1, 0);
+ chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
+ chroma_height, 5, weight0, weight1, 0);
}else{
- luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
+ luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
- chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
- chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
}
@@ -643,42 +674,46 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
int list = list1 ? 1 : 0;
int refn = h->ref_cache[list][ scan8[n] ];
Picture *ref= &h->ref_list[list][refn];
- mc_dir_part(h, ref, n, square, chroma_height, delta, list,
+ mc_dir_part(h, ref, n, square, height, delta, list,
dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_put, chroma_put, pixel_shift, chroma444);
+ qpix_put, chroma_put, pixel_shift, chroma_idc);
- luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
+ luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
if(h->use_weight_chroma){
- chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
- chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
}
}
}
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int x_offset, int y_offset,
- qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
- qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
- h264_weight_func *weight_op, h264_biweight_func *weight_avg,
- int list0, int list1, int pixel_shift, int chroma444){
+static av_always_inline void
+mc_part(H264Context *h, int n, int square, int height, int delta,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int x_offset, int y_offset,
+ qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+ qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+ h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+ int list0, int list1, int pixel_shift, int chroma_idc)
+{
if((h->use_weight==2 && list0 && list1
&& (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
|| h->use_weight==1)
- mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+ mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
x_offset, y_offset, qpix_put, chroma_put,
- weight_op[0], weight_op[3], weight_avg[0],
- weight_avg[3], list0, list1, pixel_shift, chroma444);
+ weight_op[0], weight_op[1], weight_avg[0],
+ weight_avg[1], list0, list1, pixel_shift, chroma_idc);
else
- mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+ mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
- chroma_avg, list0, list1, pixel_shift, chroma444);
+ chroma_avg, list0, list1, pixel_shift, chroma_idc);
}
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){
+static av_always_inline void
+prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc)
+{
/* fetch pixels for estimated mv 4 macroblocks ahead
* optimized for 64byte cache lines */
MpegEncContext * const s = &h->s;
@@ -689,7 +724,7 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, in
uint8_t **src = h->ref_list[list][refn].f.data;
int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift);
s->dsp.prefetch(src[0]+off, s->linesize, 4);
- if(chroma444){
+ if (chroma_idc == 3 /* yuv444 */) {
s->dsp.prefetch(src[1]+off, s->linesize, 4);
s->dsp.prefetch(src[2]+off, s->linesize, 4);
}else{
@@ -703,7 +738,8 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
h264_weight_func *weight_op, h264_biweight_func *weight_avg,
- int pixel_shift, int chroma444){
+ int pixel_shift, int chroma_idc)
+{
MpegEncContext * const s = &h->s;
const int mb_xy= h->mb_xy;
const int mb_type = s->current_picture.f.mb_type[mb_xy];
@@ -712,36 +748,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
if(HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
await_references(h);
- prefetch_motion(h, 0, pixel_shift, chroma444);
+ prefetch_motion(h, 0, pixel_shift, chroma_idc);
if(IS_16X16(mb_type)){
- mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+ mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
weight_op, weight_avg,
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
- pixel_shift, chroma444);
+ pixel_shift, chroma_idc);
}else if(IS_16X8(mb_type)){
- mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
+ mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
- &weight_op[1], &weight_avg[1],
+ weight_op, weight_avg,
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
- pixel_shift, chroma444);
- mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
+ pixel_shift, chroma_idc);
+ mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
- &weight_op[1], &weight_avg[1],
+ weight_op, weight_avg,
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
- pixel_shift, chroma444);
+ pixel_shift, chroma_idc);
}else if(IS_8X16(mb_type)){
- mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+ mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
- &weight_op[2], &weight_avg[2],
+ &weight_op[1], &weight_avg[1],
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
- pixel_shift, chroma444);
- mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+ pixel_shift, chroma_idc);
+ mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
- &weight_op[2], &weight_avg[2],
+ &weight_op[1], &weight_avg[1],
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
- pixel_shift, chroma444);
+ pixel_shift, chroma_idc);
}else{
int i;
@@ -754,50 +790,72 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
int y_offset= (i&2)<<1;
if(IS_SUB_8X8(sub_mb_type)){
- mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
- &weight_op[3], &weight_avg[3],
+ &weight_op[1], &weight_avg[1],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
- pixel_shift, chroma444);
+ pixel_shift, chroma_idc);
}else if(IS_SUB_8X4(sub_mb_type)){
- mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
- &weight_op[4], &weight_avg[4],
+ &weight_op[1], &weight_avg[1],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
- pixel_shift, chroma444);
- mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+ pixel_shift, chroma_idc);
+ mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
- &weight_op[4], &weight_avg[4],
+ &weight_op[1], &weight_avg[1],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
- pixel_shift, chroma444);
+ pixel_shift, chroma_idc);
}else if(IS_SUB_4X8(sub_mb_type)){
- mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
- &weight_op[5], &weight_avg[5],
+ &weight_op[2], &weight_avg[2],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
- pixel_shift, chroma444);
- mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+ pixel_shift, chroma_idc);
+ mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
- &weight_op[5], &weight_avg[5],
+ &weight_op[2], &weight_avg[2],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
- pixel_shift, chroma444);
+ pixel_shift, chroma_idc);
}else{
int j;
assert(IS_SUB_4X4(sub_mb_type));
for(j=0; j<4; j++){
int sub_x_offset= x_offset + 2*(j&1);
int sub_y_offset= y_offset + (j&2);
- mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+ mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
- &weight_op[6], &weight_avg[6],
+ &weight_op[2], &weight_avg[2],
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
- pixel_shift, chroma444);
+ pixel_shift, chroma_idc);
}
}
}
}
- prefetch_motion(h, 1, pixel_shift, chroma444);
+ prefetch_motion(h, 1, pixel_shift, chroma_idc);
+}
+
+static av_always_inline void
+hl_motion_420(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+ qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+ h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+ int pixel_shift)
+{
+ hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
+ qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1);
+}
+
+static av_always_inline void
+hl_motion_422(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+ qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+ h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+ int pixel_shift)
+{
+ hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
+ qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2);
}
static void free_tables(H264Context *h, int free_rbsp){
@@ -1468,7 +1526,10 @@ static void decode_postinit(H264Context *h, int setup_finished){
ff_thread_finish_setup(s->avctx);
}
-static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
+static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y,
+ uint8_t *src_cb, uint8_t *src_cr,
+ int linesize, int uvlinesize, int simple)
+{
MpegEncContext * const s = &h->s;
uint8_t *top_border;
int top_idx = 1;
@@ -1813,7 +1874,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
}
}
-static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
+static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift)
+{
MpegEncContext * const s = &h->s;
const int mb_x= s->mb_x;
const int mb_y= s->mb_y;
@@ -1827,7 +1889,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
/* is_h264 should always be true if SVQ3 is disabled. */
const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
- const int block_h = 16>>s->chroma_y_shift;
+ const int block_h = 16 >> s->chroma_y_shift;
+ const int chroma422 = CHROMA422;
dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize ) * 16;
dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h;
@@ -1844,8 +1907,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
block_offset = &h->block_offset[48];
if(mb_y&1){ //FIXME move out of this function?
dest_y -= s->linesize*15;
- dest_cb-= s->uvlinesize*(block_h-1);
- dest_cr-= s->uvlinesize*(block_h-1);
+ dest_cb-= s->uvlinesize * (block_h - 1);
+ dest_cr-= s->uvlinesize * (block_h - 1);
}
if(FRAME_MBAFF) {
int list;
@@ -1884,7 +1947,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
}
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
if (!h->sps.chroma_format_idc) {
- for (i = 0; i < 8; i++) {
+ for (i = 0; i < block_h; i++) {
uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
for (j = 0; j < 8; j++) {
@@ -1911,13 +1974,13 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
if (!h->sps.chroma_format_idc) {
for (i=0; i<8; i++) {
- memset(dest_cb+ i*uvlinesize, 1 << (bit_depth - 1), 8);
- memset(dest_cr+ i*uvlinesize, 1 << (bit_depth - 1), 8);
+ memset(dest_cb + i*uvlinesize, 1 << (bit_depth - 1), 8);
+ memset(dest_cr + i*uvlinesize, 1 << (bit_depth - 1), 8);
}
} else {
for (i=0; i<block_h; i++) {
- memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4, 8);
- memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4, 8);
+ memcpy(dest_cb + i*uvlinesize, h->mb + 128 + i*4, 8);
+ memcpy(dest_cr + i*uvlinesize, h->mb + 160 + i*4, 8);
}
}
}
@@ -1937,11 +2000,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
if(h->deblocking_filter)
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift);
}else if(is_h264){
- hl_motion(h, dest_y, dest_cb, dest_cr,
- s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
- s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
- h->h264dsp.weight_h264_pixels_tab,
- h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 0);
+ if (chroma422) {
+ hl_motion_422(h, dest_y, dest_cb, dest_cr,
+ s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+ s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+ h->h264dsp.weight_h264_pixels_tab,
+ h->h264dsp.biweight_h264_pixels_tab,
+ pixel_shift);
+ } else {
+ hl_motion_420(h, dest_y, dest_cb, dest_cr,
+ s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+ s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+ h->h264dsp.weight_h264_pixels_tab,
+ h->h264dsp.biweight_h264_pixels_tab,
+ pixel_shift);
+ }
}
hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
@@ -1959,14 +2032,20 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
idct_add (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
}
+ if (chroma422) {
+ for(i=j*16+4; i<j*16+8; i++){
+ if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+ idct_add (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize);
+ }
+ }
}
}
}else{
if(is_h264){
int qp[2];
- if (CHROMA422) {
- qp[0] = h->chroma_qp[0]+3;
- qp[1] = h->chroma_qp[1]+3;
+ if (chroma422) {
+ qp[0] = h->chroma_qp[0] + 3;
+ qp[1] = h->chroma_qp[1] + 3;
} else {
qp[0] = h->chroma_qp[0];
qp[1] = h->chroma_qp[1];
@@ -2086,7 +2165,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
h->h264dsp.weight_h264_pixels_tab,
- h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 1);
+ h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3);
}
for (p = 0; p < plane_count; p++)
@@ -2690,6 +2769,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
case 9 :
if (CHROMA444)
s->avctx->pix_fmt = PIX_FMT_YUV444P9;
+ else if (CHROMA422)
+ s->avctx->pix_fmt = PIX_FMT_YUV422P9;
else
s->avctx->pix_fmt = PIX_FMT_YUV420P9;
break;
@@ -2708,7 +2789,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
s->avctx->pix_fmt = PIX_FMT_GBR24P;
av_log(h->s.avctx, AV_LOG_DEBUG, "Detected GBR colorspace.\n");
}
- }else if (CHROMA422) {
+ } else if (CHROMA422) {
s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ422P : PIX_FMT_YUV422P;
}else{
s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
@@ -3384,7 +3465,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
const int end_mb_y= s->mb_y + FRAME_MBAFF;
const int old_slice_type= h->slice_type;
const int pixel_shift = h->pixel_shift;
- const int block_h = 16>>s->chroma_y_shift;
+ const int block_h = 16 >> s->chroma_y_shift;
if(h->deblocking_filter) {
for(mb_x= start_x; mb_x<end_x; mb_x++){
@@ -3401,8 +3482,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
s->mb_x= mb_x;
s->mb_y= mb_y;
dest_y = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize ) * 16;
- dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*(8<<CHROMA444) + mb_y * s->uvlinesize * block_h;
- dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift)*(8<<CHROMA444) + mb_y * s->uvlinesize * block_h;
+ dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
+ dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
//FIXME simplify above
if (MB_FIELD) {
@@ -3410,8 +3491,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
if(mb_y&1){ //FIXME move out of this function?
dest_y -= s->linesize*15;
- dest_cb-= s->uvlinesize*(block_h-1);
- dest_cr-= s->uvlinesize*(block_h-1);
+ dest_cb-= s->uvlinesize * (block_h - 1);
+ dest_cr-= s->uvlinesize * (block_h - 1);
}
} else {
linesize = h->mb_linesize = s->linesize;
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index f4cae4621f..31c2658a6b 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1565,7 +1565,12 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
};
-static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
+static av_always_inline void
+decode_cabac_residual_internal(H264Context *h, DCTELEM *block,
+ int cat, int n, const uint8_t *scantable,
+ const uint32_t *qmul, int max_coeff,
+ int is_dc, int chroma422)
+{
static const int significant_coeff_flag_offset[2][14] = {
{ 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
{ 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
@@ -1593,7 +1598,10 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
* map node ctx => cabac ctx for level=1 */
static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
/* map node ctx => cabac ctx for level>1 */
- static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+ static const uint8_t coeff_abs_levelgt1_ctx[2][8] = {
+ { 5, 5, 5, 5, 6, 7, 8, 9 },
+ { 5, 5, 5, 5, 6, 7, 8, 8 }, // 422/dc case
+ };
static const uint8_t coeff_abs_level_transition[2][8] = {
/* update node ctx after decoding a level=1 */
{ 1, 2, 3, 3, 4, 5, 6, 7 },
@@ -1652,7 +1660,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
last_coeff_ctx_base, sig_off);
} else {
- if (is_dc && max_coeff == 8) { // dc 422
+ if (is_dc && chroma422) { // dc 422
DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
} else {
coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
@@ -1661,7 +1669,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
#else
DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
} else {
- if (is_dc && max_coeff == 8) { // dc 422
+ if (is_dc && chroma422) { // dc 422
DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
} else {
DECODE_SIGNIFICANCE(max_coeff - 1, last, last);
@@ -1701,9 +1709,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
} \
} else { \
int coeff_abs = 2; \
- if (is_dc && max_coeff == 8) \
- node_ctx = FFMIN(node_ctx, 6); \
- ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; \
+ ctx = coeff_abs_levelgt1_ctx[is_dc && chroma422][node_ctx] + abs_level_m1_ctx_base; \
node_ctx = coeff_abs_level_transition[1][node_ctx]; \
\
while( coeff_abs < 15 && get_cabac( CC, ctx ) ) { \
@@ -1745,11 +1751,18 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
}
static void decode_cabac_residual_dc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
- decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1);
+ decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0);
+}
+
+static void decode_cabac_residual_dc_internal_422(H264Context *h, DCTELEM *block,
+ int cat, int n, const uint8_t *scantable,
+ int max_coeff)
+{
+ decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1);
}
static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
- decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
+ decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0, 0);
}
/* cat: 0-> DC 16x16 n = 0
@@ -1773,6 +1786,19 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *
decode_cabac_residual_dc_internal( h, block, cat, n, scantable, max_coeff );
}
+static av_always_inline void
+decode_cabac_residual_dc_422(H264Context *h, DCTELEM *block,
+ int cat, int n, const uint8_t *scantable,
+ int max_coeff)
+{
+ /* read coded block flag */
+ if (get_cabac(&h->cabac, &h->cabac_state[get_cabac_cbf_ctx(h, cat, n, max_coeff, 1)]) == 0) {
+ h->non_zero_count_cache[scan8[n]] = 0;
+ return;
+ }
+ decode_cabac_residual_dc_internal_422(h, block, cat, n, scantable, max_coeff);
+}
+
static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
/* read coded block flag */
if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) {
@@ -2325,17 +2351,14 @@ decode_intra_mb:
if(CHROMA444){
decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1);
decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2);
- } else {
- const int num_c8x8 = h->sps.chroma_format_idc;
-
+ } else if (CHROMA422) {
if( cbp&0x30 ){
int c;
for( c = 0; c < 2; c++ ) {
//av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
- decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3,
- CHROMA_DC_BLOCK_INDEX+c,
- CHROMA422 ? chroma422_dc_scan : chroma_dc_scan,
- 4*num_c8x8);
+ decode_cabac_residual_dc_422(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3,
+ CHROMA_DC_BLOCK_INDEX + c,
+ chroma422_dc_scan, 8);
}
}
@@ -2344,7 +2367,7 @@ decode_intra_mb:
for( c = 0; c < 2; c++ ) {
DCTELEM *mb = h->mb + (16*(16 + 16*c) << pixel_shift);
qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
- for (i8x8 = 0; i8x8 < num_c8x8; i8x8++) {
+ for (i8x8 = 0; i8x8 < 2; i8x8++) {
for (i = 0; i < 4; i++) {
const int index = 16 + 16 * c + 8*i8x8 + i;
//av_log(s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16);
@@ -2357,6 +2380,29 @@ decode_intra_mb:
fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
}
+ } else /* yuv420 */ {
+ if( cbp&0x30 ){
+ int c;
+ for( c = 0; c < 2; c++ ) {
+ //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+ decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
+ }
+ }
+
+ if( cbp&0x20 ) {
+ int c, i;
+ for( c = 0; c < 2; c++ ) {
+ qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
+ for( i = 0; i < 4; i++ ) {
+ const int index = 16 + 16 * c + i;
+ //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
+ decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
+ }
+ }
+ } else {
+ fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+ fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+ }
}
} else {
fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index 93697a83c1..a7de122c53 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -415,7 +415,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
#endif
sps->crop= get_bits1(&s->gb);
if(sps->crop){
- int crop_vertical_limit = sps->chroma_format_idc & 2 ? 16 : 8;
+ int crop_vertical_limit = sps->chroma_format_idc & 2 ? 16 : 8;
int crop_horizontal_limit = sps->chroma_format_idc == 3 ? 16 : 8;
sps->crop_left = get_ue_golomb(&s->gb);
sps->crop_right = get_ue_golomb(&s->gb);
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index cf0067b8f0..bd35aa3065 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
else\
c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
\
- c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
- c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
- c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
- c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
- c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
- c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
- c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
- c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
- c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
- c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
- c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
- c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
- c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
- c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
- c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
- c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
- c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
- c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
- c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
- c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
+ c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
+ c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
+ c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
+ c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
+ c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
+ c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
+ c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
+ c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
\
c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index c79ab0a625..490a936310 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -31,16 +31,18 @@
#include "dsputil.h"
//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
+ int log2_denom, int weight, int offset);
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
+ int log2_denom, int weightd, int weights, int offset);
/**
* Context for storing H.264 DSP functions
*/
typedef struct H264DSPContext{
/* weighted MC */
- h264_weight_func weight_h264_pixels_tab[10];
- h264_biweight_func biweight_h264_pixels_tab[10];
+ h264_weight_func weight_h264_pixels_tab[4];
+ h264_biweight_func biweight_h264_pixels_tab[4];
/* loop filter */
void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index 3023541634..4d5faf01c0 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -29,14 +29,16 @@
#define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
#define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *p_block, int stride, int log2_denom, int weight, int offset){ \
+#define H264_WEIGHT(W) \
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
+ int log2_denom, int weight, int offset) \
+{ \
int y; \
- pixel *block = (pixel*)p_block; \
+ pixel *block = (pixel*)_block; \
stride >>= sizeof(pixel)-1; \
offset <<= (log2_denom + (BIT_DEPTH-8)); \
if(log2_denom) offset += 1<<(log2_denom-1); \
- for(y=0; y<H; y++, block += stride){ \
+ for (y = 0; y < height; y++, block += stride) { \
op_scale1(0); \
op_scale1(1); \
if(W==2) continue; \
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *p_block, int strid
op_scale1(15); \
} \
} \
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
+ int log2_denom, int weightd, int weights, int offset) \
+{ \
int y; \
pixel *dst = (pixel*)_dst; \
pixel *src = (pixel*)_src; \
stride >>= sizeof(pixel)-1; \
offset <<= (BIT_DEPTH-8); \
offset = ((offset + 1) | 1) << log2_denom; \
- for(y=0; y<H; y++, dst += stride, src += stride){ \
+ for (y = 0; y < height; y++, dst += stride, src += stride) { \
op_scale2(0); \
op_scale2(1); \
if(W==2) continue; \
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
} \
}
-H264_WEIGHT(16,16)
-H264_WEIGHT(16,8)
-H264_WEIGHT(8,16)
-H264_WEIGHT(8,8)
-H264_WEIGHT(8,4)
-H264_WEIGHT(4,8)
-H264_WEIGHT(4,4)
-H264_WEIGHT(4,2)
-H264_WEIGHT(2,4)
-H264_WEIGHT(2,2)
+H264_WEIGHT(16)
+H264_WEIGHT(8)
+H264_WEIGHT(4)
+H264_WEIGHT(2)
#undef op_scale1
#undef op_scale2
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index 64bc70dd47..c59976a1d9 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -228,16 +228,6 @@ void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *
void FUNCC(ff_h264_idct_add8_422)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
int i, j;
-#if 0
- av_log(NULL, AV_LOG_INFO, "idct\n");
- int32_t *b = block;
- for (int i = 0; i < 256; i++) {
- av_log(NULL, AV_LOG_INFO, "%5d ", b[i+256]);
- if (!((i+1) % 16))
- av_log(NULL, AV_LOG_INFO, "\n");
- }
-#endif
-
for(j=1; j<3; j++){
for(i=j*16; i<j*16+4; i++){
if(nnzc[ scan8[i] ])
@@ -296,13 +286,13 @@ void FUNCC(ff_h264_luma_dc_dequant_idct)(DCTELEM *p_output, DCTELEM *p_input, in
#undef stride
}
-void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *p_block, int qmul){
+void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *_block, int qmul){
const int stride= 16*2;
const int xStride= 16;
int i;
int temp[8];
static const uint8_t x_offset[2]={0, 16};
- dctcoef *block = (dctcoef*)p_block;
+ dctcoef *block = (dctcoef*)_block;
for(i=0; i<4; i++){
temp[2*i+0] = block[stride*i + xStride*0] + block[stride*i + xStride*1];
@@ -321,22 +311,13 @@ void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *p_block, int qmul){
block[stride*2+offset]= ((z1 - z2)*qmul + 128) >> 8;
block[stride*3+offset]= ((z0 - z3)*qmul + 128) >> 8;
}
-
-#if 0
- av_log(NULL, AV_LOG_INFO, "after chroma dc\n");
- for (int i = 0; i < 256; i++) {
- av_log(NULL, AV_LOG_INFO, "%5d ", block[i]);
- if (!((i+1) % 16))
- av_log(NULL, AV_LOG_INFO, "\n");
- }
-#endif
}
-void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *p_block, int qmul){
+void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){
const int stride= 16*2;
const int xStride= 16;
int a,b,c,d,e;
- dctcoef *block = (dctcoef*)p_block;
+ dctcoef *block = (dctcoef*)_block;
a= block[stride*0 + xStride*0];
b= block[stride*0 + xStride*1];
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index dce29f9f0b..a174b4ca3c 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -462,10 +462,10 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x16_dc , depth);\
h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc , depth);\
h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc , depth);\
- h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\
- h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\
- h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
- h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
+ h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l0t, depth);\
+ h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0lt, depth);\
+ h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l00, depth);\
+ h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0l0, depth);\
}\
}else{\
h->pred8x8[DC_PRED8x8 ]= FUNCD(pred8x8_dc_rv40);\
@@ -510,8 +510,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
h->pred4x4_add [ HOR_PRED ]= FUNCC(pred4x4_horizontal_add , depth);\
h->pred8x8l_add [VERT_PRED ]= FUNCC(pred8x8l_vertical_add , depth);\
h->pred8x8l_add [ HOR_PRED ]= FUNCC(pred8x8l_horizontal_add , depth);\
+ if (chroma_format_idc == 1) {\
h->pred8x8_add [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add , depth);\
h->pred8x8_add [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add , depth);\
+ } else {\
+ h->pred8x8_add [VERT_PRED8x8]= FUNCC(pred8x16_vertical_add , depth);\
+ h->pred8x8_add [ HOR_PRED8x8]= FUNCC(pred8x16_horizontal_add , depth);\
+ }\
h->pred16x16_add[VERT_PRED8x8]= FUNCC(pred16x16_vertical_add , depth);\
h->pred16x16_add[ HOR_PRED8x8]= FUNCC(pred16x16_horizontal_add , depth);\
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index 8021895378..074fad50ca 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -663,23 +663,45 @@ static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
FUNCC(pred4x4_dc)(src, NULL, stride);
}
+static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){
+ FUNCC(pred8x16_top_dc)(src, stride);
+ FUNCC(pred4x4_dc)(src, NULL, stride);
+}
+
static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
FUNCC(pred8x8_dc)(src, stride);
FUNCC(pred4x4_top_dc)(src, NULL, stride);
}
+static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){
+ FUNCC(pred8x16_dc)(src, stride);
+ FUNCC(pred4x4_top_dc)(src, NULL, stride);
+}
+
static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
FUNCC(pred8x8_left_dc)(src, stride);
FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
}
+static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){
+ FUNCC(pred8x16_left_dc)(src, stride);
+ FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
+ FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
+}
+
static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
FUNCC(pred8x8_left_dc)(src, stride);
FUNCC(pred4x4_128_dc)(src , NULL, stride);
FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
}
+static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){
+ FUNCC(pred8x16_left_dc)(src, stride);
+ FUNCC(pred4x4_128_dc)(src , NULL, stride);
+ FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
+}
+
static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
int j, k;
int a;
@@ -1126,8 +1148,24 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, c
FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
}
+static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+ int i;
+ for(i=0; i<4; i++)
+ FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+ for(i=4; i<8; i++)
+ FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+}
+
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
int i;
for(i=0; i<4; i++)
FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
}
+
+static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+ int i;
+ for(i=0; i<4; i++)
+ FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+ for(i=4; i<8; i++)
+ FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+}
diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index 7ee53b04e5..91f190525d 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -18,11 +18,11 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "avcodec.h"
#include <speex/speex.h>
#include <speex/speex_header.h>
#include <speex/speex_stereo.h>
#include <speex/speex_callbacks.h>
+#include "avcodec.h"
typedef struct {
SpeexBits bits;
@@ -60,14 +60,14 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
mode = speex_lib_get_mode(s->header->mode);
if (!mode) {
av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", s->header->mode);
- return -1;
+ return AVERROR_INVALIDDATA;
}
} else
av_log(avctx, AV_LOG_INFO, "Missing Speex header, assuming defaults.\n");
if (avctx->channels > 2) {
av_log(avctx, AV_LOG_ERROR, "Only stereo and mono are supported.\n");
- return -1;
+ return AVERROR(EINVAL);
}
speex_bits_init(&s->bits);
@@ -99,32 +99,42 @@ static int libspeex_decode_frame(AVCodecContext *avctx,
uint8_t *buf = avpkt->data;
int buf_size = avpkt->size;
LibSpeexContext *s = avctx->priv_data;
- int16_t *output = data, *end;
- int i, num_samples;
-
- num_samples = s->frame_size * avctx->channels;
- end = output + *data_size / sizeof(*output);
-
- speex_bits_read_from(&s->bits, buf, buf_size);
-
- for (i = 0; speex_bits_remaining(&s->bits) && output + num_samples < end; i++) {
- int ret = speex_decode_int(s->dec_state, &s->bits, output);
- if (ret <= -2) {
- av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
- return -1;
- } else if (ret == -1)
- // end of stream
- break;
+ int16_t *output = data;
+ int out_size, ret, consumed = 0;
+
+ /* check output buffer size */
+ out_size = s->frame_size * avctx->channels *
+ av_get_bytes_per_sample(avctx->sample_fmt);
+ if (*data_size < out_size) {
+ av_log(avctx, AV_LOG_ERROR, "Output buffer is too small\n");
+ return AVERROR(EINVAL);
+ }
- if (avctx->channels == 2)
- speex_decode_stereo_int(output, s->frame_size, &s->stereo);
+ /* if there is not enough data left for the smallest possible frame,
+ reset the libspeex buffer using the current packet, otherwise ignore
+ the current packet and keep decoding frames from the libspeex buffer. */
+ if (speex_bits_remaining(&s->bits) < 43) {
+ /* check for flush packet */
+ if (!buf || !buf_size) {
+ *data_size = 0;
+ return buf_size;
+ }
+ /* set new buffer */
+ speex_bits_read_from(&s->bits, buf, buf_size);
+ consumed = buf_size;
+ }
- output += num_samples;
+ /* decode a single frame */
+ ret = speex_decode_int(s->dec_state, &s->bits, output);
+ if (ret <= -2) {
+ av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
+ return AVERROR_INVALIDDATA;
}
+ if (avctx->channels == 2)
+ speex_decode_stereo_int(output, s->frame_size, &s->stereo);
- avctx->frame_size = s->frame_size * i;
- *data_size = avctx->channels * avctx->frame_size * sizeof(*output);
- return buf_size;
+ *data_size = out_size;
+ return consumed;
}
static av_cold int libspeex_decode_close(AVCodecContext *avctx)
@@ -138,6 +148,12 @@ static av_cold int libspeex_decode_close(AVCodecContext *avctx)
return 0;
}
+static av_cold void libspeex_decode_flush(AVCodecContext *avctx)
+{
+ LibSpeexContext *s = avctx->priv_data;
+ speex_bits_reset(&s->bits);
+}
+
AVCodec ff_libspeex_decoder = {
.name = "libspeex",
.type = AVMEDIA_TYPE_AUDIO,
@@ -146,5 +162,7 @@ AVCodec ff_libspeex_decoder = {
.init = libspeex_decode_init,
.close = libspeex_decode_close,
.decode = libspeex_decode_frame,
+ .flush = libspeex_decode_flush,
+ .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DELAY,
.long_name = NULL_IF_CONFIG_SMALL("libspeex Speex"),
};
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index b5ba285bc9..f5f169a8e3 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1893,24 +1893,50 @@ typedef struct MP3On4DecodeContext {
int syncword; ///< syncword patch
const uint8_t *coff; ///< channels offsets in output buffer
MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance
+ OUT_INT *decoded_buf; ///< output buffer for decoded samples
} MP3On4DecodeContext;
#include "mpeg4audio.h"
/* Next 3 arrays are indexed by channel config number (passed via codecdata) */
static const uint8_t mp3Frames[8] = {0,1,1,2,3,3,4,5}; /* number of mp3 decoder instances */
-/* offsets into output buffer, assume output order is FL FR BL BR C LFE */
+/* offsets into output buffer, assume output order is FL FR C LFE BL BR SL SR */
static const uint8_t chan_offset[8][5] = {
{0},
{0}, // C
{0}, // FLR
{2,0}, // C FLR
{2,0,3}, // C FLR BS
- {4,0,2}, // C FLR BLRS
- {4,0,2,5}, // C FLR BLRS LFE
- {4,0,2,6,5}, // C FLR BLRS BLR LFE
+ {2,0,3}, // C FLR BLRS
+ {2,0,4,3}, // C FLR BLRS LFE
+ {2,0,6,4,3}, // C FLR BLRS BLR LFE
};
+/* mp3on4 channel layouts */
+static const int16_t chan_layout[8] = {
+ 0,
+ AV_CH_LAYOUT_MONO,
+ AV_CH_LAYOUT_STEREO,
+ AV_CH_LAYOUT_SURROUND,
+ AV_CH_LAYOUT_4POINT0,
+ AV_CH_LAYOUT_5POINT0,
+ AV_CH_LAYOUT_5POINT1,
+ AV_CH_LAYOUT_7POINT1
+};
+
+static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
+{
+ MP3On4DecodeContext *s = avctx->priv_data;
+ int i;
+
+ for (i = 0; i < s->frames; i++)
+ av_free(s->mp3decctx[i]);
+
+ av_freep(&s->decoded_buf);
+
+ return 0;
+}
+
static int decode_init_mp3on4(AVCodecContext * avctx)
{
@@ -1931,6 +1957,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
s->frames = mp3Frames[cfg.chan_config];
s->coff = chan_offset[cfg.chan_config];
avctx->channels = ff_mpeg4audio_channels[cfg.chan_config];
+ avctx->channel_layout = chan_layout[cfg.chan_config];
if (cfg.sample_rate < 16000)
s->syncword = 0xffe00000;
@@ -1944,6 +1971,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
*/
// Allocate zeroed memory for the first decoder context
s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext));
+ if (!s->mp3decctx[0])
+ goto alloc_fail;
// Put decoder context in place to make init_decode() happy
avctx->priv_data = s->mp3decctx[0];
decode_init(avctx);
@@ -1956,23 +1985,38 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
*/
for (i = 1; i < s->frames; i++) {
s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext));
+ if (!s->mp3decctx[i])
+ goto alloc_fail;
s->mp3decctx[i]->adu_mode = 1;
s->mp3decctx[i]->avctx = avctx;
+ s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
+ }
+
+ /* Allocate buffer for multi-channel output if needed */
+ if (s->frames > 1) {
+ s->decoded_buf = av_malloc(MPA_FRAME_SIZE * MPA_MAX_CHANNELS *
+ sizeof(*s->decoded_buf));
+ if (!s->decoded_buf)
+ goto alloc_fail;
}
return 0;
+alloc_fail:
+ decode_close_mp3on4(avctx);
+ return AVERROR(ENOMEM);
}
-static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
+static void flush_mp3on4(AVCodecContext *avctx)
{
- MP3On4DecodeContext *s = avctx->priv_data;
int i;
+ MP3On4DecodeContext *s = avctx->priv_data;
- for (i = 0; i < s->frames; i++)
- av_free(s->mp3decctx[i]);
-
- return 0;
+ for (i = 0; i < s->frames; i++) {
+ MPADecodeContext *m = s->mp3decctx[i];
+ memset(m->synth_buf, 0, sizeof(m->synth_buf));
+ m->last_buf_size = 0;
+ }
}
@@ -1987,12 +2031,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
int fsize, len = buf_size, out_size = 0;
uint32_t header;
OUT_INT *out_samples = data;
- OUT_INT decoded_buf[MPA_FRAME_SIZE * MPA_MAX_CHANNELS];
OUT_INT *outptr, *bp;
- int fr, j, n;
+ int fr, j, n, ch;
- if(*data_size < MPA_FRAME_SIZE * MPA_MAX_CHANNELS * s->frames * sizeof(OUT_INT))
- return -1;
+ if (*data_size < MPA_FRAME_SIZE * avctx->channels * sizeof(OUT_INT)) {
+ av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
+ return AVERROR(EINVAL);
+ }
*data_size = 0;
// Discard too short frames
@@ -2000,10 +2045,11 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
return -1;
// If only one decoder interleave is not needed
- outptr = s->frames == 1 ? out_samples : decoded_buf;
+ outptr = s->frames == 1 ? out_samples : s->decoded_buf;
avctx->bit_rate = 0;
+ ch = 0;
for (fr = 0; fr < s->frames; fr++) {
fsize = AV_RB16(buf) >> 4;
fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE);
@@ -2016,6 +2062,14 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
break;
avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header);
+
+ if (ch + m->nb_channels > avctx->channels) {
+ av_log(avctx, AV_LOG_ERROR, "frame channel count exceeds codec "
+ "channel count\n");
+ return AVERROR_INVALIDDATA;
+ }
+ ch += m->nb_channels;
+
out_size += mp_decode_frame(m, outptr, buf, fsize);
buf += fsize;
len -= fsize;
@@ -2026,13 +2080,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
bp = out_samples + s->coff[fr];
if(m->nb_channels == 1) {
for(j = 0; j < n; j++) {
- *bp = decoded_buf[j];
+ *bp = s->decoded_buf[j];
bp += avctx->channels;
}
} else {
for(j = 0; j < n; j++) {
- bp[0] = decoded_buf[j++];
- bp[1] = decoded_buf[j];
+ bp[0] = s->decoded_buf[j++];
+ bp[1] = s->decoded_buf[j];
bp += avctx->channels;
}
}
@@ -2110,7 +2164,7 @@ AVCodec ff_mp3on4_decoder = {
.init = decode_init_mp3on4,
.close = decode_close_mp3on4,
.decode = decode_frame_mp3on4,
- .flush = flush,
+ .flush = flush_mp3on4,
.long_name = NULL_IF_CONFIG_SMALL("MP3onMP4"),
};
#endif
diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c
index 2fde46a5cd..312b84278f 100644
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@@ -83,7 +83,7 @@ AVCodec ff_mp3on4float_decoder = {
.init = decode_init_mp3on4,
.close = decode_close_mp3on4,
.decode = decode_frame_mp3on4,
- .flush = flush,
+ .flush = flush_mp3on4,
.long_name = NULL_IF_CONFIG_SMALL("MP3onMP4"),
};
#endif
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index 00fb0a73d9..bf4fd0f016 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
}
static av_always_inline
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+void weight_h264_W_altivec(uint8_t *block, int stride, int height,
+ int log2_denom, int weight, int offset, int w)
{
int y, aligned;
vec_u8 vblock;
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
voffset = vec_splat(vtemp, 5);
aligned = !((unsigned long)block & 0xf);
- for (y=0; y<h; y++) {
+ for (y = 0; y < height; y++) {
vblock = vec_ld(0, block);
v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
}
static av_always_inline
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
- int weightd, int weights, int offset, int w, int h)
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
+ int log2_denom, int weightd, int weights, int offset, int w)
{
int y, dst_aligned, src_aligned;
vec_u8 vsrc, vdst;
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
dst_aligned = !((unsigned long)dst & 0xf);
src_aligned = !((unsigned long)src & 0xf);
- for (y=0; y<h; y++) {
+ for (y = 0; y < height; y++) {
vdst = vec_ld(0, dst);
vsrc = vec_ld(0, src);
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
}
}
-#define H264_WEIGHT(W,H) \
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
- weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+#define H264_WEIGHT(W) \
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
+ int log2_denom, int weight, int offset){ \
+ weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \
}\
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
- biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
+ int log2_denom, int weightd, int weights, int offset){ \
+ biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
}
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
+H264_WEIGHT(16)
+H264_WEIGHT( 8)
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
- c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
- c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
- c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
- c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
- c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
- c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
- c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
- c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
}
}
}
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 205e3600c5..405fc31318 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -158,6 +158,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l
case PIX_FMT_YUV420P9BE:
case PIX_FMT_YUV420P10LE:
case PIX_FMT_YUV420P10BE:
+ case PIX_FMT_YUV422P9LE:
+ case PIX_FMT_YUV422P9BE:
case PIX_FMT_YUV422P10LE:
case PIX_FMT_YUV422P10BE:
case PIX_FMT_YUV444P9LE:
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 2643b34d67..8c0380315e 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -41,24 +41,57 @@ static void free_buffers(VP8Context *s)
av_freep(&s->top_nnz);
av_freep(&s->edge_emu_buffer);
av_freep(&s->top_border);
- av_freep(&s->segmentation_map);
s->macroblocks = NULL;
}
-static void vp8_decode_flush(AVCodecContext *avctx)
+static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
+{
+ int ret;
+ if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
+ return ret;
+ if (!s->maps_are_invalid && s->num_maps_to_be_freed) {
+ f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
+ } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
+ ff_thread_release_buffer(s->avctx, f);
+ return AVERROR(ENOMEM);
+ }
+ return 0;
+}
+
+static void vp8_release_frame(VP8Context *s, AVFrame *f, int is_close)
+{
+ if (!is_close) {
+ if (f->ref_index[0]) {
+ assert(s->num_maps_to_be_freed < FF_ARRAY_ELEMS(s->segmentation_maps));
+ s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
+ f->ref_index[0] = NULL;
+ }
+ } else {
+ av_freep(&f->ref_index[0]);
+ }
+ ff_thread_release_buffer(s->avctx, f);
+}
+
+static void vp8_decode_flush_impl(AVCodecContext *avctx, int force, int is_close)
{
VP8Context *s = avctx->priv_data;
int i;
- if (!avctx->is_copy) {
+ if (!avctx->is_copy || force) {
for (i = 0; i < 5; i++)
if (s->frames[i].data[0])
- ff_thread_release_buffer(avctx, &s->frames[i]);
+ vp8_release_frame(s, &s->frames[i], is_close);
}
memset(s->framep, 0, sizeof(s->framep));
free_buffers(s);
+ s->maps_are_invalid = 1;
+}
+
+static void vp8_decode_flush(AVCodecContext *avctx)
+{
+ vp8_decode_flush_impl(avctx, 0, 0);
}
static int update_dimensions(VP8Context *s, int width, int height)
@@ -68,7 +101,7 @@ static int update_dimensions(VP8Context *s, int width, int height)
if (av_image_check_size(width, height, 0, s->avctx))
return AVERROR_INVALIDDATA;
- vp8_decode_flush(s->avctx);
+ vp8_decode_flush_impl(s->avctx, 1, 0);
avcodec_set_dimensions(s->avctx, width, height);
}
@@ -81,10 +114,9 @@ static int update_dimensions(VP8Context *s, int width, int height)
s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
s->top_nnz = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
s->top_border = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
- s->segmentation_map = av_mallocz(s->mb_width*s->mb_height);
if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
- !s->top_nnz || !s->top_border || !s->segmentation_map)
+ !s->top_nnz || !s->top_border)
return AVERROR(ENOMEM);
s->macroblocks = s->macroblocks_base + 1;
@@ -1508,6 +1540,14 @@ static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
}
}
+static void release_queued_segmaps(VP8Context *s, int is_close)
+{
+ int leave_behind = is_close ? 0 : !s->maps_are_invalid;
+ while (s->num_maps_to_be_freed > leave_behind)
+ av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
+ s->maps_are_invalid = 0;
+}
+
static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
AVPacket *avpkt)
{
@@ -1516,6 +1556,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
enum AVDiscard skip_thresh;
AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT];
+ release_queued_segmaps(s, 0);
+
if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
return ret;
@@ -1538,7 +1580,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
&s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
&s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
&s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
- ff_thread_release_buffer(avctx, &s->frames[i]);
+ vp8_release_frame(s, &s->frames[i], 0);
// find a free buffer
for (i = 0; i < 5; i++)
@@ -1559,8 +1601,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
curframe->key_frame = s->keyframe;
curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
curframe->reference = referenced ? 3 : 0;
- curframe->ref_index[0] = s->segmentation_map;
- if ((ret = ff_thread_get_buffer(avctx, curframe))) {
+ if ((ret = vp8_alloc_frame(s, curframe))) {
av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
return ret;
}
@@ -1652,8 +1693,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
- decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy,
- prev_frame ? prev_frame->ref_index[0] + mb_xy : NULL);
+ decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
+ prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
@@ -1736,7 +1777,8 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx)
static av_cold int vp8_decode_free(AVCodecContext *avctx)
{
- vp8_decode_flush(avctx);
+ vp8_decode_flush_impl(avctx, 0, 1);
+ release_queued_segmaps(avctx->priv_data, 1);
return 0;
}
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 468e28e8d5..36c21df217 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -130,7 +130,6 @@ typedef struct {
uint8_t *intra4x4_pred_mode_top;
uint8_t intra4x4_pred_mode_left[4];
- uint8_t *segmentation_map;
/**
* Macroblocks can have one of 4 different quants in a frame when
@@ -237,6 +236,16 @@ typedef struct {
H264PredContext hpc;
vp8_mc_func put_pixels_tab[3][3][3];
AVFrame frames[5];
+
+ /**
+ * A list of segmentation_map buffers that are to be free()'ed in
+ * the next decoding iteration. We can't free() them right away
+ * because the map may still be used by subsequent decoding threads.
+ * Unused if frame threading is off.
+ */
+ uint8_t *segmentation_maps[5];
+ int num_maps_to_be_freed;
+ int maps_are_invalid;
} VP8Context;
#endif /* AVCODEC_VP8_H */
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 70f1b86c44..6627d21bd8 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1055,14 +1055,6 @@ emu_edge mmx
; int32_t max, unsigned int len)
;-----------------------------------------------------------------------------
-%macro SPLATD_MMX 1
- punpckldq %1, %1
-%endmacro
-
-%macro SPLATD_SSE2 1
- pshufd %1, %1, 0
-%endmacro
-
%macro VECTOR_CLIP_INT32 4
cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
%ifidn %1, sse2
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 2deb577ca6..37e7a094ce 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,146 @@
SECTION_TEXT
+;---------------------------------------------------------------------------------
+; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
+;---------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 2
+%ifdef ARCH_X86_64
+cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
+ movss m0, mulm
+%endif
+ SPLATD m0
+ shl lenq, 2
+ add srcq, lenq
+ add dstq, lenq
+ neg lenq
+.loop:
+%ifidn %1, sse2
+ cvtdq2ps m1, [srcq+lenq ]
+ cvtdq2ps m2, [srcq+lenq+16]
+%else
+ cvtpi2ps m1, [srcq+lenq ]
+ cvtpi2ps m3, [srcq+lenq+ 8]
+ cvtpi2ps m2, [srcq+lenq+16]
+ cvtpi2ps m4, [srcq+lenq+24]
+ movlhps m1, m3
+ movlhps m2, m4
+%endif
+ mulps m1, m0
+ mulps m2, m0
+ mova [dstq+lenq ], m1
+ mova [dstq+lenq+16], m2
+ add lenq, 32
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM
+%define SPLATD SPLATD_SSE
+%define movdqa movaps
+INT32_TO_FLOAT_FMUL_SCALAR sse, 5
+%undef movdqa
+%define SPLATD SPLATD_SSE2
+INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
+%undef SPLATD
+
+
+;------------------------------------------------------------------------------
+; void ff_float_to_int16(int16_t *dst, const float *src, long len);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16 2
+cglobal float_to_int16_%1, 3,3,%2, dst, src, len
+ add lenq, lenq
+ lea srcq, [srcq+2*lenq]
+ add dstq, lenq
+ neg lenq
+.loop:
+%ifidn %1, sse2
+ cvtps2dq m0, [srcq+2*lenq ]
+ cvtps2dq m1, [srcq+2*lenq+16]
+ packssdw m0, m1
+ mova [dstq+lenq], m0
+%else
+ cvtps2pi m0, [srcq+2*lenq ]
+ cvtps2pi m1, [srcq+2*lenq+ 8]
+ cvtps2pi m2, [srcq+2*lenq+16]
+ cvtps2pi m3, [srcq+2*lenq+24]
+ packssdw m0, m1
+ packssdw m2, m3
+ mova [dstq+lenq ], m0
+ mova [dstq+lenq+8], m2
+%endif
+ add lenq, 16
+ js .loop
+%ifnidn %1, sse2
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_XMM
+FLOAT_TO_INT16 sse2, 2
+INIT_MMX
+FLOAT_TO_INT16 sse, 0
+%define cvtps2pi pf2id
+FLOAT_TO_INT16 3dnow, 0
+%undef cvtps2pi
+
+
+;-------------------------------------------------------------------------------
+; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
+;-------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_INTERLEAVE2 1
+cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
+ lea lenq, [4*r2q]
+ mov src1q, [src0q+gprsize]
+ mov src0q, [src0q]
+ add dstq, lenq
+ add src0q, lenq
+ add src1q, lenq
+ neg lenq
+.loop:
+%ifidn %1, sse2
+ cvtps2dq m0, [src0q+lenq]
+ cvtps2dq m1, [src1q+lenq]
+ packssdw m0, m1
+ movhlps m1, m0
+ punpcklwd m0, m1
+ mova [dstq+lenq], m0
+%else
+ cvtps2pi m0, [src0q+lenq ]
+ cvtps2pi m1, [src0q+lenq+8]
+ cvtps2pi m2, [src1q+lenq ]
+ cvtps2pi m3, [src1q+lenq+8]
+ packssdw m0, m1
+ packssdw m2, m3
+ mova m1, m0
+ punpcklwd m0, m2
+ punpckhwd m1, m2
+ mova [dstq+lenq ], m0
+ mova [dstq+lenq+8], m1
+%endif
+ add lenq, 16
+ js .loop
+%ifnidn %1, sse2
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define cvtps2pi pf2id
+FLOAT_TO_INT16_INTERLEAVE2 3dnow
+%undef cvtps2pi
+%define movdqa movaps
+FLOAT_TO_INT16_INTERLEAVE2 sse
+%undef movdqa
+INIT_XMM
+FLOAT_TO_INT16_INTERLEAVE2 sse2
+
+
%macro PSWAPD_SSE 2
pshufw %1, %2, 0x4e
%endmacro
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index ba2c2c9bd5..a3d8f89816 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -26,133 +26,32 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/fmtconvert.h"
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtpi2ps (%2,%0), %%xmm0 \n"
- "cvtpi2ps 8(%2,%0), %%xmm1 \n"
- "cvtpi2ps 16(%2,%0), %%xmm2 \n"
- "cvtpi2ps 24(%2,%0), %%xmm3 \n"
- "movlhps %%xmm1, %%xmm0 \n"
- "movlhps %%xmm3, %%xmm2 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm2 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm2, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtdq2ps (%2,%0), %%xmm0 \n"
- "cvtdq2ps 16(%2,%0), %%xmm1 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm1 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm1, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
+#if HAVE_YASM
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- // not bit-exact: pf2id uses different rounding than C and SSE
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "pf2id (%2,%0,2) , %%mm0 \n\t"
- "pf2id 8(%2,%0,2) , %%mm1 \n\t"
- "pf2id 16(%2,%0,2) , %%mm2 \n\t"
- "pf2id 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "femms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
- "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
- "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
- "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "emms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
+void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
- "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
- "packssdw %%xmm1 , %%xmm0 \n\t"
- "movdqa %%xmm0 , (%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
+void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
-#if !HAVE_YASM
-#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
int i,j,c;\
for(c=0; c<channels; c++){\
- float_to_int16_##cpu(tmp, src[c], len);\
+ ff_float_to_int16_##cpu(tmp, src[c], len);\
for(i=0, j=c; i<len; i++, j+=channels)\
dst[j] = tmp[i];\
}\
@@ -160,73 +59,18 @@ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const
\
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
if(channels==1)\
- float_to_int16_##cpu(dst, src[0], len);\
+ ff_float_to_int16_##cpu(dst, src[0], len);\
else if(channels==2){\
- x86_reg reglen = len; \
- const float *src0 = src[0];\
- const float *src1 = src[1];\
- __asm__ volatile(\
- "shl $2, %0 \n"\
- "add %0, %1 \n"\
- "add %0, %2 \n"\
- "add %0, %3 \n"\
- "neg %0 \n"\
- body\
- :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
- );\
+ ff_float_to_int16_interleave2_##cpu(dst, src, len);\
}else if(channels==6){\
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
}else\
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
}
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
- "1: \n"
- "pf2id (%2,%0), %%mm0 \n"
- "pf2id 8(%2,%0), %%mm1 \n"
- "pf2id (%3,%0), %%mm2 \n"
- "pf2id 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "femms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
- "1: \n"
- "cvtps2pi (%2,%0), %%mm0 \n"
- "cvtps2pi 8(%2,%0), %%mm1 \n"
- "cvtps2pi (%3,%0), %%mm2 \n"
- "cvtps2pi 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "emms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
- "1: \n"
- "cvtps2dq (%2,%0), %%xmm0 \n"
- "cvtps2dq (%3,%0), %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "punpcklwd %%xmm1, %%xmm0 \n"
- "movdqa %%xmm0, (%1,%0) \n"
- "add $16, %0 \n"
- "js 1b \n"
-)
+FLOAT_TO_INT16_INTERLEAVE(3dnow)
+FLOAT_TO_INT16_INTERLEAVE(sse)
+FLOAT_TO_INT16_INTERLEAVE(sse2)
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
if(channels==6)
@@ -235,7 +79,6 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
-#if HAVE_YASM
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
@@ -269,34 +112,32 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
- if (mm_flags & AV_CPU_FLAG_MMX) {
#if HAVE_YASM
+ if (mm_flags & AV_CPU_FLAG_MMX) {
c->float_interleave = float_interleave_mmx;
-#endif
- if(mm_flags & AV_CPU_FLAG_3DNOW){
+ if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->float_to_int16 = float_to_int16_3dnow;
+ c->float_to_int16 = ff_float_to_int16_3dnow;
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
}
}
- if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+ if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) {
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
- if(mm_flags & AV_CPU_FLAG_SSE){
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
- c->float_to_int16 = float_to_int16_sse;
+ if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+ c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
-#if HAVE_YASM
c->float_interleave = float_interleave_sse;
-#endif
}
- if(mm_flags & AV_CPU_FLAG_SSE2){
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
- c->float_to_int16 = float_to_int16_sse2;
+ if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+ c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
}
}
+#endif
}
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index bb0af86097..cc96cb1f3b 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -28,21 +28,20 @@ SECTION .text
;-----------------------------------------------------------------------------
; biweight pred:
;
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
-; int log2_denom, int weightd, int weights,
-; int offset);
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
+; int height, int log2_denom, int weightd,
+; int weights, int offset);
; and
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
-; int log2_denom, int weight,
-; int offset);
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
+; int log2_denom, int weight, int offset);
;-----------------------------------------------------------------------------
%macro WEIGHT_SETUP 0
- add r4, r4
- inc r4
- movd m3, r3d
- movd m5, r4d
- movd m6, r2d
+ add r5, r5
+ inc r5
+ movd m3, r4d
+ movd m5, r5d
+ movd m6, r3d
pslld m5, m6
psrld m5, 1
%if mmsize == 16
@@ -71,60 +70,41 @@ SECTION .text
packuswb m0, m1
%endmacro
-%macro WEIGHT_FUNC_DBL_MM 1
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
+INIT_MMX
+cglobal h264_weight_16_mmx2, 6, 6, 0
WEIGHT_SETUP
- mov r2, %1
-%if %1 == 16
.nextrow
WEIGHT_OP 0, 4
mova [r0 ], m0
WEIGHT_OP 8, 12
mova [r0+8], m0
add r0, r1
- dec r2
+ dec r2d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
-%endif
-%endmacro
-INIT_MMX
-WEIGHT_FUNC_DBL_MM 16
-WEIGHT_FUNC_DBL_MM 8
-
-%macro WEIGHT_FUNC_MM 4
-cglobal h264_weight_%1x%2_%4, 7, 7, %3
+%macro WEIGHT_FUNC_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_SETUP
- mov r2, %2
-%if %2 == 16
.nextrow
WEIGHT_OP 0, mmsize/2
mova [r0], m0
add r0, r1
- dec r2
+ dec r2d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
-%endif
%endmacro
INIT_MMX
-WEIGHT_FUNC_MM 8, 16, 0, mmx2
-WEIGHT_FUNC_MM 8, 8, 0, mmx2
-WEIGHT_FUNC_MM 8, 4, 0, mmx2
+WEIGHT_FUNC_MM 8, 0, mmx2
INIT_XMM
-WEIGHT_FUNC_MM 16, 16, 8, sse2
-WEIGHT_FUNC_MM 16, 8, 8, sse2
+WEIGHT_FUNC_MM 16, 8, sse2
-%macro WEIGHT_FUNC_HALF_MM 5
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
+%macro WEIGHT_FUNC_HALF_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_SETUP
- mov r2, %2/2
+ sar r2d, 1
lea r3, [r1*2]
-%if %2 == mmsize
.nextrow
WEIGHT_OP 0, r1
movh [r0], m0
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
movh [r0+r1], m0
%endif
add r0, r3
- dec r2
+ dec r2d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
-%endif
%endmacro
INIT_MMX
-WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
INIT_XMM
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SETUP 0
- add r6, 1
- or r6, 1
- add r3, 1
- movd m3, r4d
- movd m4, r5d
- movd m5, r6d
- movd m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+ mov off_regd, r7m
+ add off_regd, 1
+ or off_regd, 1
+ add r4, 1
+ movd m3, r5d
+ movd m4, r6d
+ movd m5, off_regd
+ movd m6, r4d
pslld m5, m6
psrld m5, 1
%if mmsize == 16
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
packuswb m0, m1
%endmacro
-%macro BIWEIGHT_FUNC_DBL_MM 1
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
+INIT_MMX
+cglobal h264_biweight_16_mmx2, 7, 7, 0
BIWEIGHT_SETUP
- mov r3, %1
-%if %1 == 16
+ movifnidn r3d, r3m
.nextrow
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
mova [r0+8], m0
add r0, r2
add r1, r2
- dec r3
+ dec r3d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
-%endif
-%endmacro
-INIT_MMX
-BIWEIGHT_FUNC_DBL_MM 16
-BIWEIGHT_FUNC_DBL_MM 8
-
-%macro BIWEIGHT_FUNC_MM 4
-cglobal h264_biweight_%1x%2_%4, 7, 7, %3
+%macro BIWEIGHT_FUNC_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
BIWEIGHT_SETUP
- mov r3, %2
-%if %2 == 16
+ movifnidn r3d, r3m
.nextrow
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, mmsize/2
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
mova [r0], m0
add r0, r2
add r1, r2
- dec r3
+ dec r3d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
-%endif
%endmacro
INIT_MMX
-BIWEIGHT_FUNC_MM 8, 16, 0, mmx2
-BIWEIGHT_FUNC_MM 8, 8, 0, mmx2
-BIWEIGHT_FUNC_MM 8, 4, 0, mmx2
+BIWEIGHT_FUNC_MM 8, 0, mmx2
INIT_XMM
-BIWEIGHT_FUNC_MM 16, 16, 8, sse2
-BIWEIGHT_FUNC_MM 16, 8, 8, sse2
+BIWEIGHT_FUNC_MM 16, 8, sse2
-%macro BIWEIGHT_FUNC_HALF_MM 5
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
+%macro BIWEIGHT_FUNC_HALF_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
BIWEIGHT_SETUP
- mov r3, %2/2
+ movifnidn r3d, r3m
+ sar r3, 1
lea r4, [r2*2]
-%if %2 == mmsize
.nextrow
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, r2
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
%endif
add r0, r4
add r1, r4
- dec r3
+ dec r3d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
-%endif
%endmacro
INIT_MMX
-BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
INIT_XMM
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SSSE3_SETUP 0
- add r6, 1
- or r6, 1
- add r3, 1
- movd m4, r4d
- movd m0, r5d
- movd m5, r6d
- movd m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+ mov off_regd, r7m
+ add off_regd, 1
+ or off_regd, 1
+ add r4, 1
+ movd m4, r5d
+ movd m0, r6d
+ movd m5, off_regd
+ movd m6, r4d
pslld m5, m6
psrld m5, 1
punpcklbw m4, m0
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
packuswb m0, m2
%endmacro
-%macro BIWEIGHT_SSSE3_16 1
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
+INIT_XMM
+cglobal h264_biweight_16_ssse3, 7, 7, 8
BIWEIGHT_SSSE3_SETUP
- mov r3, %1
+ movifnidn r3d, r3m
-%if %1 == 16
.nextrow
movh m0, [r0]
movh m2, [r0+8]
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
mova [r0], m0
add r0, r2
add r1, r2
- dec r3
+ dec r3d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
-%endif
-%endmacro
INIT_XMM
-BIWEIGHT_SSSE3_16 16
-BIWEIGHT_SSSE3_16 8
-
-%macro BIWEIGHT_SSSE3_8 1
-cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
+cglobal h264_biweight_8_ssse3, 7, 7, 8
BIWEIGHT_SSSE3_SETUP
- mov r3, %1/2
+ movifnidn r3d, r3m
+ sar r3, 1
lea r4, [r2*2]
-%if %1 == 16
.nextrow
movh m0, [r0]
movh m1, [r1]
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
movhps [r0+r2], m0
add r0, r4
add r1, r4
- dec r3
+ dec r3d
jnz .nextrow
REP_RET
-%else
- jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
-%endif
-%endmacro
-
-INIT_XMM
-BIWEIGHT_SSSE3_8 16
-BIWEIGHT_SSSE3_8 8
-BIWEIGHT_SSSE3_8 4
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 1c58d72d94..20df6fbab5 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -36,33 +36,26 @@ cextern pw_1
SECTION .text
;-----------------------------------------------------------------------------
-; void h264_weight(uint8_t *dst, int stride, int log2_denom,
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
; int weight, int offset);
;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_32
-DECLARE_REG_TMP 2
-%else
-DECLARE_REG_TMP 10
-%endif
-
-%macro WEIGHT_PROLOGUE 1
- mov t0, %1
+%macro WEIGHT_PROLOGUE 0
.prologue
- PROLOGUE 0,5,8
+ PROLOGUE 0,6,8
movifnidn r0, r0mp
movifnidn r1d, r1m
- movifnidn r3d, r3m
movifnidn r4d, r4m
+ movifnidn r5d, r5m
%endmacro
%macro WEIGHT_SETUP 1
mova m0, [pw_1]
- movd m2, r2m
+ movd m2, r3m
pslld m0, m2 ; 1<<log2_denom
SPLATW m0, m0
- shl r4, 19 ; *8, move to upper half of dword
- lea r4, [r4+r3*2+0x10000]
- movd m3, r4d ; weight<<1 | 1+(offset<<(3))
+ shl r5, 19 ; *8, move to upper half of dword
+ lea r5, [r5+r4*2+0x10000]
+ movd m3, r5d ; weight<<1 | 1+(offset<<(3))
pshufd m3, m3, 0
mova m4, [pw_pixel_max]
paddw m2, [sq_1] ; log2_denom+1
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
%endmacro
%macro WEIGHT_FUNC_DBL 1
-cglobal h264_weight_16x16_10_%1
- WEIGHT_PROLOGUE 16
+cglobal h264_weight_16_10_%1
+ WEIGHT_PROLOGUE
WEIGHT_SETUP %1
.nextrow
WEIGHT_OP %1, 0
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
WEIGHT_OP %1, 16
mova [r0+16], m5
add r0, r1
- dec t0
+ dec r2d
jnz .nextrow
REP_RET
-
-cglobal h264_weight_16x8_10_%1
- mov t0, 8
- jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
%endmacro
INIT_XMM
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
%macro WEIGHT_FUNC_MM 1
-cglobal h264_weight_8x16_10_%1
- WEIGHT_PROLOGUE 16
+cglobal h264_weight_8_10_%1
+ WEIGHT_PROLOGUE
WEIGHT_SETUP %1
.nextrow
WEIGHT_OP %1, 0
mova [r0], m5
add r0, r1
- dec t0
+ dec r2d
jnz .nextrow
REP_RET
-
-cglobal h264_weight_8x8_10_%1
- mov t0, 8
- jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
-
-cglobal h264_weight_8x4_10_%1
- mov t0, 4
- jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
%endmacro
INIT_XMM
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
%macro WEIGHT_FUNC_HALF_MM 1
-cglobal h264_weight_4x8_10_%1
- WEIGHT_PROLOGUE 4
+cglobal h264_weight_4_10_%1
+ WEIGHT_PROLOGUE
+ sar r2d, 1
WEIGHT_SETUP %1
lea r3, [r1*2]
.nextrow
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
movh [r0], m5
movhps [r0+r1], m5
add r0, r3
- dec t0
+ dec r2d
jnz .nextrow
REP_RET
-
-cglobal h264_weight_4x4_10_%1
- mov t0, 2
- jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
-
-cglobal h264_weight_4x2_10_%1
- mov t0, 1
- jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
%endmacro
INIT_XMM
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
;-----------------------------------------------------------------------------
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-; int weightd, int weights, int offset);
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
+; int log2_denom, int weightd, int weights, int offset);
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_32
-DECLARE_REG_TMP 2,3
+DECLARE_REG_TMP 3
%else
-DECLARE_REG_TMP 10,2
+DECLARE_REG_TMP 10
%endif
-%macro BIWEIGHT_PROLOGUE 1
- mov t0, %1
+%macro BIWEIGHT_PROLOGUE 0
.prologue
PROLOGUE 0,7,8
movifnidn r0, r0mp
movifnidn r1, r1mp
- movifnidn t1d, r2m
- movifnidn r4d, r4m
+ movifnidn r2d, r2m
movifnidn r5d, r5m
movifnidn r6d, r6m
+ movifnidn t0d, r7m
%endmacro
%macro BIWEIGHT_SETUP 1
- lea r6, [r6*4+1] ; (offset<<2)+1
- or r6, 1
- shl r5, 16
- or r4, r5
- movd m4, r4d ; weightd | weights
- movd m5, r6d ; (offset+1)|1
- movd m6, r3m ; log2_denom
+ lea t0, [t0*4+1] ; (offset<<2)+1
+ or t0, 1
+ shl r6, 16
+ or r5, r6
+ movd m4, r5d ; weightd | weights
+ movd m5, t0d ; (offset+1)|1
+ movd m6, r4m ; log2_denom
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
paddd m6, [sq_1]
pshufd m4, m4, 0
pshufd m5, m5, 0
mova m3, [pw_pixel_max]
+ movifnidn r3d, r3m
%ifnidn %1, sse4
pxor m7, m7
%endif
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
%endmacro
%macro BIWEIGHT_FUNC_DBL 1
-cglobal h264_biweight_16x16_10_%1
- BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_16_10_%1
+ BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1
.nextrow
BIWEIGHT %1, 0
mova [r0 ], m0
BIWEIGHT %1, 16
mova [r0+16], m0
- add r0, t1
- add r1, t1
- dec t0
+ add r0, r2
+ add r1, r2
+ dec r3d
jnz .nextrow
REP_RET
-
-cglobal h264_biweight_16x8_10_%1
- mov t0, 8
- jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
%endmacro
INIT_XMM
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
BIWEIGHT_FUNC_DBL sse4
%macro BIWEIGHT_FUNC 1
-cglobal h264_biweight_8x16_10_%1
- BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_8_10_%1
+ BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1
.nextrow
BIWEIGHT %1, 0
mova [r0], m0
- add r0, t1
- add r1, t1
- dec t0
+ add r0, r2
+ add r1, r2
+ dec r3d
jnz .nextrow
REP_RET
-
-cglobal h264_biweight_8x8_10_%1
- mov t0, 8
- jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
-
-cglobal h264_biweight_8x4_10_%1
- mov t0, 4
- jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
%endmacro
INIT_XMM
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
BIWEIGHT_FUNC sse4
%macro BIWEIGHT_FUNC_HALF 1
-cglobal h264_biweight_4x8_10_%1
- BIWEIGHT_PROLOGUE 4
+cglobal h264_biweight_4_10_%1
+ BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1
- lea r4, [t1*2]
+ sar r3d, 1
+ lea r4, [r2*2]
.nextrow
- BIWEIGHT %1, 0, t1
+ BIWEIGHT %1, 0, r2
movh [r0 ], m0
- movhps [r0+t1], m0
+ movhps [r0+r2], m0
add r0, r4
add r1, r4
- dec t0
+ dec r3d
jnz .nextrow
REP_RET
-
-cglobal h264_biweight_4x4_10_%1
- mov t0, 2
- jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
-
-cglobal h264_biweight_4x2_10_%1
- mov t0, 1
- jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
%endmacro
INIT_XMM
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 71beb262c9..b337462aec 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -298,63 +298,53 @@ LF_IFUNC(v, luma_intra, 10, mmxext)
/***********************************/
/* weighted prediction */
-#define H264_WEIGHT(W, H, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
- int stride, int log2_denom, int weight, int offset);
+#define H264_WEIGHT(W, OPT) \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
+ int stride, int height, int log2_denom, int weight, int offset);
-#define H264_BIWEIGHT(W, H, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
- uint8_t *src, int stride, int log2_denom, int weightd, \
+#define H264_BIWEIGHT(W, OPT) \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
+ uint8_t *src, int stride, int height, int log2_denom, int weightd, \
int weights, int offset);
-#define H264_BIWEIGHT_MMX(W,H) \
-H264_WEIGHT (W, H, mmx2) \
-H264_BIWEIGHT(W, H, mmx2)
-
-#define H264_BIWEIGHT_MMX_SSE(W,H) \
-H264_BIWEIGHT_MMX(W, H) \
-H264_WEIGHT (W, H, sse2) \
-H264_BIWEIGHT (W, H, sse2) \
-H264_BIWEIGHT (W, H, ssse3)
-
-H264_BIWEIGHT_MMX_SSE(16, 16)
-H264_BIWEIGHT_MMX_SSE(16, 8)
-H264_BIWEIGHT_MMX_SSE( 8, 16)
-H264_BIWEIGHT_MMX_SSE( 8, 8)
-H264_BIWEIGHT_MMX_SSE( 8, 4)
-H264_BIWEIGHT_MMX ( 4, 8)
-H264_BIWEIGHT_MMX ( 4, 4)
-H264_BIWEIGHT_MMX ( 4, 2)
-
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
- int stride, int log2_denom, int weight, int offset);
-
-#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
- (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
+#define H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT (W, mmx2) \
+H264_BIWEIGHT(W, mmx2)
+
+#define H264_BIWEIGHT_MMX_SSE(W) \
+H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT (W, sse2) \
+H264_BIWEIGHT (W, sse2) \
+H264_BIWEIGHT (W, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16)
+H264_BIWEIGHT_MMX_SSE( 8)
+H264_BIWEIGHT_MMX ( 4)
+
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+ int stride, int height, int log2_denom, int weight, int offset);
+
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
+ (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
int weightd, int weights, int offset);
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
-H264_WEIGHT_10 (W, H, DEPTH, sse2) \
-H264_WEIGHT_10 (W, H, DEPTH, sse4) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse4)
-
-H264_BIWEIGHT_10_SSE(16, 16, 10)
-H264_BIWEIGHT_10_SSE(16, 8, 10)
-H264_BIWEIGHT_10_SSE( 8, 16, 10)
-H264_BIWEIGHT_10_SSE( 8, 8, 10)
-H264_BIWEIGHT_10_SSE( 8, 4, 10)
-H264_BIWEIGHT_10_SSE( 4, 8, 10)
-H264_BIWEIGHT_10_SSE( 4, 4, 10)
-H264_BIWEIGHT_10_SSE( 4, 2, 10)
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
+H264_WEIGHT_10 (W, DEPTH, sse2) \
+H264_WEIGHT_10 (W, DEPTH, sse4) \
+H264_BIWEIGHT_10(W, DEPTH, sse2) \
+H264_BIWEIGHT_10(W, DEPTH, sse4)
+
+H264_BIWEIGHT_10_SSE(16, 10)
+H264_BIWEIGHT_10_SSE( 8, 10)
+H264_BIWEIGHT_10_SSE( 4, 10)
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
{
int mm_flags = av_get_cpu_flags();
- if (mm_flags & AV_CPU_FLAG_MMX2) {
+ if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) {
c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
}
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
#endif
- c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
- c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
- c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
- c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
- c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
- c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
- c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
- c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
- c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
- c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
- c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
- c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
- c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
- c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
+ c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
+
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
+ c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
if (mm_flags&AV_CPU_FLAG_SSE2) {
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
- c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
- c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
- c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
- c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
- c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
- c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
- c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
- c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
#endif
}
if (mm_flags&AV_CPU_FLAG_SSSE3) {
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
- c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
- c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
- c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
}
if (HAVE_AVX && mm_flags&AV_CPU_FLAG_AVX) {
#if HAVE_ALIGNED_STACK
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
#endif
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
- c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
- c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
- c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
- c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
- c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
- c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
- c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
-
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
- c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
- c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
- c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
- c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
- c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
- c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
+ c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
#endif
}
if (mm_flags&AV_CPU_FLAG_SSE4) {
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
- c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
- c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
- c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
- c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
- c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
- c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
- c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
-
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
- c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
- c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
- c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
- c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
- c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
- c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
+ c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
}
#if HAVE_AVX
if (mm_flags&AV_CPU_FLAG_AVX) {