aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2011-04-10 17:04:13 +0200
committerMichael Niedermayer <michaelni@gmx.at>2011-04-10 22:33:42 +0200
commita50f0bea25a3da605cd547fe3bdfd36c8764b847 (patch)
tree4f0babca53336d46d717b445a5d2a5ea6f778d92
parente7077f5e7b509c4fec62620d136a80b676428bb1 (diff)
downloadffmpeg-a50f0bea25a3da605cd547fe3bdfd36c8764b847.tar.gz
H264: Split out hl_motion and template it, this seems a bit faster
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/Makefile6
-rw-r--r--libavcodec/h264.c419
-rw-r--r--libavcodec/h264.h6
-rw-r--r--libavcodec/h264_hl_motion.c164
-rw-r--r--libavcodec/h264_hl_motion.h282
5 files changed, 456 insertions, 421 deletions
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 0730124870..1cb8ee5264 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -168,7 +168,7 @@ OBJS-$(CONFIG_H263_ENCODER) += mpegvideo_enc.o mpeg4video.o \
ratecontrol.o h263.o ituh263enc.o \
flvenc.o mpeg12data.o \
mpegvideo.o error_resilience.o
-OBJS-$(CONFIG_H264_DECODER) += h264.o \
+OBJS-$(CONFIG_H264_DECODER) += h264.o h264_hl_motion.o \
h264_loopfilter.o h264_direct.o \
cabac.o h264_sei.o h264_ps.o \
h264_refs.o h264_cavlc.o h264_cabac.o\
@@ -356,7 +356,7 @@ OBJS-$(CONFIG_SVQ1_ENCODER) += svq1enc.o svq1.o \
mpegvideo.o error_resilience.o \
ituh263enc.o mpegvideo_enc.o \
ratecontrol.o mpeg12data.o
-OBJS-$(CONFIG_SVQ3_DECODER) += h264.o svq3.o \
+OBJS-$(CONFIG_SVQ3_DECODER) += h264.o svq3.o h264_hl_motion.o \
h264_loopfilter.o h264_direct.o \
h264_sei.o h264_ps.o h264_refs.o \
h264_cavlc.o h264_cabac.o cabac.o \
@@ -594,7 +594,7 @@ OBJS-$(CONFIG_DVDSUB_PARSER) += dvdsub_parser.o
OBJS-$(CONFIG_FLAC_PARSER) += flac_parser.o flacdata.o flac.o
OBJS-$(CONFIG_H261_PARSER) += h261_parser.o
OBJS-$(CONFIG_H263_PARSER) += h263_parser.o
-OBJS-$(CONFIG_H264_PARSER) += h264_parser.o h264.o \
+OBJS-$(CONFIG_H264_PARSER) += h264_parser.o h264.o h264_hl_motion.o \
cabac.o \
h264_refs.o h264_sei.o h264_direct.o \
h264_loopfilter.o h264_cabac.o \
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 1fa333c83e..6365afb303 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -250,141 +250,6 @@ static int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
return 0;
}
-static inline int get_lowest_part_list_y(H264Context *h, Picture *pic, int n, int height,
- int y_offset, int list){
- int raw_my= h->mv_cache[list][ scan8[n] ][1];
- int filter_height= (raw_my&3) ? 2 : 0;
- int full_my= (raw_my>>2) + y_offset;
- int top = full_my - filter_height, bottom = full_my + height + filter_height;
-
- return FFMAX(abs(top), bottom);
-}
-
-static inline void get_lowest_part_y(H264Context *h, int refs[2][48], int n, int height,
- int y_offset, int list0, int list1, int *nrefs){
- MpegEncContext * const s = &h->s;
- int my;
-
- y_offset += 16*(s->mb_y >> MB_FIELD);
-
- if(list0){
- int ref_n = h->ref_cache[0][ scan8[n] ];
- Picture *ref= &h->ref_list[0][ref_n];
-
- // Error resilience puts the current picture in the ref list.
- // Don't try to wait on these as it will cause a deadlock.
- // Fields can wait on each other, though.
- if(ref->thread_opaque != s->current_picture.thread_opaque ||
- (ref->reference&3) != s->picture_structure) {
- my = get_lowest_part_list_y(h, ref, n, height, y_offset, 0);
- if (refs[0][ref_n] < 0) nrefs[0] += 1;
- refs[0][ref_n] = FFMAX(refs[0][ref_n], my);
- }
- }
-
- if(list1){
- int ref_n = h->ref_cache[1][ scan8[n] ];
- Picture *ref= &h->ref_list[1][ref_n];
-
- if(ref->thread_opaque != s->current_picture.thread_opaque ||
- (ref->reference&3) != s->picture_structure) {
- my = get_lowest_part_list_y(h, ref, n, height, y_offset, 1);
- if (refs[1][ref_n] < 0) nrefs[1] += 1;
- refs[1][ref_n] = FFMAX(refs[1][ref_n], my);
- }
- }
-}
-
-/**
- * Wait until all reference frames are available for MC operations.
- *
- * @param h the H264 context
- */
-static void await_references(H264Context *h){
- MpegEncContext * const s = &h->s;
- const int mb_xy= h->mb_xy;
- const int mb_type= s->current_picture.mb_type[mb_xy];
- int refs[2][48];
- int nrefs[2] = {0};
- int ref, list;
-
- memset(refs, -1, sizeof(refs));
-
- if(IS_16X16(mb_type)){
- get_lowest_part_y(h, refs, 0, 16, 0,
- IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
- }else if(IS_16X8(mb_type)){
- get_lowest_part_y(h, refs, 0, 8, 0,
- IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
- get_lowest_part_y(h, refs, 8, 8, 8,
- IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
- }else if(IS_8X16(mb_type)){
- get_lowest_part_y(h, refs, 0, 16, 0,
- IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
- get_lowest_part_y(h, refs, 4, 16, 0,
- IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
- }else{
- int i;
-
- assert(IS_8X8(mb_type));
-
- for(i=0; i<4; i++){
- const int sub_mb_type= h->sub_mb_type[i];
- const int n= 4*i;
- int y_offset= (i&2)<<2;
-
- if(IS_SUB_8X8(sub_mb_type)){
- get_lowest_part_y(h, refs, n , 8, y_offset,
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
- }else if(IS_SUB_8X4(sub_mb_type)){
- get_lowest_part_y(h, refs, n , 4, y_offset,
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
- get_lowest_part_y(h, refs, n+2, 4, y_offset+4,
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
- }else if(IS_SUB_4X8(sub_mb_type)){
- get_lowest_part_y(h, refs, n , 8, y_offset,
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
- get_lowest_part_y(h, refs, n+1, 8, y_offset,
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
- }else{
- int j;
- assert(IS_SUB_4X4(sub_mb_type));
- for(j=0; j<4; j++){
- int sub_y_offset= y_offset + 2*(j&2);
- get_lowest_part_y(h, refs, n+j, 4, sub_y_offset,
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
- }
- }
- }
- }
-
- for(list=h->list_count-1; list>=0; list--){
- for(ref=0; ref<48 && nrefs[list]; ref++){
- int row = refs[list][ref];
- if(row >= 0){
- Picture *ref_pic = &h->ref_list[list][ref];
- int ref_field = ref_pic->reference - 1;
- int ref_field_picture = ref_pic->field_picture;
- int pic_height = 16*s->mb_height >> ref_field_picture;
-
- row <<= MB_MBAFF;
- nrefs[list]--;
-
- if(!FIELD_PICTURE && ref_field_picture){ // frame referencing two fields
- ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) - !(row&1), pic_height-1), 1);
- ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) , pic_height-1), 0);
- }else if(FIELD_PICTURE && !ref_field_picture){ // field referencing one field of a frame
- ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row*2 + ref_field , pic_height-1), 0);
- }else if(FIELD_PICTURE){
- ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), ref_field);
- }else{
- ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), 0);
- }
- }
- }
- }
-}
-
#if 0
/**
* DCT transforms the 16 dc values.
@@ -451,288 +316,6 @@ static void chroma_dc_dct_c(DCTELEM *block){
}
#endif
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int src_x_offset, int src_y_offset,
- qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
- MpegEncContext * const s = &h->s;
- const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
- int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
- const int luma_xy= (mx&3) + ((my&3)<<2);
- uint8_t * src_y = pic->data[0] + ((mx>>2)<<h->pixel_shift) + (my>>2)*h->mb_linesize;
- uint8_t * src_cb, * src_cr;
- int extra_width= h->emu_edge_width;
- int extra_height= h->emu_edge_height;
- int emu=0;
- const int full_mx= mx>>2;
- const int full_my= my>>2;
- const int pic_width = 16*s->mb_width;
- const int pic_height = 16*s->mb_height >> MB_FIELD;
-
- if(mx&7) extra_width -= 3;
- if(my&7) extra_height -= 3;
-
- if( full_mx < 0-extra_width
- || full_my < 0-extra_height
- || full_mx + 16/*FIXME*/ > pic_width + extra_width
- || full_my + 16/*FIXME*/ > pic_height + extra_height){
- s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2<<h->pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
- src_y= s->edge_emu_buffer + (2<<h->pixel_shift) + 2*h->mb_linesize;
- emu=1;
- }
-
- qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
- if(!square){
- qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
- }
-
- if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
-
- if(MB_FIELD){
- // chroma offset when predicting from a field of opposite parity
- my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
- emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
- }
- src_cb= pic->data[1] + ((mx>>3)<<h->pixel_shift) + (my>>3)*h->mb_uvlinesize;
- src_cr= pic->data[2] + ((mx>>3)<<h->pixel_shift) + (my>>3)*h->mb_uvlinesize;
-
- if(emu){
- s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
- src_cb= s->edge_emu_buffer;
- }
- chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
-
- if(emu){
- s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
- src_cr= s->edge_emu_buffer;
- }
- chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
-}
-
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int x_offset, int y_offset,
- qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
- qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
- int list0, int list1){
- MpegEncContext * const s = &h->s;
- qpel_mc_func *qpix_op= qpix_put;
- h264_chroma_mc_func chroma_op= chroma_put;
-
- dest_y += (2*x_offset<<h->pixel_shift) + 2*y_offset*h-> mb_linesize;
- dest_cb += ( x_offset<<h->pixel_shift) + y_offset*h->mb_uvlinesize;
- dest_cr += ( x_offset<<h->pixel_shift) + y_offset*h->mb_uvlinesize;
- x_offset += 8*s->mb_x;
- y_offset += 8*(s->mb_y >> MB_FIELD);
-
- if(list0){
- Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
- mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
- dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_op, chroma_op);
-
- qpix_op= qpix_avg;
- chroma_op= chroma_avg;
- }
-
- if(list1){
- Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
- mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
- dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_op, chroma_op);
- }
-}
-
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int x_offset, int y_offset,
- qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
- h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
- h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
- int list0, int list1){
- MpegEncContext * const s = &h->s;
-
- dest_y += (2*x_offset<<h->pixel_shift) + 2*y_offset*h-> mb_linesize;
- dest_cb += ( x_offset<<h->pixel_shift) + y_offset*h->mb_uvlinesize;
- dest_cr += ( x_offset<<h->pixel_shift) + y_offset*h->mb_uvlinesize;
- x_offset += 8*s->mb_x;
- y_offset += 8*(s->mb_y >> MB_FIELD);
-
- if(list0 && list1){
- /* don't optimize for luma-only case, since B-frames usually
- * use implicit weights => chroma too. */
- uint8_t *tmp_cb = s->obmc_scratchpad;
- uint8_t *tmp_cr = s->obmc_scratchpad + (8<<h->pixel_shift);
- uint8_t *tmp_y = s->obmc_scratchpad + 8*h->mb_uvlinesize;
- int refn0 = h->ref_cache[0][ scan8[n] ];
- int refn1 = h->ref_cache[1][ scan8[n] ];
-
- mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
- dest_y, dest_cb, dest_cr,
- x_offset, y_offset, qpix_put, chroma_put);
- mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
- tmp_y, tmp_cb, tmp_cr,
- x_offset, y_offset, qpix_put, chroma_put);
-
- if(h->use_weight == 2){
- int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
- int weight1 = 64 - weight0;
- luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0);
- chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
- chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
- }else{
- luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
- h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
- h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
- chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
- h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
- h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
- chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
- h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
- h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
- }
- }else{
- int list = list1 ? 1 : 0;
- int refn = h->ref_cache[list][ scan8[n] ];
- Picture *ref= &h->ref_list[list][refn];
- mc_dir_part(h, ref, n, square, chroma_height, delta, list,
- dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_put, chroma_put);
-
- luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
- h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
- if(h->use_weight_chroma){
- chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
- h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
- chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
- h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
- }
- }
-}
-
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
- uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- int x_offset, int y_offset,
- qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
- qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
- h264_weight_func *weight_op, h264_biweight_func *weight_avg,
- int list0, int list1){
- if((h->use_weight==2 && list0 && list1
- && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
- || h->use_weight==1)
- mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
- x_offset, y_offset, qpix_put, chroma_put,
- weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
- else
- mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
- x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
-}
-
-static inline void prefetch_motion(H264Context *h, int list){
- /* fetch pixels for estimated mv 4 macroblocks ahead
- * optimized for 64byte cache lines */
- MpegEncContext * const s = &h->s;
- const int refn = h->ref_cache[list][scan8[0]];
- if(refn >= 0){
- const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
- const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
- uint8_t **src= h->ref_list[list][refn].data;
- int off= ((mx+64)<<h->pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize;
- s->dsp.prefetch(src[0]+off, s->linesize, 4);
- off= (((mx>>1)+64)<<h->pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize;
- s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
- }
-}
-
-static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
- qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
- qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
- h264_weight_func *weight_op, h264_biweight_func *weight_avg){
- MpegEncContext * const s = &h->s;
- const int mb_xy= h->mb_xy;
- const int mb_type= s->current_picture.mb_type[mb_xy];
-
- assert(IS_INTER(mb_type));
-
- if(HAVE_PTHREADS && s->avctx->active_thread_type&FF_THREAD_FRAME)
- await_references(h);
- prefetch_motion(h, 0);
-
- if(IS_16X16(mb_type)){
- mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
- qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
- weight_op, weight_avg,
- IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
- }else if(IS_16X8(mb_type)){
- mc_part(h, 0, 0, 4, (8<<h->pixel_shift), dest_y, dest_cb, dest_cr, 0, 0,
- qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
- &weight_op[1], &weight_avg[1],
- IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
- mc_part(h, 8, 0, 4, (8<<h->pixel_shift), dest_y, dest_cb, dest_cr, 0, 4,
- qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
- &weight_op[1], &weight_avg[1],
- IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
- }else if(IS_8X16(mb_type)){
- mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
- qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
- &weight_op[2], &weight_avg[2],
- IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
- mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
- qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
- &weight_op[2], &weight_avg[2],
- IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
- }else{
- int i;
-
- assert(IS_8X8(mb_type));
-
- for(i=0; i<4; i++){
- const int sub_mb_type= h->sub_mb_type[i];
- const int n= 4*i;
- int x_offset= (i&1)<<2;
- int y_offset= (i&2)<<1;
-
- if(IS_SUB_8X8(sub_mb_type)){
- mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
- &weight_op[3], &weight_avg[3],
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
- }else if(IS_SUB_8X4(sub_mb_type)){
- mc_part(h, n , 0, 2, (4<<h->pixel_shift), dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
- &weight_op[4], &weight_avg[4],
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
- mc_part(h, n+2, 0, 2, (4<<h->pixel_shift), dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
- qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
- &weight_op[4], &weight_avg[4],
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
- }else if(IS_SUB_4X8(sub_mb_type)){
- mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
- qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
- &weight_op[5], &weight_avg[5],
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
- mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
- qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
- &weight_op[5], &weight_avg[5],
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
- }else{
- int j;
- assert(IS_SUB_4X4(sub_mb_type));
- for(j=0; j<4; j++){
- int sub_x_offset= x_offset + 2*(j&1);
- int sub_y_offset= y_offset + (j&2);
- mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
- qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
- &weight_op[6], &weight_avg[6],
- IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
- }
- }
- }
- }
-
- prefetch_motion(h, 1);
-}
-
static void free_tables(H264Context *h, int free_rbsp){
int i;
@@ -1692,7 +1275,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
if(h->deblocking_filter)
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
}else if(is_h264){
- hl_motion(h, dest_y, dest_cb, dest_cr,
+ ff_hl_motion(h, dest_y, dest_cb, dest_cr,
s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
h->h264dsp.weight_h264_pixels_tab, h->h264dsp.biweight_h264_pixels_tab);
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index c9a2f0073b..ab02de1dfb 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -717,6 +717,12 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
void ff_h264_reset_sei(H264Context *h);
+void ff_hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+ qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+ h264_weight_func *weight_op, h264_biweight_func *weight_avg);
+
+
/*
o-o o-o
/ / /
diff --git a/libavcodec/h264_hl_motion.c b/libavcodec/h264_hl_motion.c
new file mode 100644
index 0000000000..654b8b8444
--- /dev/null
+++ b/libavcodec/h264_hl_motion.c
@@ -0,0 +1,164 @@
+
+#include "h264.h"
+#include "thread.h"
+
+static inline int get_lowest_part_list_y(H264Context *h, Picture *pic, int n, int height,
+ int y_offset, int list){
+ int raw_my= h->mv_cache[list][ scan8[n] ][1];
+ int filter_height= (raw_my&3) ? 2 : 0;
+ int full_my= (raw_my>>2) + y_offset;
+ int top = full_my - filter_height, bottom = full_my + height + filter_height;
+
+ return FFMAX(abs(top), bottom);
+}
+
+static inline void get_lowest_part_y(H264Context *h, int refs[2][48], int n, int height,
+ int y_offset, int list0, int list1, int *nrefs){
+ MpegEncContext * const s = &h->s;
+ int my;
+
+ y_offset += 16*(s->mb_y >> MB_FIELD);
+
+ if(list0){
+ int ref_n = h->ref_cache[0][ scan8[n] ];
+ Picture *ref= &h->ref_list[0][ref_n];
+
+ // Error resilience puts the current picture in the ref list.
+ // Don't try to wait on these as it will cause a deadlock.
+ // Fields can wait on each other, though.
+ if(ref->thread_opaque != s->current_picture.thread_opaque ||
+ (ref->reference&3) != s->picture_structure) {
+ my = get_lowest_part_list_y(h, ref, n, height, y_offset, 0);
+ if (refs[0][ref_n] < 0) nrefs[0] += 1;
+ refs[0][ref_n] = FFMAX(refs[0][ref_n], my);
+ }
+ }
+
+ if(list1){
+ int ref_n = h->ref_cache[1][ scan8[n] ];
+ Picture *ref= &h->ref_list[1][ref_n];
+
+ if(ref->thread_opaque != s->current_picture.thread_opaque ||
+ (ref->reference&3) != s->picture_structure) {
+ my = get_lowest_part_list_y(h, ref, n, height, y_offset, 1);
+ if (refs[1][ref_n] < 0) nrefs[1] += 1;
+ refs[1][ref_n] = FFMAX(refs[1][ref_n], my);
+ }
+ }
+}
+
+/**
+ * Wait until all reference frames are available for MC operations.
+ *
+ * @param h the H264 context
+ */
+static void await_references(H264Context *h){
+ MpegEncContext * const s = &h->s;
+ const int mb_xy= h->mb_xy;
+ const int mb_type= s->current_picture.mb_type[mb_xy];
+ int refs[2][48];
+ int nrefs[2] = {0};
+ int ref, list;
+
+ memset(refs, -1, sizeof(refs));
+
+ if(IS_16X16(mb_type)){
+ get_lowest_part_y(h, refs, 0, 16, 0,
+ IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
+ }else if(IS_16X8(mb_type)){
+ get_lowest_part_y(h, refs, 0, 8, 0,
+ IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
+ get_lowest_part_y(h, refs, 8, 8, 8,
+ IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
+ }else if(IS_8X16(mb_type)){
+ get_lowest_part_y(h, refs, 0, 16, 0,
+ IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
+ get_lowest_part_y(h, refs, 4, 16, 0,
+ IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
+ }else{
+ int i;
+
+ assert(IS_8X8(mb_type));
+
+ for(i=0; i<4; i++){
+ const int sub_mb_type= h->sub_mb_type[i];
+ const int n= 4*i;
+ int y_offset= (i&2)<<2;
+
+ if(IS_SUB_8X8(sub_mb_type)){
+ get_lowest_part_y(h, refs, n , 8, y_offset,
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
+ }else if(IS_SUB_8X4(sub_mb_type)){
+ get_lowest_part_y(h, refs, n , 4, y_offset,
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
+ get_lowest_part_y(h, refs, n+2, 4, y_offset+4,
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
+ }else if(IS_SUB_4X8(sub_mb_type)){
+ get_lowest_part_y(h, refs, n , 8, y_offset,
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
+ get_lowest_part_y(h, refs, n+1, 8, y_offset,
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
+ }else{
+ int j;
+ assert(IS_SUB_4X4(sub_mb_type));
+ for(j=0; j<4; j++){
+ int sub_y_offset= y_offset + 2*(j&2);
+ get_lowest_part_y(h, refs, n+j, 4, sub_y_offset,
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), nrefs);
+ }
+ }
+ }
+ }
+
+ for(list=h->list_count-1; list>=0; list--){
+ for(ref=0; ref<48 && nrefs[list]; ref++){
+ int row = refs[list][ref];
+ if(row >= 0){
+ Picture *ref_pic = &h->ref_list[list][ref];
+ int ref_field = ref_pic->reference - 1;
+ int ref_field_picture = ref_pic->field_picture;
+ int pic_height = 16*s->mb_height >> ref_field_picture;
+
+ row <<= MB_MBAFF;
+ nrefs[list]--;
+
+ if(!FIELD_PICTURE && ref_field_picture){ // frame referencing two fields
+ ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) - !(row&1), pic_height-1), 1);
+ ff_thread_await_progress((AVFrame*)ref_pic, FFMIN((row >> 1) , pic_height-1), 0);
+ }else if(FIELD_PICTURE && !ref_field_picture){ // field referencing one field of a frame
+ ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row*2 + ref_field , pic_height-1), 0);
+ }else if(FIELD_PICTURE){
+ ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), ref_field);
+ }else{
+ ff_thread_await_progress((AVFrame*)ref_pic, FFMIN(row, pic_height-1), 0);
+ }
+ }
+ }
+ }
+}
+
+#define FUNC(a) a ## _8
+#define PIXEL_SHIFT 0
+#include "h264_hl_motion.h"
+
+#undef PIXEL_SHIFT
+#undef FUNC
+#define FUNC(a) a ## _16
+#define PIXEL_SHIFT 1
+#include "h264_hl_motion.h"
+
+void ff_hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+ qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+ h264_weight_func *weight_op, h264_biweight_func *weight_avg){
+ if(h->pixel_shift){
+ hl_motion_16(h, dest_y, dest_cb, dest_cr,
+ qpix_put, chroma_put,
+ qpix_avg, chroma_avg,
+ weight_op, weight_avg);
+ }else
+ hl_motion_8(h, dest_y, dest_cb, dest_cr,
+ qpix_put, chroma_put,
+ qpix_avg, chroma_avg,
+ weight_op, weight_avg);
+}
diff --git a/libavcodec/h264_hl_motion.h b/libavcodec/h264_hl_motion.h
new file mode 100644
index 0000000000..f354251fc9
--- /dev/null
+++ b/libavcodec/h264_hl_motion.h
@@ -0,0 +1,282 @@
+
+static inline void FUNC(mc_dir_part)(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int src_x_offset, int src_y_offset,
+ qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
+ MpegEncContext * const s = &h->s;
+ const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
+ int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
+ const int luma_xy= (mx&3) + ((my&3)<<2);
+ uint8_t * src_y = pic->data[0] + ((mx>>2)<<PIXEL_SHIFT) + (my>>2)*h->mb_linesize;
+ uint8_t * src_cb, * src_cr;
+ int extra_width= h->emu_edge_width;
+ int extra_height= h->emu_edge_height;
+ int emu=0;
+ const int full_mx= mx>>2;
+ const int full_my= my>>2;
+ const int pic_width = 16*s->mb_width;
+ const int pic_height = 16*s->mb_height >> MB_FIELD;
+
+ if(mx&7) extra_width -= 3;
+ if(my&7) extra_height -= 3;
+
+ if( full_mx < 0-extra_width
+ || full_my < 0-extra_height
+ || full_mx + 16/*FIXME*/ > pic_width + extra_width
+ || full_my + 16/*FIXME*/ > pic_height + extra_height){
+ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2<<PIXEL_SHIFT) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+ src_y= s->edge_emu_buffer + (2<<PIXEL_SHIFT) + 2*h->mb_linesize;
+ emu=1;
+ }
+
+ qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
+ if(!square){
+ qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
+ }
+
+ if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
+
+ if(MB_FIELD){
+ // chroma offset when predicting from a field of opposite parity
+ my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
+ emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
+ }
+ src_cb= pic->data[1] + ((mx>>3)<<PIXEL_SHIFT) + (my>>3)*h->mb_uvlinesize;
+ src_cr= pic->data[2] + ((mx>>3)<<PIXEL_SHIFT) + (my>>3)*h->mb_uvlinesize;
+
+ if(emu){
+ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+ src_cb= s->edge_emu_buffer;
+ }
+ chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+
+ if(emu){
+ s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+ src_cr= s->edge_emu_buffer;
+ }
+ chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+}
+
+static inline void FUNC(mc_part_std)(H264Context *h, int n, int square, int chroma_height, int delta,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int x_offset, int y_offset,
+ qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+ qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+ int list0, int list1){
+ MpegEncContext * const s = &h->s;
+ qpel_mc_func *qpix_op= qpix_put;
+ h264_chroma_mc_func chroma_op= chroma_put;
+
+ dest_y += (2*x_offset<<PIXEL_SHIFT) + 2*y_offset*h-> mb_linesize;
+ dest_cb += ( x_offset<<PIXEL_SHIFT) + y_offset*h->mb_uvlinesize;
+ dest_cr += ( x_offset<<PIXEL_SHIFT) + y_offset*h->mb_uvlinesize;
+ x_offset += 8*s->mb_x;
+ y_offset += 8*(s->mb_y >> MB_FIELD);
+
+ if(list0){
+ Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
+ FUNC(mc_dir_part)(h, ref, n, square, chroma_height, delta, 0,
+ dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ qpix_op, chroma_op);
+
+ qpix_op= qpix_avg;
+ chroma_op= chroma_avg;
+ }
+
+ if(list1){
+ Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
+ FUNC(mc_dir_part)(h, ref, n, square, chroma_height, delta, 1,
+ dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ qpix_op, chroma_op);
+ }
+}
+
+static inline void FUNC(mc_part_weighted)(H264Context *h, int n, int square, int chroma_height, int delta,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int x_offset, int y_offset,
+ qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+ h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
+ h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
+ int list0, int list1){
+ MpegEncContext * const s = &h->s;
+
+ dest_y += (2*x_offset<<PIXEL_SHIFT) + 2*y_offset*h-> mb_linesize;
+ dest_cb += ( x_offset<<PIXEL_SHIFT) + y_offset*h->mb_uvlinesize;
+ dest_cr += ( x_offset<<PIXEL_SHIFT) + y_offset*h->mb_uvlinesize;
+ x_offset += 8*s->mb_x;
+ y_offset += 8*(s->mb_y >> MB_FIELD);
+
+ if(list0 && list1){
+ /* don't optimize for luma-only case, since B-frames usually
+ * use implicit weights => chroma too. */
+ uint8_t *tmp_cb = s->obmc_scratchpad;
+ uint8_t *tmp_cr = s->obmc_scratchpad + (8<<PIXEL_SHIFT);
+ uint8_t *tmp_y = s->obmc_scratchpad + 8*h->mb_uvlinesize;
+ int refn0 = h->ref_cache[0][ scan8[n] ];
+ int refn1 = h->ref_cache[1][ scan8[n] ];
+
+ FUNC(mc_dir_part)(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+ dest_y, dest_cb, dest_cr,
+ x_offset, y_offset, qpix_put, chroma_put);
+ FUNC(mc_dir_part)(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+ tmp_y, tmp_cb, tmp_cr,
+ x_offset, y_offset, qpix_put, chroma_put);
+
+ if(h->use_weight == 2){
+ int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
+ int weight1 = 64 - weight0;
+ luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0);
+ chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
+ chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
+ }else{
+ luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
+ h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
+ h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
+ chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
+ h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
+ chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
+ h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
+ }
+ }else{
+ int list = list1 ? 1 : 0;
+ int refn = h->ref_cache[list][ scan8[n] ];
+ Picture *ref= &h->ref_list[list][refn];
+ FUNC(mc_dir_part)(h, ref, n, square, chroma_height, delta, list,
+ dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ qpix_put, chroma_put);
+
+ luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
+ h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
+ if(h->use_weight_chroma){
+ chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
+ chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+ h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
+ }
+ }
+}
+
+static inline void FUNC(mc_part)(H264Context *h, int n, int square, int chroma_height, int delta,
+ uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ int x_offset, int y_offset,
+ qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+ qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+ h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+ int list0, int list1){
+ if((h->use_weight==2 && list0 && list1
+ && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
+ || h->use_weight==1)
+ FUNC(mc_part_weighted)(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+ x_offset, y_offset, qpix_put, chroma_put,
+ weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
+ else
+ FUNC(mc_part_std)(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+ x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
+}
+
+static inline void FUNC(prefetch_motion)(H264Context *h, int list){
+ /* fetch pixels for estimated mv 4 macroblocks ahead
+ * optimized for 64byte cache lines */
+ MpegEncContext * const s = &h->s;
+ const int refn = h->ref_cache[list][scan8[0]];
+ if(refn >= 0){
+ const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
+ const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
+ uint8_t **src= h->ref_list[list][refn].data;
+ int off= ((mx+64)<<PIXEL_SHIFT) + (my + (s->mb_x&3)*4)*h->mb_linesize;
+ s->dsp.prefetch(src[0]+off, s->linesize, 4);
+ off= (((mx>>1)+64)<<PIXEL_SHIFT) + ((my>>1) + (s->mb_x&7))*s->uvlinesize;
+ s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+ }
+}
+
+static void FUNC(hl_motion)(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+ qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+ qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+ h264_weight_func *weight_op, h264_biweight_func *weight_avg){
+ MpegEncContext * const s = &h->s;
+ const int mb_xy= h->mb_xy;
+ const int mb_type= s->current_picture.mb_type[mb_xy];
+
+ assert(IS_INTER(mb_type));
+
+ if(HAVE_PTHREADS && s->avctx->active_thread_type&FF_THREAD_FRAME)
+ await_references(h);
+ FUNC(prefetch_motion)(h, 0);
+
+ if(IS_16X16(mb_type)){
+ FUNC(mc_part)(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+ qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
+ weight_op, weight_avg,
+ IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+ }else if(IS_16X8(mb_type)){
+ FUNC(mc_part)(h, 0, 0, 4, (8<<PIXEL_SHIFT), dest_y, dest_cb, dest_cr, 0, 0,
+ qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+ &weight_op[1], &weight_avg[1],
+ IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+ FUNC(mc_part)(h, 8, 0, 4, (8<<PIXEL_SHIFT), dest_y, dest_cb, dest_cr, 0, 4,
+ qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+ &weight_op[1], &weight_avg[1],
+ IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+ }else if(IS_8X16(mb_type)){
+ FUNC(mc_part)(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+ qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+ &weight_op[2], &weight_avg[2],
+ IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+ FUNC(mc_part)(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+ qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+ &weight_op[2], &weight_avg[2],
+ IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+ }else{
+ int i;
+
+ assert(IS_8X8(mb_type));
+
+ for(i=0; i<4; i++){
+ const int sub_mb_type= h->sub_mb_type[i];
+ const int n= 4*i;
+ int x_offset= (i&1)<<2;
+ int y_offset= (i&2)<<1;
+
+ if(IS_SUB_8X8(sub_mb_type)){
+ FUNC(mc_part)(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+ &weight_op[3], &weight_avg[3],
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+ }else if(IS_SUB_8X4(sub_mb_type)){
+ FUNC(mc_part)(h, n , 0, 2, (4<<PIXEL_SHIFT), dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+ &weight_op[4], &weight_avg[4],
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+ FUNC(mc_part)(h, n+2, 0, 2, (4<<PIXEL_SHIFT), dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+ qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+ &weight_op[4], &weight_avg[4],
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+ }else if(IS_SUB_4X8(sub_mb_type)){
+ FUNC(mc_part)(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+ qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+ &weight_op[5], &weight_avg[5],
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+ FUNC(mc_part)(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+ qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+ &weight_op[5], &weight_avg[5],
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+ }else{
+ int j;
+ assert(IS_SUB_4X4(sub_mb_type));
+ for(j=0; j<4; j++){
+ int sub_x_offset= x_offset + 2*(j&1);
+ int sub_y_offset= y_offset + (j&2);
+ FUNC(mc_part)(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+ qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+ &weight_op[6], &weight_avg[6],
+ IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+ }
+ }
+ }
+ }
+
+ FUNC(prefetch_motion)(h, 1);
+}