diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-10-12 05:33:52 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-10-12 05:40:57 +0200 |
commit | b81f8880e010ccdef3604d9beb681d3c4c6a7bc0 (patch) | |
tree | 0a6534f8fd2d53e84f6b5716047e473b1598cd3d /libavcodec | |
parent | b75d89a4784e027cec99236d58e9bd4121ec4309 (diff) | |
parent | 5f3fb599536dd5bceb1d45cb73cd0b0ce3e5560c (diff) | |
download | ffmpeg-b81f8880e010ccdef3604d9beb681d3c4c6a7bc0.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (23 commits)
fix AC3ENC_OPT_MODE_ON/OFF
h264: fix HRD parameters parsing
prores: implement multithreading.
prores: idct sse2/sse4 optimizations.
swscale: use aligned move for storage into temporary buffer.
prores: extract idct into its own dspcontext and merge with put_pixels.
h264: fix invalid shifts in init_cavlc_level_tab()
intfloat_readwrite: fix signed addition overflows
mov: do not misreport empty stts
mov: cosmetics, fix for and if spacing
id3v2: fix NULL pointer dereference
mov: read album_artist atom
mov: fix disc/track numbers and totals
doc: fix references to obsolete presets directories for avconv/ffmpeg
flashsv: return more meaningful error value
flashsv: fix typo in av_log() message
smacker: validate channels and sample format.
smacker: check buffer size before reading output size
smacker: validate number of channels
smacker: Separate audio flags from sample rates in smacker demuxer.
...
Conflicts:
cmdutils.h
doc/ffmpeg.texi
libavcodec/Makefile
libavcodec/motion_est_template.c
libavformat/id3v2.c
libavformat/mov.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/ac3enc.h | 4 | ||||
-rw-r--r-- | libavcodec/dsputil.c | 65 | ||||
-rw-r--r-- | libavcodec/dsputil.h | 2 | ||||
-rw-r--r-- | libavcodec/flashsv.c | 4 | ||||
-rw-r--r-- | libavcodec/golomb.h | 14 | ||||
-rw-r--r-- | libavcodec/h264_cavlc.c | 9 | ||||
-rw-r--r-- | libavcodec/h264_ps.c | 6 | ||||
-rw-r--r-- | libavcodec/motion_est.c | 2 | ||||
-rw-r--r-- | libavcodec/motion_est_template.c | 36 | ||||
-rw-r--r-- | libavcodec/mpegvideo.h | 2 | ||||
-rw-r--r-- | libavcodec/proresdec_lgpl.c | 159 | ||||
-rw-r--r-- | libavcodec/proresdsp.c | 63 | ||||
-rw-r--r-- | libavcodec/proresdsp.h | 40 | ||||
-rw-r--r-- | libavcodec/simple_idct.c | 17 | ||||
-rw-r--r-- | libavcodec/simple_idct.h | 6 | ||||
-rw-r--r-- | libavcodec/sipr.c | 6 | ||||
-rw-r--r-- | libavcodec/smacker.c | 17 | ||||
-rw-r--r-- | libavcodec/x86/Makefile | 2 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 2 | ||||
-rw-r--r-- | libavcodec/x86/proresdsp-init.c | 54 | ||||
-rw-r--r-- | libavcodec/x86/proresdsp.asm | 432 |
22 files changed, 788 insertions, 156 deletions
diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 2978329801..78a0a300b3 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -314,7 +314,7 @@ OBJS-$(CONFIG_PNG_ENCODER) += png.o pngenc.o OBJS-$(CONFIG_PPM_DECODER) += pnmdec.o pnm.o OBJS-$(CONFIG_PPM_ENCODER) += pnmenc.o pnm.o OBJS-$(CONFIG_PRORES_GPL_DECODER) += proresdec_gpl.o -OBJS-$(CONFIG_PRORES_LGPL_DECODER) += proresdec_lgpl.o +OBJS-$(CONFIG_PRORES_LGPL_DECODER) += proresdec_lgpl.o proresdsp.o OBJS-$(CONFIG_PTX_DECODER) += ptx.o OBJS-$(CONFIG_QCELP_DECODER) += qcelpdec.o celp_math.o \ celp_filters.o acelp_vectors.o \ diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h index bf5ccea19f..4a017498f6 100644 --- a/libavcodec/ac3enc.h +++ b/libavcodec/ac3enc.h @@ -73,8 +73,8 @@ typedef int64_t CoefSumType; #define AC3ENC_OPT_OFF 0 #define AC3ENC_OPT_ON 1 #define AC3ENC_OPT_NOT_INDICATED 0 -#define AC3ENC_OPT_MODE_ON 1 -#define AC3ENC_OPT_MODE_OFF 2 +#define AC3ENC_OPT_MODE_ON 2 +#define AC3ENC_OPT_MODE_OFF 1 /* specific option values */ #define AC3ENC_OPT_LARGE_ROOM 1 diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index ebce93039f..c451c97155 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -144,6 +144,41 @@ void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_s } } +void ff_init_scantable_permutation(uint8_t *idct_permutation, + int idct_permutation_type) +{ + int i; + + switch(idct_permutation_type){ + case FF_NO_IDCT_PERM: + for(i=0; i<64; i++) + idct_permutation[i]= i; + break; + case FF_LIBMPEG2_IDCT_PERM: + for(i=0; i<64; i++) + idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); + break; + case FF_SIMPLE_IDCT_PERM: + for(i=0; i<64; i++) + idct_permutation[i]= simple_mmx_permutation[i]; + break; + case FF_TRANSPOSE_IDCT_PERM: + for(i=0; i<64; i++) + idct_permutation[i]= ((i&7)<<3) | (i>>3); + break; + case FF_PARTTRANS_IDCT_PERM: + for(i=0; i<64; i++) + idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3); + break; + case FF_SSE2_IDCT_PERM: + for(i=0; i<64; i++) + idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7]; + break; + default: + av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n"); + } +} + static int pix_sum_c(uint8_t * pix, int line_size) { int s, i, j; @@ -3107,32 +3142,6 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i]; } - switch(c->idct_permutation_type){ - case FF_NO_IDCT_PERM: - for(i=0; i<64; i++) - c->idct_permutation[i]= i; - break; - case FF_LIBMPEG2_IDCT_PERM: - for(i=0; i<64; i++) - c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); - break; - case FF_SIMPLE_IDCT_PERM: - for(i=0; i<64; i++) - c->idct_permutation[i]= simple_mmx_permutation[i]; - break; - case FF_TRANSPOSE_IDCT_PERM: - for(i=0; i<64; i++) - c->idct_permutation[i]= ((i&7)<<3) | (i>>3); - break; - case FF_PARTTRANS_IDCT_PERM: - for(i=0; i<64; i++) - c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3); - break; - case FF_SSE2_IDCT_PERM: - for(i=0; i<64; i++) - c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7]; - break; - default: - av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n"); - } + ff_init_scantable_permutation(c->idct_permutation, + c->idct_permutation_type); } diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 057c41cca5..22c51a0962 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -204,6 +204,8 @@ typedef struct ScanTable{ } ScanTable; void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable); +void ff_init_scantable_permutation(uint8_t *idct_permutation, + int idct_permutation_type); #define EMULATED_EDGE(depth) \ void ff_emulated_edge_mc_ ## depth (uint8_t *buf, const uint8_t *src, int linesize,\ diff --git a/libavcodec/flashsv.c b/libavcodec/flashsv.c index 6b649bb874..3861344cb7 100644 --- a/libavcodec/flashsv.c +++ b/libavcodec/flashsv.c @@ -301,7 +301,7 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, /* check for changes of image width and image height */ if (avctx->width != s->image_width || avctx->height != s->image_height) { av_log(avctx, AV_LOG_ERROR, - "Frame width or height differs from first frames!\n"); + "Frame width or height differs from first frame!\n"); av_log(avctx, AV_LOG_ERROR, "fh = %d, fv %d vs ch = %d, cv = %d\n", avctx->height, avctx->width, s->image_height, s->image_width); return AVERROR_INVALIDDATA; @@ -367,7 +367,7 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, if (s->color_depth != 0 && s->color_depth != 2) { av_log(avctx, AV_LOG_ERROR, "%dx%d invalid color depth %d\n", i, j, s->color_depth); - return -1; + return AVERROR_INVALIDDATA; } if (has_diff) { diff --git a/libavcodec/golomb.h b/libavcodec/golomb.h index 90eeb30b54..8dff0322a7 100644 --- a/libavcodec/golomb.h +++ b/libavcodec/golomb.h @@ -75,6 +75,20 @@ static inline int get_ue_golomb(GetBitContext *gb){ } } +/** + * Read an unsigned Exp-Golomb code in the range 0 to UINT32_MAX-1. + */ +static inline unsigned get_ue_golomb_long(GetBitContext *gb) +{ + unsigned buf, log; + + buf = show_bits_long(gb, 32); + log = 31 - av_log2(buf); + skip_bits_long(gb, log); + + return get_bits_long(gb, log + 1) - 1; +} + /** * read unsigned exp golomb code, constraint to a max of 31. * the return value is undefined if the stored value exceeds 31. diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c index d2b979fc2c..b0dc999132 100644 --- a/libavcodec/h264_cavlc.c +++ b/libavcodec/h264_cavlc.c @@ -298,17 +298,18 @@ static inline int pred_non_zero_count(H264Context *h, int n){ } static av_cold void init_cavlc_level_tab(void){ - int suffix_length, mask; + int suffix_length; unsigned int i; for(suffix_length=0; suffix_length<7; suffix_length++){ for(i=0; i<(1<<LEVEL_TAB_BITS); i++){ int prefix= LEVEL_TAB_BITS - av_log2(2*i); - int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length); - mask= -(level_code&1); - level_code= (((2+level_code)>>1) ^ mask) - mask; if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){ + int level_code = (prefix << suffix_length) + + (i >> (av_log2(i) - suffix_length)) - (1 << suffix_length); + int mask = -(level_code&1); + level_code = (((2 + level_code) >> 1) ^ mask) - mask; cavlc_level_tab[suffix_length][i][0]= level_code; cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length; }else if(prefix + 1 <= LEVEL_TAB_BITS){ diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c index f0ec0ff9a9..158ca3205a 100644 --- a/libavcodec/h264_ps.c +++ b/libavcodec/h264_ps.c @@ -143,8 +143,8 @@ static inline int decode_hrd_parameters(H264Context *h, SPS *sps){ get_bits(&s->gb, 4); /* bit_rate_scale */ get_bits(&s->gb, 4); /* cpb_size_scale */ for(i=0; i<cpb_count; i++){ - get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */ - get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */ + get_ue_golomb_long(&s->gb); /* bit_rate_value_minus1 */ + get_ue_golomb_long(&s->gb); /* cpb_size_value_minus1 */ get_bits1(&s->gb); /* cbr_flag */ } sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1; @@ -494,6 +494,7 @@ int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){ unsigned int pps_id= get_ue_golomb(&s->gb); PPS *pps; const int qp_bd_offset = 6*(h->sps.bit_depth_luma-8); + int bits_left; if(pps_id >= MAX_PPS_COUNT) { av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id); @@ -570,6 +571,7 @@ int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){ memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4)); memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8)); + bits_left = bit_length - get_bits_count(&s->gb); if(get_bits_count(&s->gb) < bit_length){ pps->transform_8x8_mode= get_bits1(&s->gb); decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8); diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c index c8af093653..319aaf97ca 100644 --- a/libavcodec/motion_est.c +++ b/libavcodec/motion_est.c @@ -52,7 +52,7 @@ static inline int sad_hpel_motion_search(MpegEncContext * s, int src_index, int ref_index, int size, int h); -static inline int update_map_generation(MotionEstContext *c) +static inline unsigned update_map_generation(MotionEstContext *c) { c->map_generation+= 1<<(ME_MAP_MV_BITS*2); if(c->map_generation==0){ diff --git a/libavcodec/motion_est_template.c b/libavcodec/motion_est_template.c index 72150b4092..b7b7b6d521 100644 --- a/libavcodec/motion_est_template.c +++ b/libavcodec/motion_est_template.c @@ -90,8 +90,8 @@ static int hpel_motion_search(MpegEncContext * s, + (mv_penalty[bx - pred_x] + mv_penalty[by+2 - pred_y])*c->penalty_factor; #if 1 - int key; - int map_generation= c->map_generation; + unsigned key; + unsigned map_generation= c->map_generation; #ifndef NDEBUG uint32_t *map= c->map; #endif @@ -210,7 +210,7 @@ static int qpel_motion_search(MpegEncContext * s, const int mx = *mx_ptr; const int my = *my_ptr; const int penalty_factor= c->sub_penalty_factor; - const int map_generation= c->map_generation; + const unsigned map_generation = c->map_generation; const int subpel_quality= c->avctx->me_subpel_quality; uint32_t *map= c->map; me_cmp_func cmpf, chroma_cmpf; @@ -356,7 +356,7 @@ static int qpel_motion_search(MpegEncContext * s, #define CHECK_MV(x,y)\ {\ - const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\ + const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\ const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\ assert((x) >= xmin);\ assert((x) <= xmax);\ @@ -384,7 +384,7 @@ static int qpel_motion_search(MpegEncContext * s, #define CHECK_MV_DIR(x,y,new_dir)\ {\ - const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\ + const unsigned key = ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\ const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\ /*printf("check_mv_dir %d %d %d\n", x, y, new_dir);*/\ if(map[index]!=key){\ @@ -422,13 +422,13 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best, int next_dir=-1; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; cmpf= s->dsp.me_cmp[size]; chroma_cmpf= s->dsp.me_cmp[size+1]; { /* ensure that the best point is in the MAP as h/qpel refinement needs it */ - const int key= (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation; + const unsigned key = (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation; const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1); if(map[index]!=key){ //this will be executed only very rarey score_map[index]= cmp(s, best[0], best[1], 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags); @@ -464,7 +464,7 @@ static int funny_diamond_search(MpegEncContext * s, int *best, int dmin, int dia_size; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; cmpf= s->dsp.me_cmp[size]; chroma_cmpf= s->dsp.me_cmp[size+1]; @@ -505,7 +505,7 @@ static int hex_search(MpegEncContext * s, int *best, int dmin, me_cmp_func cmpf, chroma_cmpf; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; int x,y,d; const int dec= dia_size & (dia_size-1); @@ -539,7 +539,7 @@ static int l2s_dia_search(MpegEncContext * s, int *best, int dmin, me_cmp_func cmpf, chroma_cmpf; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; int x,y,i,d; int dia_size= c->dia_size&0xFF; const int dec= dia_size & (dia_size-1); @@ -577,7 +577,7 @@ static int umh_search(MpegEncContext * s, int *best, int dmin, me_cmp_func cmpf, chroma_cmpf; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; int x,y,x2,y2, i, j, d; const int dia_size= c->dia_size&0xFE; static const int hex[16][2]={{-4,-2}, {-4,-1}, {-4, 0}, {-4, 1}, {-4, 2}, @@ -624,7 +624,7 @@ static int full_search(MpegEncContext * s, int *best, int dmin, me_cmp_func cmpf, chroma_cmpf; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; int x,y, d; const int dia_size= c->dia_size&0xFF; @@ -653,7 +653,7 @@ static int full_search(MpegEncContext * s, int *best, int dmin, #define SAB_CHECK_MV(ax,ay)\ {\ - const int key= ((ay)<<ME_MAP_MV_BITS) + (ax) + map_generation;\ + const unsigned key = ((ay)<<ME_MAP_MV_BITS) + (ax) + map_generation;\ const int index= (((ay)<<ME_MAP_SHIFT) + (ax))&(ME_MAP_SIZE-1);\ /*printf("sab check %d %d\n", ax, ay);*/\ if(map[index]!=key){\ @@ -692,7 +692,7 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin, int i, j; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; cmpf= s->dsp.me_cmp[size]; chroma_cmpf= s->dsp.me_cmp[size+1]; @@ -777,7 +777,7 @@ static int var_diamond_search(MpegEncContext * s, int *best, int dmin, int dia_size; LOAD_COMMON LOAD_COMMON2 - int map_generation= c->map_generation; + unsigned map_generation = c->map_generation; cmpf= s->dsp.me_cmp[size]; chroma_cmpf= s->dsp.me_cmp[size+1]; @@ -869,7 +869,7 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int int d; ///< the score (cmp + penalty) of any given mv int dmin; /**< the best value of d, i.e. the score corresponding to the mv stored in best[]. */ - int map_generation; + unsigned map_generation; int penalty_factor; const int ref_mv_stride= s->mb_stride; //pass as arg FIXME const int ref_mv_xy= s->mb_x + s->mb_y*ref_mv_stride; //add to last_mv beforepassing FIXME @@ -997,7 +997,7 @@ static int epzs_motion_search4(MpegEncContext * s, MotionEstContext * const c= &s->me; int best[2]={0, 0}; int d, dmin; - int map_generation; + unsigned map_generation; const int penalty_factor= c->penalty_factor; const int size=1; const int h=8; @@ -1057,7 +1057,7 @@ static int epzs_motion_search2(MpegEncContext * s, MotionEstContext * const c= &s->me; int best[2]={0, 0}; int d, dmin; - int map_generation; + unsigned map_generation; const int penalty_factor= c->penalty_factor; const int size=0; //FIXME pass as arg const int h=8; diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h index 7d6c8d24b8..f640c8e315 100644 --- a/libavcodec/mpegvideo.h +++ b/libavcodec/mpegvideo.h @@ -156,7 +156,7 @@ typedef struct MotionEstContext{ int best_bits; uint32_t *map; ///< map to avoid duplicate evaluations uint32_t *score_map; ///< map to store the scores - int map_generation; + unsigned map_generation; int pre_penalty_factor; int penalty_factor; /**< an estimate of the bits required to code a given mv value, e.g. (1,0) takes diff --git a/libavcodec/proresdec_lgpl.c b/libavcodec/proresdec_lgpl.c index 89e5582a37..156f87530a 100644 --- a/libavcodec/proresdec_lgpl.c +++ b/libavcodec/proresdec_lgpl.c @@ -34,17 +34,19 @@ #include "libavutil/intmath.h" #include "avcodec.h" -#include "dsputil.h" +#include "proresdsp.h" #include "get_bits.h" -#define BITS_PER_SAMPLE 10 ///< output precision of that decoder -#define BIAS (1 << (BITS_PER_SAMPLE - 1)) ///< bias value for converting signed pixels into unsigned ones -#define CLIP_MIN (1 << (BITS_PER_SAMPLE - 8)) ///< minimum value for clipping resulting pixels -#define CLIP_MAX (1 << BITS_PER_SAMPLE) - CLIP_MIN - 1 ///< maximum value for clipping resulting pixels - +typedef struct { + const uint8_t *index; ///< pointers to the data of this slice + int slice_num; + int x_pos, y_pos; + int slice_width; + DECLARE_ALIGNED(16, DCTELEM, blocks[8 * 4 * 64]); +} ProresThreadData; typedef struct { - DSPContext dsp; + ProresDSPContext dsp; AVFrame picture; ScanTable scantable; int scantable_type; ///< -1 = uninitialized, 0 = progressive, 1/2 = interlaced @@ -57,9 +59,9 @@ typedef struct { int prev_slice_sf; ///< scalefactor of the previous decoded slice DECLARE_ALIGNED(16, int16_t, qmat_luma_scaled[64]); DECLARE_ALIGNED(16, int16_t, qmat_chroma_scaled[64]); - DECLARE_ALIGNED(16, DCTELEM, blocks[8 * 4 * 64]); int total_slices; ///< total number of slices in a picture - const uint8_t **slice_data_index; ///< array of pointers to the data of each slice + ProresThreadData *slice_data; + int pic_num; int chroma_factor; int mb_chroma_factor; int num_chroma_blocks; ///< number of chrominance blocks in a macroblock @@ -100,12 +102,12 @@ static av_cold int decode_init(AVCodecContext *avctx) ProresContext *ctx = avctx->priv_data; ctx->total_slices = 0; - ctx->slice_data_index = 0; + ctx->slice_data = NULL; avctx->pix_fmt = PIX_FMT_YUV422P10; // set default pixel format - avctx->bits_per_raw_sample = BITS_PER_SAMPLE; - dsputil_init(&ctx->dsp, avctx); + avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE; + ff_proresdsp_init(&ctx->dsp); avctx->coded_frame = &ctx->picture; avcodec_get_frame_defaults(&ctx->picture); @@ -271,9 +273,9 @@ static int decode_picture_header(ProresContext *ctx, const uint8_t *buf, } if (ctx->total_slices != num_slices) { - av_freep(&ctx->slice_data_index); - ctx->slice_data_index = av_malloc((num_slices + 1) * sizeof(uint8_t*)); - if (!ctx->slice_data_index) + av_freep(&ctx->slice_data); + ctx->slice_data = av_malloc((num_slices + 1) * sizeof(ctx->slice_data[0])); + if (!ctx->slice_data) return AVERROR(ENOMEM); ctx->total_slices = num_slices; } @@ -288,10 +290,10 @@ static int decode_picture_header(ProresContext *ctx, const uint8_t *buf, data_ptr = index_ptr + num_slices * 2; for (i = 0; i < num_slices; i++) { - ctx->slice_data_index[i] = data_ptr; + ctx->slice_data[i].index = data_ptr; data_ptr += AV_RB16(index_ptr + i * 2); } - ctx->slice_data_index[i] = data_ptr; + ctx->slice_data[i].index = data_ptr; if (data_ptr > buf + data_size) { av_log(avctx, AV_LOG_ERROR, "out of slice data\n"); @@ -449,52 +451,11 @@ static inline void decode_ac_coeffs(GetBitContext *gb, DCTELEM *out, } -#define CLIP_AND_BIAS(x) (av_clip((x) + BIAS, CLIP_MIN, CLIP_MAX)) - -/** - * Add bias value, clamp and output pixels of a slice - */ -static void put_pixels(const DCTELEM *in, uint16_t *out, int stride, - int mbs_per_slice, int blocks_per_mb) -{ - int mb, x, y, src_offset, dst_offset; - const DCTELEM *src1, *src2; - uint16_t *dst1, *dst2; - - src1 = in; - src2 = in + (blocks_per_mb << 5); - dst1 = out; - dst2 = out + (stride << 3); - - for (mb = 0; mb < mbs_per_slice; mb++) { - for (y = 0, dst_offset = 0; y < 8; y++, dst_offset += stride) { - for (x = 0; x < 8; x++) { - src_offset = (y << 3) + x; - - dst1[dst_offset + x] = CLIP_AND_BIAS(src1[src_offset]); - dst2[dst_offset + x] = CLIP_AND_BIAS(src2[src_offset]); - - if (blocks_per_mb > 2) { - dst1[dst_offset + x + 8] = - CLIP_AND_BIAS(src1[src_offset + 64]); - dst2[dst_offset + x + 8] = - CLIP_AND_BIAS(src2[src_offset + 64]); - } - } - } - - src1 += blocks_per_mb << 6; - src2 += blocks_per_mb << 6; - dst1 += blocks_per_mb << 2; - dst2 += blocks_per_mb << 2; - } -} - - /** * Decode a slice plane (luma or chroma). */ -static void decode_slice_plane(ProresContext *ctx, const uint8_t *buf, +static void decode_slice_plane(ProresContext *ctx, ProresThreadData *td, + const uint8_t *buf, int data_size, uint16_t *out_ptr, int linesize, int mbs_per_slice, int blocks_per_mb, int plane_size_factor, @@ -502,43 +463,47 @@ static void decode_slice_plane(ProresContext *ctx, const uint8_t *buf, { GetBitContext gb; DCTELEM *block_ptr; - int i, blk_num, blocks_per_slice; + int mb_num, blocks_per_slice; blocks_per_slice = mbs_per_slice * blocks_per_mb; - memset(ctx->blocks, 0, 8 * 4 * 64 * sizeof(*ctx->blocks)); + memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks)); init_get_bits(&gb, buf, data_size << 3); - decode_dc_coeffs(&gb, ctx->blocks, blocks_per_slice); + decode_dc_coeffs(&gb, td->blocks, blocks_per_slice); - decode_ac_coeffs(&gb, ctx->blocks, blocks_per_slice, + decode_ac_coeffs(&gb, td->blocks, blocks_per_slice, plane_size_factor, ctx->scantable.permutated); /* inverse quantization, inverse transform and output */ - block_ptr = ctx->blocks; - - for (blk_num = 0; blk_num < blocks_per_slice; blk_num++, block_ptr += 64) { - /* TODO: the correct solution shoud be (block_ptr[i] * qmat[i]) >> 1 - * and the input of the inverse transform should be scaled by 2 - * in order to avoid rounding errors. - * Due to the fact the existing Libav transforms are incompatible with - * that input I temporally introduced the coarse solution below... */ - for (i = 0; i < 64; i++) - block_ptr[i] = (block_ptr[i] * qmat[i]) >> 2; - - ctx->dsp.idct(block_ptr); + block_ptr = td->blocks; + + for (mb_num = 0; mb_num < mbs_per_slice; mb_num++, out_ptr += blocks_per_mb * 4) { + ctx->dsp.idct_put(out_ptr, linesize, block_ptr, qmat); + block_ptr += 64; + if (blocks_per_mb > 2) { + ctx->dsp.idct_put(out_ptr + 8, linesize, block_ptr, qmat); + block_ptr += 64; + } + ctx->dsp.idct_put(out_ptr + linesize * 4, linesize, block_ptr, qmat); + block_ptr += 64; + if (blocks_per_mb > 2) { + ctx->dsp.idct_put(out_ptr + linesize * 4 + 8, linesize, block_ptr, qmat); + block_ptr += 64; + } } - - put_pixels(ctx->blocks, out_ptr, linesize >> 1, mbs_per_slice, - blocks_per_mb); } -static int decode_slice(ProresContext *ctx, int pic_num, int slice_num, - int mb_x_pos, int mb_y_pos, int mbs_per_slice, - AVCodecContext *avctx) +static int decode_slice(AVCodecContext *avctx, ProresThreadData *td) { + ProresContext *ctx = avctx->priv_data; + int mb_x_pos = td->x_pos; + int mb_y_pos = td->y_pos; + int pic_num = ctx->pic_num; + int slice_num = td->slice_num; + int mbs_per_slice = td->slice_width; const uint8_t *buf; uint8_t *y_data, *u_data, *v_data; AVFrame *pic = avctx->coded_frame; @@ -546,8 +511,8 @@ static int decode_slice(ProresContext *ctx, int pic_num, int slice_num, int slice_data_size, hdr_size, y_data_size, u_data_size, v_data_size; int y_linesize, u_linesize, v_linesize; - buf = ctx->slice_data_index[slice_num]; - slice_data_size = ctx->slice_data_index[slice_num + 1] - buf; + buf = ctx->slice_data[slice_num].index; + slice_data_size = ctx->slice_data[slice_num + 1].index - buf; slice_width_factor = av_log2(mbs_per_slice); @@ -593,20 +558,20 @@ static int decode_slice(ProresContext *ctx, int pic_num, int slice_num, if (ctx->qmat_changed || sf != ctx->prev_slice_sf) { ctx->prev_slice_sf = sf; for (i = 0; i < 64; i++) { - ctx->qmat_luma_scaled[i] = ctx->qmat_luma[i] * sf; - ctx->qmat_chroma_scaled[i] = ctx->qmat_chroma[i] * sf; + ctx->qmat_luma_scaled[ctx->dsp.idct_permutation[i]] = ctx->qmat_luma[i] * sf; + ctx->qmat_chroma_scaled[ctx->dsp.idct_permutation[i]] = ctx->qmat_chroma[i] * sf; } } /* decode luma plane */ - decode_slice_plane(ctx, buf + hdr_size, y_data_size, + decode_slice_plane(ctx, td, buf + hdr_size, y_data_size, (uint16_t*) (y_data + (mb_y_pos << 4) * y_linesize + (mb_x_pos << 5)), y_linesize, mbs_per_slice, 4, slice_width_factor + 2, ctx->qmat_luma_scaled); /* decode U chroma plane */ - decode_slice_plane(ctx, buf + hdr_size + y_data_size, u_data_size, + decode_slice_plane(ctx, td, buf + hdr_size + y_data_size, u_data_size, (uint16_t*) (u_data + (mb_y_pos << 4) * u_linesize + (mb_x_pos << ctx->mb_chroma_factor)), u_linesize, mbs_per_slice, ctx->num_chroma_blocks, @@ -614,7 +579,7 @@ static int decode_slice(ProresContext *ctx, int pic_num, int slice_num, ctx->qmat_chroma_scaled); /* decode V chroma plane */ - decode_slice_plane(ctx, buf + hdr_size + y_data_size + u_data_size, + decode_slice_plane(ctx, td, buf + hdr_size + y_data_size + u_data_size, v_data_size, (uint16_t*) (v_data + (mb_y_pos << 4) * v_linesize + (mb_x_pos << ctx->mb_chroma_factor)), @@ -633,6 +598,7 @@ static int decode_picture(ProresContext *ctx, int pic_num, slice_num = 0; + ctx->pic_num = pic_num; for (y_pos = 0; y_pos < ctx->num_y_mbs; y_pos++) { slice_width = 1 << ctx->slice_width_factor; @@ -641,15 +607,18 @@ static int decode_picture(ProresContext *ctx, int pic_num, while (ctx->num_x_mbs - x_pos < slice_width) slice_width >>= 1; - if (decode_slice(ctx, pic_num, slice_num, x_pos, y_pos, - slice_width, avctx) < 0) - return -1; + ctx->slice_data[slice_num].slice_num = slice_num; + ctx->slice_data[slice_num].x_pos = x_pos; + ctx->slice_data[slice_num].y_pos = y_pos; + ctx->slice_data[slice_num].slice_width = slice_width; slice_num++; } } - return 0; + return avctx->execute(avctx, (void *) decode_slice, + ctx->slice_data, NULL, slice_num, + sizeof(ctx->slice_data[0])); } @@ -712,7 +681,7 @@ static av_cold int decode_close(AVCodecContext *avctx) if (ctx->picture.data[0]) avctx->release_buffer(avctx, &ctx->picture); - av_freep(&ctx->slice_data_index); + av_freep(&ctx->slice_data); return 0; } @@ -726,6 +695,6 @@ AVCodec ff_prores_lgpl_decoder = { .init = decode_init, .close = decode_close, .decode = decode_frame, - .capabilities = CODEC_CAP_DR1, + .capabilities = CODEC_CAP_DR1 | CODEC_CAP_SLICE_THREADS, .long_name = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)") }; diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c new file mode 100644 index 0000000000..7e753e9dc7 --- /dev/null +++ b/libavcodec/proresdsp.c @@ -0,0 +1,63 @@ +/* + * Apple ProRes compatible decoder + * + * Copyright (c) 2010-2011 Maxim Poliakovski + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "proresdsp.h" +#include "simple_idct.h" + +#define BIAS (1 << (PRORES_BITS_PER_SAMPLE - 1)) ///< bias value for converting signed pixels into unsigned ones +#define CLIP_MIN (1 << (PRORES_BITS_PER_SAMPLE - 8)) ///< minimum value for clipping resulting pixels +#define CLIP_MAX (1 << PRORES_BITS_PER_SAMPLE) - CLIP_MIN - 1 ///< maximum value for clipping resulting pixels + +#define CLIP_AND_BIAS(x) (av_clip((x) + BIAS, CLIP_MIN, CLIP_MAX)) + +/** + * Add bias value, clamp and output pixels of a slice + */ +static void put_pixels(uint16_t *dst, int stride, const DCTELEM *in) +{ + int x, y, src_offset, dst_offset; + + for (y = 0, dst_offset = 0; y < 8; y++, dst_offset += stride) { + for (x = 0; x < 8; x++) { + src_offset = (y << 3) + x; + + dst[dst_offset + x] = CLIP_AND_BIAS(in[src_offset]); + } + } +} + +static void prores_idct_put_c(uint16_t *out, int linesize, DCTELEM *block, const int16_t *qmat) +{ + ff_prores_idct(block, qmat); + put_pixels(out, linesize >> 1, block); +} + +void ff_proresdsp_init(ProresDSPContext *dsp) +{ + dsp->idct_put = prores_idct_put_c; + dsp->idct_permutation_type = FF_NO_IDCT_PERM; + + if (HAVE_MMX) ff_proresdsp_x86_init(dsp); + + ff_init_scantable_permutation(dsp->idct_permutation, + dsp->idct_permutation_type); +} diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h new file mode 100644 index 0000000000..8b864faabd --- /dev/null +++ b/libavcodec/proresdsp.h @@ -0,0 +1,40 @@ +/* + * Apple ProRes compatible decoder + * + * Copyright (c) 2010-2011 Maxim Poliakovski + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PRORESDSP_H +#define AVCODEC_PRORESDSP_H + +#include "dsputil.h" + +#define PRORES_BITS_PER_SAMPLE 10 ///< output precision of prores decoder + +typedef struct { + int idct_permutation_type; + uint8_t idct_permutation[64]; + void (* idct_put) (uint16_t *out, int linesize, DCTELEM *block, const int16_t *qmat); +} ProresDSPContext; + +void ff_proresdsp_init(ProresDSPContext *dsp); + +void ff_proresdsp_x86_init(ProresDSPContext *dsp); + +#endif /* AVCODEC_PRORESDSP_H */ diff --git a/libavcodec/simple_idct.c b/libavcodec/simple_idct.c index ffe5a0b070..b4e763a704 100644 --- a/libavcodec/simple_idct.c +++ b/libavcodec/simple_idct.c @@ -221,3 +221,20 @@ void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block) idct4col_add(dest + i, line_size, block + i); } } + +void ff_prores_idct(DCTELEM *block, const int16_t *qmat) +{ + int i; + + for (i = 0; i < 64; i++) + block[i] *= qmat[i]; + + for (i = 0; i < 8; i++) + idctRowCondDC_10(block + i*8); + + for (i = 0; i < 64; i++) + block[i] >>= 2; + + for (i = 0; i < 8; i++) + idctSparseCol_10(block + i); +} diff --git a/libavcodec/simple_idct.h b/libavcodec/simple_idct.h index 10ac7da5e6..64d3c2ac31 100644 --- a/libavcodec/simple_idct.h +++ b/libavcodec/simple_idct.h @@ -38,6 +38,12 @@ void ff_simple_idct_8(DCTELEM *block); void ff_simple_idct_put_10(uint8_t *dest, int line_size, DCTELEM *block); void ff_simple_idct_add_10(uint8_t *dest, int line_size, DCTELEM *block); void ff_simple_idct_10(DCTELEM *block); +/** + * Special version of ff_simple_idct_10() which does dequantization + * and scales by a factor of 2 more between the two IDCTs to account + * for larger scale of input coefficients. + */ +void ff_prores_idct(DCTELEM *block, const int16_t *qmat); void ff_simple_idct_mmx(int16_t *block); void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block); diff --git a/libavcodec/sipr.c b/libavcodec/sipr.c index a2e2fe4267..098d1074a7 100644 --- a/libavcodec/sipr.c +++ b/libavcodec/sipr.c @@ -194,14 +194,16 @@ static void decode_parameters(SiprParameters* parms, GetBitContext *pgb, { int i, j; - parms->ma_pred_switch = get_bits(pgb, p->ma_predictor_bits); + if (p->ma_predictor_bits) + parms->ma_pred_switch = get_bits(pgb, p->ma_predictor_bits); for (i = 0; i < 5; i++) parms->vq_indexes[i] = get_bits(pgb, p->vq_indexes_bits[i]); for (i = 0; i < p->subframe_count; i++) { parms->pitch_delay[i] = get_bits(pgb, p->pitch_delay_bits[i]); - parms->gp_index[i] = get_bits(pgb, p->gp_index_bits); + if (p->gp_index_bits) + parms->gp_index[i] = get_bits(pgb, p->gp_index_bits); for (j = 0; j < p->number_of_fc_indexes; j++) parms->fc_indexes[i][j] = get_bits(pgb, p->fc_index_bits[j]); diff --git a/libavcodec/smacker.c b/libavcodec/smacker.c index 3a4ba1a114..c7eafbc261 100644 --- a/libavcodec/smacker.c +++ b/libavcodec/smacker.c @@ -560,6 +560,10 @@ static av_cold int decode_end(AVCodecContext *avctx) static av_cold int smka_decode_init(AVCodecContext *avctx) { + if (avctx->channels < 1 || avctx->channels > 2) { + av_log(avctx, AV_LOG_ERROR, "invalid number of channels\n"); + return AVERROR(EINVAL); + } avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO; avctx->sample_fmt = avctx->bits_per_coded_sample == 8 ? AV_SAMPLE_FMT_U8 : AV_SAMPLE_FMT_S16; return 0; @@ -583,6 +587,11 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data, int *data_size, int bits, stereo; int pred[2] = {0, 0}; + if (buf_size <= 4) { + av_log(avctx, AV_LOG_ERROR, "packet is too small\n"); + return AVERROR(EINVAL); + } + unp_size = AV_RL32(buf); init_get_bits(&gb, buf + 4, (buf_size - 4) * 8); @@ -598,6 +607,14 @@ static int smka_decode_frame(AVCodecContext *avctx, void *data, int *data_size, av_log(avctx, AV_LOG_ERROR, "Frame is too large to fit in buffer\n"); return -1; } + if (stereo ^ (avctx->channels != 1)) { + av_log(avctx, AV_LOG_ERROR, "channels mismatch\n"); + return AVERROR(EINVAL); + } + if (bits && avctx->sample_fmt == AV_SAMPLE_FMT_U8) { + av_log(avctx, AV_LOG_ERROR, "sample format mismatch\n"); + return AVERROR(EINVAL); + } memset(vlc, 0, sizeof(VLC) * 4); memset(h, 0, sizeof(HuffContext) * 4); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index bce2c3ecaf..d031f6505b 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -34,6 +34,8 @@ MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o +YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o +MMX-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp-init.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 96983773da..2322a29fb7 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -64,6 +64,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x00400 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; diff --git a/libavcodec/x86/proresdsp-init.c b/libavcodec/x86/proresdsp-init.c new file mode 100644 index 0000000000..9760105ab6 --- /dev/null +++ b/libavcodec/x86/proresdsp-init.c @@ -0,0 +1,54 @@ +/* + * Apple ProRes compatible decoder + * + * Copyright (c) 2010-2011 Maxim Poliakovski + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/proresdsp.h" + +void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, + DCTELEM *block); +void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, + DCTELEM *block); +void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize, + DCTELEM *block); + +void ff_proresdsp_x86_init(ProresDSPContext *dsp) +{ +#if ARCH_X86_64 + int flags = av_get_cpu_flags(); + + if (flags & AV_CPU_FLAG_SSE2) { + dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + dsp->idct_put = ff_prores_idct_put_10_sse2; + } + + if (flags & AV_CPU_FLAG_SSE4) { + dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + dsp->idct_put = ff_prores_idct_put_10_sse4; + } + +#if HAVE_AVX + if (flags & AV_CPU_FLAG_AVX) { + dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + dsp->idct_put = ff_prores_idct_put_10_avx; + } +#endif +#endif +} diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm new file mode 100644 index 0000000000..9365bf1ea4 --- /dev/null +++ b/libavcodec/x86/proresdsp.asm @@ -0,0 +1,432 @@ +;****************************************************************************** +;* x86-SIMD-optimized IDCT for prores +;* this is identical to "simple" IDCT except for the clip range +;* +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1 +%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1 +%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2 +%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1 +%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1 +%define W6sh2 8867 ; W6 = 35468 = 8867<<2 +%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1 + +%ifdef ARCH_X86_64 + +SECTION_RODATA + +w4_plus_w2: times 4 dw W4sh2, +W2sh2 +w4_min_w2: times 4 dw W4sh2, -W2sh2 +w4_plus_w6: times 4 dw W4sh2, +W6sh2 +w4_min_w6: times 4 dw W4sh2, -W6sh2 +w1_plus_w3: times 4 dw W1sh2, +W3sh2 +w3_min_w1: times 4 dw W3sh2, -W1sh2 +w7_plus_w3: times 4 dw W7sh2, +W3sh2 +w3_min_w7: times 4 dw W3sh2, -W7sh2 +w1_plus_w5: times 4 dw W1sh2, +W5sh2 +w5_min_w1: times 4 dw W5sh2, -W1sh2 +w5_plus_w7: times 4 dw W5sh2, +W7sh2 +w7_min_w5: times 4 dw W7sh2, -W5sh2 +row_round: times 8 dw (1<<14) + +cextern pw_4 +cextern pw_8 +cextern pw_512 +cextern pw_1019 + +section .text align=16 + +; interleave data while maintaining source +; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave +%macro SBUTTERFLY3 5 + punpckl%1 m%2, m%4, m%5 + punpckh%1 m%3, m%4, m%5 +%endmacro + +; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift +; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6 +; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3) +%macro SUMSUB_SHPK 7 + psubd %3, %1, %5 ; { a0 - b0 }[0-3] + psubd %4, %2, %6 ; { a0 - b0 }[4-7] + paddd %1, %5 ; { a0 + b0 }[0-3] + paddd %2, %6 ; { a0 + b0 }[4-7] + psrad %1, %7 + psrad %2, %7 + psrad %3, %7 + psrad %4, %7 + packssdw %1, %2 ; row[0] + packssdw %3, %4 ; row[7] +%endmacro + +; %1 = row or col (for rounding variable) +; %2 = number of bits to shift at the end +; %3 = optimization +%macro IDCT_1D 3 + ; a0 = (W4 * row[0]) + (1 << (15 - 1)); + ; a1 = a0; + ; a2 = a0; + ; a3 = a0; + ; a0 += W2 * row[2]; + ; a1 += W6 * row[2]; + ; a2 -= W6 * row[2]; + ; a3 -= W2 * row[2]; +%ifidn %1, col + paddw m10,[pw_8] +%endif + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] +%ifidn %1, row + psubw m10,[row_round] +%endif + SIGNEXTEND m8, m9, m14 ; { row[2] }[0-3] / [4-7] + SIGNEXTEND m10, m11, m14 ; { row[0] }[0-3] / [4-7] + pmaddwd m2, m0, [w4_plus_w6] + pmaddwd m3, m1, [w4_plus_w6] + pmaddwd m4, m0, [w4_min_w6] + pmaddwd m5, m1, [w4_min_w6] + pmaddwd m6, m0, [w4_min_w2] + pmaddwd m7, m1, [w4_min_w2] + pmaddwd m0, [w4_plus_w2] + pmaddwd m1, [w4_plus_w2] + pslld m2, 2 + pslld m3, 2 + pslld m4, 2 + pslld m5, 2 + pslld m6, 2 + pslld m7, 2 + pslld m0, 2 + pslld m1, 2 + + ; a0: -1*row[0]-1*row[2] + ; a1: -1*row[0] + ; a2: -1*row[0] + ; a3: -1*row[0]+1*row[2] + psubd m2, m10 ; a1[0-3] + psubd m3, m11 ; a1[4-7] + psubd m4, m10 ; a2[0-3] + psubd m5, m11 ; a2[4-7] + psubd m0, m10 + psubd m1, m11 + psubd m6, m10 + psubd m7, m11 + psubd m0, m8 ; a0[0-3] + psubd m1, m9 ; a0[4-7] + paddd m6, m8 ; a3[0-3] + paddd m7, m9 ; a3[4-7] + + ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] + ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] + ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] + ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] + SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] + SIGNEXTEND m13, m14, m10 ; { row[4] }[0-3] / [4-7] + pmaddwd m10, m8, [w4_plus_w6] + pmaddwd m11, m9, [w4_plus_w6] + pslld m10, 2 + pslld m11, 2 + psubd m10, m13 + psubd m11, m14 + paddd m0, m10 ; a0[0-3] + paddd m1, m11 ; a0[4-7] + pmaddwd m10, m8, [w4_min_w6] + pmaddwd m11, m9, [w4_min_w6] + pslld m10, 2 + pslld m11, 2 + psubd m10, m13 + psubd m11, m14 + paddd m6, m10 ; a3[0-3] + paddd m7, m11 ; a3[4-7] + pmaddwd m10, m8, [w4_min_w2] + pmaddwd m11, m9, [w4_min_w2] + pmaddwd m8, [w4_plus_w2] + pmaddwd m9, [w4_plus_w2] + pslld m10, 2 + pslld m11, 2 + pslld m8, 2 + pslld m9, 2 + psubd m10, m13 + psubd m11, m14 + psubd m8, m13 + psubd m9, m14 + psubd m4, m10 ; a2[0-3] intermediate + psubd m5, m11 ; a2[4-7] intermediate + psubd m2, m8 ; a1[0-3] intermediate + psubd m3, m9 ; a1[4-7] intermediate + SIGNEXTEND m12, m13, m10 ; { row[6] }[0-3] / [4-7] + psubd m4, m12 ; a2[0-3] + psubd m5, m13 ; a2[4-7] + paddd m2, m12 ; a1[0-3] + paddd m3, m13 ; a1[4-7] + + ; load/store + mova [r2+ 0], m0 + mova [r2+ 32], m2 + mova [r2+ 64], m4 + mova [r2+ 96], m6 + mova m10,[r2+ 16] ; { row[1] }[0-7] + mova m8, [r2+ 48] ; { row[3] }[0-7] + mova m13,[r2+ 80] ; { row[5] }[0-7] + mova m14,[r2+112] ; { row[7] }[0-7] + mova [r2+ 16], m1 + mova [r2+ 48], m3 + mova [r2+ 80], m5 + mova [r2+112], m7 +%ifidn %1, row + pmullw m10,[r3+ 16] + pmullw m8, [r3+ 48] + pmullw m13,[r3+ 80] + pmullw m14,[r3+112] +%endif + + ; b0 = MUL(W1, row[1]); + ; MAC(b0, W3, row[3]); + ; b1 = MUL(W3, row[1]); + ; MAC(b1, -W7, row[3]); + ; b2 = MUL(W5, row[1]); + ; MAC(b2, -W1, row[3]); + ; b3 = MUL(W7, row[1]); + ; MAC(b3, -W5, row[3]); + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] + SIGNEXTEND m10, m11, m12 ; { row[1] }[0-3] / [4-7] + SIGNEXTEND m8, m9, m12 ; { row[3] }[0-3] / [4-7] + pmaddwd m2, m0, [w3_min_w7] + pmaddwd m3, m1, [w3_min_w7] + pmaddwd m4, m0, [w5_min_w1] + pmaddwd m5, m1, [w5_min_w1] + pmaddwd m6, m0, [w7_min_w5] + pmaddwd m7, m1, [w7_min_w5] + pmaddwd m0, [w1_plus_w3] + pmaddwd m1, [w1_plus_w3] + pslld m2, 2 + pslld m3, 2 + pslld m4, 2 + pslld m5, 2 + pslld m6, 2 + pslld m7, 2 + pslld m0, 2 + pslld m1, 2 + + ; b0: +1*row[1]+2*row[3] + ; b1: +2*row[1]-1*row[3] + ; b2: -1*row[1]-1*row[3] + ; b3: +1*row[1]+1*row[3] + psubd m2, m8 + psubd m3, m9 + paddd m0, m8 + paddd m1, m9 + paddd m8, m10 ; { row[1] + row[3] }[0-3] + paddd m9, m11 ; { row[1] + row[3] }[4-7] + paddd m10, m10 + paddd m11, m11 + paddd m0, m8 ; b0[0-3] + paddd m1, m9 ; b0[4-7] + paddd m2, m10 ; b1[0-3] + paddd m3, m11 ; b2[4-7] + psubd m4, m8 ; b2[0-3] + psubd m5, m9 ; b2[4-7] + paddd m6, m8 ; b3[0-3] + paddd m7, m9 ; b3[4-7] + + ; MAC(b0, W5, row[5]); + ; MAC(b0, W7, row[7]); + ; MAC(b1, -W1, row[5]); + ; MAC(b1, -W5, row[7]); + ; MAC(b2, W7, row[5]); + ; MAC(b2, W3, row[7]); + ; MAC(b3, W3, row[5]); + ; MAC(b3, -W1, row[7]); + SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] + SIGNEXTEND m13, m12, m11 ; { row[5] }[0-3] / [4-7] + SIGNEXTEND m14, m11, m10 ; { row[7] }[0-3] / [4-7] + + ; b0: -1*row[5]+1*row[7] + ; b1: -1*row[5]+1*row[7] + ; b2: +1*row[5]+2*row[7] + ; b3: +2*row[5]-1*row[7] + paddd m4, m13 + paddd m5, m12 + paddd m6, m13 + paddd m7, m12 + psubd m13, m14 ; { row[5] - row[7] }[0-3] + psubd m12, m11 ; { row[5] - row[7] }[4-7] + paddd m14, m14 + paddd m11, m11 + psubd m0, m13 + psubd m1, m12 + psubd m2, m13 + psubd m3, m12 + paddd m4, m14 + paddd m5, m11 + paddd m6, m13 + paddd m7, m12 + + pmaddwd m10, m8, [w1_plus_w5] + pmaddwd m11, m9, [w1_plus_w5] + pmaddwd m12, m8, [w5_plus_w7] + pmaddwd m13, m9, [w5_plus_w7] + pslld m10, 2 + pslld m11, 2 + pslld m12, 2 + pslld m13, 2 + psubd m2, m10 ; b1[0-3] + psubd m3, m11 ; b1[4-7] + paddd m0, m12 ; b0[0-3] + paddd m1, m13 ; b0[4-7] + pmaddwd m12, m8, [w7_plus_w3] + pmaddwd m13, m9, [w7_plus_w3] + pmaddwd m8, [w3_min_w1] + pmaddwd m9, [w3_min_w1] + pslld m12, 2 + pslld m13, 2 + pslld m8, 2 + pslld m9, 2 + paddd m4, m12 ; b2[0-3] + paddd m5, m13 ; b2[4-7] + paddd m6, m8 ; b3[0-3] + paddd m7, m9 ; b3[4-7] + + ; row[0] = (a0 + b0) >> 15; + ; row[7] = (a0 - b0) >> 15; + ; row[1] = (a1 + b1) >> 15; + ; row[6] = (a1 - b1) >> 15; + ; row[2] = (a2 + b2) >> 15; + ; row[5] = (a2 - b2) >> 15; + ; row[3] = (a3 + b3) >> 15; + ; row[4] = (a3 - b3) >> 15; + mova m8, [r2+ 0] ; a0[0-3] + mova m9, [r2+16] ; a0[4-7] + SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2 + mova m0, [r2+32] ; a1[0-3] + mova m1, [r2+48] ; a1[4-7] + SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2 + mova m1, [r2+64] ; a2[0-3] + mova m2, [r2+80] ; a2[4-7] + SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2 + mova m2, [r2+96] ; a3[0-3] + mova m3, [r2+112] ; a3[4-7] + SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2 +%endmacro + +; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride, +; DCTELEM *block, const int16_t *qmat); +%macro idct_put_fn 2 +cglobal prores_idct_put_10_%1, 4, 4, %2 + movsxd r1, r1d + pxor m15, m15 ; zero + + ; for (i = 0; i < 8; i++) + ; idctRowCondDC(block + i*8); + mova m10,[r2+ 0] ; { row[0] }[0-7] + mova m8, [r2+32] ; { row[2] }[0-7] + mova m13,[r2+64] ; { row[4] }[0-7] + mova m12,[r2+96] ; { row[6] }[0-7] + + pmullw m10,[r3+ 0] + pmullw m8, [r3+32] + pmullw m13,[r3+64] + pmullw m12,[r3+96] + + IDCT_1D row, 17, %1 + + ; transpose for second part of IDCT + TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 + mova [r2+ 16], m0 + mova [r2+ 48], m2 + mova [r2+ 80], m11 + mova [r2+112], m10 + SWAP 8, 10 + SWAP 1, 8 + SWAP 4, 13 + SWAP 9, 12 + + ; for (i = 0; i < 8; i++) + ; idctSparseColAdd(dest + i, line_size, block + i); + IDCT_1D col, 20, %1 + + ; clip/store + mova m6, [pw_512] + mova m3, [pw_4] + mova m5, [pw_1019] + paddw m8, m6 + paddw m0, m6 + paddw m1, m6 + paddw m2, m6 + paddw m4, m6 + paddw m11, m6 + paddw m9, m6 + paddw m10, m6 + pmaxsw m8, m3 + pmaxsw m0, m3 + pmaxsw m1, m3 + pmaxsw m2, m3 + pmaxsw m4, m3 + pmaxsw m11, m3 + pmaxsw m9, m3 + pmaxsw m10, m3 + pminsw m8, m5 + pminsw m0, m5 + pminsw m1, m5 + pminsw m2, m5 + pminsw m4, m5 + pminsw m11, m5 + pminsw m9, m5 + pminsw m10, m5 + + lea r2, [r1*3] + mova [r0 ], m8 + mova [r0+r1 ], m0 + mova [r0+r1*2], m1 + mova [r0+r2 ], m2 + lea r0, [r0+r1*4] + mova [r0 ], m4 + mova [r0+r1 ], m11 + mova [r0+r1*2], m9 + mova [r0+r2 ], m10 + RET +%endmacro + +%macro signextend_sse2 3 ; dstlow, dsthigh, tmp + pxor %3, %3 + pcmpgtw %3, %1 + mova %2, %1 + punpcklwd %1, %3 + punpckhwd %2, %3 +%endmacro + +%macro signextend_sse4 2-3 ; dstlow, dsthigh + movhlps %2, %1 + pmovsxwd %1, %1 + pmovsxwd %2, %2 +%endmacro + +INIT_XMM +%define SIGNEXTEND signextend_sse2 +idct_put_fn sse2, 16 +INIT_XMM +%define SIGNEXTEND signextend_sse4 +idct_put_fn sse4, 16 +INIT_AVX +idct_put_fn avx, 16 + +%endif |