lavc: add a HEVC decoder.

Initially written by Guillaume Martres <[email protected]> as a GSoC project. Further contributions by the OpenHEVC project and other developers, namely: Mickaël Raulet <[email protected]> Seppo Tomperi <[email protected]> Gildas Cocherel <[email protected]> Khaled Jerbi <[email protected]> Wassim Hamidouche <[email protected]> Vittorio Giovara <[email protected]> Jan Ekström <[email protected]> Anton Khirnov <[email protected]> Martin Storsjö <[email protected]> Luca Barbato <[email protected]> Yusuke Nakamura <[email protected]> Signed-off-by: Anton Khirnov <[email protected]> Signed-off-by: Michael Niedermayer <[email protected]>
author: Guillaume Martres <[email protected]> 2013-10-12 11:55:48 +0200
committer: Michael Niedermayer <[email protected]> 2013-10-15 22:13:02 +0200
commit: c8dd048ab8cff815c9f4b16a62db0b74df011f0a (patch)
tree: e9167d50e3b802a195b6fcfb4c042332f0d2b469
parent: 2a19fcc12311f71f55eab7129b764d4cb800c934 (diff)
20 files changed, 11023 insertions, 1 deletions
diff --git a/configure b/configure
index 1d32ca93d9..101954e3e4 100755
--- a/configure
+++ b/configure
@@ -1809,6 +1809,7 @@ h263i_decoder_select="h263_decoder"
 h263p_encoder_select="h263_encoder"
 h264_decoder_select="golomb h264chroma h264dsp h264pred h264qpel videodsp"
 h264_decoder_suggest="error_resilience"
+hevc_decoder_select="dsputil golomb videodsp"
 huffyuv_decoder_select="dsputil"
 huffyuv_encoder_select="dsputil huffman"
 iac_decoder_select="dsputil fft mdct sinewin"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 230c3f3ae4..d46c5462a5 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -233,6 +233,10 @@ OBJS-$(CONFIG_H264_DECODER)            += h264.o                               \
                                           cabac.o h264_sei.o h264_ps.o         \
                                           h264_refs.o h264_cavlc.o h264_cabac.o
 OBJS-$(CONFIG_H264_VDA_DECODER)        += vda_h264_dec.o
+OBJS-$(CONFIG_HEVC_DECODER)            += hevc.o hevc_mvs.o hevc_ps.o hevc_sei.o \
+                                          hevc_cabac.o hevc_refs.o hevcpred.o    \
+                                          hevcdsp.o hevc_filter.o cabac.o
+
 OBJS-$(CONFIG_HUFFYUV_DECODER)         += huffyuv.o huffyuvdec.o
 OBJS-$(CONFIG_HUFFYUV_ENCODER)         += huffyuv.o huffyuvenc.o
 OBJS-$(CONFIG_IAC_DECODER)             += imc.o
@@ -758,6 +762,7 @@ OBJS-$(CONFIG_H264_PARSER)             += h264_parser.o h264.o            \
                                           h264_refs.o h264_sei.o h264_direct.o \
                                           h264_loopfilter.o h264_cabac.o \
                                           h264_cavlc.o h264_ps.o
+OBJS-$(CONFIG_HEVC_PARSER)             += hevc_parser.o
 OBJS-$(CONFIG_MJPEG_PARSER)            += mjpeg_parser.o
 OBJS-$(CONFIG_MLP_PARSER)              += mlp_parser.o mlp.o
 OBJS-$(CONFIG_MPEG4VIDEO_PARSER)       += mpeg4video_parser.o h263.o \
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 8e08a2349d..e20c9cd38a 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -164,6 +164,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER(H264_CRYSTALHD,    h264_crystalhd);
     REGISTER_DECODER(H264_VDA,          h264_vda);
     REGISTER_DECODER(H264_VDPAU,        h264_vdpau);
+    REGISTER_DECODER(HEVC,              hevc);
     REGISTER_ENCDEC (HUFFYUV,           huffyuv);
     REGISTER_DECODER(IDCIN,             idcin);
     REGISTER_DECODER(IFF_BYTERUN1,      iff_byterun1);
@@ -534,6 +535,7 @@ void avcodec_register_all(void)
     REGISTER_PARSER(H261,               h261);
     REGISTER_PARSER(H263,               h263);
     REGISTER_PARSER(H264,               h264);
+    REGISTER_PARSER(HEVC,               hevc);
     REGISTER_PARSER(MJPEG,              mjpeg);
     REGISTER_PARSER(MLP,                mlp);
     REGISTER_PARSER(MPEG4VIDEO,         mpeg4video);
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index 8a7275d9de..b84258bfa7 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -160,4 +160,24 @@ static int av_unused get_cabac_terminate(CABACContext *c){
     }
 }
 
+/**
+ * Skip @p n bytes and reset the decoder.
+ * @return the address of the first skipped byte or NULL if there's less than @p n bytes left
+ */
+static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
+    const uint8_t *ptr = c->bytestream;
+
+    if (c->low & 0x1)
+        ptr--;
+#if CABAC_BITS == 16
+    if (c->low & 0x1FF)
+        ptr--;
+#endif
+    if ((int) (c->bytestream_end - ptr) < n)
+        return NULL;
+    ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n);
+
+    return ptr;
+}
+
 #endif /* AVCODEC_CABAC_FUNCTIONS_H */
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
new file mode 100644
index 0000000000..0b784b593f
--- /dev/null
+++ b/libavcodec/hevc.c
@@ -0,0 +1,3118 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Mickael Raulet
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2012 - 2013 Wassim Hamidouche
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/md5.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "bytestream.h"
+#include "cabac_functions.h"
+#include "dsputil.h"
+#include "golomb.h"
+#include "hevc.h"
+
+const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 2 };
+const uint8_t ff_hevc_qpel_extra_after[4]  = { 0, 3, 4, 4 };
+const uint8_t ff_hevc_qpel_extra[4]        = { 0, 6, 7, 6 };
+
+static const uint8_t scan_1x1[1] = {
+    0,
+};
+
+static const uint8_t horiz_scan2x2_x[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t horiz_scan2x2_y[4] = {
+    0, 0, 1, 1
+};
+
+static const uint8_t horiz_scan4x4_x[16] = {
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+    0, 1, 2, 3,
+};
+
+static const uint8_t horiz_scan4x4_y[16] = {
+    0, 0, 0, 0,
+    1, 1, 1, 1,
+    2, 2, 2, 2,
+    3, 3, 3, 3,
+};
+
+static const uint8_t horiz_scan8x8_inv[8][8] = {
+    {  0,  1,  2,  3, 16, 17, 18, 19, },
+    {  4,  5,  6,  7, 20, 21, 22, 23, },
+    {  8,  9, 10, 11, 24, 25, 26, 27, },
+    { 12, 13, 14, 15, 28, 29, 30, 31, },
+    { 32, 33, 34, 35, 48, 49, 50, 51, },
+    { 36, 37, 38, 39, 52, 53, 54, 55, },
+    { 40, 41, 42, 43, 56, 57, 58, 59, },
+    { 44, 45, 46, 47, 60, 61, 62, 63, },
+};
+
+static const uint8_t diag_scan4x1_x[4] = {
+    0, 1, 2, 3,
+};
+
+static const uint8_t diag_scan1x4_y[4] = {
+    0, 1, 2, 3,
+};
+
+static const uint8_t diag_scan2x2_x[4] = {
+    0, 0, 1, 1,
+};
+
+static const uint8_t diag_scan2x2_y[4] = {
+    0, 1, 0, 1,
+};
+
+static const uint8_t diag_scan2x2_inv[2][2] = {
+    { 0, 2, },
+    { 1, 3, },
+};
+
+static const uint8_t diag_scan8x2_x[16] = {
+    0, 0, 1, 1,
+    2, 2, 3, 3,
+    4, 4, 5, 5,
+    6, 6, 7, 7,
+};
+
+static const uint8_t diag_scan8x2_y[16] = {
+    0, 1, 0, 1,
+    0, 1, 0, 1,
+    0, 1, 0, 1,
+    0, 1, 0, 1,
+};
+
+static const uint8_t diag_scan8x2_inv[2][8] = {
+    { 0, 2, 4, 6, 8, 10, 12, 14, },
+    { 1, 3, 5, 7, 9, 11, 13, 15, },
+};
+
+static const uint8_t diag_scan2x8_x[16] = {
+    0, 0, 1, 0,
+    1, 0, 1, 0,
+    1, 0, 1, 0,
+    1, 0, 1, 1,
+};
+
+static const uint8_t diag_scan2x8_y[16] = {
+    0, 1, 0, 2,
+    1, 3, 2, 4,
+    3, 5, 4, 6,
+    5, 7, 6, 7,
+};
+
+static const uint8_t diag_scan2x8_inv[8][2] = {
+    {  0,  2, },
+    {  1,  4, },
+    {  3,  6, },
+    {  5,  8, },
+    {  7, 10, },
+    {  9, 12, },
+    { 11, 14, },
+    { 13, 15, },
+};
+
+const uint8_t ff_hevc_diag_scan4x4_x[16] = {
+    0, 0, 1, 0,
+    1, 2, 0, 1,
+    2, 3, 1, 2,
+    3, 2, 3, 3,
+};
+
+const uint8_t ff_hevc_diag_scan4x4_y[16] = {
+    0, 1, 0, 2,
+    1, 0, 3, 2,
+    1, 0, 3, 2,
+    1, 3, 2, 3,
+};
+
+static const uint8_t diag_scan4x4_inv[4][4] = {
+    { 0,  2,  5,  9, },
+    { 1,  4,  8, 12, },
+    { 3,  7, 11, 14, },
+    { 6, 10, 13, 15, },
+};
+
+const uint8_t ff_hevc_diag_scan8x8_x[64] = {
+    0, 0, 1, 0,
+    1, 2, 0, 1,
+    2, 3, 0, 1,
+    2, 3, 4, 0,
+    1, 2, 3, 4,
+    5, 0, 1, 2,
+    3, 4, 5, 6,
+    0, 1, 2, 3,
+    4, 5, 6, 7,
+    1, 2, 3, 4,
+    5, 6, 7, 2,
+    3, 4, 5, 6,
+    7, 3, 4, 5,
+    6, 7, 4, 5,
+    6, 7, 5, 6,
+    7, 6, 7, 7,
+};
+
+const uint8_t ff_hevc_diag_scan8x8_y[64] = {
+    0, 1, 0, 2,
+    1, 0, 3, 2,
+    1, 0, 4, 3,
+    2, 1, 0, 5,
+    4, 3, 2, 1,
+    0, 6, 5, 4,
+    3, 2, 1, 0,
+    7, 6, 5, 4,
+    3, 2, 1, 0,
+    7, 6, 5, 4,
+    3, 2, 1, 7,
+    6, 5, 4, 3,
+    2, 7, 6, 5,
+    4, 3, 7, 6,
+    5, 4, 7, 6,
+    5, 7, 6, 7,
+};
+
+static const uint8_t diag_scan8x8_inv[8][8] = {
+    {  0,  2,  5,  9, 14, 20, 27, 35, },
+    {  1,  4,  8, 13, 19, 26, 34, 42, },
+    {  3,  7, 12, 18, 25, 33, 41, 48, },
+    {  6, 11, 17, 24, 32, 40, 47, 53, },
+    { 10, 16, 23, 31, 39, 46, 52, 57, },
+    { 15, 22, 30, 38, 45, 51, 56, 60, },
+    { 21, 29, 37, 44, 50, 55, 59, 62, },
+    { 28, 36, 43, 49, 54, 58, 61, 63, },
+};
+
+/**
+ * NOTE: Each function hls_foo correspond to the function foo in the
+ * specification (HLS stands for High Level Syntax).
+ */
+
+/**
+ * Section 5.7
+ */
+
+/* free everything allocated  by pic_arrays_init() */
+static void pic_arrays_free(HEVCContext *s)
+{
+    av_freep(&s->sao);
+    av_freep(&s->deblock);
+    av_freep(&s->split_cu_flag);
+
+    av_freep(&s->skip_flag);
+    av_freep(&s->tab_ct_depth);
+
+    av_freep(&s->tab_ipm);
+    av_freep(&s->cbf_luma);
+    av_freep(&s->is_pcm);
+
+    av_freep(&s->qp_y_tab);
+    av_freep(&s->tab_slice_address);
+    av_freep(&s->filter_slice_edges);
+
+    av_freep(&s->horizontal_bs);
+    av_freep(&s->vertical_bs);
+
+    av_buffer_pool_uninit(&s->tab_mvf_pool);
+    av_buffer_pool_uninit(&s->rpl_tab_pool);
+}
+
+/* allocate arrays that depend on frame dimensions */
+static int pic_arrays_init(HEVCContext *s)
+{
+    int log2_min_cb_size     = s->sps->log2_min_coding_block_size;
+    int width                = s->sps->width;
+    int height               = s->sps->height;
+    int pic_size             = width * height;
+    int pic_size_in_ctb      = ((width  >> log2_min_cb_size) + 1) *
+                               ((height >> log2_min_cb_size) + 1);
+    int ctb_count            = s->sps->ctb_width * s->sps->ctb_height;
+    int pic_width_in_min_pu  = width  >> s->sps->log2_min_pu_size;
+    int pic_height_in_min_pu = height >> s->sps->log2_min_pu_size;
+    int pic_size_in_min_pu   = pic_width_in_min_pu * pic_height_in_min_pu;
+    int pic_width_in_min_tu  = width  >> s->sps->log2_min_transform_block_size;
+    int pic_height_in_min_tu = height >> s->sps->log2_min_transform_block_size;
+
+    s->bs_width  = width  >> 3;
+    s->bs_height = height >> 3;
+
+    s->sao           = av_mallocz_array(ctb_count, sizeof(*s->sao));
+    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
+    s->split_cu_flag = av_malloc(pic_size);
+    if (!s->sao || !s->deblock || !s->split_cu_flag)
+        goto fail;
+
+    s->skip_flag    = av_malloc(pic_size_in_ctb);
+    s->tab_ct_depth = av_malloc(s->sps->min_cb_height * s->sps->min_cb_width);
+    if (!s->skip_flag || !s->tab_ct_depth)
+        goto fail;
+
+    s->tab_ipm  = av_malloc(pic_size_in_min_pu);
+    s->cbf_luma = av_malloc(pic_width_in_min_tu * pic_height_in_min_tu);
+    s->is_pcm   = av_malloc(pic_size_in_min_pu);
+    if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
+        goto fail;
+
+    s->filter_slice_edges = av_malloc(ctb_count);
+    s->tab_slice_address = av_malloc(pic_size_in_ctb * sizeof(*s->tab_slice_address));
+    s->qp_y_tab = av_malloc(pic_size_in_ctb * sizeof(*s->qp_y_tab));
+    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
+        goto fail;
+
+    s->horizontal_bs = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
+    s->vertical_bs   = av_mallocz(2 * s->bs_width * (s->bs_height + 1));
+    if (!s->horizontal_bs || !s->vertical_bs)
+        goto fail;
+
+    s->tab_mvf_pool = av_buffer_pool_init(pic_size_in_min_pu * sizeof(MvField),
+                                          av_buffer_alloc);
+    if (!s->tab_mvf_pool)
+        goto fail;
+
+    s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab),
+                                          av_buffer_allocz);
+    if (!s->rpl_tab_pool)
+        goto fail;
+
+    return 0;
+fail:
+    pic_arrays_free(s);
+    return AVERROR(ENOMEM);
+}
+
+static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+{
+    int i = 0;
+    int j = 0;
+    uint8_t luma_weight_l0_flag[16];
+    uint8_t chroma_weight_l0_flag[16];
+    uint8_t luma_weight_l1_flag[16];
+    uint8_t chroma_weight_l1_flag[16];
+
+    s->sh.luma_log2_weight_denom = get_ue_golomb(gb);
+    if (s->sps->chroma_format_idc != 0) {
+        int delta = get_se_golomb(gb);
+        s->sh.chroma_log2_weight_denom = av_clip_c(s->sh.luma_log2_weight_denom + delta, 0, 7);
+    }
+
+    for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+        luma_weight_l0_flag[i] = get_bits1(gb);
+        if (!luma_weight_l0_flag[i]) {
+            s->sh.luma_weight_l0[i] = 1 << s->sh.luma_log2_weight_denom;
+            s->sh.luma_offset_l0[i] = 0;
+        }
+    }
+    if (s->sps->chroma_format_idc != 0) { //fix me ! invert "if" and "for"
+        for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+            chroma_weight_l0_flag[i] = get_bits1(gb);
+        }
+    } else {
+        for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+            chroma_weight_l0_flag[i] = 0;
+        }
+    }
+    for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+        if (luma_weight_l0_flag[i]) {
+            int delta_luma_weight_l0 = get_se_golomb(gb);
+            s->sh.luma_weight_l0[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l0;
+            s->sh.luma_offset_l0[i] = get_se_golomb(gb);
+        }
+        if (chroma_weight_l0_flag[i]) {
+            for (j = 0; j < 2; j++) {
+                int delta_chroma_weight_l0 = get_se_golomb(gb);
+                int delta_chroma_offset_l0 = get_se_golomb(gb);
+                s->sh.chroma_weight_l0[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l0;
+                s->sh.chroma_offset_l0[i][j] = av_clip_c((delta_chroma_offset_l0 - ((128 * s->sh.chroma_weight_l0[i][j])
+                                                                                     >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
+            }
+        } else {
+            s->sh.chroma_weight_l0[i][0] = 1 << s->sh.chroma_log2_weight_denom;
+            s->sh.chroma_offset_l0[i][0] = 0;
+            s->sh.chroma_weight_l0[i][1] = 1 << s->sh.chroma_log2_weight_denom;
+            s->sh.chroma_offset_l0[i][1] = 0;
+        }
+    }
+    if (s->sh.slice_type == B_SLICE) {
+        for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+            luma_weight_l1_flag[i] = get_bits1(gb);
+            if (!luma_weight_l1_flag[i]) {
+                s->sh.luma_weight_l1[i] = 1 << s->sh.luma_log2_weight_denom;
+                s->sh.luma_offset_l1[i] = 0;
+            }
+        }
+        if (s->sps->chroma_format_idc != 0) {
+            for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+                chroma_weight_l1_flag[i] = get_bits1(gb);
+            }
+        } else {
+            for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+                chroma_weight_l1_flag[i] = 0;
+            }
+        }
+        for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+            if (luma_weight_l1_flag[i]) {
+                int delta_luma_weight_l1 = get_se_golomb(gb);
+                s->sh.luma_weight_l1[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l1;
+                s->sh.luma_offset_l1[i] = get_se_golomb(gb);
+            }
+            if (chroma_weight_l1_flag[i]) {
+                for (j = 0; j < 2; j++) {
+                    int delta_chroma_weight_l1 = get_se_golomb(gb);
+                    int delta_chroma_offset_l1 = get_se_golomb(gb);
+                    s->sh.chroma_weight_l1[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l1;
+                    s->sh.chroma_offset_l1[i][j] = av_clip_c((delta_chroma_offset_l1 - ((128 * s->sh.chroma_weight_l1[i][j])
+                                                                                         >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
+                }
+            } else {
+                s->sh.chroma_weight_l1[i][0] = 1 << s->sh.chroma_log2_weight_denom;
+                s->sh.chroma_offset_l1[i][0] = 0;
+                s->sh.chroma_weight_l1[i][1] = 1 << s->sh.chroma_log2_weight_denom;
+                s->sh.chroma_offset_l1[i][1] = 0;
+            }
+        }
+    }
+}
+
+static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
+{
+    const HEVCSPS *sps = s->sps;
+    int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
+    int prev_delta_msb = 0;
+    int nb_sps = 0, nb_sh;
+    int i;
+
+    rps->nb_refs = 0;
+    if (!sps->long_term_ref_pics_present_flag)
+        return 0;
+
+    if (sps->num_long_term_ref_pics_sps > 0)
+        nb_sps = get_ue_golomb(gb);
+    nb_sh = get_ue_golomb(gb);
+
+    if (nb_sh + nb_sps > FF_ARRAY_ELEMS(rps->poc))
+        return AVERROR_INVALIDDATA;
+
+    rps->nb_refs = nb_sh + nb_sps;
+
+    for (i = 0; i < rps->nb_refs; i++) {
+        uint8_t delta_poc_msb_present;
+
+        if (i < nb_sps) {
+            uint8_t lt_idx_sps = 0;
+
+            if (sps->num_long_term_ref_pics_sps > 1)
+                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
+
+            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
+            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
+        } else {
+            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
+            rps->used[i] = get_bits1(gb);
+        }
+
+        delta_poc_msb_present = get_bits1(gb);
+        if (delta_poc_msb_present) {
+            int delta = get_ue_golomb(gb);
+
+            if (i && i != nb_sps)
+                delta += prev_delta_msb;
+
+            rps->poc[i] += s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
+            prev_delta_msb = delta;
+        }
+    }
+
+    return 0;
+}
+
+static int hls_slice_header(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc.gb;
+    SliceHeader   *sh = &s->sh;
+    int i, ret;
+
+    // Coded parameters
+    sh->first_slice_in_pic_flag = get_bits1(gb);
+    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        s->max_ra = INT_MAX;
+        if (IS_IDR(s))
+            ff_hevc_clear_refs(s);
+    }
+    if (s->nal_unit_type >= 16 && s->nal_unit_type <= 23)
+        sh->no_output_of_prior_pics_flag = get_bits1(gb);
+
+    sh->pps_id = get_ue_golomb(gb);
+    if (sh->pps_id >= MAX_PPS_COUNT || !s->pps_list[sh->pps_id]) {
+        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
+        return AVERROR_INVALIDDATA;
+    }
+    s->pps = (HEVCPPS*)s->pps_list[sh->pps_id]->data;
+
+    if (s->sps != (HEVCSPS*)s->sps_list[s->pps->sps_id]->data) {
+        s->sps = (HEVCSPS*)s->sps_list[s->pps->sps_id]->data;
+        s->vps = s->vps_list[s->sps->vps_id];
+
+        pic_arrays_free(s);
+        ret = pic_arrays_init(s);
+        if (ret < 0) {
+            s->sps = NULL;
+            return AVERROR(ENOMEM);
+        }
+
+        s->width  = s->sps->width;
+        s->height = s->sps->height;
+
+        s->avctx->coded_width  = s->sps->width;
+        s->avctx->coded_height = s->sps->height;
+        s->avctx->width        = s->sps->output_width;
+        s->avctx->height       = s->sps->output_height;
+        s->avctx->pix_fmt      = s->sps->pix_fmt;
+        s->avctx->sample_aspect_ratio = s->sps->vui.sar;
+        s->avctx->has_b_frames = s->sps->temporal_layer[s->sps->max_sub_layers - 1].num_reorder_pics;
+
+        if (s->sps->chroma_format_idc == 0 || s->sps->separate_colour_plane_flag) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "TODO: s->sps->chroma_format_idc == 0 || "
+                   "s->sps->separate_colour_plane_flag\n");
+            return AVERROR_PATCHWELCOME;
+        }
+
+        ff_hevc_pred_init(&s->hpc,     s->sps->bit_depth);
+        ff_hevc_dsp_init (&s->hevcdsp, s->sps->bit_depth);
+        ff_videodsp_init (&s->vdsp,    s->sps->bit_depth);
+
+        if (s->sps->sao_enabled) {
+            av_frame_unref(s->tmp_frame);
+            ret = ff_get_buffer(s->avctx, s->tmp_frame, 0);
+            if (ret < 0)
+                return ret;
+            s->frame = s->tmp_frame;
+        }
+    }
+
+    sh->dependent_slice_segment_flag = 0;
+    if (!sh->first_slice_in_pic_flag) {
+        int slice_address_length;
+
+        if (s->pps->dependent_slice_segments_enabled_flag)
+            sh->dependent_slice_segment_flag = get_bits1(gb);
+
+        slice_address_length = av_ceil_log2(s->sps->ctb_width *
+                                              s->sps->ctb_height);
+        sh->slice_segment_addr = get_bits(gb, slice_address_length);
+        if (sh->slice_segment_addr >= s->sps->ctb_width * s->sps->ctb_height) {
+            av_log(s->avctx, AV_LOG_ERROR, "Invalid slice segment address: %u.\n",
+                   sh->slice_segment_addr);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (!sh->dependent_slice_segment_flag) {
+            sh->slice_addr = sh->slice_segment_addr;
+            s->slice_idx++;
+        }
+    } else {
+        sh->slice_segment_addr = sh->slice_addr = 0;
+        s->slice_idx = 0;
+        s->slice_initialized = 0;
+    }
+
+    if (!sh->dependent_slice_segment_flag) {
+        s->slice_initialized = 0;
+
+        for (i = 0; i < s->pps->num_extra_slice_header_bits; i++)
+            skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
+
+        sh->slice_type = get_ue_golomb(gb);
+        if (!(sh->slice_type == I_SLICE || sh->slice_type == P_SLICE ||
+              sh->slice_type == B_SLICE)) {
+            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
+                   sh->slice_type);
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (s->pps->output_flag_present_flag)
+            sh->pic_output_flag = get_bits1(gb);
+
+        if (s->sps->separate_colour_plane_flag)
+            sh->colour_plane_id = get_bits(gb, 2);
+
+        if (!IS_IDR(s)) {
+            int short_term_ref_pic_set_sps_flag;
+            int poc;
+
+            sh->pic_order_cnt_lsb = get_bits(gb, s->sps->log2_max_poc_lsb);
+            poc = ff_hevc_compute_poc(s, sh->pic_order_cnt_lsb);
+            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
+                av_log(s->avctx, AV_LOG_WARNING,
+                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+                poc = s->poc;
+            }
+            s->poc = poc;
+
+            short_term_ref_pic_set_sps_flag = get_bits1(gb);
+            if (!short_term_ref_pic_set_sps_flag) {
+                ret = ff_hevc_decode_short_term_rps(s, &sh->slice_rps, s->sps, 1);
+                if (ret < 0)
+                    return ret;
+
+                sh->short_term_rps = &sh->slice_rps;
+            } else {
+                int numbits, rps_idx;
+
+                if (!s->sps->nb_st_rps) {
+                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                numbits = av_ceil_log2(s->sps->nb_st_rps);
+                rps_idx = (numbits > 0) ? get_bits(gb, numbits) : 0;
+                sh->short_term_rps = &s->sps->st_rps[rps_idx];
+            }
+
+            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return AVERROR_INVALIDDATA;
+            }
+
+            if (s->sps->sps_temporal_mvp_enabled_flag)
+                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
+            else
+                sh->slice_temporal_mvp_enabled_flag = 0;
+        } else {
+            s->sh.short_term_rps = NULL;
+            s->poc = 0;
+        }
+
+        if (s->temporal_id == 0 &&
+            s->nal_unit_type != NAL_TRAIL_N &&
+            s->nal_unit_type != NAL_TSA_N &&
+            s->nal_unit_type != NAL_STSA_N &&
+            s->nal_unit_type != NAL_TRAIL_N &&
+            s->nal_unit_type != NAL_RADL_N &&
+            s->nal_unit_type != NAL_RADL_R &&
+            s->nal_unit_type != NAL_RASL_R)
+            s->pocTid0 = s->poc;
+
+        if (s->sps->sao_enabled) {
+            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
+            sh->slice_sample_adaptive_offset_flag[2] =
+            sh->slice_sample_adaptive_offset_flag[1] = get_bits1(gb);
+        } else {
+            sh->slice_sample_adaptive_offset_flag[0] = 0;
+            sh->slice_sample_adaptive_offset_flag[1] = 0;
+            sh->slice_sample_adaptive_offset_flag[2] = 0;
+        }
+
+        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
+        if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE) {
+            int nb_refs;
+
+            sh->nb_refs[L0] = s->pps->num_ref_idx_l0_default_active;
+            if (sh->slice_type == B_SLICE)
+                sh->nb_refs[L1] = s->pps->num_ref_idx_l1_default_active;
+
+            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
+                sh->nb_refs[L0] = get_ue_golomb(gb) + 1;
+                if (sh->slice_type == B_SLICE)
+                    sh->nb_refs[L1] = get_ue_golomb(gb) + 1;
+            }
+            if (sh->nb_refs[L0] > MAX_REFS || sh->nb_refs[L1] > MAX_REFS) {
+                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
+                       sh->nb_refs[L0], sh->nb_refs[L1]);
+                return AVERROR_INVALIDDATA;
+            }
+
+            sh->rpl_modification_flag[0] = 0;
+            sh->rpl_modification_flag[1] = 0;
+            nb_refs = ff_hevc_frame_nb_refs(s);
+            if (!nb_refs) {
+                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
+                return AVERROR_INVALIDDATA;
+            }
+
+            if (s->pps->lists_modification_present_flag && nb_refs > 1) {
+                sh->rpl_modification_flag[0] = get_bits1(gb);
+                if (sh->rpl_modification_flag[0]) {
+                    for (i = 0; i < sh->nb_refs[L0]; i++)
+                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
+                }
+
+                if (sh->slice_type == B_SLICE) {
+                    sh->rpl_modification_flag[1] = get_bits1(gb);
+                    if (sh->rpl_modification_flag[1] == 1)
+                        for (i = 0; i < sh->nb_refs[L1]; i++)
+                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
+                }
+            }
+
+            if (sh->slice_type == B_SLICE)
+                sh->mvd_l1_zero_flag = get_bits1(gb);
+
+            if (s->pps->cabac_init_present_flag)
+                sh->cabac_init_flag = get_bits1(gb);
+            else
+                sh->cabac_init_flag = 0;
+
+            sh->collocated_ref_idx = 0;
+            if (sh->slice_temporal_mvp_enabled_flag) {
+                sh->collocated_list = L0;
+                if (sh->slice_type == B_SLICE)
+                    sh->collocated_list = !get_bits1(gb);
+
+                if (sh->nb_refs[sh->collocated_list] > 1) {
+                    sh->collocated_ref_idx = get_ue_golomb(gb);
+                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Invalid collocated_ref_idx: %d.\n", sh->collocated_ref_idx);
+                        return AVERROR_INVALIDDATA;
+                    }
+                }
+            }
+
+            if ((s->pps->weighted_pred_flag   && sh->slice_type == P_SLICE) ||
+                (s->pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
+                pred_weight_table(s, gb);
+            }
+
+            sh->max_num_merge_cand = 5 - get_ue_golomb(gb);
+        }
+
+        sh->slice_qp_delta = get_se_golomb(gb);
+        if (s->pps->pic_slice_level_chroma_qp_offsets_present_flag) {
+            sh->slice_cb_qp_offset = get_se_golomb(gb);
+            sh->slice_cr_qp_offset = get_se_golomb(gb);
+        } else {
+            sh->slice_cb_qp_offset = 0;
+            sh->slice_cr_qp_offset = 0;
+        }
+
+        if (s->pps->deblocking_filter_control_present_flag) {
+            int deblocking_filter_override_flag = 0;
+
+            if (s->pps->deblocking_filter_override_enabled_flag)
+                deblocking_filter_override_flag = get_bits1(gb);
+
+            if (deblocking_filter_override_flag) {
+                sh->disable_deblocking_filter_flag = get_bits1(gb);
+                if (!sh->disable_deblocking_filter_flag) {
+                    sh->beta_offset = get_se_golomb(gb) * 2;
+                    sh->tc_offset   = get_se_golomb(gb) * 2;
+                }
+            } else {
+                sh->disable_deblocking_filter_flag = s->pps->pps_disable_deblocking_filter_flag;
+                sh->beta_offset = s->pps->beta_offset;
+                sh->tc_offset = s->pps->tc_offset;
+            }
+        } else {
+            sh->disable_deblocking_filter_flag = 0;
+            sh->beta_offset = 0;
+            sh->tc_offset = 0;
+        }
+
+
+        if (s->pps->seq_loop_filter_across_slices_enabled_flag &&
+            (sh->slice_sample_adaptive_offset_flag[0] ||
+             sh->slice_sample_adaptive_offset_flag[1] ||
+             !sh->disable_deblocking_filter_flag)) {
+            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
+        } else {
+            sh->slice_loop_filter_across_slices_enabled_flag = s->pps->seq_loop_filter_across_slices_enabled_flag;
+        }
+    } else if (!s->slice_initialized) {
+        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    sh->num_entry_point_offsets = 0;
+    if (s->pps->tiles_enabled_flag || s->pps->entropy_coding_sync_enabled_flag) {
+        sh->num_entry_point_offsets = get_ue_golomb(gb);
+        if (sh->num_entry_point_offsets > 0) {
+            int offset_len = get_ue_golomb(gb) + 1;
+
+            for (i = 0; i < sh->num_entry_point_offsets; i++)
+                skip_bits(gb, offset_len);
+        }
+    }
+
+    if (s->pps->slice_header_extension_present_flag) {
+        int length = get_ue_golomb(gb);
+        for (i = 0; i < length; i++)
+            skip_bits(gb, 8); // slice_header_extension_data_byte
+    }
+
+    // Inferred parameters
+    sh->slice_qp = 26 + s->pps->pic_init_qp_minus26 + sh->slice_qp_delta;
+    sh->slice_ctb_addr_rs = sh->slice_segment_addr;
+
+    s->HEVClc.first_qp_group = !s->sh.dependent_slice_segment_flag;
+
+    if (!s->pps->cu_qp_delta_enabled_flag)
+        s->HEVClc.qp_y = ((s->sh.slice_qp + 52 + 2 * s->sps->qp_bd_offset) %
+                          (52 + s->sps->qp_bd_offset)) - s->sps->qp_bd_offset;
+
+    s->slice_initialized = 1;
+
+    return 0;
+}
+
+#define CTB(tab, x, y) ((tab)[(y) * s->sps->ctb_width + (x)])
+
+#define SET_SAO(elem, value)                            \
+do {                                                    \
+    if (!sao_merge_up_flag && !sao_merge_left_flag)     \
+        sao->elem = value;                              \
+    else if (sao_merge_left_flag)                       \
+        sao->elem = CTB(s->sao, rx-1, ry).elem;         \
+    else if (sao_merge_up_flag)                         \
+        sao->elem = CTB(s->sao, rx, ry-1).elem;         \
+    else                                                \
+        sao->elem = 0;                                  \
+} while (0)
+
+static void hls_sao_param(HEVCContext *s, int rx, int ry)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    int sao_merge_left_flag = 0;
+    int sao_merge_up_flag   = 0;
+    int shift = s->sps->bit_depth - FFMIN(s->sps->bit_depth, 10);
+    SAOParams *sao = &CTB(s->sao, rx, ry);
+    int c_idx, i;
+
+    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
+        s->sh.slice_sample_adaptive_offset_flag[1]) {
+        if (rx > 0) {
+            if (lc->ctb_left_flag)
+                sao_merge_left_flag = ff_hevc_sao_merge_flag_decode(s);
+        }
+        if (ry > 0 && !sao_merge_left_flag) {
+            if (lc->ctb_up_flag)
+                sao_merge_up_flag = ff_hevc_sao_merge_flag_decode(s);
+        }
+    }
+
+    for (c_idx = 0; c_idx < 3; c_idx++) {
+        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
+            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
+            continue;
+        }
+
+        if (c_idx == 2) {
+            sao->type_idx[2] = sao->type_idx[1];
+            sao->eo_class[2] = sao->eo_class[1];
+        } else {
+            SET_SAO(type_idx[c_idx], ff_hevc_sao_type_idx_decode(s));
+        }
+
+        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
+            continue;
+
+        for (i = 0; i < 4; i++)
+            SET_SAO(offset_abs[c_idx][i], ff_hevc_sao_offset_abs_decode(s));
+
+        if (sao->type_idx[c_idx] == SAO_BAND) {
+            for (i = 0; i < 4; i++) {
+                if (sao->offset_abs[c_idx][i]) {
+                    SET_SAO(offset_sign[c_idx][i], ff_hevc_sao_offset_sign_decode(s));
+                } else {
+                    sao->offset_sign[c_idx][i] = 0;
+                }
+            }
+            SET_SAO(band_position[c_idx], ff_hevc_sao_band_position_decode(s));
+        } else if (c_idx != 2) {
+            SET_SAO(eo_class[c_idx], ff_hevc_sao_eo_class_decode(s));
+        }
+
+        // Inferred parameters
+        sao->offset_val[c_idx][0] = 0;   //avoid undefined values
+        for (i = 0; i < 4; i++) {
+            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i] << shift;
+            if (sao->type_idx[c_idx] == SAO_EDGE) {
+                if (i > 1)
+                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
+            } else if (sao->offset_sign[c_idx][i]) {
+                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
+            }
+        }
+    }
+}
+
+#undef SET_SAO
+#undef CTB
+
+static void hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                int log2_trafo_size, enum ScanType scan_idx,
+                                int c_idx)
+{
+#define GET_COORD(offset, n)                                    \
+    do {                                                        \
+        x_c = (scan_x_cg[offset >> 4] << 2) + scan_x_off[n];    \
+        y_c = (scan_y_cg[offset >> 4] << 2) + scan_y_off[n];    \
+    } while (0)
+    HEVCLocalContext *lc = &s->HEVClc;
+    int transform_skip_flag = 0;
+
+    int last_significant_coeff_x, last_significant_coeff_y;
+    int last_scan_pos;
+    int n_end;
+    int num_coeff = 0;
+    int num_last_subset;
+    int x_cg_last_sig, y_cg_last_sig;
+
+    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+
+    ptrdiff_t stride = s->frame->linesize[c_idx];
+    int hshift = s->sps->hshift[c_idx];
+    int vshift = s->sps->vshift[c_idx];
+    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+                                           ((x0 >> hshift) << s->sps->pixel_shift)];
+    DECLARE_ALIGNED( 16, int16_t, coeffs[MAX_TB_SIZE * MAX_TB_SIZE] ) = {0};
+
+    int trafo_size = 1 << log2_trafo_size;
+    int i;
+    int qp,shift,add,scale,scale_m;
+    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+    const uint8_t *scale_matrix;
+    uint8_t dc_scale;
+
+    // Derive QP for dequant
+    if (!lc->cu.cu_transquant_bypass_flag) {
+        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+        int qp_y = lc->qp_y;
+
+        if (c_idx == 0) {
+            qp = qp_y + s->sps->qp_bd_offset;
+        } else {
+            int qp_i, offset;
+
+            if (c_idx == 1)
+                offset = s->pps->cb_qp_offset + s->sh.slice_cb_qp_offset;
+            else
+                offset = s->pps->cr_qp_offset + s->sh.slice_cr_qp_offset;
+
+            qp_i = av_clip_c(qp_y + offset, - s->sps->qp_bd_offset, 57);
+            if (qp_i < 30)
+                qp = qp_i;
+            else if (qp_i > 43)
+                qp = qp_i - 6;
+            else
+                qp = qp_c[qp_i - 30];
+
+            qp += s->sps->qp_bd_offset;
+        }
+
+        shift    = s->sps->bit_depth + log2_trafo_size - 5;
+        add      = 1 << (shift-1);
+        scale    = level_scale[qp%6] << (qp/6);
+        scale_m  = 16; // default when no custom scaling lists.
+        dc_scale = 16;
+
+        if (s->sps->scaling_list_enable_flag) {
+            const ScalingList *sl = s->pps->pps_scaling_list_data_present_flag ?
+                                    &s->pps->scaling_list : &s->sps->scaling_list;
+            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+
+            if (log2_trafo_size != 5)
+                matrix_id = 3 * matrix_id + c_idx;
+
+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            if (log2_trafo_size >= 4)
+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+        }
+    }
+
+    memset(lc->rc.significant_coeff_group_flag, 0, 8 * 8);
+
+    if (s->pps->transform_skip_enabled_flag && !lc->cu.cu_transquant_bypass_flag &&
+        log2_trafo_size == 2) {
+        transform_skip_flag = ff_hevc_transform_skip_flag_decode(s, c_idx);
+    }
+
+    last_significant_coeff_x =
+        ff_hevc_last_significant_coeff_x_prefix_decode(s, c_idx, log2_trafo_size);
+    last_significant_coeff_y =
+        ff_hevc_last_significant_coeff_y_prefix_decode(s, c_idx, log2_trafo_size);
+
+    if (last_significant_coeff_x > 3) {
+        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
+                                   (2 + (last_significant_coeff_x & 1)) +
+                                   suffix;
+    }
+
+    if (last_significant_coeff_y > 3) {
+        int suffix = ff_hevc_last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
+                                   (2 + (last_significant_coeff_y & 1)) +
+                                   suffix;
+    }
+
+    if (scan_idx == SCAN_VERT)
+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
+
+    x_cg_last_sig = last_significant_coeff_x >> 2;
+    y_cg_last_sig = last_significant_coeff_y >> 2;
+
+    switch (scan_idx) {
+    case SCAN_DIAG: {
+        int last_x_c = last_significant_coeff_x & 3;
+        int last_y_c = last_significant_coeff_y & 3;
+
+        scan_x_off = ff_hevc_diag_scan4x4_x;
+        scan_y_off = ff_hevc_diag_scan4x4_y;
+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+        if (trafo_size == 4) {
+            scan_x_cg = scan_1x1;
+            scan_y_cg = scan_1x1;
+        } else if (trafo_size == 8) {
+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = diag_scan2x2_x;
+            scan_y_cg = diag_scan2x2_y;
+        } else if (trafo_size == 16) {
+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan4x4_x;
+            scan_y_cg = ff_hevc_diag_scan4x4_y;
+        } else { // trafo_size == 32
+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+            scan_x_cg = ff_hevc_diag_scan8x8_x;
+            scan_y_cg = ff_hevc_diag_scan8x8_y;
+        }
+        break;
+    }
+    case SCAN_HORIZ:
+        scan_x_cg = horiz_scan2x2_x;
+        scan_y_cg = horiz_scan2x2_y;
+        scan_x_off = horiz_scan4x4_x;
+        scan_y_off = horiz_scan4x4_y;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+        break;
+    default: //SCAN_VERT
+        scan_x_cg = horiz_scan2x2_y;
+        scan_y_cg = horiz_scan2x2_x;
+        scan_x_off = horiz_scan4x4_y;
+        scan_y_off = horiz_scan4x4_x;
+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+        break;
+    }
+    num_coeff++;
+
+    num_last_subset = (num_coeff - 1) >> 4;
+
+    for (i = num_last_subset; i >= 0; i--) {
+        int n, m;
+        int first_nz_pos_in_cg, last_nz_pos_in_cg, num_sig_coeff, first_greater1_coeff_idx;
+        int sign_hidden;
+        int sum_abs;
+        int x_cg, y_cg, x_c, y_c, pos;
+        int implicit_non_zero_coeff = 0;
+        int64_t trans_coeff_level;
+
+        int offset = i << 4;
+
+        uint8_t significant_coeff_flag_idx[16] = {0};
+        uint8_t coeff_abs_level_greater1_flag[16] = {0};
+        uint8_t coeff_abs_level_greater2_flag[16] = {0};
+        uint16_t coeff_sign_flag;
+        uint8_t nb_significant_coeff_flag = 0;
+
+        int first_elem;
+
+        x_cg = scan_x_cg[i];
+        y_cg = scan_y_cg[i];
+
+        if ((i < num_last_subset) && (i > 0)) {
+            lc->rc.significant_coeff_group_flag[x_cg][y_cg] =
+            ff_hevc_significant_coeff_group_flag_decode(s, c_idx, x_cg, y_cg,
+                                                        log2_trafo_size);
+            implicit_non_zero_coeff = 1;
+        } else {
+            lc->rc.significant_coeff_group_flag[x_cg][y_cg] =
+            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+             (x_cg == 0 && y_cg == 0));
+        }
+
+        last_scan_pos = num_coeff - offset - 1;
+
+        if (i == num_last_subset) {
+            n_end = last_scan_pos - 1;
+            significant_coeff_flag_idx[0] = last_scan_pos;
+            nb_significant_coeff_flag = 1;
+        } else {
+            n_end = 15;
+        }
+        for (n = n_end; n >= 0; n--) {
+            GET_COORD(offset, n);
+
+            if (lc->rc.significant_coeff_group_flag[x_cg][y_cg] &&
+                (n > 0 || implicit_non_zero_coeff == 0)) {
+                if (ff_hevc_significant_coeff_flag_decode(s, c_idx, x_c, y_c, log2_trafo_size, scan_idx) == 1) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+                    nb_significant_coeff_flag = nb_significant_coeff_flag + 1;
+                    implicit_non_zero_coeff = 0;
+                }
+            } else {
+                int last_cg = (x_c == (x_cg << 2) && y_c == (y_cg << 2));
+                if (last_cg && implicit_non_zero_coeff && lc->rc.significant_coeff_group_flag[x_cg][y_cg]) {
+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+                    nb_significant_coeff_flag = nb_significant_coeff_flag + 1;
+                }
+            }
+        }
+
+        n_end = nb_significant_coeff_flag;
+
+        first_nz_pos_in_cg = 16;
+        last_nz_pos_in_cg = -1;
+        num_sig_coeff = 0;
+        first_greater1_coeff_idx = -1;
+        for (m = 0; m < n_end; m++) {
+            n = significant_coeff_flag_idx[m];
+            if (num_sig_coeff < 8) {
+                coeff_abs_level_greater1_flag[n] =
+                ff_hevc_coeff_abs_level_greater1_flag_decode(s, c_idx, i, n,
+                                                             (num_sig_coeff == 0),
+                                                             (i == num_last_subset));
+                num_sig_coeff++;
+                if (coeff_abs_level_greater1_flag[n] &&
+                    first_greater1_coeff_idx == -1)
+                    first_greater1_coeff_idx = n;
+            }
+            if (last_nz_pos_in_cg == -1)
+                last_nz_pos_in_cg = n;
+            first_nz_pos_in_cg = n;
+        }
+
+        sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 &&
+                       !lc->cu.cu_transquant_bypass_flag);
+        if (first_greater1_coeff_idx != -1) {
+            coeff_abs_level_greater2_flag[first_greater1_coeff_idx] =
+            ff_hevc_coeff_abs_level_greater2_flag_decode(s, c_idx, i, first_greater1_coeff_idx);
+        }
+        if (!s->pps->sign_data_hiding_flag || !sign_hidden ) {
+            coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+        } else {
+            coeff_sign_flag = ff_hevc_coeff_sign_flag(s, nb_significant_coeff_flag-1) << (16 - (nb_significant_coeff_flag - 1));
+        }
+
+        num_sig_coeff = 0;
+        sum_abs = 0;
+        first_elem = 1;
+        for (m = 0; m < n_end; m++) {
+            n = significant_coeff_flag_idx[m];
+            GET_COORD(offset, n);
+            trans_coeff_level = 1 + coeff_abs_level_greater1_flag[n] +
+                                coeff_abs_level_greater2_flag[n];
+            if (trans_coeff_level == ((num_sig_coeff < 8) ?
+                                      ((n == first_greater1_coeff_idx) ? 3 : 2) : 1)) {
+                trans_coeff_level += ff_hevc_coeff_abs_level_remaining(s, first_elem, trans_coeff_level);
+                first_elem = 0;
+            }
+            if (s->pps->sign_data_hiding_flag && sign_hidden) {
+                sum_abs += trans_coeff_level;
+                if (n == first_nz_pos_in_cg && ((sum_abs&1) == 1))
+                    trans_coeff_level = -trans_coeff_level;
+            }
+            if (coeff_sign_flag >> 15)
+                trans_coeff_level = -trans_coeff_level;
+            coeff_sign_flag <<= 1;
+            num_sig_coeff++;
+
+            if (!lc->cu.cu_transquant_bypass_flag) {
+                if(s->sps->scaling_list_enable_flag) {
+                    if(y_c || x_c || log2_trafo_size < 4) {
+                        switch(log2_trafo_size) {
+                            case 3: pos = (y_c << 3) + x_c; break;
+                            case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+                            case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+                            default: pos = (y_c << 2) + x_c;
+                        }
+                        scale_m = scale_matrix[pos];
+                    } else
+                        scale_m = dc_scale;
+                }
+                trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+                if (trans_coeff_level < 0) {
+                    if((~trans_coeff_level) & 0xFffffffffff8000)
+                        trans_coeff_level = -32768;
+                } else {
+                    if(trans_coeff_level & 0xffffffffffff8000)
+                        trans_coeff_level = 32767;
+                }
+            }
+
+            coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+        }
+    }
+
+    if (lc->cu.cu_transquant_bypass_flag) {
+        s->hevcdsp.transquant_bypass[log2_trafo_size-2](dst, coeffs, stride);
+    } else {
+        if (transform_skip_flag)
+            s->hevcdsp.transform_skip(dst, coeffs, stride);
+        else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2)
+            s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
+        else
+            s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+    }
+}
+
+static void hls_transform_unit(HEVCContext *s, int x0, int  y0, int xBase, int yBase, int cb_xBase, int cb_yBase,
+                               int log2_cb_size, int log2_trafo_size, int trafo_depth, int blk_idx)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    int scan_idx = SCAN_DIAG;
+    int scan_idx_c = SCAN_DIAG;
+
+    if (lc->cu.pred_mode == MODE_INTRA) {
+        int trafo_size = 1 << log2_trafo_size;
+        ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+
+        s->hpc.intra_pred(s, x0, y0, log2_trafo_size, 0);
+        if (log2_trafo_size > 2) {
+            trafo_size = trafo_size<<(s->sps->hshift[1]-1);
+            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+            s->hpc.intra_pred(s, x0, y0, log2_trafo_size - 1, 1);
+            s->hpc.intra_pred(s, x0, y0, log2_trafo_size - 1, 2);
+        } else if (blk_idx == 3) {
+            trafo_size = trafo_size<<(s->sps->hshift[1]);
+            ff_hevc_set_neighbour_available(s, xBase, yBase, trafo_size, trafo_size);
+            s->hpc.intra_pred(s, xBase, yBase, log2_trafo_size, 1);
+            s->hpc.intra_pred(s, xBase, yBase, log2_trafo_size, 2);
+        }
+    }
+
+    if (lc->tt.cbf_luma ||
+        SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth], x0, y0) ||
+        SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth], x0, y0)) {
+        if (s->pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) {
+            lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(s);
+            if (lc->tu.cu_qp_delta != 0)
+                if (ff_hevc_cu_qp_delta_sign_flag(s) == 1)
+                    lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta;
+            lc->tu.is_cu_qp_delta_coded = 1;
+            ff_hevc_set_qPy(s, x0, y0, cb_xBase, cb_yBase, log2_cb_size);
+        }
+
+        if (lc->cu.pred_mode == MODE_INTRA && log2_trafo_size < 4) {
+            if (lc->tu.cur_intra_pred_mode >= 6 &&
+                lc->tu.cur_intra_pred_mode <= 14) {
+                scan_idx = SCAN_VERT;
+            } else if (lc->tu.cur_intra_pred_mode >= 22 &&
+                       lc->tu.cur_intra_pred_mode <= 30) {
+                scan_idx = SCAN_HORIZ;
+            }
+
+            if (lc->pu.intra_pred_mode_c >= 6 &&
+                lc->pu.intra_pred_mode_c <= 14) {
+                scan_idx_c = SCAN_VERT;
+            } else if (lc->pu.intra_pred_mode_c >= 22 &&
+                       lc->pu.intra_pred_mode_c <= 30) {
+                scan_idx_c = SCAN_HORIZ;
+            }
+        }
+
+        if (lc->tt.cbf_luma)
+            hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
+        if (log2_trafo_size > 2) {
+            if (SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth], x0, y0))
+                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 1);
+            if (SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth], x0, y0))
+                hls_residual_coding(s, x0, y0, log2_trafo_size - 1, scan_idx_c, 2);
+        } else if (blk_idx == 3) {
+            if (SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth], xBase, yBase))
+                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 1);
+            if (SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth], xBase, yBase))
+                hls_residual_coding(s, xBase, yBase, log2_trafo_size, scan_idx_c, 2);
+        }
+    }
+}
+
+static void set_deblocking_bypass(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    int cb_size = 1 << log2_cb_size;
+    int log2_min_pu_size = s->sps->log2_min_pu_size;
+
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+    int x_end = FFMIN(x0 + cb_size, s->sps->width);
+    int y_end = FFMIN(y0 + cb_size, s->sps->height);
+    int i, j;
+
+    for (j = (y0 >> log2_min_pu_size); j < (y_end >> log2_min_pu_size); j++)
+        for (i = (x0 >> log2_min_pu_size); i < (x_end >> log2_min_pu_size); i++)
+            s->is_pcm[i + j * pic_width_in_min_pu] = 2;
+}
+
+static void hls_transform_tree(HEVCContext *s, int x0, int y0, int xBase, int yBase, int cb_xBase, int cb_yBase,
+                               int log2_cb_size, int log2_trafo_size, int trafo_depth, int blk_idx)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    uint8_t split_transform_flag;
+
+    if (trafo_depth > 0 && log2_trafo_size == 2) {
+        SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth], x0, y0) =
+            SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth - 1], xBase, yBase);
+        SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth], x0, y0) =
+            SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth - 1], xBase, yBase);
+    } else {
+        SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth], x0, y0) =
+            SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth], x0, y0) = 0;
+    }
+
+    if (lc->cu.intra_split_flag) {
+        if (trafo_depth == 1)
+            lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
+    } else {
+        lc->tu.cur_intra_pred_mode = lc->pu.intra_pred_mode[0];
+    }
+
+    lc->tt.cbf_luma = 1;
+
+    lc->tt.inter_split_flag = (s->sps->max_transform_hierarchy_depth_inter == 0 &&
+                               lc->cu.pred_mode == MODE_INTER &&
+                               lc->cu.part_mode != PART_2Nx2N && trafo_depth == 0);
+
+    if (log2_trafo_size <= s->sps->log2_max_trafo_size &&
+        log2_trafo_size > s->sps->log2_min_transform_block_size &&
+        trafo_depth < lc->cu.max_trafo_depth &&
+        !(lc->cu.intra_split_flag && trafo_depth == 0)) {
+        split_transform_flag = ff_hevc_split_transform_flag_decode(s, log2_trafo_size);
+    } else {
+        split_transform_flag = (log2_trafo_size > s->sps->log2_max_trafo_size ||
+                               (lc->cu.intra_split_flag && (trafo_depth == 0)) ||
+                               lc->tt.inter_split_flag);
+    }
+
+    if (log2_trafo_size > 2) {
+        if (trafo_depth == 0 ||
+            SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth - 1], xBase, yBase)) {
+            SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth], x0, y0) =
+                ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+        }
+
+        if (trafo_depth == 0 || SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth - 1], xBase, yBase)) {
+            SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth], x0, y0) =
+                ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
+        }
+    }
+
+    if (split_transform_flag) {
+        int x1 = x0 + ((1 << log2_trafo_size) >> 1);
+        int y1 = y0 + ((1 << log2_trafo_size) >> 1);
+
+        hls_transform_tree(s, x0, y0, x0, y0, cb_xBase, cb_yBase, log2_cb_size,
+                           log2_trafo_size - 1, trafo_depth + 1, 0);
+        hls_transform_tree(s, x1, y0, x0, y0, cb_xBase, cb_yBase, log2_cb_size,
+                           log2_trafo_size - 1, trafo_depth + 1, 1);
+        hls_transform_tree(s, x0, y1, x0, y0, cb_xBase, cb_yBase, log2_cb_size,
+                           log2_trafo_size - 1, trafo_depth + 1, 2);
+        hls_transform_tree(s, x1, y1, x0, y0, cb_xBase, cb_yBase, log2_cb_size,
+                           log2_trafo_size - 1, trafo_depth + 1, 3);
+    } else {
+        int min_tu_size = 1 << s->sps->log2_min_transform_block_size;
+        int log2_min_tu_size = s->sps->log2_min_transform_block_size;
+        int pic_width_in_min_tu = s->sps->width >> log2_min_tu_size;
+        int i, j;
+
+        if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 ||
+            SAMPLE_CBF(lc->tt.cbf_cb[trafo_depth], x0, y0) ||
+            SAMPLE_CBF(lc->tt.cbf_cr[trafo_depth], x0, y0)) {
+            lc->tt.cbf_luma = ff_hevc_cbf_luma_decode(s, trafo_depth);
+        }
+
+        hls_transform_unit(s, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
+                log2_cb_size, log2_trafo_size, trafo_depth, blk_idx);
+
+        // TODO: store cbf_luma somewhere else
+        if (lc->tt.cbf_luma)
+            for (i = 0; i < (1 << log2_trafo_size); i += min_tu_size)
+                for (j = 0; j < (1 << log2_trafo_size); j += min_tu_size) {
+                    int x_tu = (x0 + j) >> log2_min_tu_size;
+                    int y_tu = (y0 + i) >> log2_min_tu_size;
+                    s->cbf_luma[y_tu * pic_width_in_min_tu + x_tu] = 1;
+                }
+        if (!s->sh.disable_deblocking_filter_flag) {
+            ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_trafo_size,
+                                                  lc->slice_or_tiles_up_boundary,
+                                                  lc->slice_or_tiles_left_boundary);
+            if (s->pps->transquant_bypass_enable_flag && lc->cu.cu_transquant_bypass_flag)
+                set_deblocking_bypass(s, x0, y0, log2_trafo_size);
+        }
+    }
+}
+
+static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    //TODO: non-4:2:0 support
+    HEVCLocalContext *lc = &s->HEVClc;
+    GetBitContext gb;
+    int cb_size = 1 << log2_cb_size;
+    int    stride0 = s->frame->linesize[0];
+    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->sps->pixel_shift)];
+    int   stride1 = s->frame->linesize[1];
+    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->sps->vshift[1]) * stride1 + ((x0 >> s->sps->hshift[1]) << s->sps->pixel_shift)];
+    int   stride2 = s->frame->linesize[2];
+    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->sps->vshift[2]) * stride2 + ((x0 >> s->sps->hshift[2]) << s->sps->pixel_shift)];
+
+    int length = cb_size * cb_size * s->sps->pcm.bit_depth + ((cb_size * cb_size) >> 1) * s->sps->pcm.bit_depth;
+    const uint8_t *pcm = skip_bytes(&s->HEVClc.cc, (length + 7) >> 3);
+    int ret;
+
+    ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size,
+                                          lc->slice_or_tiles_up_boundary,
+                                          lc->slice_or_tiles_left_boundary);
+
+    ret = init_get_bits(&gb, pcm, length);
+    if (ret < 0)
+        return ret;
+
+    s->hevcdsp.put_pcm(dst0, stride0, cb_size, &gb, s->sps->pcm.bit_depth);
+    s->hevcdsp.put_pcm(dst1, stride1, cb_size / 2, &gb, s->sps->pcm.bit_depth);
+    s->hevcdsp.put_pcm(dst2, stride2, cb_size / 2, &gb, s->sps->pcm.bit_depth);
+    return 0;
+}
+
+static void hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    int x = ff_hevc_abs_mvd_greater0_flag_decode(s);
+    int y = ff_hevc_abs_mvd_greater0_flag_decode(s);
+
+    if (x)
+        x += ff_hevc_abs_mvd_greater1_flag_decode(s);
+    if (y)
+        y += ff_hevc_abs_mvd_greater1_flag_decode(s);
+
+    switch (x) {
+    case 2: lc->pu.mvd.x = ff_hevc_mvd_decode(s);           break;
+    case 1: lc->pu.mvd.x = ff_hevc_mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.x = 0;                               break;
+    }
+
+    switch (y) {
+    case 2: lc->pu.mvd.y = ff_hevc_mvd_decode(s);           break;
+    case 1: lc->pu.mvd.y = ff_hevc_mvd_sign_flag_decode(s); break;
+    case 0: lc->pu.mvd.y = 0;                               break;
+    }
+}
+
+/**
+ * 8.5.3.2.2.1 Luma sample interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst target buffer for block data at block position
+ * @param dststride stride of the dst buffer
+ * @param ref reference picture buffer at origin (0, 0)
+ * @param mv motion vector (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ */
+static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride, AVFrame *ref,
+                    const Mv *mv, int x_off, int y_off, int block_w, int block_h)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    uint8_t *src = ref->data[0];
+    ptrdiff_t srcstride = ref->linesize[0];
+    int pic_width = s->sps->width;
+    int pic_height = s->sps->height;
+
+    int mx = mv->x & 3;
+    int my = mv->y & 3;
+    int extra_left = ff_hevc_qpel_extra_before[mx];
+    int extra_top  = ff_hevc_qpel_extra_before[my];
+
+    x_off += mv->x >> 2;
+    y_off += mv->y >> 2;
+    src   += y_off * srcstride + (x_off << s->sps->pixel_shift);
+
+    if (x_off < extra_left || x_off >= pic_width - block_w - ff_hevc_qpel_extra_after[mx] ||
+        y_off < extra_top || y_off >= pic_height - block_h - ff_hevc_qpel_extra_after[my]) {
+        int offset = extra_top * srcstride + (extra_left << s->sps->pixel_shift);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, srcstride, src - offset, srcstride,
+                                 block_w + ff_hevc_qpel_extra[mx], block_h + ff_hevc_qpel_extra[my],
+                                 x_off - extra_left, y_off - extra_top,
+                                 pic_width, pic_height);
+        src = lc->edge_emu_buffer + offset;
+    }
+    s->hevcdsp.put_hevc_qpel[my][mx](dst, dststride, src, srcstride, block_w,
+                                     block_h, lc->mc_buffer);
+}
+
+/**
+ * 8.5.3.2.2.2 Chroma sample interpolation process
+ *
+ * @param s HEVC decoding context
+ * @param dst1 target buffer for block data at block position (U plane)
+ * @param dst2 target buffer for block data at block position (V plane)
+ * @param dststride stride of the dst1 and dst2 buffers
+ * @param ref reference picture buffer at origin (0, 0)
+ * @param mv motion vector (relative to block position) to get pixel data from
+ * @param x_off horizontal position of block from origin (0, 0)
+ * @param y_off vertical position of block from origin (0, 0)
+ * @param block_w width of block
+ * @param block_h height of block
+ */
+static void chroma_mc(HEVCContext *s, int16_t *dst1, int16_t *dst2, ptrdiff_t dststride, AVFrame *ref,
+                      const Mv *mv, int x_off, int y_off, int block_w, int block_h)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    uint8_t *src1 = ref->data[1];
+    uint8_t *src2 = ref->data[2];
+    ptrdiff_t src1stride = ref->linesize[1];
+    ptrdiff_t src2stride = ref->linesize[2];
+    int pic_width  = s->sps->width >> 1;
+    int pic_height = s->sps->height >> 1;
+
+    int mx = mv->x & 7;
+    int my = mv->y & 7;
+
+    x_off += mv->x >> 3;
+    y_off += mv->y >> 3;
+    src1 += y_off * src1stride + (x_off << s->sps->pixel_shift);
+    src2 += y_off * src2stride + (x_off << s->sps->pixel_shift);
+
+    if (x_off < EPEL_EXTRA_BEFORE || x_off >= pic_width - block_w - EPEL_EXTRA_AFTER ||
+        y_off < EPEL_EXTRA_AFTER || y_off >= pic_height - block_h - EPEL_EXTRA_AFTER) {
+        int offset1 = EPEL_EXTRA_BEFORE * (src1stride + (1 << s->sps->pixel_shift));
+        int offset2 = EPEL_EXTRA_BEFORE * (src2stride + (1 << s->sps->pixel_shift));
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src1stride, src1 - offset1, src1stride,
+                                 block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
+                                 x_off - EPEL_EXTRA_BEFORE, y_off - EPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+
+        src1 = lc->edge_emu_buffer + offset1;
+        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst1, dststride, src1, src1stride,
+                                             block_w, block_h, mx, my, lc->mc_buffer);
+
+        s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src2stride, src2 - offset2, src2stride,
+                                 block_w + EPEL_EXTRA, block_h + EPEL_EXTRA,
+                                 x_off - EPEL_EXTRA_BEFORE, y_off - EPEL_EXTRA_BEFORE,
+                                 pic_width, pic_height);
+        src2 = lc->edge_emu_buffer + offset2;
+        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst2, dststride, src2, src2stride,
+                                             block_w, block_h, mx, my, lc->mc_buffer);
+    } else {
+        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst1, dststride, src1, src1stride,
+                                             block_w, block_h, mx, my, lc->mc_buffer);
+        s->hevcdsp.put_hevc_epel[!!my][!!mx](dst2, dststride, src2, src2stride,
+                                             block_w, block_h, mx, my, lc->mc_buffer);
+    }
+}
+
+static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
+                                const Mv *mv, int y0)
+{
+    int y = (mv->y >> 2) + y0;
+
+    //ff_thread_await_progress(&ref->tf, FFMIN(s->height, y), 0);
+    ff_thread_await_progress(&ref->tf, INT_MAX, 0);
+}
+
+static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW, int nPbH, int log2_cb_size, int partIdx)
+{
+#define POS(c_idx, x, y)                                                              \
+    &s->frame->data[c_idx][((y) >> s->sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
+                           (((x) >> s->sps->hshift[c_idx]) << s->sps->pixel_shift)]
+    HEVCLocalContext *lc = &s->HEVClc;
+    int merge_idx = 0;
+    enum InterPredIdc inter_pred_idc = PRED_L0;
+    struct MvField current_mv = {{{ 0 }}};
+
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+
+    MvField *tab_mvf = s->ref->tab_mvf;
+    RefPicList  *refPicList = s->ref->refPicList;
+    HEVCFrame *ref0, *ref1;
+
+    int tmpstride = MAX_PB_SIZE;
+
+    uint8_t *dst0 = POS(0, x0, y0);
+    uint8_t *dst1 = POS(1, x0, y0);
+    uint8_t *dst2 = POS(2, x0, y0);
+    int log2_min_cb_size = s->sps->log2_min_coding_block_size;
+    int pic_width_in_ctb = s->sps->width>>log2_min_cb_size;
+    int x_cb             = x0 >> log2_min_cb_size;
+    int y_cb             = y0 >> log2_min_cb_size;
+    int ref_idx[2];
+    int mvp_flag[2];
+    int x_pu, y_pu;
+    int i, j;
+
+    if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
+        if (s->sh.max_num_merge_cand > 1)
+            merge_idx = ff_hevc_merge_idx_decode(s);
+        else
+            merge_idx = 0;
+
+        ff_hevc_luma_mv_merge_mode(s, x0, y0, 1 << log2_cb_size, 1 << log2_cb_size,
+                                   log2_cb_size, partIdx, merge_idx, &current_mv);
+        x_pu = x0 >> s->sps->log2_min_pu_size;
+        y_pu = y0 >> s->sps->log2_min_pu_size;
+
+        for (i = 0; i < nPbW >> s->sps->log2_min_pu_size; i++)
+            for (j = 0; j < nPbH >> s->sps->log2_min_pu_size; j++)
+                tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i] = current_mv;
+    } else { /* MODE_INTER */
+        lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
+        if (lc->pu.merge_flag) {
+            if (s->sh.max_num_merge_cand > 1)
+                merge_idx = ff_hevc_merge_idx_decode(s);
+            else
+                merge_idx = 0;
+
+            ff_hevc_luma_mv_merge_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
+                                       partIdx, merge_idx, &current_mv);
+            x_pu = x0 >> s->sps->log2_min_pu_size;
+            y_pu = y0 >> s->sps->log2_min_pu_size;
+
+            for (i = 0; i < nPbW >> s->sps->log2_min_pu_size; i++)
+                for (j = 0; j < nPbH >> s->sps->log2_min_pu_size; j++)
+                    tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i] = current_mv;
+        } else {
+            ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH);
+            if (s->sh.slice_type == B_SLICE)
+                inter_pred_idc = ff_hevc_inter_pred_idc_decode(s, nPbW, nPbH);
+
+            if (inter_pred_idc != PRED_L1) {
+                if (s->sh.nb_refs[L0]) {
+                    ref_idx[0] = ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L0]);
+                    current_mv.ref_idx[0] = ref_idx[0];
+                }
+                current_mv.pred_flag[0] = 1;
+                hls_mvd_coding(s, x0, y0, 0);
+                mvp_flag[0] = ff_hevc_mvp_lx_flag_decode(s);
+                ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
+                                         partIdx, merge_idx, &current_mv, mvp_flag[0], 0);
+                current_mv.mv[0].x += lc->pu.mvd.x;
+                current_mv.mv[0].y += lc->pu.mvd.y;
+            }
+
+            if (inter_pred_idc != PRED_L0) {
+                if (s->sh.nb_refs[L1]) {
+                    ref_idx[1] = ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L1]);
+                    current_mv.ref_idx[1] = ref_idx[1];
+                }
+
+                if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
+                    lc->pu.mvd.x = 0;
+                    lc->pu.mvd.y = 0;
+                } else {
+                    hls_mvd_coding(s, x0, y0, 1);
+                }
+
+                current_mv.pred_flag[1] = 1;
+                mvp_flag[1] = ff_hevc_mvp_lx_flag_decode(s);
+                ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
+                                         partIdx, merge_idx, &current_mv, mvp_flag[1], 1);
+                current_mv.mv[1].x += lc->pu.mvd.x;
+                current_mv.mv[1].y += lc->pu.mvd.y;
+            }
+
+            x_pu = x0 >> s->sps->log2_min_pu_size;
+            y_pu = y0 >> s->sps->log2_min_pu_size;
+
+            for (i = 0; i < nPbW >> s->sps->log2_min_pu_size; i++)
+                for(j = 0; j < nPbH >> s->sps->log2_min_pu_size; j++)
+                    tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i] = current_mv;
+        }
+    }
+
+    if (current_mv.pred_flag[0]) {
+        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
+        if (!ref0)
+            return;
+        hevc_await_progress(s, ref0, &current_mv.mv[0], y0);
+    }
+    if (current_mv.pred_flag[1]) {
+        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
+        if (!ref1)
+            return;
+        hevc_await_progress(s, ref1, &current_mv.mv[1], y0);
+    }
+
+    if (current_mv.pred_flag[0] && !current_mv.pred_flag[1]) {
+        DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
+        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
+
+        luma_mc(s, tmp, tmpstride, ref0->frame,
+                &current_mv.mv[0], x0, y0, nPbW, nPbH);
+
+        if ((s->sh.slice_type == P_SLICE && s->pps->weighted_pred_flag) ||
+            (s->sh.slice_type == B_SLICE && s->pps->weighted_bipred_flag)) {
+            s->hevcdsp.weighted_pred(s->sh.luma_log2_weight_denom,
+                                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                                     s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+                                     dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH);
+        } else {
+            s->hevcdsp.put_unweighted_pred(dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH);
+        }
+        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
+                  &current_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2);
+
+        if ((s->sh.slice_type == P_SLICE && s->pps->weighted_pred_flag) ||
+            (s->sh.slice_type == B_SLICE && s->pps->weighted_bipred_flag)) {
+            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
+                                     s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
+                                     s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
+                                     dst1, s->frame->linesize[1], tmp, tmpstride,
+                                     nPbW / 2, nPbH / 2);
+            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
+                                     s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
+                                     s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
+                                     dst2, s->frame->linesize[2], tmp2, tmpstride,
+                                     nPbW / 2, nPbH / 2);
+        } else {
+            s->hevcdsp.put_unweighted_pred(dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2);
+            s->hevcdsp.put_unweighted_pred(dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2);
+        }
+    } else if (!current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
+        DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
+        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
+
+        if (!ref1)
+            return;
+
+        luma_mc(s, tmp, tmpstride, ref1->frame,
+                &current_mv.mv[1], x0, y0, nPbW, nPbH);
+
+        if ((s->sh.slice_type == P_SLICE && s->pps->weighted_pred_flag) ||
+            (s->sh.slice_type == B_SLICE && s->pps->weighted_bipred_flag)) {
+            s->hevcdsp.weighted_pred(s->sh.luma_log2_weight_denom,
+                                      s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                                      s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+                                      dst0, s->frame->linesize[0], tmp, tmpstride,
+                                      nPbW, nPbH);
+        } else {
+            s->hevcdsp.put_unweighted_pred(dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH);
+        }
+
+        chroma_mc(s, tmp, tmp2, tmpstride, ref1->frame,
+                  &current_mv.mv[1], x0/2, y0/2, nPbW/2, nPbH/2);
+
+        if ((s->sh.slice_type == P_SLICE && s->pps->weighted_pred_flag) ||
+            (s->sh.slice_type == B_SLICE && s->pps->weighted_bipred_flag)) {
+            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
+                                     s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
+                                     dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2);
+            s->hevcdsp.weighted_pred(s->sh.chroma_log2_weight_denom,
+                                     s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
+                                     dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2);
+        } else {
+            s->hevcdsp.put_unweighted_pred(dst1, s->frame->linesize[1], tmp, tmpstride, nPbW/2, nPbH/2);
+            s->hevcdsp.put_unweighted_pred(dst2, s->frame->linesize[2], tmp2, tmpstride, nPbW/2, nPbH/2);
+        }
+    } else if (current_mv.pred_flag[0] && current_mv.pred_flag[1]) {
+        DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
+        DECLARE_ALIGNED(16, int16_t, tmp2[MAX_PB_SIZE * MAX_PB_SIZE]);
+        DECLARE_ALIGNED(16, int16_t, tmp3[MAX_PB_SIZE * MAX_PB_SIZE]);
+        DECLARE_ALIGNED(16, int16_t, tmp4[MAX_PB_SIZE * MAX_PB_SIZE]);
+        HEVCFrame *ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
+        HEVCFrame *ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
+
+        if (!ref0 || !ref1)
+            return;
+
+        luma_mc(s, tmp, tmpstride, ref0->frame,
+                &current_mv.mv[0], x0, y0, nPbW, nPbH);
+        luma_mc(s, tmp2, tmpstride, ref1->frame,
+                &current_mv.mv[1], x0, y0, nPbW, nPbH);
+
+        if ((s->sh.slice_type == P_SLICE && s->pps->weighted_pred_flag) ||
+            (s->sh.slice_type == B_SLICE && s->pps->weighted_bipred_flag)){
+            s->hevcdsp.weighted_pred_avg(s->sh.luma_log2_weight_denom,
+                                         s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                                         s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                                         s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+                                         s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+                                         dst0, s->frame->linesize[0], tmp, tmp2, tmpstride, nPbW, nPbH);
+        } else {
+            s->hevcdsp.put_weighted_pred_avg(dst0, s->frame->linesize[0], tmp, tmp2, tmpstride, nPbW, nPbH);
+        }
+
+        chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame,
+                  &current_mv.mv[0], x0/2, y0/2, nPbW/2, nPbH/2);
+        chroma_mc(s, tmp3, tmp4, tmpstride, ref1->frame,
+                  &current_mv.mv[1], x0/2, y0/2, nPbW/2, nPbH/2);
+
+        if ((s->sh.slice_type == P_SLICE && s->pps->weighted_pred_flag) ||
+            (s->sh.slice_type == B_SLICE && s->pps->weighted_bipred_flag)) {
+            s->hevcdsp.weighted_pred_avg(s->sh.chroma_log2_weight_denom ,
+                                         s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0],
+                                         s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0],
+                                         s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0],
+                                         s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0],
+                                         dst1, s->frame->linesize[1], tmp, tmp3, tmpstride, nPbW/2, nPbH/2);
+            s->hevcdsp.weighted_pred_avg(s->sh.chroma_log2_weight_denom ,
+                                         s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1],
+                                         s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1],
+                                         s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1],
+                                         s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1],
+                                         dst2, s->frame->linesize[2], tmp2, tmp4, tmpstride, nPbW/2, nPbH/2);
+        } else {
+            s->hevcdsp.put_weighted_pred_avg(dst1, s->frame->linesize[1], tmp, tmp3, tmpstride, nPbW/2, nPbH/2);
+            s->hevcdsp.put_weighted_pred_avg(dst2, s->frame->linesize[2], tmp2, tmp4, tmpstride, nPbW/2, nPbH/2);
+        }
+    }
+}
+
+/**
+ * 8.4.1
+ */
+static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
+                                int prev_intra_luma_pred_flag)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    int x_pu = x0 >> s->sps->log2_min_pu_size;
+    int y_pu = y0 >> s->sps->log2_min_pu_size;
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+    int size_in_pus = pu_size >> s->sps->log2_min_pu_size;
+    int x0b = x0 & ((1 << s->sps->log2_ctb_size) - 1);
+    int y0b = y0 & ((1 << s->sps->log2_ctb_size) - 1);
+
+    int cand_up   = (lc->ctb_up_flag || y0b) ? s->tab_ipm[(y_pu-1)*pic_width_in_min_pu+x_pu] : INTRA_DC ;
+    int cand_left = (lc->ctb_left_flag || x0b) ? s->tab_ipm[y_pu*pic_width_in_min_pu+x_pu-1] : INTRA_DC ;
+
+    int y_ctb = (y0 >> (s->sps->log2_ctb_size)) << (s->sps->log2_ctb_size);
+    MvField *tab_mvf = s->ref->tab_mvf;
+    int intra_pred_mode;
+    int candidate[3];
+    int i, j;
+
+    // intra_pred_mode prediction does not cross vertical CTB boundaries
+    if ((y0 - 1) < y_ctb)
+        cand_up = INTRA_DC;
+
+    if (cand_left == cand_up) {
+        if (cand_left < 2) {
+            candidate[0] = INTRA_PLANAR;
+            candidate[1] = INTRA_DC;
+            candidate[2] = INTRA_ANGULAR_26;
+        } else {
+            candidate[0] = cand_left;
+            candidate[1] = 2 + ((cand_left - 2 - 1 + 32) & 31);
+            candidate[2] = 2 + ((cand_left - 2 + 1) & 31);
+        }
+    } else {
+        candidate[0] = cand_left;
+        candidate[1] = cand_up;
+        if (candidate[0] != INTRA_PLANAR && candidate[1] != INTRA_PLANAR) {
+            candidate[2] = INTRA_PLANAR;
+        } else if (candidate[0] != INTRA_DC && candidate[1] != INTRA_DC) {
+            candidate[2] = INTRA_DC;
+        } else {
+            candidate[2] = INTRA_ANGULAR_26;
+        }
+    }
+
+    if (prev_intra_luma_pred_flag) {
+        intra_pred_mode = candidate[lc->pu.mpm_idx];
+    } else {
+        if (candidate[0] > candidate[1])
+            FFSWAP(uint8_t, candidate[0], candidate[1]);
+        if (candidate[0] > candidate[2])
+            FFSWAP(uint8_t, candidate[0], candidate[2]);
+        if (candidate[1] > candidate[2])
+            FFSWAP(uint8_t, candidate[1], candidate[2]);
+
+        intra_pred_mode = lc->pu.rem_intra_luma_pred_mode;
+        for (i = 0; i < 3; i++) {
+            if (intra_pred_mode >= candidate[i])
+                intra_pred_mode++;
+        }
+    }
+
+    /* write the intra prediction units into the mv array */
+    if(!size_in_pus)
+        size_in_pus = 1;
+    for (i = 0; i < size_in_pus; i++) {
+        memset(&s->tab_ipm[(y_pu + i) * pic_width_in_min_pu + x_pu],
+               intra_pred_mode, size_in_pus);
+
+        for (j = 0; j < size_in_pus; j++) {
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].is_intra     = 1;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].pred_flag[0] = 0;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].pred_flag[1] = 0;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].ref_idx[0]   = 0;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].ref_idx[1]   = 0;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].mv[0].x      = 0;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].mv[0].y      = 0;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].mv[1].x      = 0;
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + i].mv[1].y      = 0;
+        }
+    }
+
+    return intra_pred_mode;
+}
+
+static av_always_inline void set_ct_depth(HEVCContext *s, int x0, int y0,
+                                          int log2_cb_size, int ct_depth)
+{
+    int length = (1 << log2_cb_size) >> s->sps->log2_min_coding_block_size;
+    int x_cb = x0 >> s->sps->log2_min_coding_block_size;
+    int y_cb = y0 >> s->sps->log2_min_coding_block_size;
+    int y;
+
+    for (y = 0; y < length; y++)
+        memset(&s->tab_ct_depth[(y_cb + y) * s->sps->min_cb_width + x_cb],
+               ct_depth, length);
+}
+
+static void intra_prediction_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    static const uint8_t intra_chroma_table[4] = {0, 26, 10, 1};
+    uint8_t prev_intra_luma_pred_flag[4];
+    int split   = lc->cu.part_mode == PART_NxN;
+    int pb_size = (1 << log2_cb_size) >> split;
+    int side    = split + 1;
+    int chroma_mode;
+    int i, j;
+
+    for (i = 0; i < side; i++)
+        for (j = 0; j < side; j++)
+            prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_prev_intra_luma_pred_flag_decode(s);
+
+    for (i = 0; i < side; i++) {
+        for (j = 0; j < side; j++) {
+            if (prev_intra_luma_pred_flag[2*i+j])
+                lc->pu.mpm_idx = ff_hevc_mpm_idx_decode(s);
+            else
+                lc->pu.rem_intra_luma_pred_mode = ff_hevc_rem_intra_luma_pred_mode_decode(s);
+
+            lc->pu.intra_pred_mode[2 * i + j] =
+                luma_intra_pred_mode(s, x0 + pb_size * j, y0 + pb_size * i, pb_size,
+                                     prev_intra_luma_pred_flag[2 * i + j]);
+        }
+    }
+
+    chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
+    if (chroma_mode != 4) {
+        if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
+            lc->pu.intra_pred_mode_c = 34;
+        else
+            lc->pu.intra_pred_mode_c = intra_chroma_table[chroma_mode];
+    } else {
+        lc->pu.intra_pred_mode_c = lc->pu.intra_pred_mode[0];
+    }
+}
+
+static void intra_prediction_unit_default_value(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    int pb_size = 1 << log2_cb_size;
+    int size_in_pus = pb_size >> s->sps->log2_min_pu_size;
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+    MvField *tab_mvf = s->ref->tab_mvf;
+    int x_pu = x0 >> s->sps->log2_min_pu_size;
+    int y_pu = y0 >> s->sps->log2_min_pu_size;
+    int j, k;
+
+    if (size_in_pus == 0)
+        size_in_pus = 1;
+    for (j = 0; j < size_in_pus; j++) {
+        memset(&s->tab_ipm[(y_pu + j) * pic_width_in_min_pu + x_pu], INTRA_DC, size_in_pus);
+        for (k = 0; k <size_in_pus; k++)
+            tab_mvf[(y_pu + j) * pic_width_in_min_pu + x_pu + k].is_intra = lc->cu.pred_mode == MODE_INTRA;
+    }
+}
+
+static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+{
+    int cb_size          = 1 << log2_cb_size;
+    HEVCLocalContext *lc = &s->HEVClc;
+    int log2_min_cb_size = s->sps->log2_min_coding_block_size;
+    int length           = cb_size >> log2_min_cb_size;
+    int pic_width_in_ctb = s->sps->width >> log2_min_cb_size;
+    int x_cb             = x0 >> log2_min_cb_size;
+    int y_cb             = y0 >> log2_min_cb_size;
+    int x, y;
+
+    lc->cu.x = x0;
+    lc->cu.y = y0;
+    lc->cu.rqt_root_cbf = 1;
+
+    lc->cu.pred_mode        = MODE_INTRA;
+    lc->cu.part_mode        = PART_2Nx2N;
+    lc->cu.intra_split_flag = 0;
+    lc->cu.pcm_flag         = 0;
+    SAMPLE_CTB(s->skip_flag, x_cb, y_cb) = 0;
+    for (x = 0; x < 4; x++)
+        lc->pu.intra_pred_mode[x] = 1;
+    if (s->pps->transquant_bypass_enable_flag) {
+        lc->cu.cu_transquant_bypass_flag = ff_hevc_cu_transquant_bypass_flag_decode(s);
+        if (lc->cu.cu_transquant_bypass_flag)
+            set_deblocking_bypass(s, x0, y0, log2_cb_size);
+    } else
+        lc->cu.cu_transquant_bypass_flag = 0;
+
+
+    if (s->sh.slice_type != I_SLICE) {
+        uint8_t skip_flag = ff_hevc_skip_flag_decode(s, x0, y0, x_cb, y_cb);
+
+        lc->cu.pred_mode = MODE_SKIP;
+        x = y_cb * pic_width_in_ctb + x_cb;
+        for (y = 0; y < length; y++) {
+            memset(&s->skip_flag[x], skip_flag, length);
+            x += pic_width_in_ctb;
+        }
+        lc->cu.pred_mode = skip_flag ? MODE_SKIP : MODE_INTER;
+    }
+
+    if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
+        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+        intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+
+        if (!s->sh.disable_deblocking_filter_flag)
+            ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size,
+                                                  lc->slice_or_tiles_up_boundary,
+                                                  lc->slice_or_tiles_left_boundary);
+    } else {
+        if (s->sh.slice_type != I_SLICE)
+            lc->cu.pred_mode = ff_hevc_pred_mode_decode(s);
+        if (lc->cu.pred_mode != MODE_INTRA ||
+            log2_cb_size == s->sps->log2_min_coding_block_size) {
+            lc->cu.part_mode = ff_hevc_part_mode_decode(s, log2_cb_size);
+            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
+                                      lc->cu.pred_mode == MODE_INTRA;
+        }
+
+        if (lc->cu.pred_mode == MODE_INTRA) {
+            if (lc->cu.part_mode == PART_2Nx2N && s->sps->pcm_enabled_flag &&
+                log2_cb_size >= s->sps->pcm.log2_min_pcm_cb_size &&
+                log2_cb_size <= s->sps->pcm.log2_max_pcm_cb_size) {
+                lc->cu.pcm_flag = ff_hevc_pcm_flag_decode(s);
+            }
+            if (lc->cu.pcm_flag) {
+                int ret;
+                intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+                ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
+                if(s->sps->pcm.loop_filter_disable_flag)
+                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
+
+                if (ret < 0)
+                    return ret;
+            } else {
+                intra_prediction_unit(s, x0, y0, log2_cb_size);
+            }
+        } else {
+            intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+            switch (lc->cu.part_mode) {
+            case PART_2Nx2N:
+                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0);
+                break;
+            case PART_2NxN:
+                hls_prediction_unit(s, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0);
+                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size/2, log2_cb_size, 1);
+                break;
+            case PART_Nx2N:
+                hls_prediction_unit(s, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1);
+                break;
+            case PART_2NxnU:
+                hls_prediction_unit(s, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0);
+                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1);
+                break;
+            case PART_2NxnD:
+                hls_prediction_unit(s, x0, y0, cb_size, cb_size * 3 / 4, log2_cb_size, 0);
+                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size / 4, log2_cb_size, 1);
+                break;
+            case PART_nLx2N:
+                hls_prediction_unit(s, x0, y0, cb_size / 4, cb_size, log2_cb_size,0);
+                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1);
+                break;
+            case PART_nRx2N:
+                hls_prediction_unit(s, x0, y0, cb_size * 3 / 4, cb_size, log2_cb_size,0);
+                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size/4, cb_size, log2_cb_size, 1);
+                break;
+            case PART_NxN:
+                hls_prediction_unit(s, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1);
+                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2);
+                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3);
+                break;
+            }
+        }
+
+        if (!lc->cu.pcm_flag) {
+            if (lc->cu.pred_mode != MODE_INTRA &&
+                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
+                lc->cu.rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(s);
+            }
+            if (lc->cu.rqt_root_cbf) {
+                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
+                                        s->sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
+                                        s->sps->max_transform_hierarchy_depth_inter;
+                hls_transform_tree(s, x0, y0, x0, y0, x0, y0, log2_cb_size,
+                                   log2_cb_size, 0, 0);
+            } else {
+                if (!s->sh.disable_deblocking_filter_flag)
+                    ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size,
+                                                          lc->slice_or_tiles_up_boundary,
+                                                          lc->slice_or_tiles_left_boundary);
+            }
+        }
+    }
+
+    if (s->pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0)
+        ff_hevc_set_qPy(s, x0, y0, x0, y0, log2_cb_size);
+
+    x = y_cb * pic_width_in_ctb + x_cb;
+    for (y = 0; y < length; y++) {
+        memset(&s->qp_y_tab[x], lc->qp_y, length);
+        x += pic_width_in_ctb;
+    }
+
+    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct.depth);
+
+    return 0;
+}
+
+static int hls_coding_quadtree(HEVCContext *s, int x0, int y0, int log2_cb_size, int cb_depth)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    int ret;
+
+    lc->ct.depth = cb_depth;
+    if ((x0 + (1 << log2_cb_size) <= s->sps->width) &&
+        (y0 + (1 << log2_cb_size) <= s->sps->height) &&
+        log2_cb_size > s->sps->log2_min_coding_block_size) {
+        SAMPLE(s->split_cu_flag, x0, y0) =
+            ff_hevc_split_coding_unit_flag_decode(s, cb_depth, x0, y0);
+    } else {
+        SAMPLE(s->split_cu_flag, x0, y0) =
+            (log2_cb_size > s->sps->log2_min_coding_block_size);
+    }
+    if (s->pps->cu_qp_delta_enabled_flag &&
+        log2_cb_size >= s->sps->log2_ctb_size - s->pps->diff_cu_qp_delta_depth) {
+        lc->tu.is_cu_qp_delta_coded = 0;
+        lc->tu.cu_qp_delta          = 0;
+    }
+
+    if (SAMPLE(s->split_cu_flag, x0, y0)) {
+        int more_data = 0;
+        int cb_size = (1 << (log2_cb_size)) >> 1;
+        int x1 = x0 + cb_size;
+        int y1 = y0 + cb_size;
+
+        more_data = hls_coding_quadtree(s, x0, y0, log2_cb_size - 1, cb_depth + 1);
+        if (more_data < 0)
+            return more_data;
+
+        if (more_data && x1 < s->sps->width)
+            more_data = hls_coding_quadtree(s, x1, y0, log2_cb_size - 1, cb_depth + 1);
+        if (more_data && y1 < s->sps->height)
+            more_data = hls_coding_quadtree(s, x0, y1, log2_cb_size - 1, cb_depth + 1);
+        if (more_data && x1 < s->sps->width &&
+            y1 < s->sps->height) {
+            return hls_coding_quadtree(s, x1, y1, log2_cb_size - 1, cb_depth + 1);
+        }
+        if (more_data)
+            return ((x1 + cb_size) < s->sps->width ||
+                    (y1 + cb_size) < s->sps->height);
+        else
+            return 0;
+    } else {
+        ret = hls_coding_unit(s, x0, y0, log2_cb_size);
+        if (ret < 0)
+            return ret;
+        if ((!((x0 + (1 << log2_cb_size)) %
+               (1 << (s->sps->log2_ctb_size))) ||
+             (x0 + (1 << log2_cb_size) >= s->sps->width)) &&
+            (!((y0 + (1 << log2_cb_size)) %
+               (1 << (s->sps->log2_ctb_size))) ||
+             (y0 + (1 << log2_cb_size) >= s->sps->height))) {
+            int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(s);
+            return !end_of_slice_flag;
+        } else {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * 7.3.4
+ */
+
+static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, int ctb_addr_ts)
+{
+    HEVCLocalContext *lc  = &s->HEVClc;
+    int ctb_size          = 1 << s->sps->log2_ctb_size;
+    int ctb_addr_rs       = s->pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+    int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;
+
+    int tile_left_boundary;
+    int tile_up_boundary;
+    int slice_left_boundary;
+    int slice_up_boundary;
+
+    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
+
+
+    if (s->pps->entropy_coding_sync_enabled_flag) {
+        if (x_ctb == 0 && (y_ctb & (ctb_size - 1)) == 0)
+            lc->first_qp_group = 1;
+        lc->end_of_tiles_x = s->sps->width;
+    } else if (s->pps->tiles_enabled_flag) {
+        if (ctb_addr_ts && s->pps->tile_id[ctb_addr_ts] != s->pps->tile_id[ctb_addr_ts - 1]) {
+            int idxX = s->pps->col_idxX[x_ctb >> s->sps->log2_ctb_size];
+            lc->start_of_tiles_x = x_ctb;
+            lc->end_of_tiles_x   = x_ctb + (s->pps->column_width[idxX]<< s->sps->log2_ctb_size);
+            lc->first_qp_group   = 1;
+        }
+    } else {
+        lc->end_of_tiles_x = s->sps->width;
+    }
+
+    lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->sps->height);
+
+    if (s->pps->tiles_enabled_flag) {
+        tile_left_boundary = ((x_ctb > 0) &&
+                              (s->pps->tile_id[ctb_addr_ts] == s->pps->tile_id[s->pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]));
+        slice_left_boundary = ((x_ctb > 0) &&
+                               (s->tab_slice_address[ctb_addr_rs] == s->tab_slice_address[ctb_addr_rs - 1]));
+        tile_up_boundary = ((y_ctb > 0) &&
+                            (s->pps->tile_id[ctb_addr_ts] == s->pps->tile_id[s->pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->sps->ctb_width]]));
+        slice_up_boundary = ((y_ctb > 0) &&
+                             (s->tab_slice_address[ctb_addr_rs] == s->tab_slice_address[ctb_addr_rs - s->sps->ctb_width]));
+    } else {
+        tile_left_boundary =
+        tile_up_boundary = 1;
+        slice_left_boundary = ctb_addr_in_slice > 0;
+        slice_up_boundary = ctb_addr_in_slice >= s->sps->ctb_width;
+    }
+    lc->slice_or_tiles_left_boundary = (!slice_left_boundary) + (!tile_left_boundary << 1);
+    lc->slice_or_tiles_up_boundary   = (!slice_up_boundary + (!tile_up_boundary << 1));
+    lc->ctb_left_flag = ((x_ctb > 0) && (ctb_addr_in_slice > 0) && tile_left_boundary);
+    lc->ctb_up_flag   = ((y_ctb > 0) && (ctb_addr_in_slice >= s->sps->ctb_width) && tile_up_boundary);
+    lc->ctb_up_right_flag = ((y_ctb > 0)  && (ctb_addr_in_slice+1 >= s->sps->ctb_width) && (s->pps->tile_id[ctb_addr_ts] == s->pps->tile_id[s->pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->sps->ctb_width]]));
+    lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->sps->ctb_width) && (s->pps->tile_id[ctb_addr_ts] == s->pps->tile_id[s->pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->sps->ctb_width]]));
+}
+
+static int hls_slice_data(HEVCContext *s)
+{
+    int ctb_size    = 1 << s->sps->log2_ctb_size;
+    int more_data   = 1;
+    int x_ctb       = 0;
+    int y_ctb       = 0;
+    int ctb_addr_ts = s->pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+
+    while (more_data && ctb_addr_ts < s->sps->ctb_size) {
+        int ctb_addr_rs = s->pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+
+        x_ctb = (ctb_addr_rs % ((s->sps->width + (ctb_size - 1)) >> s->sps->log2_ctb_size)) << s->sps->log2_ctb_size;
+        y_ctb = (ctb_addr_rs / ((s->sps->width + (ctb_size - 1)) >> s->sps->log2_ctb_size)) << s->sps->log2_ctb_size;
+        hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+
+        ff_hevc_cabac_init(s, ctb_addr_ts);
+
+        hls_sao_param(s, x_ctb >> s->sps->log2_ctb_size, y_ctb >> s->sps->log2_ctb_size);
+
+        s->deblock[ctb_addr_rs].disable     = s->sh.disable_deblocking_filter_flag;
+        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
+        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+
+        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->sps->log2_ctb_size, 0);
+        if (more_data < 0)
+            return more_data;
+
+        ctb_addr_ts++;
+        ff_hevc_save_states(s, ctb_addr_ts);
+        ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+    }
+
+    if (x_ctb + ctb_size >= s->sps->width &&
+        y_ctb + ctb_size >= s->sps->height)
+        ff_hevc_hls_filter(s, x_ctb, y_ctb);
+
+    return ctb_addr_ts;
+}
+
+/**
+ * @return AVERROR_INVALIDDATA if the packet is not a valid NAL unit,
+ * 0 if the unit should be skipped, 1 otherwise
+ */
+static int hls_nal_unit(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc.gb;
+    int nuh_layer_id;
+
+    if (get_bits1(gb) != 0)
+        return AVERROR_INVALIDDATA;
+
+    s->nal_unit_type = get_bits(gb, 6);
+
+    nuh_layer_id   = get_bits(gb, 6);
+    s->temporal_id = get_bits(gb, 3) - 1;
+    if (s->temporal_id < 0)
+        return AVERROR_INVALIDDATA;
+
+    av_log(s->avctx, AV_LOG_DEBUG,
+           "nal_unit_type: %d, nuh_layer_id: %dtemporal_id: %d\n",
+           s->nal_unit_type, nuh_layer_id, s->temporal_id);
+
+    return (nuh_layer_id == 0);
+}
+
+static void restore_tqb_pixels(HEVCContext *s)
+{
+    int pic_width_in_min_pu  = s->sps->width >> s->sps->log2_min_pu_size;
+    int pic_height_in_min_pu = s->sps->height >> s->sps->log2_min_pu_size;
+    int min_pu_size = 1 << s->sps->log2_min_pu_size;
+    int x, y, c_idx;
+
+    for (c_idx = 0; c_idx < 3; c_idx++) {
+        ptrdiff_t stride = s->frame->linesize[c_idx];
+        int hshift = s->sps->hshift[c_idx];
+        int vshift = s->sps->vshift[c_idx];
+        for (y = 0; y < pic_height_in_min_pu; y++) {
+            for (x = 0; x < pic_width_in_min_pu; x++) {
+                if (s->is_pcm[y*pic_width_in_min_pu+x]) {
+                    int n;
+                    int len = min_pu_size >> hshift;
+                    uint8_t *src = &s->frame->data[c_idx][((y << s->sps->log2_min_pu_size) >> vshift) * stride + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)];
+                    uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->sps->log2_min_pu_size) >> vshift) * stride + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)];
+                    for (n = 0;n < (min_pu_size >> vshift); n++) {
+                       memcpy(dst,src,len);
+                       src += stride;
+                       dst += stride;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static int hevc_frame_start(HEVCContext *s)
+{
+    HEVCLocalContext *lc     = &s->HEVClc;
+    int pic_width_in_min_pu  = s->sps->width  >> s->sps->log2_min_pu_size;
+    int pic_height_in_min_pu = s->sps->height >> s->sps->log2_min_pu_size;
+    int pic_width_in_min_tu  = s->sps->width  >> s->sps->log2_min_transform_block_size;
+    int pic_height_in_min_tu = s->sps->height >> s->sps->log2_min_transform_block_size;
+    int ret;
+
+    memset(s->horizontal_bs, 0, 2 * s->bs_width * (s->bs_height + 1));
+    memset(s->vertical_bs,   0, 2 * s->bs_width * (s->bs_height + 1));
+    memset(s->cbf_luma,      0, pic_width_in_min_tu * pic_height_in_min_tu);
+    memset(s->is_pcm,        0, pic_width_in_min_pu * pic_height_in_min_pu);
+
+    lc->start_of_tiles_x = 0;
+    s->is_decoded        = 0;
+
+    if (s->pps->tiles_enabled_flag)
+        lc->end_of_tiles_x   = s->pps->column_width[0] << s->sps->log2_ctb_size;
+
+    ret = ff_hevc_set_new_ref(s, s->sps->sao_enabled ? &s->sao_frame : &s->frame,
+                              s->poc);
+    if (ret < 0)
+        goto fail;
+
+    av_fast_malloc(&lc->edge_emu_buffer, &lc->edge_emu_buffer_size,
+                   (MAX_PB_SIZE + 7) * s->ref->frame->linesize[0]);
+    if (!lc->edge_emu_buffer) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = ff_hevc_frame_rps(s);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
+        goto fail;
+    }
+
+    av_frame_unref(s->output_frame);
+    ret = ff_hevc_output_frame(s, s->output_frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    ff_thread_finish_setup(s->avctx);
+
+    return 0;
+fail:
+    if (s->ref)
+        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+    s->ref = NULL;
+    return ret;
+}
+
+static int decode_nal_unit(HEVCContext *s, const uint8_t *nal, int length)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    GetBitContext *gb = &lc->gb;
+    int ctb_addr_ts;
+    int ret;
+
+    ret = init_get_bits8(gb, nal, length);
+    if (ret < 0)
+        return ret;
+
+    ret = hls_nal_unit(s);
+    if (ret < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid NAL unit %d, skipping.\n",
+                s->nal_unit_type);
+        if (s->avctx->err_recognition & AV_EF_EXPLODE)
+            return ret;
+        return 0;
+    } else if (!ret)
+        return 0;
+
+    switch (s->nal_unit_type) {
+    case NAL_VPS:
+        ret = ff_hevc_decode_nal_vps(s);
+        if (ret < 0)
+            return ret;
+        break;
+    case NAL_SPS:
+        ret = ff_hevc_decode_nal_sps(s);
+        if (ret < 0)
+            return ret;
+        break;
+    case NAL_PPS:
+        ret = ff_hevc_decode_nal_pps(s);
+        if (ret < 0)
+            return ret;
+        break;
+    case NAL_SEI_PREFIX:
+    case NAL_SEI_SUFFIX:
+        ret = ff_hevc_decode_nal_sei(s);
+        if (ret < 0)
+            return ret;
+        break;
+    case NAL_TRAIL_R:
+    case NAL_TRAIL_N:
+    case NAL_TSA_N:
+    case NAL_TSA_R:
+    case NAL_STSA_N:
+    case NAL_STSA_R:
+    case NAL_BLA_W_LP:
+    case NAL_BLA_W_RADL:
+    case NAL_BLA_N_LP:
+    case NAL_IDR_W_RADL:
+    case NAL_IDR_N_LP:
+    case NAL_CRA_NUT:
+    case NAL_RADL_N:
+    case NAL_RADL_R:
+    case NAL_RASL_N:
+    case NAL_RASL_R:
+        ret = hls_slice_header(s);
+        if (ret < 0)
+            return ret;
+
+        if (s->max_ra == INT_MAX) {
+            if (s->nal_unit_type == NAL_CRA_NUT  ||
+                s->nal_unit_type == NAL_BLA_W_LP ||
+                s->nal_unit_type == NAL_BLA_N_LP ||
+                s->nal_unit_type == NAL_BLA_N_LP) {
+                s->max_ra = s->poc;
+            } else {
+                if (IS_IDR(s))
+                    s->max_ra = INT_MIN;
+            }
+        }
+
+        if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) &&
+            s->poc <= s->max_ra) {
+            s->is_decoded = 0;
+            break;
+        } else {
+            if (s->nal_unit_type == NAL_RASL_R && s->poc > s->max_ra)
+                s->max_ra = INT_MIN;
+        }
+
+        if (s->sh.first_slice_in_pic_flag) {
+            ret = hevc_frame_start(s);
+            if (ret < 0)
+                return ret;
+        } else if (!s->ref) {
+            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        if (!s->sh.dependent_slice_segment_flag &&
+            s->sh.slice_type != I_SLICE) {
+            ret = ff_hevc_slice_rpl(s);
+            if (ret < 0) {
+                av_log(s->avctx, AV_LOG_WARNING, "Error constructing the reference "
+                       "lists for the current slice.\n");
+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                    return ret;
+            }
+        }
+
+        ctb_addr_ts = hls_slice_data(s);
+        if (ctb_addr_ts >= (s->sps->ctb_width * s->sps->ctb_height)) {
+            s->is_decoded = 1;
+            if ((s->pps->transquant_bypass_enable_flag ||
+                 (s->sps->pcm.loop_filter_disable_flag && s->sps->pcm_enabled_flag)) &&
+                s->sps->sao_enabled)
+                restore_tqb_pixels(s);
+        }
+
+        if (ctb_addr_ts < 0)
+            return ctb_addr_ts;
+        break;
+    case NAL_EOS_NUT:
+    case NAL_EOB_NUT:
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        s->max_ra     = INT_MAX;
+        break;
+    case NAL_AUD:
+    case NAL_FD_NUT:
+        break;
+    default:
+        av_log(s->avctx, AV_LOG_INFO, "Skipping NAL unit %d\n", s->nal_unit_type);
+    }
+
+    return 0;
+}
+
+/* FIXME: This is adapted from ff_h264_decode_nal, avoiding duplication
+   between these functions would be nice. */
+static int extract_rbsp(const uint8_t *src, int length,
+                        HEVCNAL *nal)
+{
+    int i, si, di;
+    uint8_t *dst;
+
+#define STARTCODE_TEST                                                  \
+        if (i + 2 < length && src[i + 1] == 0 && src[i + 2] <= 3) {     \
+            if (src[i + 2] != 3) {                                      \
+                /* startcode, so we must be past the end */             \
+                length = i;                                             \
+            }                                                           \
+            break;                                                      \
+        }
+#if HAVE_FAST_UNALIGNED
+#define FIND_FIRST_ZERO                                                 \
+        if (i > 0 && !src[i])                                           \
+            i--;                                                        \
+        while (src[i])                                                  \
+            i++
+#if HAVE_FAST_64BIT
+    for (i = 0; i + 1 < length; i += 9) {
+        if (!((~AV_RN64A(src + i) &
+               (AV_RN64A(src + i) - 0x0100010001000101ULL)) &
+              0x8000800080008080ULL))
+            continue;
+        FIND_FIRST_ZERO;
+        STARTCODE_TEST;
+        i -= 7;
+    }
+#else
+    for (i = 0; i + 1 < length; i += 5) {
+        if (!((~AV_RN32A(src + i) &
+               (AV_RN32A(src + i) - 0x01000101U)) &
+              0x80008080U))
+            continue;
+        FIND_FIRST_ZERO;
+        STARTCODE_TEST;
+        i -= 3;
+    }
+#endif
+#else
+    for (i = 0; i + 1 < length; i += 2) {
+        if (src[i])
+            continue;
+        if (i > 0 && src[i - 1] == 0)
+            i--;
+        STARTCODE_TEST;
+    }
+#endif
+
+    if (i >= length - 1) { // no escaped 0
+        nal->data = src;
+        nal->size = length;
+        return length;
+    }
+
+    av_fast_malloc(&nal->rbsp_buffer, &nal->rbsp_buffer_size,
+                   length + FF_INPUT_BUFFER_PADDING_SIZE);
+    if (!nal->rbsp_buffer)
+        return AVERROR(ENOMEM);
+
+    dst = nal->rbsp_buffer;
+
+    memcpy(dst, src, i);
+    si = di = i;
+    while (si + 2 < length) {
+        // remove escapes (very rare 1:2^22)
+        if (src[si + 2] > 3) {
+            dst[di++] = src[si++];
+            dst[di++] = src[si++];
+        } else if (src[si] == 0 && src[si + 1] == 0) {
+            if (src[si + 2] == 3) { // escape
+                dst[di++]  = 0;
+                dst[di++]  = 0;
+                si        += 3;
+
+                continue;
+            } else // next start code
+                goto nsc;
+        }
+
+        dst[di++] = src[si++];
+    }
+    while (si < length)
+        dst[di++] = src[si++];
+nsc:
+
+    memset(dst + di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
+
+    nal->data = dst;
+    nal->size = di;
+    return si;
+}
+
+static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+{
+    int i, consumed, ret = 0;
+
+    s->ref = NULL;
+    s->eos = 0;
+
+    /* split the input packet into NAL units, so we know the upper bound on the
+     * number of slices in the frame */
+    s->nb_nals = 0;
+    while (length >= 4) {
+        HEVCNAL *nal;
+        int extract_length = 0;
+
+        if (s->disable_au == 0) {
+            if (s->is_nalff) {
+                int i;
+                for (i = 0; i < s->nal_length_size; i++)
+                    extract_length = (extract_length << 8) | buf[i];
+                buf    += s->nal_length_size;
+                length -= s->nal_length_size;
+
+                if (extract_length > length) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid NAL unit size.\n");
+                    ret = AVERROR_INVALIDDATA;
+                    goto fail;
+                }
+            } else {
+                if (buf[2] == 0) {
+                    length--;
+                    buf++;
+                    continue;
+                }
+                if (buf[0] != 0 || buf[1] != 0 || buf[2] != 1) {
+                    ret = AVERROR_INVALIDDATA;
+                    goto fail;
+                }
+
+                buf    += 3;
+                length -= 3;
+            }
+        }
+        if (!s->is_nalff || s->disable_au)
+            extract_length = length;
+
+        if (s->nals_allocated < s->nb_nals + 1) {
+            int new_size = s->nals_allocated + 1;
+            HEVCNAL *tmp = av_realloc_array(s->nals, new_size, sizeof(*tmp));
+            if (!tmp) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+            s->nals = tmp;
+            memset(s->nals + s->nals_allocated, 0, (new_size - s->nals_allocated) * sizeof(*tmp));
+            s->nals_allocated = new_size;
+        }
+        nal = &s->nals[s->nb_nals++];
+
+        consumed = extract_rbsp(buf, extract_length, nal);
+        if (consumed < 0) {
+            ret = consumed;
+            goto fail;
+        }
+
+        ret = init_get_bits8(&s->HEVClc.gb, nal->data, nal->size);
+        if (ret < 0)
+            goto fail;
+        hls_nal_unit(s);
+
+        if (s->nal_unit_type == NAL_EOS_NUT || s->nal_unit_type == NAL_EOS_NUT)
+            s->eos = 1;
+
+        buf    += consumed;
+        length -= consumed;
+    }
+
+    /* parse the NAL units */
+    for (i = 0; i < s->nb_nals; i++) {
+        int ret = decode_nal_unit(s, s->nals[i].data, s->nals[i].size);
+        if (ret < 0) {
+            av_log(s->avctx, AV_LOG_WARNING, "Error parsing NAL unit #%d.\n", i);
+            if (s->avctx->err_recognition & AV_EF_EXPLODE)
+                goto fail;
+        }
+    }
+
+fail:
+    if (s->ref)
+        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+
+    return ret;
+}
+
+static void print_md5(void *log_ctx, int level,  uint8_t md5[16])
+{
+    int i;
+    for (i = 0; i < 16; i++)
+        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
+}
+
+static int verify_md5(HEVCContext *s, AVFrame *frame)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    int pixel_shift = desc->comp[0].depth_minus1 > 7;
+    int i, j;
+
+    if (!desc)
+        return AVERROR(EINVAL);
+
+    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
+           s->poc);
+
+    /* the checksums are LE, so we have to byteswap for >8bpp formats
+     * on BE arches */
+#if HAVE_BIGENDIAN
+    if (pixel_shift && !s->checksum_buf) {
+        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
+                       FFMAX3(frame->linesize[0], frame->linesize[1],
+                              frame->linesize[2]));
+        if (!s->checksum_buf)
+            return AVERROR(ENOMEM);
+    }
+#endif
+
+    for (i = 0; frame->data[i]; i++) {
+        int width  = s->avctx->coded_width;
+        int height = s->avctx->coded_height;
+        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
+        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
+        uint8_t md5[16];
+
+        av_md5_init(s->md5_ctx);
+        for (j = 0; j < h; j++) {
+            const uint8_t *src = frame->data[i] + j * frame->linesize[i];
+#if HAVE_BIGENDIAN
+            if (pixel_shift) {
+                s->dsp.bswap16_buf((uint16_t*)s->checksum_buf,
+                                   (const uint16_t*)src, w);
+                src = s->checksum_buf;
+            }
+#endif
+            av_md5_update(s->md5_ctx, src, w << pixel_shift);
+        }
+        av_md5_final(s->md5_ctx, md5);
+
+        if (!memcmp(md5, s->md5[i], 16)) {
+            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
+            print_md5(s->avctx, AV_LOG_DEBUG, md5);
+            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
+        } else {
+            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
+            print_md5(s->avctx, AV_LOG_ERROR, md5);
+            av_log   (s->avctx, AV_LOG_ERROR, " != ");
+            print_md5(s->avctx, AV_LOG_ERROR, s->md5[i]);
+            av_log   (s->avctx, AV_LOG_ERROR, "\n");
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
+
+    return 0;
+}
+
+static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
+                             AVPacket *avpkt)
+{
+    int ret;
+    HEVCContext *s = avctx->priv_data;
+
+    //av_log(avctx, AV_LOG_WARNING, "decode size %d\n", avpkt->size);
+
+    if (!avpkt->size) {
+        ret = ff_hevc_output_frame(s, data, 1);
+        if (ret < 0)
+            return ret;
+
+        *got_output = ret;
+        return 0;
+    }
+
+    s->ref = NULL;
+    ret = decode_nal_units(s, avpkt->data, avpkt->size);
+    if (ret < 0)
+        return ret;
+
+    /* verify the SEI checksum */
+    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
+        s->is_md5) {
+        ret = verify_md5(s, s->ref->frame);
+        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
+            ff_hevc_unref_frame(s, s->ref, ~0);
+            return ret;
+        }
+    }
+    s->is_md5 = 0;
+
+    if (s->is_decoded) {
+        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
+        s->is_decoded = 0;
+    }
+
+    if (s->output_frame->buf[0]) {
+        av_frame_move_ref(data, s->output_frame);
+        *got_output = 1;
+    }
+
+    return avpkt->size;
+}
+
+static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
+{
+    int ret;
+
+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
+    if (ret < 0)
+        return ret;
+
+    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+    if (!dst->tab_mvf_buf)
+        goto fail;
+    dst->tab_mvf = src->tab_mvf;
+
+    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+    if (!dst->rpl_tab_buf)
+        goto fail;
+    dst->rpl_tab = src->rpl_tab;
+
+    dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+    if (!dst->rpl_buf)
+        goto fail;
+
+    dst->poc        = src->poc;
+    dst->ctb_count  = src->ctb_count;
+    dst->window     = src->window;
+    dst->flags      = src->flags;
+    dst->sequence   = src->sequence;
+
+    return 0;
+fail:
+    ff_hevc_unref_frame(s, dst, ~0);
+    return AVERROR(ENOMEM);
+}
+
+static av_cold int hevc_decode_free(AVCodecContext *avctx)
+{
+    HEVCContext       *s = avctx->priv_data;
+    HEVCLocalContext *lc = &s->HEVClc;
+    int i;
+
+    pic_arrays_free(s);
+
+    av_freep(&lc->edge_emu_buffer);
+    av_freep(&s->md5_ctx);
+
+    av_frame_free(&s->tmp_frame);
+    av_frame_free(&s->output_frame);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        ff_hevc_unref_frame(s, &s->DPB[i], ~0);
+        av_frame_free(&s->DPB[i].frame);
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->vps_list); i++)
+        av_freep(&s->vps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
+        av_buffer_unref(&s->sps_list[i]);
+    for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
+        av_buffer_unref(&s->pps_list[i]);
+
+    for (i = 0; i < s->nals_allocated; i++)
+        av_freep(&s->nals[i].rbsp_buffer);
+    av_freep(&s->nals);
+    s->nals_allocated = 0;
+
+    return 0;
+}
+
+static av_cold int hevc_init_context(AVCodecContext *avctx)
+{
+    HEVCContext *s = avctx->priv_data;
+    int i;
+
+    s->avctx = avctx;
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        goto fail;
+
+    s->output_frame = av_frame_alloc();
+    if (!s->output_frame)
+        goto fail;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        s->DPB[i].frame = av_frame_alloc();
+        if (!s->DPB[i].frame)
+            goto fail;
+        s->DPB[i].tf.f = s->DPB[i].frame;
+    }
+
+    s->max_ra = INT_MAX;
+
+    s->md5_ctx = av_md5_alloc();
+    if (!s->md5_ctx)
+        goto fail;
+
+    ff_dsputil_init(&s->dsp, avctx);
+
+    s->context_initialized = 1;
+
+    return 0;
+fail:
+    hevc_decode_free(avctx);
+    return AVERROR(ENOMEM);
+}
+
+static int hevc_update_thread_context(AVCodecContext *dst,
+                                      const AVCodecContext *src)
+{
+    HEVCContext *s  = dst->priv_data;
+    HEVCContext *s0 = src->priv_data;
+    int i, ret;
+
+    if (!s->context_initialized) {
+        ret = hevc_init_context(dst);
+        if (ret < 0)
+            return ret;
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        ff_hevc_unref_frame(s, &s->DPB[i], ~0);
+        if (s0->DPB[i].frame->buf[0]) {
+            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++) {
+        av_buffer_unref(&s->sps_list[i]);
+        if (s0->sps_list[i]) {
+            s->sps_list[i] = av_buffer_ref(s0->sps_list[i]);
+            if (!s->sps_list[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++) {
+        av_buffer_unref(&s->pps_list[i]);
+        if (s0->pps_list[i]) {
+            s->pps_list[i] = av_buffer_ref(s0->pps_list[i]);
+            if (!s->pps_list[i])
+                return AVERROR(ENOMEM);
+        }
+    }
+
+    s->seq_decode = s0->seq_decode;
+    s->seq_output = s0->seq_output;
+    s->pocTid0    = s0->pocTid0;
+    s->max_ra     = s0->max_ra;
+
+    s->is_nalff        = s0->is_nalff;
+    s->nal_length_size = s0->nal_length_size;
+
+
+    if (s0->eos) {
+        s->seq_decode = (s->seq_decode + 1) & 0xff;
+        s->max_ra = INT_MAX;
+    }
+
+    return 0;
+}
+
+static int hevc_decode_extradata(HEVCContext *s)
+{
+    AVCodecContext *avctx = s->avctx;
+    GetByteContext gb;
+    int ret;
+
+    bytestream2_init(&gb, avctx->extradata, avctx->extradata_size);
+
+    if (avctx->extradata_size > 3 &&
+        (avctx->extradata[0] || avctx->extradata[1] ||
+         avctx->extradata[2] > 1)) {
+        /* It seems the extradata is encoded as hvcC format.
+         * Temporarily, we support configurationVersion==0 until 14496-15 3rd finalized.
+         * When finalized, configurationVersion will be 1 and we can recognize hvcC by
+         * checking if avctx->extradata[0]==1 or not. */
+        int i, j, num_arrays;
+        int nal_len_size;
+
+        s->is_nalff = 1;
+
+        bytestream2_skip(&gb, 21);
+        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
+        num_arrays   = bytestream2_get_byte(&gb);
+
+        /* nal units in the hvcC always have length coded with 2 bytes,
+         * so put a fake nal_length_size = 2 while parsing them */
+        s->nal_length_size = 2;
+
+        /* Decode nal units from hvcC. */
+        for (i = 0; i < num_arrays; i++) {
+            int type = bytestream2_get_byte(&gb) & 0x3f;
+            int cnt  = bytestream2_get_be16(&gb);
+
+            for (j = 0; j < cnt; j++) {
+                // +2 for the nal size field
+                int nalsize = bytestream2_peek_be16(&gb) + 2;
+                if (bytestream2_get_bytes_left(&gb) < nalsize) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid NAL unit size in extradata.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                ret = decode_nal_units(s, gb.buffer, nalsize);
+                if (ret < 0) {
+                    av_log(avctx, AV_LOG_ERROR,
+                           "Decoding nal unit %d %d from hvcC failed\n", type, i);
+                    return ret;
+                }
+                bytestream2_skip(&gb, nalsize);
+            }
+        }
+
+        /* Now store right nal length size, that will be used to parse all other nals */
+        s->nal_length_size = nal_len_size;
+    } else {
+        s->is_nalff = 0;
+        ret = decode_nal_units(s, avctx->extradata, avctx->extradata_size);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
+}
+
+static av_cold int hevc_decode_init(AVCodecContext *avctx)
+{
+    HEVCContext *s = avctx->priv_data;
+    int ret;
+
+    ff_init_cabac_states();
+
+    avctx->internal->allocate_progress = 1;
+
+    ret = hevc_init_context(avctx);
+    if (ret < 0)
+        return ret;
+
+    if (avctx->extradata_size > 0 && avctx->extradata) {
+        ret = hevc_decode_extradata(s);
+        if (ret < 0) {
+            hevc_decode_free(avctx);
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
+{
+    HEVCContext *s = avctx->priv_data;
+    int ret;
+
+    memset(s, 0, sizeof(*s));
+
+    ret = hevc_init_context(avctx);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static void hevc_decode_flush(AVCodecContext *avctx)
+{
+    HEVCContext *s = avctx->priv_data;
+    ff_hevc_flush_dpb(s);
+    s->max_ra = INT_MAX;
+}
+
+#define OFFSET(x) offsetof(HEVCContext, x)
+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption options[] = {
+    { "disable-au", "disable read frame AU by AU", OFFSET(disable_au),
+        AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, PAR },
+    { "strict-displaywin", "stricly apply default display window size", OFFSET(strict_def_disp_win),
+        AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, PAR },
+    { NULL },
+};
+
+static const AVClass hevc_decoder_class = {
+    .class_name = "HEVC decoder",
+    .item_name  = av_default_item_name,
+    .option     = options,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+AVCodec ff_hevc_decoder = {
+    .name                  = "hevc",
+    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (High Efficiency Video Coding)"),
+    .type                  = AVMEDIA_TYPE_VIDEO,
+    .id                    = AV_CODEC_ID_HEVC,
+    .priv_data_size        = sizeof(HEVCContext),
+    .priv_class            = &hevc_decoder_class,
+    .init                  = hevc_decode_init,
+    .close                 = hevc_decode_free,
+    .decode                = hevc_decode_frame,
+    .flush                 = hevc_decode_flush,
+    .update_thread_context = hevc_update_thread_context,
+    .init_thread_copy      = hevc_init_thread_copy,
+    .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_FRAME_THREADS,
+};
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
new file mode 100644
index 0000000000..1aaf584260
--- /dev/null
+++ b/libavcodec/hevc.h
@@ -0,0 +1,988 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HEVC_H
+#define AVCODEC_HEVC_H
+
+#include "libavutil/buffer.h"
+#include "libavutil/md5.h"
+
+#include "avcodec.h"
+#include "cabac.h"
+#include "dsputil.h"
+#include "get_bits.h"
+#include "hevcpred.h"
+#include "hevcdsp.h"
+#include "internal.h"
+#include "thread.h"
+#include "videodsp.h"
+
+#define MAX_DPB_SIZE 16 // A.4.1
+#define MAX_REFS 16
+
+/**
+ * 7.4.2.1
+ */
+#define MAX_SUB_LAYERS 7
+#define MAX_VPS_COUNT 16
+#define MAX_SPS_COUNT 32
+#define MAX_PPS_COUNT 256
+#define MAX_SHORT_TERM_RPS_COUNT 64
+#define MAX_CU_SIZE 128
+
+//TODO: check if this is really the maximum
+#define MAX_TRANSFORM_DEPTH 5
+
+#define MAX_TB_SIZE 32
+#define MAX_PB_SIZE 64
+#define MAX_LOG2_CTB_SIZE 6
+#define MAX_QP 51
+#define DEFAULT_INTRA_TC_OFFSET 2
+
+#define HEVC_CONTEXTS 183
+
+#define MRG_MAX_NUM_CANDS     5
+
+#define L0 0
+#define L1 1
+
+#define EPEL_EXTRA_BEFORE 1
+#define EPEL_EXTRA_AFTER  2
+#define EPEL_EXTRA        3
+
+/**
+ * Value of the luma sample at position (x, y) in the 2D array tab.
+ */
+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * pic_width_in_ctb + (x)])
+#define SAMPLE_CBF(tab, x, y) ((tab)[((y) & ((1<<log2_trafo_size)-1)) * MAX_CU_SIZE + ((x) & ((1<<log2_trafo_size)-1))])
+
+#define IS_IDR(s) (s->nal_unit_type == NAL_IDR_W_RADL || s->nal_unit_type == NAL_IDR_N_LP)
+#define IS_BLA(s) (s->nal_unit_type == NAL_BLA_W_RADL || s->nal_unit_type == NAL_BLA_W_LP || \
+                   s->nal_unit_type == NAL_BLA_N_LP)
+
+/**
+ * Table 7-3: NAL unit type codes
+ */
+enum NALUnitType {
+    NAL_TRAIL_N     =  0,
+    NAL_TRAIL_R     =  1,
+    NAL_TSA_N       =  2,
+    NAL_TSA_R       =  3,
+    NAL_STSA_N      =  4,
+    NAL_STSA_R      =  5,
+    NAL_RADL_N      =  6,
+    NAL_RADL_R      =  7,
+    NAL_RASL_N      =  8,
+    NAL_RASL_R      =  9,
+    NAL_BLA_W_LP    = 16,
+    NAL_BLA_W_RADL  = 17,
+    NAL_BLA_N_LP    = 18,
+    NAL_IDR_W_RADL  = 19,
+    NAL_IDR_N_LP    = 20,
+    NAL_CRA_NUT     = 21,
+    NAL_VPS         = 32,
+    NAL_SPS         = 33,
+    NAL_PPS         = 34,
+    NAL_AUD         = 35,
+    NAL_EOS_NUT     = 36,
+    NAL_EOB_NUT     = 37,
+    NAL_FD_NUT      = 38,
+    NAL_SEI_PREFIX  = 39,
+    NAL_SEI_SUFFIX  = 40,
+};
+
+enum RPSType {
+    ST_CURR_BEF = 0,
+    ST_CURR_AFT,
+    ST_FOLL,
+    LT_CURR,
+    LT_FOLL,
+    NB_RPS_TYPE,
+};
+
+enum SliceType {
+    B_SLICE = 0,
+    P_SLICE = 1,
+    I_SLICE = 2,
+};
+
+enum SyntaxElement {
+    SAO_MERGE_FLAG = 0,
+    SAO_TYPE_IDX,
+    SAO_EO_CLASS,
+    SAO_BAND_POSITION,
+    SAO_OFFSET_ABS,
+    SAO_OFFSET_SIGN,
+    END_OF_SLICE_FLAG,
+    SPLIT_CODING_UNIT_FLAG,
+    CU_TRANSQUANT_BYPASS_FLAG,
+    SKIP_FLAG,
+    CU_QP_DELTA,
+    PRED_MODE_FLAG,
+    PART_MODE,
+    PCM_FLAG,
+    PREV_INTRA_LUMA_PRED_FLAG,
+    MPM_IDX,
+    REM_INTRA_LUMA_PRED_MODE,
+    INTRA_CHROMA_PRED_MODE,
+    MERGE_FLAG,
+    MERGE_IDX,
+    INTER_PRED_IDC,
+    REF_IDX_L0,
+    REF_IDX_L1,
+    ABS_MVD_GREATER0_FLAG,
+    ABS_MVD_GREATER1_FLAG,
+    ABS_MVD_MINUS2,
+    MVD_SIGN_FLAG,
+    MVP_LX_FLAG,
+    NO_RESIDUAL_DATA_FLAG,
+    SPLIT_TRANSFORM_FLAG,
+    CBF_LUMA,
+    CBF_CB_CR,
+    TRANSFORM_SKIP_FLAG,
+    LAST_SIGNIFICANT_COEFF_X_PREFIX,
+    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
+    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
+    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
+    SIGNIFICANT_COEFF_GROUP_FLAG,
+    SIGNIFICANT_COEFF_FLAG,
+    COEFF_ABS_LEVEL_GREATER1_FLAG,
+    COEFF_ABS_LEVEL_GREATER2_FLAG,
+    COEFF_ABS_LEVEL_REMAINING,
+    COEFF_SIGN_FLAG,
+};
+
+enum PartMode {
+    PART_2Nx2N = 0,
+    PART_2NxN  = 1,
+    PART_Nx2N  = 2,
+    PART_NxN   = 3,
+    PART_2NxnU = 4,
+    PART_2NxnD = 5,
+    PART_nLx2N = 6,
+    PART_nRx2N = 7,
+};
+
+enum PredMode {
+    MODE_INTER = 0,
+    MODE_INTRA,
+    MODE_SKIP,
+};
+
+enum InterPredIdc {
+    PRED_L0 = 0,
+    PRED_L1,
+    PRED_BI,
+};
+
+enum IntraPredMode {
+    INTRA_PLANAR = 0,
+    INTRA_DC,
+    INTRA_ANGULAR_2,
+    INTRA_ANGULAR_3,
+    INTRA_ANGULAR_4,
+    INTRA_ANGULAR_5,
+    INTRA_ANGULAR_6,
+    INTRA_ANGULAR_7,
+    INTRA_ANGULAR_8,
+    INTRA_ANGULAR_9,
+    INTRA_ANGULAR_10,
+    INTRA_ANGULAR_11,
+    INTRA_ANGULAR_12,
+    INTRA_ANGULAR_13,
+    INTRA_ANGULAR_14,
+    INTRA_ANGULAR_15,
+    INTRA_ANGULAR_16,
+    INTRA_ANGULAR_17,
+    INTRA_ANGULAR_18,
+    INTRA_ANGULAR_19,
+    INTRA_ANGULAR_20,
+    INTRA_ANGULAR_21,
+    INTRA_ANGULAR_22,
+    INTRA_ANGULAR_23,
+    INTRA_ANGULAR_24,
+    INTRA_ANGULAR_25,
+    INTRA_ANGULAR_26,
+    INTRA_ANGULAR_27,
+    INTRA_ANGULAR_28,
+    INTRA_ANGULAR_29,
+    INTRA_ANGULAR_30,
+    INTRA_ANGULAR_31,
+    INTRA_ANGULAR_32,
+    INTRA_ANGULAR_33,
+    INTRA_ANGULAR_34,
+};
+
+enum SAOType {
+    SAO_NOT_APPLIED = 0,
+    SAO_BAND,
+    SAO_EDGE,
+};
+
+enum SAOEOClass {
+    SAO_EO_HORIZ = 0,
+    SAO_EO_VERT,
+    SAO_EO_135D,
+    SAO_EO_45D,
+};
+
+enum ScanType {
+    SCAN_DIAG = 0,
+    SCAN_HORIZ,
+    SCAN_VERT,
+};
+
+typedef struct ShortTermRPS {
+    int num_negative_pics;
+    int num_delta_pocs;
+    int32_t delta_poc[32];
+    uint8_t used[32];
+} ShortTermRPS;
+
+typedef struct LongTermRPS {
+    int     poc[32];
+    uint8_t used[32];
+    uint8_t nb_refs;
+} LongTermRPS;
+
+typedef struct RefPicList {
+    struct HEVCFrame *ref[MAX_REFS];
+    int list[MAX_REFS];
+    int isLongTerm[MAX_REFS];
+    int nb_refs;
+} RefPicList;
+
+typedef struct RefPicListTab {
+    RefPicList refPicList[2];
+} RefPicListTab;
+
+typedef struct HEVCWindow {
+    int left_offset;
+    int right_offset;
+    int top_offset;
+    int bottom_offset;
+} HEVCWindow;
+
+typedef struct VUI {
+    AVRational sar;
+
+    int overscan_info_present_flag;
+    int overscan_appropriate_flag;
+
+    int video_signal_type_present_flag;
+    int video_format;
+    int video_full_range_flag;
+    int colour_description_present_flag;
+    uint8_t colour_primaries;
+    uint8_t transfer_characteristic;
+    uint8_t matrix_coeffs;
+
+    int chroma_loc_info_present_flag;
+    int chroma_sample_loc_type_top_field;
+    int chroma_sample_loc_type_bottom_field;
+    int neutra_chroma_indication_flag;
+
+    int field_seq_flag;
+    int frame_field_info_present_flag;
+
+    int default_display_window_flag;
+    HEVCWindow def_disp_win;
+
+    int vui_timing_info_present_flag;
+    uint32_t vui_num_units_in_tick;
+    uint32_t vui_time_scale;
+    int vui_poc_proportional_to_timing_flag;
+    int vui_num_ticks_poc_diff_one_minus1;
+    int vui_hrd_parameters_present_flag;
+
+    int bitstream_restriction_flag;
+    int tiles_fixed_structure_flag;
+    int motion_vectors_over_pic_boundaries_flag;
+    int restricted_ref_pic_lists_flag;
+    int min_spatial_segmentation_idc;
+    int max_bytes_per_pic_denom;
+    int max_bits_per_min_cu_denom;
+    int log2_max_mv_length_horizontal;
+    int log2_max_mv_length_vertical;
+} VUI;
+
+typedef struct PTL {
+    int general_profile_space;
+    uint8_t general_tier_flag;
+    int general_profile_idc;
+    int general_profile_compatibility_flag[32];
+    int general_level_idc;
+
+    uint8_t sub_layer_profile_present_flag[MAX_SUB_LAYERS];
+    uint8_t sub_layer_level_present_flag[MAX_SUB_LAYERS];
+
+    int sub_layer_profile_space[MAX_SUB_LAYERS];
+    uint8_t sub_layer_tier_flag[MAX_SUB_LAYERS];
+    int sub_layer_profile_idc[MAX_SUB_LAYERS];
+    uint8_t sub_layer_profile_compatibility_flags[MAX_SUB_LAYERS][32];
+    int sub_layer_level_idc[MAX_SUB_LAYERS];
+} PTL;
+
+typedef struct VPS {
+    uint8_t vps_temporal_id_nesting_flag;
+    int vps_max_layers;
+    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
+
+    PTL ptl;
+    int vps_sub_layer_ordering_info_present_flag;
+    unsigned int vps_max_dec_pic_buffering[MAX_SUB_LAYERS];
+    unsigned int vps_num_reorder_pics[MAX_SUB_LAYERS];
+    unsigned int vps_max_latency_increase[MAX_SUB_LAYERS];
+    int vps_max_layer_id;
+    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
+    uint8_t vps_timing_info_present_flag;
+    uint32_t vps_num_units_in_tick;
+    uint32_t vps_time_scale;
+    uint8_t vps_poc_proportional_to_timing_flag;
+    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
+    int vps_num_hrd_parameters;
+} VPS;
+
+typedef struct ScalingList {
+    // This is a little wasteful, since sizeID 0 only needs 8 coeffs, and size ID 3 only has 2 arrays, not 6.
+    uint8_t sl[4][6][64];
+    uint8_t sl_dc[2][6];
+} ScalingList;
+
+typedef struct HEVCSPS {
+    int vps_id;
+    int chroma_format_idc;
+    uint8_t separate_colour_plane_flag;
+
+    ///< output (i.e. cropped) values
+    int output_width, output_height;
+    HEVCWindow output_window;
+
+    HEVCWindow pic_conf_win;
+
+    int bit_depth;
+    int pixel_shift;
+    enum AVPixelFormat pix_fmt;
+
+    unsigned int log2_max_poc_lsb;
+    int pcm_enabled_flag;
+
+    int max_sub_layers;
+    struct {
+        int max_dec_pic_buffering;
+        int num_reorder_pics;
+        int max_latency_increase;
+    } temporal_layer[MAX_SUB_LAYERS];
+
+    VUI vui;
+    PTL ptl;
+
+    uint8_t scaling_list_enable_flag;
+    ScalingList scaling_list;
+
+    unsigned int nb_st_rps;
+    ShortTermRPS st_rps[MAX_SHORT_TERM_RPS_COUNT];
+
+    uint8_t amp_enabled_flag;
+    uint8_t sao_enabled;
+
+    uint8_t long_term_ref_pics_present_flag;
+    uint16_t lt_ref_pic_poc_lsb_sps[32];
+    uint8_t used_by_curr_pic_lt_sps_flag[32];
+    uint8_t num_long_term_ref_pics_sps;
+
+    struct {
+        uint8_t bit_depth;
+        unsigned int log2_min_pcm_cb_size;
+        unsigned int log2_max_pcm_cb_size;
+        uint8_t loop_filter_disable_flag;
+    } pcm;
+    uint8_t sps_temporal_mvp_enabled_flag;
+    uint8_t sps_strong_intra_smoothing_enable_flag;
+
+    unsigned int log2_min_coding_block_size;
+    unsigned int log2_diff_max_min_coding_block_size;
+    unsigned int log2_min_transform_block_size;
+    unsigned int log2_max_trafo_size;
+    unsigned int log2_ctb_size;
+    unsigned int log2_min_pu_size;
+
+    int max_transform_hierarchy_depth_inter;
+    int max_transform_hierarchy_depth_intra;
+
+    ///< coded frame dimension in various units
+    int width;
+    int height;
+    int ctb_width;
+    int ctb_height;
+    int ctb_size;
+    int min_cb_width;
+    int min_cb_height;
+    int min_tb_width;
+    int min_tb_height;
+
+    int hshift[3];
+    int vshift[3];
+
+    int qp_bd_offset;
+} HEVCSPS;
+
+typedef struct HEVCPPS {
+    int sps_id; ///< seq_parameter_set_id
+
+    uint8_t sign_data_hiding_flag;
+
+    uint8_t cabac_init_present_flag;
+
+    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
+    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
+    int pic_init_qp_minus26;
+
+    uint8_t constrained_intra_pred_flag;
+    uint8_t transform_skip_enabled_flag;
+
+    uint8_t cu_qp_delta_enabled_flag;
+    int diff_cu_qp_delta_depth;
+
+    int cb_qp_offset;
+    int cr_qp_offset;
+    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
+    uint8_t weighted_pred_flag;
+    uint8_t weighted_bipred_flag;
+    uint8_t output_flag_present_flag;
+    uint8_t transquant_bypass_enable_flag;
+
+    uint8_t dependent_slice_segments_enabled_flag;
+    uint8_t tiles_enabled_flag;
+    uint8_t entropy_coding_sync_enabled_flag;
+
+    int num_tile_columns; ///< num_tile_columns_minus1 + 1
+    int num_tile_rows; ///< num_tile_rows_minus1 + 1
+    uint8_t uniform_spacing_flag;
+    uint8_t loop_filter_across_tiles_enabled_flag;
+
+    uint8_t seq_loop_filter_across_slices_enabled_flag;
+
+    uint8_t deblocking_filter_control_present_flag;
+    uint8_t deblocking_filter_override_enabled_flag;
+    uint8_t pps_disable_deblocking_filter_flag;
+    int beta_offset; ///< beta_offset_div2 * 2
+    int tc_offset; ///< tc_offset_div2 * 2
+
+    int pps_scaling_list_data_present_flag;
+    ScalingList scaling_list;
+
+    uint8_t lists_modification_present_flag;
+    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
+    int num_extra_slice_header_bits;
+    uint8_t slice_header_extension_present_flag;
+
+    uint8_t pps_extension_flag;
+    uint8_t pps_extension_data_flag;
+
+    // Inferred parameters
+    int *column_width; ///< ColumnWidth
+    int *row_height; ///< RowHeight
+    int *col_bd; ///< ColBd
+    int *row_bd; ///< RowBd
+    int *col_idxX;
+
+    int *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
+    int *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
+    int *tile_id; ///< TileId
+    int *tile_pos_rs; ///< TilePosRS
+    int *min_cb_addr_zs; ///< MinCbAddrZS
+    int *min_tb_addr_zs; ///< MinTbAddrZS
+} HEVCPPS;
+
+typedef struct SliceHeader {
+    int pps_id;
+
+    ///< address (in raster order) of the first block in the current slice segment
+    unsigned int   slice_segment_addr;
+    ///< address (in raster order) of the first block in the current slice
+    unsigned int   slice_addr;
+
+    enum SliceType slice_type;
+
+    int pic_order_cnt_lsb;
+
+    uint8_t first_slice_in_pic_flag;
+    uint8_t dependent_slice_segment_flag;
+    uint8_t pic_output_flag;
+    uint8_t colour_plane_id;
+
+    ///< RPS coded in the slice header itself is stored here
+    ShortTermRPS slice_rps;
+    const ShortTermRPS *short_term_rps;
+    LongTermRPS long_term_rps;
+    uint8_t rpl_modification_flag[2];
+    unsigned int list_entry_lx[2][32];
+
+    uint8_t no_output_of_prior_pics_flag;
+
+    uint8_t slice_sample_adaptive_offset_flag[3];
+
+    uint8_t slice_temporal_mvp_enabled_flag;
+    unsigned int nb_refs[2];
+
+    uint8_t mvd_l1_zero_flag;
+    uint8_t cabac_init_flag;
+    uint8_t collocated_list;
+    unsigned int collocated_ref_idx;
+    int slice_qp_delta;
+    int slice_cb_qp_offset;
+    int slice_cr_qp_offset;
+
+    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
+    int beta_offset; ///< beta_offset_div2 * 2
+    int tc_offset; ///< tc_offset_div2 * 2
+
+    int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
+
+    uint8_t slice_loop_filter_across_slices_enabled_flag;
+
+    int num_entry_point_offsets;
+
+    uint8_t luma_log2_weight_denom;
+    int16_t chroma_log2_weight_denom;
+
+    int16_t luma_weight_l0[16];
+    int16_t chroma_weight_l0[16][2];
+    int16_t chroma_weight_l1[16][2];
+    int16_t luma_weight_l1[16];
+
+
+    int16_t luma_offset_l0[16];
+    int16_t chroma_offset_l0[16][2];
+
+    int16_t luma_offset_l1[16];
+    int16_t chroma_offset_l1[16][2];
+
+    // Inferred parameters
+    int8_t slice_qp;
+    int    slice_ctb_addr_rs;
+} SliceHeader;
+
+typedef struct CodingTree {
+    int depth; ///< ctDepth
+} CodingTree;
+
+typedef struct CodingUnit {
+    uint8_t cu_transquant_bypass_flag;
+
+    enum PredMode pred_mode; ///< PredMode
+    enum PartMode part_mode; ///< PartMode
+    uint8_t rqt_root_cbf;
+
+    uint8_t pcm_flag;
+
+    // Inferred parameters
+    uint8_t intra_split_flag; ///< IntraSplitFlag
+    uint8_t max_trafo_depth; ///< MaxTrafoDepth
+
+    int x;
+    int y;
+
+} CodingUnit;
+
+typedef struct Mv {
+    int16_t x;     ///< horizontal component of motion vector
+    int16_t y;     ///< vertical component of motion vector
+} Mv;
+
+typedef struct MvField {
+      Mv  mv[2];
+      int8_t ref_idx[2];
+      int8_t pred_flag[2];
+      uint8_t is_intra;
+} MvField;
+
+typedef struct NeighbourAvailable {
+    int cand_bottom_left;
+    int cand_left;
+    int cand_up;
+    int cand_up_left;
+    int cand_up_right;
+    int cand_up_right_sap;
+} NeighbourAvailable;
+
+typedef struct PredictionUnit {
+    uint8_t merge_flag;
+    int mpm_idx;
+    int rem_intra_luma_pred_mode;
+    uint8_t intra_pred_mode[4];
+    uint8_t intra_pred_mode_c;
+    Mv mvd;
+} PredictionUnit;
+
+typedef struct TransformTree {
+    uint8_t cbf_cb[MAX_TRANSFORM_DEPTH][MAX_CU_SIZE * MAX_CU_SIZE];
+    uint8_t cbf_cr[MAX_TRANSFORM_DEPTH][MAX_CU_SIZE * MAX_CU_SIZE];
+    uint8_t cbf_luma;
+
+    // Inferred parameters
+    uint8_t inter_split_flag;
+} TransformTree;
+
+typedef struct TransformUnit {
+    int cu_qp_delta;
+
+    // Inferred parameters;
+    uint8_t is_cu_qp_delta_coded;
+    int cur_intra_pred_mode;
+} TransformUnit;
+
+typedef struct ResidualCoding {
+    uint8_t significant_coeff_group_flag[8][8];
+} ResidualCoding;
+
+typedef struct SAOParams {
+    uint8_t type_idx[3]; ///< sao_type_idx
+
+    int offset_abs[3][4]; ///< sao_offset_abs
+    int offset_sign[3][4]; ///< sao_offset_sign
+
+    int band_position[3]; ///< sao_band_position
+
+    int eo_class[3]; ///< sao_eo_class
+
+    // Inferred parameters
+    int offset_val[3][5]; ///<SaoOffsetVal
+} SAOParams;
+
+typedef struct DBParams {
+    uint8_t disable;
+    int beta_offset;
+    int tc_offset;
+} DBParams;
+
+#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
+#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
+
+typedef struct HEVCFrame {
+    AVFrame *frame;
+    ThreadFrame tf;
+    int poc;
+    MvField *tab_mvf;
+    RefPicList *refPicList;
+    RefPicListTab **rpl_tab;
+    int ctb_count;
+    struct HEVCFrame *collocated_ref;
+
+    /**
+     * A combination of HEVC_FRAME_FLAG_*
+     */
+    uint8_t flags;
+
+    /**
+     * A sequence counter, so that old frames are output first
+     * after a POC reset
+     */
+    uint16_t sequence;
+
+    HEVCWindow window;
+
+    AVBufferRef *tab_mvf_buf;
+    AVBufferRef *rpl_tab_buf;
+    AVBufferRef *rpl_buf;
+} HEVCFrame;
+
+typedef struct FilterData {
+        int x;
+        int y;
+        int size;
+    int slice_or_tiles_left_boundary;
+    int slice_or_tiles_up_boundary;
+} FilterData;
+
+typedef struct HEVCNAL {
+    uint8_t *rbsp_buffer;
+    int rbsp_buffer_size;
+    const uint8_t *data;
+    int size;
+} HEVCNAL;
+
+typedef struct HEVCLocalContext {
+    uint8_t cabac_state[HEVC_CONTEXTS];
+    int ctx_set;
+    int greater1_ctx;
+    int last_coeff_abs_level_greater1_flag;
+    int c_rice_param;
+    int last_coeff_abs_level_remaining;
+    GetBitContext gb;
+    CABACContext cc;
+    TransformTree tt;
+    TransformUnit tu;
+    ResidualCoding rc;
+    uint8_t first_qp_group;
+    int8_t qp_y;
+    int8_t curr_qp_y;
+    uint8_t slice_or_tiles_left_boundary;
+    uint8_t slice_or_tiles_up_boundary;
+    uint8_t ctb_left_flag;
+    uint8_t ctb_up_flag;
+    uint8_t ctb_up_right_flag;
+    uint8_t ctb_up_left_flag;
+    int     start_of_tiles_x;
+    int     end_of_tiles_x;
+    int     end_of_tiles_y;
+    uint8_t *edge_emu_buffer;
+    int      edge_emu_buffer_size;
+    CodingTree ct;
+    CodingUnit cu;
+    PredictionUnit pu;
+    NeighbourAvailable na;
+    DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 7) * MAX_PB_SIZE]);
+    FilterData *save_boundary_strengths;
+    int nb_saved;
+} HEVCLocalContext;
+
+typedef struct HEVCContext {
+    const AVClass *c;  // needed by private avoptions
+    AVCodecContext      *avctx;
+
+    HEVCLocalContext     HEVClc;
+
+    int                 disable_au;
+    int                 width;
+    int                 height;
+
+    uint8_t cabac_state[HEVC_CONTEXTS];
+
+    AVFrame *frame;
+    AVFrame *sao_frame;
+    AVFrame *tmp_frame;
+    AVFrame *output_frame;
+    VPS *vps;
+    const HEVCSPS *sps;
+    HEVCPPS *pps;
+    VPS *vps_list[MAX_VPS_COUNT];
+    AVBufferRef *sps_list[MAX_SPS_COUNT];
+    AVBufferRef *pps_list[MAX_PPS_COUNT];
+
+    ///< candidate references for the current frame
+    RefPicList rps[5];
+
+    SliceHeader sh;
+    SAOParams *sao;
+    DBParams *deblock;
+    enum NALUnitType nal_unit_type;
+    int temporal_id;  ///< temporal_id_plus1 - 1
+    HEVCFrame *ref;
+    HEVCFrame DPB[32];
+    int poc;
+    int pocTid0;
+    int slice_idx; ///< number of the slice being currently decoded
+    int eos;       ///< current packet contains an EOS/EOB NAL
+    int max_ra;
+    int bs_width;
+    int bs_height;
+
+    int is_decoded;
+
+    HEVCPredContext hpc;
+    HEVCDSPContext hevcdsp;
+    VideoDSPContext vdsp;
+    DSPContext dsp;
+    int8_t *qp_y_tab;
+    uint8_t *split_cu_flag;
+    uint8_t *horizontal_bs;
+    uint8_t *vertical_bs;
+
+    int32_t *tab_slice_address;
+
+    //  CU
+    uint8_t *skip_flag;
+    uint8_t *tab_ct_depth;
+    // PU
+    uint8_t *tab_ipm;
+
+
+    uint8_t *cbf_luma; // cbf_luma of colocated TU
+    uint8_t *is_pcm;
+
+    // CTB-level flags affecting loop filter operation
+    uint8_t *filter_slice_edges;
+
+    /** used on BE to byteswap the lines for checksumming */
+    uint8_t *checksum_buf;
+    int      checksum_buf_size;
+
+    /**
+     * Sequence counters for decoded and output frames, so that old
+     * frames are output first after a POC reset
+     */
+    uint16_t seq_decode;
+    uint16_t seq_output;
+
+    HEVCNAL *nals;
+    int nb_nals;
+    int nals_allocated;
+
+    int nuh_layer_id;
+
+    // for checking the frame checksums
+    struct AVMD5 *md5_ctx;
+    uint8_t       md5[3][16];
+    uint8_t is_md5;
+
+    /** 1 if the independent slice segment header was successfully parsed */
+    uint8_t slice_initialized;
+    int strict_def_disp_win;
+
+    int context_initialized;
+    int is_nalff;         ///< this flag is != 0 if bitstream is encapsulated
+                          ///< as a format defined in 14496-15
+    int nal_length_size;  ///< Number of bytes used for nal length (1, 2 or 4)
+
+
+    AVBufferPool *tab_mvf_pool;
+    AVBufferPool *rpl_tab_pool;
+} HEVCContext;
+
+int ff_hevc_decode_short_term_rps(HEVCContext *s, ShortTermRPS *rps,
+                                  const HEVCSPS *sps, int is_slice_header);
+int ff_hevc_decode_nal_vps(HEVCContext *s);
+int ff_hevc_decode_nal_sps(HEVCContext *s);
+int ff_hevc_decode_nal_pps(HEVCContext *s);
+int ff_hevc_decode_nal_sei(HEVCContext *s);
+
+/**
+ * Mark all frames in DPB as unused for reference.
+ */
+void ff_hevc_clear_refs(HEVCContext *s);
+
+/**
+ * Drop all frames currently in DPB.
+ */
+void ff_hevc_flush_dpb(HEVCContext *s);
+
+/**
+ * Compute POC of the current frame and return it.
+ */
+int ff_hevc_compute_poc(HEVCContext *s, int poc_lsb);
+
+RefPicList* ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *frame, int x0, int y0);
+
+/**
+ * Construct the reference picture sets for the current frame.
+ */
+int ff_hevc_frame_rps(HEVCContext *s);
+
+/**
+ * Construct the reference picture list(s) for the current slice.
+ */
+int ff_hevc_slice_rpl(HEVCContext *s);
+
+void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts);
+void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts);
+int ff_hevc_sao_merge_flag_decode(HEVCContext *s);
+int ff_hevc_sao_type_idx_decode(HEVCContext *s);
+int ff_hevc_sao_band_position_decode(HEVCContext *s);
+int ff_hevc_sao_offset_abs_decode(HEVCContext *s);
+int ff_hevc_sao_offset_sign_decode(HEVCContext *s);
+int ff_hevc_sao_eo_class_decode(HEVCContext *s);
+int ff_hevc_end_of_slice_flag_decode(HEVCContext *s);
+int ff_hevc_cu_transquant_bypass_flag_decode(HEVCContext *s);
+int ff_hevc_skip_flag_decode(HEVCContext *s, int x0, int y0, int x_cb, int y_cb);
+int ff_hevc_pred_mode_decode(HEVCContext *s);
+int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, int y0);
+int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size);
+int ff_hevc_pcm_flag_decode(HEVCContext *s);
+int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s);
+int ff_hevc_mpm_idx_decode(HEVCContext *s);
+int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCContext *s);
+int ff_hevc_intra_chroma_pred_mode_decode(HEVCContext *s);
+int ff_hevc_merge_idx_decode(HEVCContext *s);
+int ff_hevc_merge_flag_decode(HEVCContext *s);
+int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH);
+int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx);
+int ff_hevc_mvp_lx_flag_decode(HEVCContext *s);
+int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s);
+int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s);
+int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s);
+int ff_hevc_mvd_decode(HEVCContext *s);
+int ff_hevc_mvd_sign_flag_decode(HEVCContext *s);
+int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size);
+int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth);
+int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth);
+int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx);
+int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
+                                                   int log2_size);
+int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
+                                                   int log2_size);
+int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
+                                                 int last_significant_coeff_prefix);
+int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int x_cg,
+                                                int y_cg, int log2_trafo_size);
+int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c, int y_c,
+                                          int log2_trafo_size, int scan_idx);
+int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx,
+                                                 int i, int n,
+                                                 int first_greater1_coeff_idx,
+                                                 int first_subset);
+int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx,
+                                                 int i, int n);
+int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int n, int base_level);
+int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb);
+
+/**
+ * Get the number of candidate references for the current frame.
+ */
+int ff_hevc_frame_nb_refs(HEVCContext *s);
+
+int ff_hevc_set_new_ref(HEVCContext *s, AVFrame **frame, int poc);
+
+/**
+ * Find next frame in output order and put a reference to it in frame.
+ * @return 1 if a frame was output, 0 otherwise
+ */
+int ff_hevc_output_frame(HEVCContext *s, AVFrame *frame, int flush);
+
+void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags);
+
+void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0, int nPbW, int nPbH);
+void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW, int nPbH, int log2_cb_size, int part_idx, int merge_idx, MvField *mv);
+void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, int nPbH, int log2_cb_size, int part_idx, int merge_idx, MvField *mv , int mvp_lx_flag, int LX);
+void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC, int xBase, int yBase, int log2_cb_size);
+void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_trafo_size,
+                                           int slice_or_tiles_up_boundary, int slice_or_tiles_left_boundary);
+int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s);
+int ff_hevc_cu_qp_delta_abs(HEVCContext *s);
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y);
+void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size);
+
+void ff_hevc_pps_free(HEVCPPS **ppps);
+
+extern const uint8_t ff_hevc_qpel_extra_before[4];
+extern const uint8_t ff_hevc_qpel_extra_after[4];
+extern const uint8_t ff_hevc_qpel_extra[4];
+
+extern const uint8_t ff_hevc_diag_scan4x4_x[16];
+extern const uint8_t ff_hevc_diag_scan4x4_y[16];
+extern const uint8_t ff_hevc_diag_scan8x8_x[64];
+extern const uint8_t ff_hevc_diag_scan8x8_y[64];
+
+#endif // AVCODEC_HEVC_H
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
new file mode 100644
index 0000000000..8468d560b9
--- /dev/null
+++ b/libavcodec/hevc_cabac.c
@@ -0,0 +1,928 @@
+/*
+ * HEVC CABAC decoding
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+
+#include "cabac_functions.h"
+#include "hevc.h"
+
+#define CABAC_MAX_BIN 100
+
+/**
+ * number of bin by SyntaxElement.
+ */
+static const int8_t num_bins_in_se[] = {
+     1,  // sao_merge_flag
+     1,  // sao_type_idx
+     0,  // sao_eo_class
+     0,  // sao_band_position
+     0,  // sao_offset_abs
+     0,  // sao_offset_sign
+     0,  // end_of_slice_flag
+     3,  // split_coding_unit_flag
+     1,  // cu_transquant_bypass_flag
+     3,  // skip_flag
+     3,  // cu_qp_delta
+     1,  // pred_mode
+     4,  // part_mode
+     0,  // pcm_flag
+     1,  // prev_intra_luma_pred_mode
+     0,  // mpm_idx
+     0,  // rem_intra_luma_pred_mode
+     2,  // intra_chroma_pred_mode
+     1,  // merge_flag
+     1,  // merge_idx
+     5,  // inter_pred_idc
+     2,  // ref_idx_l0
+     2,  // ref_idx_l1
+     2,  // abs_mvd_greater0_flag
+     2,  // abs_mvd_greater1_flag
+     0,  // abs_mvd_minus2
+     0,  // mvd_sign_flag
+     1,  // mvp_lx_flag
+     1,  // no_residual_data_flag
+     3,  // split_transform_flag
+     2,  // cbf_luma
+     4,  // cbf_cb, cbf_cr
+     2,  // transform_skip_flag[][]
+    18,  // last_significant_coeff_x_prefix
+    18,  // last_significant_coeff_y_prefix
+     0,  // last_significant_coeff_x_suffix
+     0,  // last_significant_coeff_y_suffix
+     4,  // significant_coeff_group_flag
+    42,  // significant_coeff_flag
+    24,  // coeff_abs_level_greater1_flag
+     6,  // coeff_abs_level_greater2_flag
+     0,  // coeff_abs_level_remaining
+     0,  // coeff_sign_flag
+};
+
+/**
+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
+ */
+static const int elem_offset[sizeof(num_bins_in_se)] = {
+    0,
+    1,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    5,
+    6,
+    9,
+    12,
+    13,
+    17,
+    17,
+    18,
+    18,
+    18,
+    20,
+    21,
+    22,
+    27,
+    29,
+    31,
+    33,
+    35,
+    35,
+    35,
+    36,
+    37,
+    40,
+    42,
+    46,
+    48,
+    66,
+    84,
+    84,
+    84,
+    88,
+    130,
+    154,
+    160,
+    160,
+};
+
+#define CNU 154
+/**
+ * Indexed by init_type
+ */
+static const uint8_t init_values[3][HEVC_CONTEXTS] = {
+    {
+        // sao_merge_flag
+        153,
+        // sao_type_idx
+        200,
+        // split_coding_unit_flag
+        139, 141, 157,
+        // cu_transquant_bypass_flag
+        154,
+        // skip_flag
+        CNU, CNU, CNU,
+        // cu_qp_delta
+        154, 154, 154,
+        // pred_mode
+        CNU,
+        // part_mode
+        184, CNU, CNU, CNU,
+        // prev_intra_luma_pred_mode
+        184,
+        // intra_chroma_pred_mode
+        63, 139,
+        // merge_flag
+        CNU,
+        // merge_idx
+        CNU,
+        // inter_pred_idc
+        CNU, CNU, CNU, CNU, CNU,
+        // ref_idx_l0
+        CNU, CNU,
+        // ref_idx_l1
+        CNU, CNU,
+        // abs_mvd_greater1_flag
+        CNU, CNU,
+        // abs_mvd_greater1_flag
+        CNU, CNU,
+        // mvp_lx_flag
+        CNU,
+        // no_residual_data_flag
+        CNU,
+        // split_transform_flag
+        153, 138, 138,
+        // cbf_luma
+        111, 141,
+        // cbf_cb, cbf_cr
+        94, 138, 182, 154,
+        // transform_skip_flag
+        139, 139,
+        // last_significant_coeff_x_prefix
+        110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
+         79, 108, 123,  63,
+        // last_significant_coeff_y_prefix
+        110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
+         79, 108, 123,  63,
+        // significant_coeff_group_flag
+        91, 171, 134, 141,
+        // significant_coeff_flag
+        111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
+        125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
+        139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
+        // coeff_abs_level_greater1_flag
+        140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
+        122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
+        // coeff_abs_level_greater2_flag
+        138, 153, 136, 167, 152, 152,
+    },
+    {
+        // sao_merge_flag
+        153,
+        // sao_type_idx
+        185,
+        // split_coding_unit_flag
+        107, 139, 126,
+        // cu_transquant_bypass_flag
+        154,
+        // skip_flag
+        197, 185, 201,
+        // cu_qp_delta
+        154, 154, 154,
+        // pred_mode
+        149,
+        // part_mode
+        154, 139, 154, 154,
+        // prev_intra_luma_pred_mode
+        154,
+        // intra_chroma_pred_mode
+        152, 139,
+        // merge_flag
+        110,
+        // merge_idx
+        122,
+        // inter_pred_idc
+        95, 79, 63, 31, 31,
+        // ref_idx_l0
+        153, 153,
+        // ref_idx_l1
+        153, 153,
+        // abs_mvd_greater1_flag
+        140, 198,
+        // abs_mvd_greater1_flag
+        140, 198,
+        // mvp_lx_flag
+        168,
+        // no_residual_data_flag
+        79,
+        // split_transform_flag
+        124, 138, 94,
+        // cbf_luma
+        153, 111,
+        // cbf_cb, cbf_cr
+        149, 107, 167, 154,
+        // transform_skip_flag
+        139, 139,
+        // last_significant_coeff_x_prefix
+        125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
+         94, 108, 123, 108,
+        // last_significant_coeff_y_prefix
+        125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
+         94, 108, 123, 108,
+        // significant_coeff_group_flag
+        121, 140, 61, 154,
+        // significant_coeff_flag
+        155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
+        154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
+        153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
+        // coeff_abs_level_greater1_flag
+        154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
+        136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
+        // coeff_abs_level_greater2_flag
+        107, 167, 91, 122, 107, 167,
+    },
+    {
+        // sao_merge_flag
+        153,
+        // sao_type_idx
+        160,
+        // split_coding_unit_flag
+        107, 139, 126,
+        // cu_transquant_bypass_flag
+        154,
+        // skip_flag
+        197, 185, 201,
+        // cu_qp_delta
+        154, 154, 154,
+        // pred_mode
+        134,
+        // part_mode
+        154, 139, 154, 154,
+        // prev_intra_luma_pred_mode
+        183,
+        // intra_chroma_pred_mode
+        152, 139,
+        // merge_flag
+        154,
+        // merge_idx
+        137,
+        // inter_pred_idc
+        95, 79, 63, 31, 31,
+        // ref_idx_l0
+        153, 153,
+        // ref_idx_l1
+        153, 153,
+        // abs_mvd_greater1_flag
+        169, 198,
+        // abs_mvd_greater1_flag
+        169, 198,
+        // mvp_lx_flag
+        168,
+        // no_residual_data_flag
+        79,
+        // split_transform_flag
+        224, 167, 122,
+        // cbf_luma
+        153, 111,
+        // cbf_cb, cbf_cr
+        149, 92, 167, 154,
+        // transform_skip_flag
+        139, 139,
+        // last_significant_coeff_x_prefix
+        125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
+         79, 108, 123,  93,
+        // last_significant_coeff_y_prefix
+        125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
+         79, 108, 123,  93,
+        // significant_coeff_group_flag
+        121, 140, 61, 154,
+        // significant_coeff_flag
+        170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
+        154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
+        153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
+        // coeff_abs_level_greater1_flag
+        154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
+        136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
+        // coeff_abs_level_greater2_flag
+        107, 167, 91, 107, 107, 167,
+    },
+};
+
+void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
+{
+    if (s->pps->entropy_coding_sync_enabled_flag &&
+        ((ctb_addr_ts % s->sps->ctb_width) == 2 ||
+         (s->sps->ctb_width == 2 &&
+          (ctb_addr_ts % s->sps->ctb_width) == 0))) {
+        memcpy(s->cabac_state, s->HEVClc.cabac_state, HEVC_CONTEXTS);
+    }
+}
+
+static void load_states(HEVCContext *s)
+{
+    memcpy(s->HEVClc.cabac_state, s->cabac_state, HEVC_CONTEXTS);
+}
+
+static void cabac_reinit(HEVCLocalContext *lc)
+{
+    skip_bytes(&lc->cc, 0);
+}
+
+static void cabac_init_decoder(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc.gb;
+    skip_bits(gb, 1);
+    align_get_bits(gb);
+    ff_init_cabac_decoder(&s->HEVClc.cc,
+                          gb->buffer + get_bits_count(gb) / 8,
+                          (get_bits_left(gb) + 7) / 8);
+}
+
+static void cabac_init_state(HEVCContext *s)
+{
+    int init_type = 2 - s->sh.slice_type;
+    int i;
+
+    if (s->sh.cabac_init_flag && s->sh.slice_type != I_SLICE)
+        init_type ^= 3;
+
+    for (i = 0; i < HEVC_CONTEXTS; i++) {
+        int init_value = init_values[init_type][i];
+        int m = (init_value >> 4) * 5 - 45;
+        int n = ((init_value & 15) << 3) - 16;
+        int pre = 2 * (((m * av_clip_c(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
+
+        pre ^= pre >> 31;
+        if (pre > 124)
+            pre = 124 + (pre & 1);
+        s->HEVClc.cabac_state[i] =  pre;
+    }
+}
+
+void ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
+{
+    if (ctb_addr_ts == s->pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) {
+        cabac_init_decoder(s);
+        if ((s->sh.dependent_slice_segment_flag == 0) ||
+            (s->pps->tiles_enabled_flag &&
+             (s->pps->tile_id[ctb_addr_ts] != s->pps->tile_id[ctb_addr_ts - 1])))
+            cabac_init_state(s);
+
+        if (!s->sh.first_slice_in_pic_flag && s->pps->entropy_coding_sync_enabled_flag) {
+            if ((ctb_addr_ts % s->sps->ctb_width) == 0) {
+                if (s->sps->ctb_width == 1)
+                    cabac_init_state(s);
+                else if (s->sh.dependent_slice_segment_flag == 1)
+                    load_states(s);
+            }
+        }
+    } else {
+        if (s->pps->tiles_enabled_flag &&
+            (s->pps->tile_id[ctb_addr_ts] != s->pps->tile_id[ctb_addr_ts - 1])) {
+            cabac_reinit(&s->HEVClc);
+            cabac_init_state(s);
+        }
+        if (s->pps->entropy_coding_sync_enabled_flag) {
+            if ((ctb_addr_ts % s->sps->ctb_width) == 0) {
+                get_cabac_terminate(&s->HEVClc.cc);
+                cabac_reinit(&s->HEVClc);
+
+                if (s->sps->ctb_width == 1)
+                    cabac_init_state(s);
+                else
+                    load_states(s);
+            }
+        }
+    }
+}
+
+#define GET_CABAC(ctx) get_cabac(&s->HEVClc.cc, &s->HEVClc.cabac_state[ctx])
+
+int ff_hevc_sao_merge_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[SAO_MERGE_FLAG]);
+}
+
+int ff_hevc_sao_type_idx_decode(HEVCContext *s)
+{
+    if (!GET_CABAC(elem_offset[SAO_TYPE_IDX]))
+        return 0;
+
+    if (!get_cabac_bypass(&s->HEVClc.cc))
+        return SAO_BAND;
+    return SAO_EDGE;
+}
+
+int ff_hevc_sao_band_position_decode(HEVCContext *s)
+{
+    int i;
+    int value = get_cabac_bypass(&s->HEVClc.cc);
+
+    for (i = 0; i < 4; i++)
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+    return value;
+}
+
+int ff_hevc_sao_offset_abs_decode(HEVCContext *s)
+{
+    int i = 0;
+    int length = (1 << (FFMIN(s->sps->bit_depth, 10) - 5)) - 1;
+
+    while (i < length && get_cabac_bypass(&s->HEVClc.cc))
+        i++;
+    return i;
+}
+
+int ff_hevc_sao_offset_sign_decode(HEVCContext *s)
+{
+    return get_cabac_bypass(&s->HEVClc.cc);
+}
+
+int ff_hevc_sao_eo_class_decode(HEVCContext *s)
+{
+    int ret = (get_cabac_bypass(&s->HEVClc.cc) << 1);
+    ret    |=  get_cabac_bypass(&s->HEVClc.cc);
+    return ret;
+}
+
+int ff_hevc_end_of_slice_flag_decode(HEVCContext *s)
+{
+    return get_cabac_terminate(&s->HEVClc.cc);
+}
+
+int ff_hevc_cu_transquant_bypass_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[CU_TRANSQUANT_BYPASS_FLAG]);
+}
+
+int ff_hevc_skip_flag_decode(HEVCContext *s, int x0, int y0, int x_cb, int y_cb)
+{
+    int pic_width_in_ctb = s->sps->width >> s->sps->log2_min_coding_block_size;
+    int inc = 0;
+    int x0b = x0 & ((1 << s->sps->log2_ctb_size) - 1);
+    int y0b = y0 & ((1 << s->sps->log2_ctb_size) - 1);
+
+    if (s->HEVClc.ctb_left_flag || x0b)
+        inc = SAMPLE_CTB(s->skip_flag, x_cb-1, y_cb);
+    if (s->HEVClc.ctb_up_flag || y0b)
+        inc += SAMPLE_CTB(s->skip_flag, x_cb, y_cb-1);
+
+    return GET_CABAC(elem_offset[SKIP_FLAG] + inc);
+}
+
+int ff_hevc_cu_qp_delta_abs(HEVCContext *s)
+{
+    int prefix_val = 0;
+    int suffix_val = 0;
+    int inc = 0;
+
+    while (prefix_val < 5 && GET_CABAC(elem_offset[CU_QP_DELTA] + inc)) {
+        prefix_val++;
+        inc = 1;
+    }
+    if (prefix_val >= 5) {
+        int k = 0;
+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
+            suffix_val += 1 << k;
+            k++;
+        }
+        if (k == CABAC_MAX_BIN)
+            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
+
+        while (k--)
+            suffix_val += get_cabac_bypass(&s->HEVClc.cc) << k;
+    }
+    return prefix_val + suffix_val;
+}
+
+int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s)
+{
+    return get_cabac_bypass(&s->HEVClc.cc);
+}
+
+int ff_hevc_pred_mode_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[PRED_MODE_FLAG]);
+}
+
+int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, int y0)
+{
+    int inc = 0, depth_left = 0, depth_top = 0;
+    int x0b = x0 & ((1 << s->sps->log2_ctb_size) - 1);
+    int y0b = y0 & ((1 << s->sps->log2_ctb_size) - 1);
+    int x_cb = x0 >> s->sps->log2_min_coding_block_size;
+    int y_cb = y0 >> s->sps->log2_min_coding_block_size;
+
+    if (s->HEVClc.ctb_left_flag || x0b)
+        depth_left = s->tab_ct_depth[(y_cb)*s->sps->min_cb_width + x_cb-1];
+    if (s->HEVClc.ctb_up_flag || y0b)
+        depth_top = s->tab_ct_depth[(y_cb-1)*s->sps->min_cb_width + x_cb];
+
+    inc += (depth_left > ct_depth);
+    inc += (depth_top > ct_depth);
+    return GET_CABAC(elem_offset[SPLIT_CODING_UNIT_FLAG] + inc);
+}
+
+int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
+{
+    if (GET_CABAC(elem_offset[PART_MODE])) // 1
+        return PART_2Nx2N;
+    if (log2_cb_size == s->sps->log2_min_coding_block_size) {
+        if (s->HEVClc.cu.pred_mode == MODE_INTRA) // 0
+            return PART_NxN;
+        if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01
+            return PART_2NxN;
+        if (log2_cb_size == 3) // 00
+            return PART_Nx2N;
+        if (GET_CABAC(elem_offset[PART_MODE] + 2)) // 001
+            return PART_Nx2N;
+        return PART_NxN; // 000
+    }
+
+    if (!s->sps->amp_enabled_flag) {
+        if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01
+            return PART_2NxN;
+        return PART_Nx2N;
+    }
+
+    if (GET_CABAC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
+        if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 011
+            return PART_2NxN;
+        if (get_cabac_bypass(&s->HEVClc.cc)) // 0101
+            return PART_2NxnD;
+        return PART_2NxnU; // 0100
+    }
+
+    if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 001
+        return PART_Nx2N;
+    if (get_cabac_bypass(&s->HEVClc.cc)) // 0001
+        return PART_nRx2N;
+    return  PART_nLx2N; // 0000
+}
+
+int ff_hevc_pcm_flag_decode(HEVCContext *s)
+{
+    return get_cabac_terminate(&s->HEVClc.cc);
+}
+
+int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[PREV_INTRA_LUMA_PRED_FLAG]);
+}
+
+int ff_hevc_mpm_idx_decode(HEVCContext *s)
+{
+    int i = 0;
+    while (i < 2 && get_cabac_bypass(&s->HEVClc.cc))
+        i++;
+    return i;
+}
+
+int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCContext *s)
+{
+    int i;
+    int value = get_cabac_bypass(&s->HEVClc.cc);
+
+    for (i = 0; i < 4; i++)
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+    return value;
+}
+
+int ff_hevc_intra_chroma_pred_mode_decode(HEVCContext *s)
+{
+    int ret;
+    if (!GET_CABAC(elem_offset[INTRA_CHROMA_PRED_MODE]))
+        return 4;
+
+    ret  = (get_cabac_bypass(&s->HEVClc.cc) << 1);
+    ret |=  get_cabac_bypass(&s->HEVClc.cc);
+    return ret;
+}
+
+int ff_hevc_merge_idx_decode(HEVCContext *s)
+{
+    int i = GET_CABAC(elem_offset[MERGE_IDX]);
+
+    if (i != 0) {
+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc.cc))
+            i++;
+    }
+    return i;
+}
+
+int ff_hevc_merge_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[MERGE_FLAG]);
+}
+
+int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH)
+{
+    if (nPbW + nPbH == 12)
+        return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
+    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc.ct.depth))
+        return PRED_BI;
+
+    return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
+}
+
+int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx)
+{
+    int i = 0;
+    int max = num_ref_idx_lx - 1;
+    int max_ctx = FFMIN(max, 2);
+
+    while (i < max_ctx && GET_CABAC(elem_offset[REF_IDX_L0] + i))
+        i++;
+    if (i == 2) {
+        while (i < max && get_cabac_bypass(&s->HEVClc.cc))
+            i++;
+    }
+
+    return i;
+}
+
+int ff_hevc_mvp_lx_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[MVP_LX_FLAG]);
+}
+
+int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[NO_RESIDUAL_DATA_FLAG]);
+}
+
+int ff_hevc_abs_mvd_greater0_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[ABS_MVD_GREATER0_FLAG]);
+}
+
+int ff_hevc_abs_mvd_greater1_flag_decode(HEVCContext *s)
+{
+    return GET_CABAC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
+}
+
+int ff_hevc_mvd_decode(HEVCContext *s)
+{
+    int ret = 2;
+    int k = 1;
+
+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc)) {
+        ret += 1 << k;
+        k++;
+    }
+    if (k == CABAC_MAX_BIN)
+        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
+    while (k--)
+        ret += get_cabac_bypass(&s->HEVClc.cc) << k;
+    return get_cabac_bypass_sign(&s->HEVClc.cc, -ret);
+}
+
+int ff_hevc_mvd_sign_flag_decode(HEVCContext *s)
+{
+    return get_cabac_bypass_sign(&s->HEVClc.cc, -1);
+}
+
+int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size)
+{
+    return GET_CABAC(elem_offset[SPLIT_TRANSFORM_FLAG] + 5 - log2_trafo_size);
+}
+
+int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth)
+{
+    return GET_CABAC(elem_offset[CBF_CB_CR] + trafo_depth);
+}
+
+int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+{
+    return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+}
+
+int ff_hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+{
+    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+}
+
+#define LAST_SIG_COEFF(elem)                                                    \
+    int i = 0;                                                                  \
+    int max = (log2_size << 1) - 1;                                             \
+    int ctx_offset, ctx_shift;                                                  \
+                                                                                \
+    if (c_idx == 0) {                                                           \
+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);             \
+        ctx_shift = (log2_size + 1) >> 2;                                       \
+    } else {                                                                    \
+        ctx_offset = 15;                                                        \
+        ctx_shift = log2_size - 2;                                              \
+    }                                                                           \
+    while (i < max &&                                                           \
+           GET_CABAC(elem_offset[elem] + (i >> ctx_shift) + ctx_offset))        \
+        i++;                                                                    \
+    return i;
+
+int ff_hevc_last_significant_coeff_x_prefix_decode(HEVCContext *s, int c_idx,
+                                                   int log2_size)
+{
+    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_X_PREFIX)
+}
+
+int ff_hevc_last_significant_coeff_y_prefix_decode(HEVCContext *s, int c_idx,
+                                                   int log2_size)
+{
+    LAST_SIG_COEFF(LAST_SIGNIFICANT_COEFF_Y_PREFIX)
+}
+
+int ff_hevc_last_significant_coeff_suffix_decode(HEVCContext *s,
+                                                 int last_significant_coeff_prefix)
+{
+    int i;
+    int length = (last_significant_coeff_prefix >> 1) - 1;
+    int value = get_cabac_bypass(&s->HEVClc.cc);
+
+    for (i = 1; i < length; i++)
+        value = (value << 1) | get_cabac_bypass(&s->HEVClc.cc);
+    return value;
+}
+
+int ff_hevc_significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int x_cg,
+                                                int y_cg, int log2_trafo_size)
+{
+    int ctx_cg = 0;
+    int inc;
+
+    if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+        ctx_cg += s->HEVClc.rc.significant_coeff_group_flag[x_cg + 1][y_cg];
+    if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+        ctx_cg += s->HEVClc.rc.significant_coeff_group_flag[x_cg][y_cg + 1];
+
+    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+
+    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
+}
+
+int ff_hevc_significant_coeff_flag_decode(HEVCContext *s, int c_idx, int x_c, int y_c,
+                                          int log2_trafo_size, int scan_idx)
+{
+    static const uint8_t ctx_idx_map[] = {
+        0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8
+    };
+    int x_cg = x_c >> 2;
+    int y_cg = y_c >> 2;
+    int sig_ctx;
+    int inc;
+
+    if (x_c + y_c == 0) {
+        sig_ctx = 0;
+    } else if (log2_trafo_size == 2) {
+        sig_ctx = ctx_idx_map[(y_c << 2) + x_c];
+    } else {
+        int prev_sig = 0;
+
+        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig += s->HEVClc.rc.significant_coeff_group_flag[x_cg + 1][y_cg];
+        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+            prev_sig += (s->HEVClc.rc.significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+
+        switch (prev_sig) {
+        case 0: {
+                int x_off = x_c & 3;
+                int y_off = y_c & 3;
+                sig_ctx   = ((x_off + y_off) == 0) ? 2 : ((x_off + y_off) <= 2) ? 1 : 0;
+            }
+            break;
+        case 1:
+            sig_ctx = 2 - FFMIN(y_c & 3, 2);
+            break;
+        case 2:
+            sig_ctx = 2 - FFMIN(x_c & 3, 2);
+            break;
+        default:
+            sig_ctx = 2;
+        }
+
+        if (c_idx == 0 && (x_cg > 0 || y_cg > 0))
+            sig_ctx += 3;
+
+        if (log2_trafo_size == 3) {
+            sig_ctx += (scan_idx == SCAN_DIAG) ? 9 : 15;
+        } else {
+            sig_ctx += c_idx ? 12 : 21;
+        }
+    }
+
+    if (c_idx == 0) {
+        inc = sig_ctx;
+    } else {
+        inc = sig_ctx + 27;
+    }
+
+    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
+}
+
+int ff_hevc_coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx,
+                                                 int i, int n,
+                                                 int first_elem,
+                                                 int first_subset)
+{
+
+    int inc;
+
+    if (first_elem) {
+        s->HEVClc.ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+
+        if (!first_subset && s->HEVClc.greater1_ctx == 0)
+            s->HEVClc.ctx_set++;
+        s->HEVClc.greater1_ctx = 1;
+    }
+
+    inc = (s->HEVClc.ctx_set << 2) + s->HEVClc.greater1_ctx;
+    if (c_idx > 0)
+        inc += 16;
+
+    s->HEVClc.last_coeff_abs_level_greater1_flag =
+        GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + inc);
+
+    if (s->HEVClc.last_coeff_abs_level_greater1_flag) {
+        s->HEVClc.greater1_ctx = 0;
+    } else if (s->HEVClc.greater1_ctx > 0 && s->HEVClc.greater1_ctx < 3) {
+        s->HEVClc.greater1_ctx++;
+    }
+
+    return s->HEVClc.last_coeff_abs_level_greater1_flag;
+}
+
+int ff_hevc_coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx,
+                                                 int i, int n)
+{
+    int inc;
+
+    inc = s->HEVClc.ctx_set;
+    if (c_idx > 0)
+        inc += 4;
+
+    return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
+}
+
+int ff_hevc_coeff_abs_level_remaining(HEVCContext *s, int first_elem, int base_level)
+{
+    int i;
+    HEVCLocalContext *lc = &s->HEVClc;
+    int prefix = 0;
+    int suffix = 0;
+
+    if (first_elem) {
+        lc->c_rice_param = 0;
+        lc->last_coeff_abs_level_remaining = 0;
+    }
+
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc.cc))
+        prefix++;
+    if (prefix == CABAC_MAX_BIN)
+        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+    if (prefix < 3) {
+        for (i = 0; i < lc->c_rice_param; i++)
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        lc->last_coeff_abs_level_remaining = (prefix << lc->c_rice_param) + suffix;
+    } else {
+        for (i = 0; i < prefix - 3 + lc->c_rice_param; i++)
+            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc.cc);
+        lc->last_coeff_abs_level_remaining = (((1 << (prefix - 3)) + 3 - 1)
+                                              << lc->c_rice_param) + suffix;
+    }
+
+    lc->c_rice_param = FFMIN(lc->c_rice_param +
+                             ((base_level + lc->last_coeff_abs_level_remaining) >
+                              (3 * (1 << lc->c_rice_param))), 4);
+
+    return lc->last_coeff_abs_level_remaining;
+}
+
+int ff_hevc_coeff_sign_flag(HEVCContext *s, uint8_t nb)
+{
+    int i;
+    int ret = 0;
+
+    for (i = 0; i < nb; i++)
+        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc.cc);
+    return ret;
+}
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
new file mode 100644
index 0000000000..85189a87f6
--- /dev/null
+++ b/libavcodec/hevc_filter.c
@@ -0,0 +1,698 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 Seppo Tomperi
+ * Copyright (C) 2013 Wassim Hamidouche
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+
+#include "cabac_functions.h"
+#include "golomb.h"
+#include "hevc.h"
+#include "bit_depth_template.c"
+
+#define LUMA 0
+#define CB 1
+#define CR 2
+
+static const uint8_t tctable[54] = {
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP  0...18
+     1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37
+     5, 5, 6, 6, 7, 8, 9,10,11,13,14,16,18,20,22,24           // QP 38...53
+};
+
+static const uint8_t betatable[52] = {
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18
+     9,10,11,12,13,14,15,16,17,18,20,22,24,26,28,30,32,34,36, // QP 19...37
+    38,40,42,44,46,48,50,52,54,56,58,60,62,64                 // QP 38...51
+};
+
+static int chroma_tc(HEVCContext *s, int qp_y, int c_idx, int tc_offset)
+{
+    static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+    int qp_i, offset;
+    int qp;
+    int idxt;
+
+    // slice qp offset is not used for deblocking
+    if (c_idx == 1)
+        offset = s->pps->cb_qp_offset;
+    else
+        offset = s->pps->cr_qp_offset;
+
+    qp_i = av_clip_c(qp_y + offset, 0, 57);
+    if (qp_i < 30)
+        qp = qp_i;
+    else if (qp_i > 43)
+        qp = qp_i - 6;
+    else
+        qp = qp_c[qp_i - 30];
+
+    idxt = av_clip_c(qp + DEFAULT_INTRA_TC_OFFSET + tc_offset, 0, 53);
+    return tctable[idxt];
+}
+
+static int get_qPy_pred(HEVCContext *s, int xC, int yC, int xBase, int yBase, int log2_cb_size)
+{
+    HEVCLocalContext *lc     = &s->HEVClc;
+    int ctb_size_mask        = (1 << s->sps->log2_ctb_size) - 1;
+    int MinCuQpDeltaSizeMask = (1 << (s->sps->log2_ctb_size - s->pps->diff_cu_qp_delta_depth)) - 1;
+    int xQgBase              = xBase - ( xBase & MinCuQpDeltaSizeMask );
+    int yQgBase              = yBase - ( yBase & MinCuQpDeltaSizeMask );
+    int pic_width            = s->sps->width  >> s->sps->log2_min_coding_block_size;
+    int pic_height           = s->sps->height >> s->sps->log2_min_coding_block_size;
+    int x_cb                 = xQgBase >> s->sps->log2_min_coding_block_size;
+    int y_cb                 = yQgBase >> s->sps->log2_min_coding_block_size;
+    int availableA           = (xBase & ctb_size_mask) && (xQgBase & ctb_size_mask);
+    int availableB           = (yBase & ctb_size_mask) && (yQgBase & ctb_size_mask);
+    int qPy_pred;
+    int qPy_a;
+    int qPy_b;
+
+    // qPy_pred
+    if (lc->first_qp_group) {
+        lc->first_qp_group = !lc->tu.is_cu_qp_delta_coded;
+        qPy_pred = s->sh.slice_qp;
+    } else {
+        qPy_pred = lc->qp_y;
+        if (log2_cb_size < s->sps->log2_ctb_size - s->pps->diff_cu_qp_delta_depth) {
+            static const int offsetX[8][8] = {
+                    {-1, 1, 3, 1, 7, 1, 3, 1},
+                    { 0, 0, 0, 0, 0, 0, 0, 0},
+                    { 1, 3, 1, 3, 1, 3, 1, 3},
+                    { 2, 2, 2, 2, 2, 2, 2, 2},
+                    { 3, 5, 7, 5, 3, 5, 7, 5},
+                    { 4, 4, 4, 4, 4, 4, 4, 4},
+                    { 5, 7, 5, 7, 5, 7, 5, 7},
+                    { 6, 6, 6, 6, 6, 6, 6, 6}
+            };
+            static const int offsetY[8][8] = {
+                    { 7, 0, 1, 2, 3, 4, 5, 6},
+                    { 0, 1, 2, 3, 4, 5, 6, 7},
+                    { 1, 0, 3, 2, 5, 4, 7, 6},
+                    { 0, 1, 2, 3, 4, 5, 6, 7},
+                    { 3, 0, 1, 2, 7, 4, 5, 6},
+                    { 0, 1, 2, 3, 4, 5, 6, 7},
+                    { 1, 0, 3, 2, 5, 4, 7, 6},
+                    { 0, 1, 2, 3, 4, 5, 6, 7}
+            };
+            int xC0b = (xC - (xC & ctb_size_mask)) >> s->sps->log2_min_coding_block_size;
+            int yC0b = (yC - (yC & ctb_size_mask)) >> s->sps->log2_min_coding_block_size;
+            int idxX = (xQgBase & ctb_size_mask)   >> s->sps->log2_min_coding_block_size;
+            int idxY = (yQgBase & ctb_size_mask)   >> s->sps->log2_min_coding_block_size;
+            int idx_mask = ctb_size_mask >> s->sps->log2_min_coding_block_size;
+            int x, y;
+
+            x = FFMIN(xC0b + offsetX[idxX][idxY],              pic_width  - 1);
+            y = FFMIN(yC0b + (offsetY[idxX][idxY] & idx_mask), pic_height - 1);
+
+            if (xC0b == (lc->start_of_tiles_x >> s->sps->log2_min_coding_block_size) &&
+                offsetX[idxX][idxY] == -1) {
+                x = (lc->end_of_tiles_x >> s->sps->log2_min_coding_block_size) - 1;
+                y = yC0b - 1;
+            }
+            qPy_pred = s->qp_y_tab[y * pic_width + x];
+        }
+    }
+
+    // qPy_a
+    if (availableA == 0)
+        qPy_a = qPy_pred;
+    else
+        qPy_a = s->qp_y_tab[(x_cb - 1) + y_cb * pic_width];
+
+    // qPy_b
+    if (availableB == 0)
+        qPy_b = qPy_pred;
+    else
+        qPy_b = s->qp_y_tab[x_cb + (y_cb - 1) * pic_width];
+
+    return (qPy_a + qPy_b + 1) >> 1;
+}
+
+void ff_hevc_set_qPy(HEVCContext *s, int xC, int yC, int xBase, int yBase, int log2_cb_size)
+{
+    int qp_y = get_qPy_pred(s, xC, yC, xBase, yBase, log2_cb_size);
+
+    if (s->HEVClc.tu.cu_qp_delta != 0) {
+        int off = s->sps->qp_bd_offset;
+        s->HEVClc.qp_y = ((qp_y + s->HEVClc.tu.cu_qp_delta + 52 + 2 * off) % (52 + off)) - off;
+    } else
+        s->HEVClc.qp_y = qp_y;
+}
+
+static int get_qPy(HEVCContext *s, int xC, int yC)
+{
+    int log2_min_cb_size  = s->sps->log2_min_coding_block_size;
+    int pic_width         = s->sps->width>>log2_min_cb_size;
+    int x                 = xC >> log2_min_cb_size;
+    int y                 = yC >> log2_min_cb_size;
+    return s->qp_y_tab[x + y * pic_width];
+}
+
+static void copy_CTB(uint8_t *dst, uint8_t *src, int width, int height, int stride)
+{
+    int i;
+
+    for(i=0; i< height; i++){
+        memcpy(dst, src, width);
+        dst += stride;
+        src += stride;
+    }
+}
+
+#define CTB(tab, x, y) ((tab)[(y) * s->sps->ctb_width + (x)])
+
+static void sao_filter_CTB(HEVCContext *s, int x, int y, int c_idx_min, int c_idx_max)
+{
+    //  TODO: This should be easily parallelizable
+    //  TODO: skip CBs when (cu_transquant_bypass_flag || (pcm_loop_filter_disable_flag && pcm_flag))
+    int c_idx = 0;
+    int class = 1, class_index;
+    int  edges[4]; // 0 left 1 top 2 right 3 bottom
+    SAOParams *sao[4];
+    int classes[4];
+    int x_shift = 0, y_shift = 0;
+    int x_ctb = x>>s->sps->log2_ctb_size;
+    int y_ctb = y>>s->sps->log2_ctb_size;
+    int ctb_addr_rs = y_ctb * s->sps->ctb_width + x_ctb;
+    int ctb_addr_ts = s->pps->ctb_addr_rs_to_ts[ctb_addr_rs];
+
+    // flags indicating unfilterable edges
+    uint8_t vert_edge[] = {0,0,0,0};
+    uint8_t horiz_edge[] = {0,0,0,0};
+    uint8_t diag_edge[] = {0,0,0,0};
+    uint8_t lfase[3]; // current, above, left
+    uint8_t no_tile_filter = s->pps->tiles_enabled_flag && !s->pps->loop_filter_across_tiles_enabled_flag;
+    uint8_t left_tile_edge = 0;
+    uint8_t up_tile_edge = 0;
+
+    sao[0]     = &CTB(s->sao, x_ctb, y_ctb);
+    edges[0]   = x_ctb == 0;
+    edges[1]   = y_ctb == 0;
+    edges[2]   = x_ctb == (s->sps->ctb_width - 1);
+    edges[3]   = y_ctb == (s->sps->ctb_height - 1);
+    lfase[0]   = CTB(s->filter_slice_edges, x_ctb, y_ctb);
+    classes[0] = 0;
+
+    if (!edges[0]) {
+        left_tile_edge = no_tile_filter && s->pps->tile_id[ctb_addr_ts] != s->pps->tile_id[s->pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+        sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb);
+        vert_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
+        vert_edge[2] = vert_edge[0];
+        lfase[2]     = CTB(s->filter_slice_edges, x_ctb - 1, y_ctb);
+        classes[class] = 2;
+        class++;
+        x_shift = 8;
+    }
+
+    if (!edges[1]) {
+        up_tile_edge = no_tile_filter && s->pps->tile_id[ctb_addr_ts] != s->pps->tile_id[s->pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->sps->ctb_width]];
+        sao[class] = &CTB(s->sao, x_ctb, y_ctb - 1);
+        horiz_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
+        horiz_edge[1] = horiz_edge[0];
+        lfase[1] = CTB(s->filter_slice_edges, x_ctb, y_ctb - 1);
+        classes[class] = 1;
+        class++;
+        y_shift = 4;
+
+        if (!edges[0]) {
+            classes[class] = 3;
+            sao[class] = &CTB(s->sao, x_ctb - 1, y_ctb - 1);
+            class++;
+
+            // Tile check here is done current CTB row/col, not above/left like you'd expect,
+            //but that is because the tile boundary always extends through the whole pic
+            vert_edge[1] = (!lfase[1] && CTB(s->tab_slice_address, x_ctb, y_ctb - 1) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge;
+            vert_edge[3] = vert_edge[1];
+            horiz_edge[2] = (!lfase[2] && CTB(s->tab_slice_address, x_ctb - 1, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || up_tile_edge;
+            horiz_edge[3] = horiz_edge[2];
+            diag_edge[0] = (!lfase[0] && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
+            diag_edge[3] = diag_edge[0];
+
+            // Does left CTB comes after above CTB?
+            if(CTB(s->tab_slice_address, x_ctb - 1, y_ctb) > CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
+                diag_edge[2] = !lfase[2] || left_tile_edge || up_tile_edge;
+                diag_edge[1] = diag_edge[2];
+            } else if(CTB(s->tab_slice_address, x_ctb - 1, y_ctb) < CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) {
+                diag_edge[1] = !lfase[1] || left_tile_edge || up_tile_edge;
+                diag_edge[2] = diag_edge[1];
+            } else {
+                // Same slice, only consider tiles
+                diag_edge[2] = left_tile_edge || up_tile_edge;
+                diag_edge[1] = diag_edge[2];
+            }
+        }
+    }
+
+    for (c_idx = 0; c_idx < 3; c_idx++) {
+        int chroma = c_idx ? 1 : 0;
+        int x0 = x >> chroma;
+        int y0 = y >> chroma;
+        int stride = s->frame->linesize[c_idx];
+        int ctb_size = (1 << (s->sps->log2_ctb_size)) >> s->sps->hshift[c_idx];
+        int width = FFMIN(ctb_size,
+                          (s->sps->width >> s->sps->hshift[c_idx]) - x0);
+        int height = FFMIN(ctb_size,
+                           (s->sps->height >> s->sps->vshift[c_idx]) - y0);
+
+        uint8_t *src = &s->frame->data[c_idx][y0 * stride + (x0 << s->sps->pixel_shift)];
+        uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride + (x0 << s->sps->pixel_shift)];
+        int offset = (y_shift >> chroma) * stride + ((x_shift >> chroma) << s->sps->pixel_shift);
+
+        copy_CTB(dst - offset, src - offset,
+                 (edges[2] ? width  + (x_shift >> chroma) : width)  << s->sps->pixel_shift,
+                 (edges[3] ? height + (y_shift >> chroma) : height), stride);
+
+        for (class_index = 0; class_index < class && c_idx >= c_idx_min &&
+                              c_idx < c_idx_max; class_index++) {
+            switch (sao[class_index]->type_idx[c_idx]) {
+            case SAO_BAND:
+                s->hevcdsp.sao_band_filter[classes[class_index]](dst, src, stride, sao[class_index], edges, width, height, c_idx);
+                    break;
+            case SAO_EDGE:
+                s->hevcdsp.sao_edge_filter[classes[class_index]](dst, src, stride, sao[class_index],  edges, width, height, c_idx, vert_edge[classes[class_index]], horiz_edge[classes[class_index]], diag_edge[classes[class_index]]);
+                break;
+            }
+        }
+    }
+}
+
+static int get_pcm(HEVCContext *s, int x, int y)
+{
+    int log2_min_pu_size     = s->sps->log2_min_pu_size;
+    int pic_width_in_min_pu  = s->sps->width  >> s->sps->log2_min_pu_size;
+    int pic_height_in_min_pu = s->sps->height >> s->sps->log2_min_pu_size;
+    int x_pu = x >> log2_min_pu_size;
+    int y_pu = y >> log2_min_pu_size;
+
+    if (x < 0 || x_pu >= pic_width_in_min_pu || y < 0 || y_pu >= pic_height_in_min_pu)
+        return 2;
+    return s->is_pcm[y_pu * pic_width_in_min_pu + x_pu];
+}
+
+#define TC_CALC(qp, bs) tctable[av_clip((qp) + DEFAULT_INTRA_TC_OFFSET * ((bs) - 1) + ((tc_offset >> 1) << 1), 0, MAX_QP + DEFAULT_INTRA_TC_OFFSET)]
+
+static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+{
+    uint8_t *src;
+    int x, y;
+    int chroma;
+    int c_tc[2];
+    int beta[2];
+    int tc[2];
+    uint8_t no_p[2] = {0};
+    uint8_t no_q[2] = {0};
+
+    int log2_ctb_size =  s->sps->log2_ctb_size;
+    int x_end, y_end;
+    int ctb_size    = 1<<log2_ctb_size;
+    int ctb         = (x0 >> log2_ctb_size) + (y0 >> log2_ctb_size) * s->sps->ctb_width;
+    int cur_tc_offset   = s->deblock[ctb].tc_offset;
+    int cur_beta_offset = s->deblock[ctb].beta_offset;
+    int left_tc_offset, left_beta_offset;
+    int tc_offset, beta_offset;
+    int pcmf        = (s->sps->pcm_enabled_flag && s->sps->pcm.loop_filter_disable_flag) ||
+                      s->pps->transquant_bypass_enable_flag;
+
+    if (s->deblock[ctb].disable)
+        return;
+
+    if (x0) {
+        left_tc_offset   = s->deblock[ctb-1].tc_offset;
+        left_beta_offset = s->deblock[ctb-1].beta_offset;
+    }
+
+    x_end = x0+ctb_size;
+    if (x_end > s->sps->width)
+        x_end = s->sps->width;
+    y_end = y0+ctb_size;
+    if (y_end > s->sps->height)
+        y_end = s->sps->height;
+
+    tc_offset = cur_tc_offset;
+    beta_offset = cur_beta_offset;
+
+    // vertical filtering luma
+    for (y = y0; y < y_end; y += 8) {
+        for (x = x0 ? x0 : 8; x < x_end; x += 8) {
+            const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
+            const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];
+            if (bs0 || bs1) {
+                const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1;
+                const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1;
+
+                beta[0] = betatable[av_clip(qp0 + ((beta_offset >> 1) << 1), 0, MAX_QP)];
+                beta[1] = betatable[av_clip(qp1 + ((beta_offset >> 1) << 1), 0, MAX_QP)];
+                tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0;
+                tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0;
+                src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
+                if (pcmf) {
+                    no_p[0] = get_pcm(s, x - 1, y);
+                    no_p[1] = get_pcm(s, x - 1, y + 4);
+                    no_q[0] = get_pcm(s, x, y);
+                    no_q[1] = get_pcm(s, x, y + 4);
+                    s->hevcdsp.hevc_v_loop_filter_luma_c(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q);
+                } else
+                    s->hevcdsp.hevc_v_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q);
+            }
+        }
+    }
+
+    // vertical filtering chroma
+    for (chroma = 1; chroma <= 2; chroma++) {
+        for (y = y0; y < y_end; y += 16) {
+            for (x = x0 ? x0:16; x < x_end; x += 16) {
+                const int bs0 = s->vertical_bs[(x >> 3) + (y >> 2) * s->bs_width];
+                const int bs1 = s->vertical_bs[(x >> 3) + ((y + 8) >> 2) * s->bs_width];
+                if ((bs0 == 2) || (bs1 == 2)) {
+                    const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1;
+                    const int qp1 = (get_qPy(s, x - 1, y + 8) + get_qPy(s, x, y + 8) + 1) >> 1;
+
+                    c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                    c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+                    src = &s->frame->data[chroma][(y / 2) * s->frame->linesize[chroma] + ((x / 2) << s->sps->pixel_shift)];
+                    if (pcmf) {
+                        no_p[0] = get_pcm(s, x - 1, y);
+                        no_p[1] = get_pcm(s, x - 1, y + 8);
+                        no_q[0] = get_pcm(s, x, y);
+                        no_q[1] = get_pcm(s, x, y + 8);
+                        s->hevcdsp.hevc_v_loop_filter_chroma_c(src, s->frame->linesize[chroma], c_tc, no_p, no_q);
+                    } else
+                        s->hevcdsp.hevc_v_loop_filter_chroma(src, s->frame->linesize[chroma], c_tc, no_p, no_q);
+                }
+            }
+        }
+    }
+
+    // horizontal filtering luma
+    if (x_end != s->sps->width)
+        x_end -= 8;
+    for (y = y0 ? y0 : 8; y < y_end; y += 8) {
+        for (x = x0 ? x0 - 8 : 0; x < x_end; x += 8) {
+            const int bs0 = s->horizontal_bs[(x +     y * s->bs_width) >> 2];
+            const int bs1 = s->horizontal_bs[(x + 4 + y * s->bs_width) >> 2];
+            if (bs0 || bs1) {
+                const int qp0 = (get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1;
+                const int qp1 = (get_qPy(s, x + 4, y - 1) + get_qPy(s, x + 4, y) + 1) >> 1;
+
+                tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset;
+                beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset;
+
+                beta[0]  = betatable[av_clip(qp0 + ((beta_offset >> 1) << 1), 0, MAX_QP)];
+                beta[1]  = betatable[av_clip(qp1 + ((beta_offset >> 1) << 1), 0, MAX_QP)];
+                tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0;
+                tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0;
+                src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
+                if (pcmf) {
+                    no_p[0] = get_pcm(s, x, y - 1);
+                    no_p[1] = get_pcm(s, x + 4, y - 1);
+                    no_q[0] = get_pcm(s, x, y);
+                    no_q[1] = get_pcm(s, x + 4, y);
+                    s->hevcdsp.hevc_h_loop_filter_luma_c(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q);
+                } else
+                    s->hevcdsp.hevc_h_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q);
+            }
+        }
+    }
+
+    // horizontal filtering chroma
+    for (chroma = 1; chroma <= 2; chroma++) {
+        for (y = y0 ? y0 : 16; y < y_end; y += 16) {
+            for (x = x0 - 8; x < x_end; x += 16) {
+                int bs0, bs1;
+                // to make sure no memory access over boundary when x = -8
+                // TODO: simplify with row based deblocking
+                if (x < 0) {
+                    bs0 = 0;
+                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
+                } else if (x >= x_end - 8) {
+                    bs0 = s->horizontal_bs[(x + y * s->bs_width) >> 2];
+                    bs1 = 0;
+                } else {
+                    bs0 = s->horizontal_bs[(x + y * s->bs_width) >> 2];
+                    bs1 = s->horizontal_bs[(x + 8 + y * s->bs_width) >> 2];
+                }
+
+                if ((bs0 == 2) || (bs1 == 2)) {
+                    const int qp0 = (bs0 == 2) ? ((get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1) : 0;
+                    const int qp1 = (bs1 == 2) ? ((get_qPy(s, x + 8, y - 1) + get_qPy(s, x + 8, y) + 1) >> 1) : 0;
+
+                    tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset;
+                    c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                    c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+                    src = &s->frame->data[chroma][(y / 2) * s->frame->linesize[chroma] + ((x / 2) << s->sps->pixel_shift)];
+                    if (pcmf) {
+                        no_p[0] = get_pcm(s, x, y - 1);
+                        no_p[1] = get_pcm(s, x + 8, y - 1);
+                        no_q[0] = get_pcm(s, x, y);
+                        no_q[1] = get_pcm(s, x + 8, y);
+                        s->hevcdsp.hevc_h_loop_filter_chroma_c(src, s->frame->linesize[chroma], c_tc, no_p, no_q);
+                    } else
+                        s->hevcdsp.hevc_h_loop_filter_chroma(src, s->frame->linesize[chroma], c_tc, no_p, no_q);
+                }
+            }
+        }
+    }
+}
+
+static int boundary_strength(HEVCContext *s, MvField *curr,
+                             uint8_t curr_cbf_luma, MvField *neigh,
+                             uint8_t neigh_cbf_luma, RefPicList *neigh_refPicList,
+                             int tu_border)
+{
+    int mvs = curr->pred_flag[0] + curr->pred_flag[1];
+
+    if (tu_border) {
+        if (curr->is_intra || neigh->is_intra)
+            return 2;
+        if (curr_cbf_luma || neigh_cbf_luma)
+            return 1;
+    }
+
+    if (mvs == neigh->pred_flag[0] + neigh->pred_flag[1]) {
+        if (mvs == 2) {
+            // same L0 and L1
+            if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]   &&
+                s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+                neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+                if ((abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                     abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+                    (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                     abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4))
+                    return 1;
+                else
+                    return 0;
+            } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                       neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+                if (abs(neigh->mv[0].x - curr->mv[0].x) >= 4 || abs(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+                    abs(neigh->mv[1].x - curr->mv[1].x) >= 4 || abs(neigh->mv[1].y - curr->mv[1].y) >= 4)
+                    return 1;
+                else
+                    return 0;
+            } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+                       neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+                if (abs(neigh->mv[1].x - curr->mv[0].x) >= 4 || abs(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+                    abs(neigh->mv[0].x - curr->mv[1].x) >= 4 || abs(neigh->mv[0].y - curr->mv[1].y) >= 4)
+                    return 1;
+                else
+                    return 0;
+            } else {
+                return 1;
+            }
+        } else { // 1 MV
+            Mv A, B;
+            int ref_A;
+            int ref_B;
+
+            if (curr->pred_flag[0]) {
+                A = curr->mv[0];
+                ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+            } else {
+                A = curr->mv[1];
+                ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+            }
+
+            if (neigh->pred_flag[0]) {
+                B = neigh->mv[0];
+                ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+            } else {
+                B = neigh->mv[1];
+                ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+            }
+
+            if (ref_A == ref_B) {
+                if (abs(A.x - B.x) >= 4 || abs(A.y - B.y) >= 4)
+                    return 1;
+                else
+                    return 0;
+            } else
+                return 1;
+        }
+    }
+
+    return 1;
+}
+
+void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_trafo_size,
+                                           int slice_or_tiles_up_boundary, int slice_or_tiles_left_boundary)
+{
+    MvField *tab_mvf      = s->ref->tab_mvf;
+    int log2_min_pu_size  = s->sps->log2_min_pu_size;
+    int log2_min_tu_size  = s->sps->log2_min_transform_block_size;
+    int pic_width_in_min_pu = s->sps->width >> log2_min_pu_size;
+    int pic_width_in_min_tu = s->sps->width >> log2_min_tu_size;
+    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * pic_width_in_min_pu + (x0 >> log2_min_pu_size)].is_intra;
+
+    int i, j;
+    int bs;
+
+    if (y0 > 0 && (y0 & 7) == 0) {
+        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+        int yq_pu = y0 >> log2_min_pu_size;
+        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+        int yq_tu = y0 >> log2_min_tu_size;
+
+        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+            int x_pu = (x0 + i) >> log2_min_pu_size;
+            int x_tu = (x0 + i) >> log2_min_tu_size;
+            MvField *top  = &tab_mvf[yp_pu * pic_width_in_min_pu + x_pu];
+            MvField *curr = &tab_mvf[yq_pu * pic_width_in_min_pu + x_pu];
+            uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * pic_width_in_min_tu + x_tu];
+            uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * pic_width_in_min_tu + x_tu];
+            RefPicList* top_refPicList = ff_hevc_get_ref_list(s, s->ref, x0 + i, y0 - 1);
+
+            bs = boundary_strength(s, curr, curr_cbf_luma, top, top_cbf_luma, top_refPicList, 1);
+            if (!s->sh.slice_loop_filter_across_slices_enabled_flag && (slice_or_tiles_up_boundary & 1) && (y0 % (1 << s->sps->log2_ctb_size)) == 0)
+                bs = 0;
+            else if (!s->pps->loop_filter_across_tiles_enabled_flag && (slice_or_tiles_up_boundary & 2)  && (y0 % (1 << s->sps->log2_ctb_size)) == 0)
+                bs = 0;
+            if (y0 == 0 || s->sh.disable_deblocking_filter_flag == 1)
+                bs = 0;
+            if (bs)
+                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+        }
+    }
+
+    // bs for TU internal horizontal PU boundaries
+    if (log2_trafo_size > s->sps->log2_min_pu_size && !is_intra)
+        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+            int yp_tu = (y0 + j - 1) >> log2_min_tu_size;
+            int yq_tu = (y0 + j)     >> log2_min_tu_size;
+
+
+            for (i = 0; i < (1<<log2_trafo_size); i += 4) {
+                int x_pu = (x0 + i) >> log2_min_pu_size;
+                int x_tu = (x0 + i) >> log2_min_tu_size;
+                MvField *top  = &tab_mvf[yp_pu * pic_width_in_min_pu + x_pu];
+                MvField *curr = &tab_mvf[yq_pu * pic_width_in_min_pu + x_pu];
+                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * pic_width_in_min_tu + x_tu];
+                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * pic_width_in_min_tu + x_tu];
+                RefPicList* top_refPicList = ff_hevc_get_ref_list(s, s->ref, x0 + i, y0 + j - 1);
+
+                bs = boundary_strength(s, curr, curr_cbf_luma, top, top_cbf_luma, top_refPicList, 0);
+                if (s->sh.disable_deblocking_filter_flag == 1)
+                    bs = 0;
+                if (bs)
+                    s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+            }
+        }
+
+    // bs for vertical TU boundaries
+    if (x0 > 0 && (x0 & 7) == 0) {
+        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+        int xq_pu =  x0      >> log2_min_pu_size;
+        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+        int xq_tu =  x0      >> log2_min_tu_size;
+
+        for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+            int y_pu = (y0 + i) >> log2_min_pu_size;
+            int y_tu = (y0 + i) >> log2_min_tu_size;
+            MvField *left = &tab_mvf[y_pu * pic_width_in_min_pu + xp_pu];
+            MvField *curr = &tab_mvf[y_pu * pic_width_in_min_pu + xq_pu];
+
+            uint8_t left_cbf_luma = s->cbf_luma[y_tu * pic_width_in_min_tu + xp_tu];
+            uint8_t curr_cbf_luma = s->cbf_luma[y_tu * pic_width_in_min_tu + xq_tu];
+            RefPicList* left_refPicList = ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0 + i);
+
+            bs = boundary_strength(s, curr, curr_cbf_luma, left, left_cbf_luma, left_refPicList, 1);
+            if (!s->sh.slice_loop_filter_across_slices_enabled_flag && (slice_or_tiles_left_boundary & 1) && (x0 % (1 << s->sps->log2_ctb_size)) == 0)
+                bs = 0;
+            else if (!s->pps->loop_filter_across_tiles_enabled_flag && (slice_or_tiles_left_boundary & 2) && (x0 % (1 << s->sps->log2_ctb_size)) == 0)
+                bs = 0;
+            if (x0 == 0 || s->sh.disable_deblocking_filter_flag == 1)
+                bs = 0;
+            if (bs)
+                s->vertical_bs[(x0 >> 3) + ((y0 + i) >> 2) * s->bs_width] = bs;
+        }
+    }
+
+    // bs for TU internal vertical PU boundaries
+    if (log2_trafo_size > s->sps->log2_min_pu_size && !is_intra)
+        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+            int y_pu = (y0 + j) >> log2_min_pu_size;
+            int y_tu = (y0 + j) >> log2_min_tu_size;
+
+            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
+                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+                int xq_pu = (x0 + i) >> log2_min_pu_size;
+                int xp_tu = (x0 + i - 1) >> log2_min_tu_size;
+                int xq_tu = (x0 + i) >> log2_min_tu_size;
+                MvField *left = &tab_mvf[y_pu * pic_width_in_min_pu + xp_pu];
+                MvField *curr = &tab_mvf[y_pu * pic_width_in_min_pu + xq_pu];
+                uint8_t left_cbf_luma = s->cbf_luma[y_tu * pic_width_in_min_tu + xp_tu];
+                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * pic_width_in_min_tu + xq_tu];
+                RefPicList* left_refPicList = ff_hevc_get_ref_list(s, s->ref, x0 + i - 1, y0 + j);
+
+
+                bs = boundary_strength(s, curr, curr_cbf_luma, left, left_cbf_luma, left_refPicList, 0);
+                if (s->sh.disable_deblocking_filter_flag == 1)
+                    bs = 0;
+                if (bs)
+                    s->vertical_bs[((x0 + i) >> 3) + ((y0 + j) >> 2) * s->bs_width] = bs;
+            }
+        }
+}
+#undef LUMA
+#undef CB
+#undef CR
+
+void ff_hevc_hls_filter(HEVCContext *s, int x, int y)
+{
+    int c_idx_min = s->sh.slice_sample_adaptive_offset_flag[0] != 0 ? 0 : 1;
+    int c_idx_max = s->sh.slice_sample_adaptive_offset_flag[1] != 0 ? 3 : 1;
+    deblocking_filter_CTB(s, x, y);
+    if (s->sps->sao_enabled)
+        sao_filter_CTB(s, x, y, c_idx_min, c_idx_max);
+}
+
+void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+{
+    if (y_ctb && x_ctb)
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size);
+    if (y_ctb && x_ctb >= s->sps->width - ctb_size)
+        ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size);
+    if (x_ctb && y_ctb >= s->sps->height - ctb_size)
+        ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb);
+}
diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
new file mode 100644
index 0000000000..f23a0d9e12
--- /dev/null
+++ b/libavcodec/hevc_mvs.c
@@ -0,0 +1,1011 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 Anand Meher Kotra
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hevc.h"
+
+static const uint8_t l0_l1_cand_idx[12][2] = {
+    { 0, 1, },
+    { 1, 0, },
+    { 0, 2, },
+    { 2, 0, },
+    { 1, 2, },
+    { 2, 1, },
+    { 0, 3, },
+    { 3, 0, },
+    { 1, 3, },
+    { 3, 1, },
+    { 2, 3, },
+    { 3, 2, },
+};
+
+void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0, int nPbW, int nPbH)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    int x0b = x0 & ((1 << s->sps->log2_ctb_size) - 1);
+    int y0b = y0 & ((1 << s->sps->log2_ctb_size) - 1);
+
+    lc->na.cand_up       = (lc->ctb_up_flag   || y0b);
+    lc->na.cand_left     = (lc->ctb_left_flag || x0b);
+    lc->na.cand_up_left  = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up;
+    lc->na.cand_up_right_sap =
+            ((x0b + nPbW) == (1 << s->sps->log2_ctb_size)) ?
+                    lc->ctb_up_right_flag && !y0b : lc->na.cand_up;
+    lc->na.cand_up_right =
+            ((x0b + nPbW) == (1 << s->sps->log2_ctb_size) ?
+                    lc->ctb_up_right_flag && !y0b : lc->na.cand_up )
+                     && (x0 + nPbW) < lc->end_of_tiles_x;
+    lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left;
+}
+
+/*
+ * 6.4.1 Derivation process for z-scan order block availability
+ */
+static int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
+                              int xN, int yN)
+{
+#define MIN_TB_ADDR_ZS(x, y)                                            \
+    s->pps->min_tb_addr_zs[(y) * s->sps->min_tb_width + (x)]
+    int Curr =  MIN_TB_ADDR_ZS(xCurr >> s->sps->log2_min_transform_block_size,
+                               yCurr >> s->sps->log2_min_transform_block_size);
+    int N;
+
+    if ((xN < 0) || (yN < 0) ||
+        (xN >= s->sps->width) ||
+        (yN >= s->sps->height))
+        return 0;
+
+    N = MIN_TB_ADDR_ZS(xN >> s->sps->log2_min_transform_block_size,
+                       yN >> s->sps->log2_min_transform_block_size);
+
+    return N <= Curr;
+}
+
+
+static int same_prediction_block(HEVCLocalContext *lc, int log2_cb_size,
+                                 int x0, int y0, int nPbW, int nPbH,
+                                 int xA1, int yA1, int partIdx)
+{
+    return !(nPbW << 1 == 1 << log2_cb_size &&
+             nPbH << 1 == 1 << log2_cb_size && partIdx == 1 &&
+             lc->cu.x + nPbW > xA1 &&
+             lc->cu.y + nPbH <= yA1);
+}
+
+/*
+ * 6.4.2 Derivation process for prediction block availability
+ */
+static int check_prediction_block_available(HEVCContext *s, int log2_cb_size,
+                                            int x0, int y0, int nPbW, int nPbH,
+                                            int xA1, int yA1, int partIdx)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+
+    if (lc->cu.x < xA1 && lc->cu.y < yA1 &&
+        (lc->cu.x + (1 << log2_cb_size)) > xA1 &&
+        (lc->cu.y + (1 << log2_cb_size)) > yA1)
+        return same_prediction_block(lc, log2_cb_size, x0, y0,
+                                     nPbW, nPbH, xA1, yA1, partIdx);
+    else
+        return z_scan_block_avail(s, x0, y0, xA1, yA1);
+}
+
+//check if the two luma locations belong to the same mostion estimation region
+static int isDiffMER(HEVCContext *s, int xN, int yN, int xP, int yP)
+{
+    uint8_t plevel = s->pps->log2_parallel_merge_level;
+
+    return xN >> plevel == xP >> plevel &&
+           yN >> plevel == yP >> plevel;
+}
+
+#define MATCH(x) (A.x == B.x)
+
+// check if the mv's and refidx are the same between A and B
+static int compareMVrefidx(struct MvField A, struct MvField B)
+{
+    if (A.pred_flag[0] && A.pred_flag[1] && B.pred_flag[0] && B.pred_flag[1])
+        return MATCH(ref_idx[0]) && MATCH(mv[0].x) && MATCH(mv[0].y) &&
+               MATCH(ref_idx[1]) && MATCH(mv[1].x) && MATCH(mv[1].y);
+
+    if (A.pred_flag[0] && !A.pred_flag[1] && B.pred_flag[0] && !B.pred_flag[1])
+        return MATCH(ref_idx[0]) && MATCH(mv[0].x) && MATCH(mv[0].y);
+
+    if (!A.pred_flag[0] && A.pred_flag[1] && !B.pred_flag[0] && B.pred_flag[1])
+        return MATCH(ref_idx[1]) && MATCH(mv[1].x) && MATCH(mv[1].y);
+
+    return 0;
+}
+
+static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+{
+    int tx, scale_factor;
+
+    td = av_clip_int8_c(td);
+    tb = av_clip_int8_c(tb);
+    tx = (0x4000 + abs(td / 2)) / td;
+    scale_factor = av_clip_c((tb * tx + 32) >> 6, -4096, 4095);
+    dst->x = av_clip_int16_c((scale_factor * src->x + 127 +
+                             (scale_factor * src->x < 0)) >> 8);
+    dst->y = av_clip_int16_c((scale_factor * src->y + 127 +
+                             (scale_factor * src->y < 0)) >> 8);
+}
+
+static int check_mvset(Mv *mvLXCol, Mv *mvCol,
+                       int colPic, int poc,
+                       RefPicList *refPicList, int X, int refIdxLx,
+                       RefPicList *refPicList_col, int listCol, int refidxCol)
+{
+    int cur_lt = refPicList[X].isLongTerm[refIdxLx];
+    int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
+    int col_poc_diff, cur_poc_diff;
+
+    if (cur_lt != col_lt) {
+        mvLXCol->x = 0;
+        mvLXCol->y = 0;
+        return 0;
+    }
+
+    col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol];
+    cur_poc_diff = poc    - refPicList[X].list[refIdxLx];
+
+    if (!col_poc_diff)
+        col_poc_diff = 1; // error resilience
+
+    if (cur_lt || col_poc_diff == cur_poc_diff) {
+        mvLXCol->x = mvCol->x;
+        mvLXCol->y = mvCol->y;
+    } else {
+        mv_scale(mvLXCol, mvCol, col_poc_diff, cur_poc_diff);
+    }
+    return 1;
+}
+
+#define CHECK_MVSET(l) \
+    check_mvset(mvLXCol, temp_col.mv + l, \
+                colPic, s->poc, \
+                refPicList, X, refIdxLx, \
+                refPicList_col, L##l, temp_col.ref_idx[l])
+
+// derive the motion vectors section 8.5.3.1.8
+static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
+                                         int refIdxLx, Mv* mvLXCol, int X,
+                                         int colPic, RefPicList* refPicList_col)
+{
+    RefPicList *refPicList = s->ref->refPicList;
+
+    if (temp_col.is_intra) {
+        mvLXCol->x = 0;
+        mvLXCol->y = 0;
+        return 0;
+    }
+
+    if (temp_col.pred_flag[0] == 0)
+        return CHECK_MVSET(1);
+    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 0)
+        return CHECK_MVSET(0);
+    else if (temp_col.pred_flag[0] == 1 && temp_col.pred_flag[1] == 1) {
+        int check_diffpicount = 0;
+        int i = 0;
+        for (i = 0; i < refPicList[0].nb_refs; i++) {
+            if (refPicList[0].list[i] > s->poc)
+                check_diffpicount++;
+        }
+        for (i = 0; i < refPicList[1].nb_refs; i++) {
+            if (refPicList[1].list[i] > s->poc)
+                check_diffpicount++;
+        }
+        if (check_diffpicount == 0 && X == 0)
+            return CHECK_MVSET(0);
+        else if (check_diffpicount == 0 && X == 1)
+            return CHECK_MVSET(1);
+        else {
+            if (s->sh.collocated_list == L1)
+                return CHECK_MVSET(0);
+            else
+                return CHECK_MVSET(1);
+        }
+    }
+
+    return 0;
+}
+
+#define TAB_MVF(x, y) \
+    tab_mvf[(y) * pic_width_in_min_pu + x]
+
+#define TAB_MVF_PU(v) \
+    TAB_MVF(x##v##_pu, y##v##_pu)
+
+#define DERIVE_TEMPORAL_COLOCATED_MVS(v) \
+    derive_temporal_colocated_mvs(s, temp_col, \
+                                  refIdxLx, mvLXCol, X, colPic, \
+                                  ff_hevc_get_ref_list(s, ref, \
+                                                       x##v, y##v))
+
+/*
+ * 8.5.3.1.7  temporal luma motion vector prediction
+ */
+static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+                                       int nPbW, int nPbH, int refIdxLx,
+                                       Mv* mvLXCol, int X)
+{
+    MvField *tab_mvf;
+    MvField temp_col;
+    int xPRb, yPRb;
+    int xPRb_pu;
+    int yPRb_pu;
+    int xPCtr, yPCtr;
+    int xPCtr_pu;
+    int yPCtr_pu;
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+    int availableFlagLXCol = 0;
+    int colPic;
+
+    HEVCFrame *ref = s->ref->collocated_ref;
+
+    if (!ref)
+        return 0;
+
+    tab_mvf = ref->tab_mvf;
+    colPic  = ref->poc;
+
+    //bottom right collocated motion vector
+    xPRb = x0 + nPbW;
+    yPRb = y0 + nPbH;
+
+    ff_thread_await_progress(&ref->tf, INT_MAX, 0);
+    if (tab_mvf &&
+        y0 >> s->sps->log2_ctb_size == yPRb >> s->sps->log2_ctb_size &&
+        yPRb < s->sps->height &&
+        xPRb < s->sps->width) {
+        xPRb = ((xPRb >> 4) << 4);
+        yPRb = ((yPRb >> 4) << 4);
+        xPRb_pu = xPRb >> s->sps->log2_min_pu_size;
+        yPRb_pu = yPRb >> s->sps->log2_min_pu_size;
+        temp_col = TAB_MVF_PU(PRb);
+        availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS(PRb);
+    } else {
+        mvLXCol->x = 0;
+        mvLXCol->y = 0;
+        availableFlagLXCol = 0;
+    }
+
+    // derive center collocated motion vector
+    if (tab_mvf && availableFlagLXCol == 0) {
+        xPCtr = x0 + (nPbW >> 1);
+        yPCtr = y0 + (nPbH >> 1);
+        xPCtr = ((xPCtr >> 4) << 4);
+        yPCtr = ((yPCtr >> 4) << 4);
+        xPCtr_pu = xPCtr >> s->sps->log2_min_pu_size;
+        yPCtr_pu = yPCtr >> s->sps->log2_min_pu_size;
+        temp_col = TAB_MVF_PU(PCtr);
+        availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS(PCtr);
+    }
+    return availableFlagLXCol;
+}
+
+#define AVAILABLE(cand, v) \
+    (cand && !TAB_MVF_PU(v).is_intra)
+
+#define PRED_BLOCK_AVAILABLE(v) \
+    check_prediction_block_available(s, log2_cb_size, \
+                                     x0, y0, nPbW, nPbH, \
+                                     x##v, y##v, part_idx)
+
+#define COMPARE_MV_REFIDX(a, b) \
+    compareMVrefidx(TAB_MVF_PU(a), TAB_MVF_PU(b))
+
+/*
+ * 8.5.3.1.2  Derivation process for spatial merging candidates
+ */
+static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
+                                            int nPbW, int nPbH, int log2_cb_size,
+                                            int singleMCLFlag, int part_idx,
+                                            struct MvField mergecandlist[])
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    RefPicList *refPicList = s->ref->refPicList;
+    MvField *tab_mvf = s->ref->tab_mvf;
+
+    int available_a1_flag = 0;
+    int available_b1_flag = 0;
+    int available_b0_flag = 0;
+    int available_a0_flag = 0;
+    int available_b2_flag = 0;
+    struct MvField spatialCMVS[MRG_MAX_NUM_CANDS];
+    struct MvField combCand = { { { 0 } } };
+    struct MvField TMVPCand = { { { 0 } } };
+    struct Mv mvL0Col = { 0 };
+    struct Mv mvL1Col = { 0 };
+
+    //first left spatial merge candidate
+    int xA1 = x0 - 1;
+    int yA1 = y0 + nPbH - 1;
+    int is_available_a1;
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+
+    int check_MER = 1;
+    int check_MER_1 = 1;
+
+    int xB1, yB1;
+    int is_available_b1;
+    int xB1_pu;
+    int yB1_pu;
+
+    int check_B0;
+    int xB0, yB0;
+    int isAvailableB0;
+    int xB0_pu;
+    int yB0_pu;
+
+    int check_A0;
+    int xA0, yA0;
+    int is_available_a0;
+    int xA0_pu;
+    int yA0_pu;
+
+    int xB2, yB2;
+    int isAvailableB2;
+    int xB2_pu;
+    int yB2_pu;
+    int mergearray_index = 0;
+
+    struct MvField zerovector;
+    int numRefIdx = 0;
+    int zeroIdx = 0;
+
+    int numMergeCand = 0;
+    int numOrigMergeCand = 0;
+    int sumcandidates = 0;
+    int combIdx = 0;
+    int combStop = 0;
+    int l0CandIdx = 0;
+    int l1CandIdx = 0;
+
+    int refIdxL0Col = 0;
+    int refIdxL1Col = 0;
+    int availableFlagLXCol = 0;
+
+    int cand_bottom_left = lc->na.cand_bottom_left;
+    int cand_left        = lc->na.cand_left;
+    int cand_up_left     = lc->na.cand_up_left;
+    int cand_up          = lc->na.cand_up;
+    int cand_up_right    = lc->na.cand_up_right_sap;
+
+
+    int xA1_pu = xA1 >> s->sps->log2_min_pu_size;
+    int yA1_pu = yA1 >> s->sps->log2_min_pu_size;
+
+    int availableFlagL0Col = 0;
+    int availableFlagL1Col = 0;
+
+    is_available_a1 = AVAILABLE(cand_left, A1);
+
+    if (!singleMCLFlag && part_idx == 1 &&
+        (lc->cu.part_mode == PART_Nx2N ||
+         lc->cu.part_mode == PART_nLx2N ||
+         lc->cu.part_mode == PART_nRx2N) ||
+        isDiffMER(s, xA1, yA1, x0, y0)) {
+        is_available_a1 = 0;
+    }
+
+    if (is_available_a1) {
+        available_a1_flag = 1;
+        spatialCMVS[0] = TAB_MVF_PU(A1);
+    } else {
+        available_a1_flag = 0;
+        spatialCMVS[0].ref_idx[0] = -1;
+        spatialCMVS[0].ref_idx[1] = -1;
+        spatialCMVS[0].mv[0].x = 0;
+        spatialCMVS[0].mv[0].y = 0;
+        spatialCMVS[0].mv[1].x = 0;
+        spatialCMVS[0].mv[1].y = 0;
+        spatialCMVS[0].pred_flag[0] = 0;
+        spatialCMVS[0].pred_flag[1] = 0;
+        spatialCMVS[0].is_intra = 0;
+    }
+
+    // above spatial merge candidate
+
+    xB1 = x0 + nPbW - 1;
+    yB1 = y0 - 1;
+    xB1_pu = xB1 >> s->sps->log2_min_pu_size;
+    yB1_pu = yB1 >> s->sps->log2_min_pu_size;
+
+    is_available_b1 = AVAILABLE(cand_up, B1);
+
+    if (!singleMCLFlag && part_idx == 1 &&
+        (lc->cu.part_mode == PART_2NxN ||
+         lc->cu.part_mode == PART_2NxnU ||
+         lc->cu.part_mode == PART_2NxnD) ||
+        isDiffMER(s, xB1, yB1, x0, y0)) {
+        is_available_b1 = 0;
+    }
+
+    if (is_available_a1 && is_available_b1)
+        check_MER = !COMPARE_MV_REFIDX(B1, A1);
+
+    if (is_available_b1 && check_MER) {
+        available_b1_flag = 1;
+        spatialCMVS[1] = TAB_MVF_PU(B1);
+    } else {
+        available_b1_flag = 0;
+        spatialCMVS[1].ref_idx[0] = -1;
+        spatialCMVS[1].ref_idx[1] = -1;
+        spatialCMVS[1].mv[0].x = 0;
+        spatialCMVS[1].mv[0].y = 0;
+        spatialCMVS[1].mv[1].x = 0;
+        spatialCMVS[1].mv[1].y = 0;
+        spatialCMVS[1].pred_flag[0] = 0;
+        spatialCMVS[1].pred_flag[1] = 0;
+        spatialCMVS[1].is_intra = 0;
+    }
+
+    // above right spatial merge candidate
+    xB0 = x0 + nPbW;
+    yB0 = y0 - 1;
+    check_MER = 1;
+    xB0_pu = xB0 >> s->sps->log2_min_pu_size;
+    yB0_pu = yB0 >> s->sps->log2_min_pu_size;
+    check_B0 = PRED_BLOCK_AVAILABLE(B0);
+
+    isAvailableB0 = check_B0 && AVAILABLE(cand_up_right, B0);
+
+    if (isDiffMER(s, xB0, yB0, x0, y0))
+        isAvailableB0 = 0;
+
+    if (is_available_b1 && isAvailableB0)
+        check_MER = !COMPARE_MV_REFIDX(B0, B1);
+
+    if (isAvailableB0 && check_MER) {
+        available_b0_flag = 1;
+        spatialCMVS[2] = TAB_MVF_PU(B0);
+    } else {
+        available_b0_flag = 0;
+        spatialCMVS[2].ref_idx[0] = -1;
+        spatialCMVS[2].ref_idx[1] = -1;
+        spatialCMVS[2].mv[0].x = 0;
+        spatialCMVS[2].mv[0].y = 0;
+        spatialCMVS[2].mv[1].x = 0;
+        spatialCMVS[2].mv[1].y = 0;
+        spatialCMVS[2].pred_flag[0] = 0;
+        spatialCMVS[2].pred_flag[1] = 0;
+        spatialCMVS[2].is_intra = 0;
+    }
+
+    // left bottom spatial merge candidate
+    xA0 = x0 - 1;
+    yA0 = y0 + nPbH;
+    check_MER = 1;
+    xA0_pu = xA0 >> s->sps->log2_min_pu_size;
+    yA0_pu = yA0 >> s->sps->log2_min_pu_size;
+    check_A0 = PRED_BLOCK_AVAILABLE(A0);
+
+    is_available_a0 = check_A0 && AVAILABLE(cand_bottom_left, A0);
+
+    if (isDiffMER(s, xA0, yA0, x0, y0))
+        is_available_a0 = 0;
+
+    if (is_available_a1 && is_available_a0)
+        check_MER = !COMPARE_MV_REFIDX(A0, A1);
+
+    if (is_available_a0 && check_MER) {
+        available_a0_flag = 1;
+        spatialCMVS[3] = TAB_MVF_PU(A0);
+    } else {
+        available_a0_flag = 0;
+        spatialCMVS[3].ref_idx[0] = -1;
+        spatialCMVS[3].ref_idx[1] = -1;
+        spatialCMVS[3].mv[0].x = 0;
+        spatialCMVS[3].mv[0].y = 0;
+        spatialCMVS[3].mv[1].x = 0;
+        spatialCMVS[3].mv[1].y = 0;
+        spatialCMVS[3].pred_flag[0] = 0;
+        spatialCMVS[3].pred_flag[1] = 0;
+        spatialCMVS[3].is_intra = 0;
+    }
+
+    // above left spatial merge candidate
+    xB2 = x0 - 1;
+    yB2 = y0 - 1;
+    check_MER = 1;
+    xB2_pu = xB2 >> s->sps->log2_min_pu_size;
+    yB2_pu = yB2 >> s->sps->log2_min_pu_size;
+
+    isAvailableB2 = AVAILABLE(cand_up_left, B2);
+
+    if (isDiffMER(s, xB2, yB2, x0, y0))
+        isAvailableB2 = 0;
+
+    if (is_available_a1 && isAvailableB2)
+        check_MER = !COMPARE_MV_REFIDX(B2, A1);
+
+    if (is_available_b1 && isAvailableB2)
+        check_MER_1 = !COMPARE_MV_REFIDX(B2, B1);
+
+    sumcandidates = available_a1_flag + available_b1_flag + available_b0_flag
+            + available_a0_flag;
+
+    if (isAvailableB2 && check_MER && check_MER_1 && sumcandidates != 4) {
+        available_b2_flag = 1;
+        spatialCMVS[4] = TAB_MVF_PU(B2);
+    } else {
+        available_b2_flag = 0;
+        spatialCMVS[4].ref_idx[0] = -1;
+        spatialCMVS[4].ref_idx[1] = -1;
+        spatialCMVS[4].mv[0].x = 0;
+        spatialCMVS[4].mv[0].y = 0;
+        spatialCMVS[4].mv[1].x = 0;
+        spatialCMVS[4].mv[1].y = 0;
+        spatialCMVS[4].pred_flag[0] = 0;
+        spatialCMVS[4].pred_flag[1] = 0;
+        spatialCMVS[4].is_intra = 0;
+    }
+
+    // temporal motion vector candidate
+    // one optimization is that do temporal checking only if the number of
+    // available candidates < MRG_MAX_NUM_CANDS
+    if (s->sh.slice_temporal_mvp_enabled_flag == 0) {
+        availableFlagLXCol = 0;
+    } else {
+        availableFlagL0Col = temporal_luma_motion_vector(s, x0, y0, nPbW, nPbH,
+                                                         refIdxL0Col, &mvL0Col, 0);
+        // one optimization is that l1 check can be done only when the current slice type is B_SLICE
+        if (s->sh.slice_type == B_SLICE) {
+            availableFlagL1Col = temporal_luma_motion_vector(s, x0, y0, nPbW,
+                                                             nPbH, refIdxL1Col, &mvL1Col, 1);
+        }
+        availableFlagLXCol = availableFlagL0Col || availableFlagL1Col;
+        if (availableFlagLXCol) {
+            TMVPCand.is_intra = 0;
+            TMVPCand.pred_flag[0] = availableFlagL0Col;
+            TMVPCand.pred_flag[1] = availableFlagL1Col;
+            if (TMVPCand.pred_flag[0]) {
+                TMVPCand.mv[0] = mvL0Col;
+                TMVPCand.ref_idx[0] = refIdxL0Col;
+            }
+            if (TMVPCand.pred_flag[1]) {
+                TMVPCand.mv[1] = mvL1Col;
+                TMVPCand.ref_idx[1] = refIdxL1Col;
+            }
+        }
+    }
+
+    if (available_a1_flag) {
+        mergecandlist[mergearray_index] = spatialCMVS[0];
+        mergearray_index++;
+    }
+    if (available_b1_flag) {
+        mergecandlist[mergearray_index] = spatialCMVS[1];
+        mergearray_index++;
+    }
+    if (available_b0_flag) {
+        mergecandlist[mergearray_index] = spatialCMVS[2];
+        mergearray_index++;
+    }
+    if (available_a0_flag) {
+        mergecandlist[mergearray_index] = spatialCMVS[3];
+        mergearray_index++;
+    }
+    if (available_b2_flag) {
+        mergecandlist[mergearray_index] = spatialCMVS[4];
+        mergearray_index++;
+    }
+    if (availableFlagLXCol && mergearray_index < s->sh.max_num_merge_cand) {
+        mergecandlist[mergearray_index] = TMVPCand;
+        mergearray_index++;
+    }
+    numMergeCand = mergearray_index;
+    numOrigMergeCand = mergearray_index;
+
+    // combined bi-predictive merge candidates  (applies for B slices)
+    if (s->sh.slice_type == B_SLICE) {
+        if (numOrigMergeCand > 1 &&
+            numOrigMergeCand < s->sh.max_num_merge_cand) {
+
+            combIdx = 0;
+            combStop = 0;
+            while (combStop != 1) {
+                MvField l0Cand;
+                MvField l1Cand;
+                l0CandIdx = l0_l1_cand_idx[combIdx][0];
+                l1CandIdx = l0_l1_cand_idx[combIdx][1];
+                l0Cand = mergecandlist[l0CandIdx];
+                l1Cand = mergecandlist[l1CandIdx];
+                if (l0Cand.pred_flag[0] == 1 &&
+                    l1Cand.pred_flag[1] == 1 &&
+                    (refPicList[0].list[l0Cand.ref_idx[0]] !=
+                     refPicList[1].list[l1Cand.ref_idx[1]] ||
+                     l0Cand.mv[0].x != l1Cand.mv[1].x ||
+                     l0Cand.mv[0].y != l1Cand.mv[1].y)) {
+                    combCand.ref_idx[0] = l0Cand.ref_idx[0];
+                    combCand.ref_idx[1] = l1Cand.ref_idx[1];
+                    combCand.pred_flag[0] = 1;
+                    combCand.pred_flag[1] = 1;
+                    combCand.mv[0].x = l0Cand.mv[0].x;
+                    combCand.mv[0].y = l0Cand.mv[0].y;
+                    combCand.mv[1].x = l1Cand.mv[1].x;
+                    combCand.mv[1].y = l1Cand.mv[1].y;
+                    combCand.is_intra = 0;
+                    mergecandlist[numMergeCand] = combCand;
+                    numMergeCand++;
+                }
+                combIdx++;
+                if (combIdx == numOrigMergeCand * (numOrigMergeCand - 1) ||
+                    numMergeCand == s->sh.max_num_merge_cand)
+                    combStop = 1;
+            }
+        }
+    }
+
+    /*
+     * append Zero motion vector candidates
+     */
+    if (s->sh.slice_type == P_SLICE) {
+        numRefIdx = s->sh.nb_refs[0];
+    } else if (s->sh.slice_type == B_SLICE) {
+        numRefIdx = FFMIN(s->sh.nb_refs[0],
+                          s->sh.nb_refs[1]);
+    }
+    while (numMergeCand < s->sh.max_num_merge_cand) {
+        if (s->sh.slice_type == P_SLICE) {
+            zerovector.ref_idx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
+            zerovector.ref_idx[1] = -1;
+            zerovector.pred_flag[0] = 1;
+            zerovector.pred_flag[1] = 0;
+            zerovector.mv[0].x = 0;
+            zerovector.mv[0].y = 0;
+            zerovector.mv[1].x = 0;
+            zerovector.mv[1].y = 0;
+            zerovector.is_intra = 0;
+        } else {
+            zerovector.ref_idx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
+            zerovector.ref_idx[1] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
+            zerovector.pred_flag[0] = 1;
+            zerovector.pred_flag[1] = 1;
+            zerovector.mv[0].x = 0;
+            zerovector.mv[0].y = 0;
+            zerovector.mv[1].x = 0;
+            zerovector.mv[1].y = 0;
+            zerovector.is_intra = 0;
+        }
+
+        mergecandlist[numMergeCand] = zerovector;
+        numMergeCand++;
+        zeroIdx++;
+    }
+}
+
+/*
+ * 8.5.3.1.1 Derivation process of luma Mvs for merge mode
+ */
+void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
+                                int nPbH, int log2_cb_size, int part_idx,
+                                int merge_idx, MvField *mv)
+{
+    int singleMCLFlag = 0;
+    int nCS = 1 << log2_cb_size;
+    struct MvField mergecand_list[MRG_MAX_NUM_CANDS] = { { { { 0 } } } };
+    int nPbW2 = nPbW;
+    int nPbH2 = nPbH;
+    HEVCLocalContext *lc = &s->HEVClc;
+
+    if (s->pps->log2_parallel_merge_level > 2 && nCS == 8) {
+        singleMCLFlag = 1;
+        x0 = lc->cu.x;
+        y0 = lc->cu.y;
+        nPbW = nCS;
+        nPbH = nCS;
+        part_idx = 0;
+    }
+
+    ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH);
+    derive_spatial_merge_candidates(s, x0, y0, nPbW, nPbH, log2_cb_size,
+                                    singleMCLFlag, part_idx, mergecand_list);
+
+    if (mergecand_list[merge_idx].pred_flag[0] == 1 &&
+        mergecand_list[merge_idx].pred_flag[1] == 1 &&
+        (nPbW2 + nPbH2) == 12) {
+        mergecand_list[merge_idx].ref_idx[1] = -1;
+        mergecand_list[merge_idx].pred_flag[1] = 0;
+    }
+
+    *mv = mergecand_list[merge_idx];
+}
+
+static av_always_inline void dist_scale(HEVCContext *s, Mv * mv,
+                                        int pic_width_in_min_pu, int x, int y,
+                                        int elist, int ref_idx_curr, int ref_idx)
+{
+    RefPicList *refPicList = s->ref->refPicList;
+    MvField *tab_mvf = s->ref->tab_mvf;
+    int ref_pic_elist = refPicList[elist].list[TAB_MVF(x, y).ref_idx[elist]];
+    int ref_pic_curr  = refPicList[ref_idx_curr].list[ref_idx];
+
+    if (ref_pic_elist != ref_pic_curr)
+        mv_scale(mv, mv, s->poc - ref_pic_elist, s->poc - ref_pic_curr);
+}
+
+static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index,
+                         Mv *mv, int ref_idx_curr, int ref_idx)
+{
+    MvField *tab_mvf = s->ref->tab_mvf;
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+
+    RefPicList *refPicList = s->ref->refPicList;
+
+    if (TAB_MVF(x, y).pred_flag[pred_flag_index] == 1 &&
+        refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) {
+        *mv = TAB_MVF(x, y).mv[pred_flag_index];
+        return 1;
+    }
+    return 0;
+}
+
+
+static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index,
+                            Mv *mv, int ref_idx_curr, int ref_idx)
+{
+    MvField *tab_mvf = s->ref->tab_mvf;
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+
+    RefPicList *refPicList = s->ref->refPicList;
+    int currIsLongTerm = refPicList[ref_idx_curr].isLongTerm[ref_idx];
+
+    int colIsLongTerm =
+        refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
+
+    if (TAB_MVF(x, y).pred_flag[pred_flag_index] && colIsLongTerm == currIsLongTerm) {
+        *mv = TAB_MVF(x, y).mv[pred_flag_index];
+        if (!currIsLongTerm)
+            dist_scale(s, mv, pic_width_in_min_pu, x, y, pred_flag_index, ref_idx_curr, ref_idx);
+        return 1;
+    }
+    return 0;
+}
+
+#define MP_MX(v, pred, mx) \
+    mv_mp_mode_mx(s, x##v##_pu, y##v##_pu, pred, &mx, ref_idx_curr, ref_idx)
+
+#define MP_MX_LT(v, pred, mx) \
+    mv_mp_mode_mx_lt(s, x##v##_pu, y##v##_pu, pred, &mx, ref_idx_curr, ref_idx)
+
+void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+                              int nPbH, int log2_cb_size, int part_idx,
+                              int merge_idx, MvField *mv,
+                              int mvp_lx_flag, int LX)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    MvField *tab_mvf = s->ref->tab_mvf;
+    int isScaledFlag_L0 = 0;
+    int availableFlagLXA0 = 0;
+    int availableFlagLXB0 = 0;
+    int availableFlagLXCol = 0;
+    int numMVPCandLX = 0;
+    int pic_width_in_min_pu = s->sps->width >> s->sps->log2_min_pu_size;
+
+    int xA0, yA0;
+    int xA0_pu, yA0_pu;
+    int is_available_a0;
+
+    int xA1, yA1;
+    int xA1_pu, yA1_pu;
+    int is_available_a1;
+
+    int xB0, yB0;
+    int xB0_pu, yB0_pu;
+    int is_available_b0;
+
+    int xB1, yB1;
+    int xB1_pu = 0, yB1_pu = 0;
+    int is_available_b1 = 0;
+
+    int xB2, yB2;
+    int xB2_pu = 0, yB2_pu = 0;
+    int is_available_b2 = 0;
+    Mv mvpcand_list[2] = { { 0 } };
+    Mv mxA = { 0 };
+    Mv mxB = { 0 };
+    Mv mvLXCol = { 0 };
+    int ref_idx_curr = 0;
+    int ref_idx = 0;
+    int pred_flag_index_l0;
+    int pred_flag_index_l1;
+    int x0b = x0 & ((1 << s->sps->log2_ctb_size) - 1);
+    int y0b = y0 & ((1 << s->sps->log2_ctb_size) - 1);
+
+    int cand_up = (lc->ctb_up_flag || y0b);
+    int cand_left = (lc->ctb_left_flag || x0b);
+    int cand_up_left =
+            (!x0b && !y0b) ? lc->ctb_up_left_flag : cand_left && cand_up;
+    int cand_up_right =
+            (x0b + nPbW == (1 << s->sps->log2_ctb_size) ||
+             x0  + nPbW >= lc->end_of_tiles_x) ? lc->ctb_up_right_flag && !y0b
+                                               : cand_up;
+    int cand_bottom_left = (y0 + nPbH >= lc->end_of_tiles_y) ? 0 : cand_left;
+
+    ref_idx_curr       = LX;
+    ref_idx            = mv->ref_idx[LX];
+    pred_flag_index_l0 = LX;
+    pred_flag_index_l1 = !LX;
+
+    // left bottom spatial candidate
+    xA0 = x0 - 1;
+    yA0 = y0 + nPbH;
+    xA0_pu = xA0 >> s->sps->log2_min_pu_size;
+    yA0_pu = yA0 >> s->sps->log2_min_pu_size;
+
+    is_available_a0 = AVAILABLE(cand_bottom_left, A0);
+    if (is_available_a0)
+        is_available_a0 = PRED_BLOCK_AVAILABLE(A0);
+
+    //left spatial merge candidate
+    xA1 = x0 - 1;
+    yA1 = y0 + nPbH - 1;
+    xA1_pu = xA1 >> s->sps->log2_min_pu_size;
+    yA1_pu = yA1 >> s->sps->log2_min_pu_size;
+
+    is_available_a1 = AVAILABLE(cand_left, A1);
+    if (is_available_a0 || is_available_a1) {
+        isScaledFlag_L0 = 1;
+    }
+
+    if (is_available_a0) {
+        availableFlagLXA0 = MP_MX(A0, pred_flag_index_l0, mxA);
+        if (!availableFlagLXA0)
+            availableFlagLXA0 = MP_MX(A0, pred_flag_index_l1, mxA);
+    }
+
+    if (is_available_a1 && !availableFlagLXA0) {
+        availableFlagLXA0 = MP_MX(A1, pred_flag_index_l0, mxA);
+        if (!availableFlagLXA0)
+            availableFlagLXA0 = MP_MX(A1, pred_flag_index_l1, mxA);
+    }
+
+    if (is_available_a0 && !availableFlagLXA0) {
+        availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l0, mxA);
+        if (!availableFlagLXA0)
+            availableFlagLXA0 = MP_MX_LT(A0, pred_flag_index_l1, mxA);
+    }
+
+    if (is_available_a1 && !availableFlagLXA0) {
+        availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l0, mxA);
+        if (!availableFlagLXA0)
+            availableFlagLXA0 = MP_MX_LT(A1, pred_flag_index_l1, mxA);
+    }
+
+    // B candidates
+    // above right spatial merge candidate
+    xB0 = x0 + nPbW;
+    yB0 = y0 - 1;
+    xB0_pu = xB0 >> s->sps->log2_min_pu_size;
+    yB0_pu = yB0 >> s->sps->log2_min_pu_size;
+
+    is_available_b0 = AVAILABLE(cand_up_right, B0);
+    if (is_available_b0)
+        is_available_b0 = PRED_BLOCK_AVAILABLE(B0);
+
+    if (is_available_b0) {
+        availableFlagLXB0 = MP_MX(B0, pred_flag_index_l0, mxB);
+        if (!availableFlagLXB0)
+            availableFlagLXB0 = MP_MX(B0, pred_flag_index_l1, mxB);
+    }
+
+    if (!availableFlagLXB0) {
+        // above spatial merge candidate
+        xB1 = x0 + nPbW - 1;
+        yB1 = y0 - 1;
+        xB1_pu = xB1 >> s->sps->log2_min_pu_size;
+        yB1_pu = yB1 >> s->sps->log2_min_pu_size;
+
+        is_available_b1 = AVAILABLE(cand_up, B1);
+
+        if (is_available_b1) {
+            availableFlagLXB0 = MP_MX(B1, pred_flag_index_l0, mxB);
+            if (!availableFlagLXB0)
+                availableFlagLXB0 = MP_MX(B1, pred_flag_index_l1, mxB);
+        }
+    }
+
+    if (!availableFlagLXB0) {
+        // above left spatial merge candidate
+        xB2 = x0 - 1;
+        yB2 = y0 - 1;
+        xB2_pu = xB2 >> s->sps->log2_min_pu_size;
+        yB2_pu = yB2 >> s->sps->log2_min_pu_size;
+        is_available_b2 = AVAILABLE(cand_up_left, B2);
+
+        if (is_available_b2) {
+            availableFlagLXB0 = MP_MX(B2, pred_flag_index_l0, mxB);
+            if (!availableFlagLXB0)
+                availableFlagLXB0 = MP_MX(B2, pred_flag_index_l1, mxB);
+        }
+    }
+
+    if (isScaledFlag_L0 == 0) {
+        if (availableFlagLXB0) {
+            availableFlagLXA0 = 1;
+            mxA = mxB;
+        }
+        availableFlagLXB0 = 0;
+
+        // XB0 and L1
+        if (is_available_b0) {
+            availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l0, mxB);
+            if (!availableFlagLXB0)
+                availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l1, mxB);
+        }
+
+        if (is_available_b1 && !availableFlagLXB0) {
+            availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l0, mxB);
+            if (!availableFlagLXB0)
+                availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l1, mxB);
+        }
+
+        if (is_available_b2 && !availableFlagLXB0) {
+            availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l0, mxB);
+            if (!availableFlagLXB0)
+                availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l1, mxB);
+        }
+    }
+
+    if (availableFlagLXA0 && availableFlagLXB0 &&
+        (mxA.x != mxB.x || mxA.y != mxB.y)) {
+        availableFlagLXCol = 0;
+    } else {
+        //temporal motion vector prediction candidate
+        if (s->sh.slice_temporal_mvp_enabled_flag == 0) {
+            availableFlagLXCol = 0;
+        } else {
+            availableFlagLXCol = temporal_luma_motion_vector(s, x0, y0, nPbW,
+                    nPbH, ref_idx, &mvLXCol, LX);
+        }
+    }
+
+    if (availableFlagLXA0) {
+        mvpcand_list[numMVPCandLX] = mxA;
+        numMVPCandLX++;
+    }
+    if (availableFlagLXB0) {
+        mvpcand_list[numMVPCandLX] = mxB;
+        numMVPCandLX++;
+    }
+
+    if (availableFlagLXA0 && availableFlagLXB0 &&
+        mxA.x == mxB.x && mxA.y == mxB.y) {
+        numMVPCandLX--;
+    }
+
+    if (availableFlagLXCol && numMVPCandLX < 2) {
+        mvpcand_list[numMVPCandLX] = mvLXCol;
+        numMVPCandLX++;
+    }
+
+    while (numMVPCandLX < 2) { // insert zero motion vectors when the number of available candidates are less than 2
+        mvpcand_list[numMVPCandLX].x = 0;
+        mvpcand_list[numMVPCandLX].y = 0;
+        numMVPCandLX++;
+    }
+
+    mv->mv[LX].x = mvpcand_list[mvp_lx_flag].x;
+    mv->mv[LX].y = mvpcand_list[mvp_lx_flag].y;
+}
diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
new file mode 100644
index 0000000000..7e038bea0c
--- /dev/null
+++ b/libavcodec/hevc_parser.c
@@ -0,0 +1,125 @@
+/*
+ * HEVC Annex B format parser
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "parser.h"
+#include "hevc.h"
+
+#define START_CODE 0x000001 ///< start_code_prefix_one_3bytes
+
+/**
+ * Find the end of the current frame in the bitstream.
+ * @return the position of the first byte of the next frame, or END_NOT_FOUND
+ */
+static int hevc_find_frame_end(AVCodecParserContext *s, const uint8_t *buf, int buf_size)
+{
+    int i;
+    ParseContext *pc = s->priv_data;
+
+    for (i = 0; i < buf_size; i++) {
+        int nut;
+
+        pc->state64 = (pc->state64 << 8) | buf[i];
+
+        if (((pc->state64 >> 3 * 8) & 0xFFFFFF) != START_CODE)
+            continue;
+
+        nut = (pc->state64 >> 2 * 8 + 1) & 0x3F;
+        // Beginning of access unit
+        if ((nut >= NAL_VPS && nut <= NAL_AUD) || nut == NAL_SEI_PREFIX ||
+            (nut >= 41 && nut <= 44) || (nut >= 48 && nut <= 55)) {
+            if (pc->frame_start_found) {
+                pc->frame_start_found = 0;
+                return i - 5;
+            }
+        } else if (nut <= NAL_RASL_R ||
+                   (nut >= NAL_BLA_W_LP && nut <= NAL_CRA_NUT)) {
+            int first_slice_segment_in_pic_flag = buf[i] >> 7;
+            if (first_slice_segment_in_pic_flag) {
+                if (!pc->frame_start_found) {
+                    pc->frame_start_found = 1;
+                    s->key_frame = nut >= NAL_BLA_W_LP && nut <= NAL_CRA_NUT;
+                } else { // First slice of next frame found
+                    pc->frame_start_found = 0;
+                    return i - 5;
+                }
+            }
+        }
+    }
+
+    return END_NOT_FOUND;
+}
+
+static int hevc_parse(AVCodecParserContext *s,
+                      AVCodecContext *avctx,
+                      const uint8_t **poutbuf, int *poutbuf_size,
+                      const uint8_t *buf, int buf_size)
+{
+    int next;
+    ParseContext *pc = s->priv_data;
+
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+        next = hevc_find_frame_end(s, buf, buf_size);
+        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+            *poutbuf = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
+    }
+
+    *poutbuf = buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+// Split after the parameter sets at the beginning of the stream if they exist.
+static int hevc_split(AVCodecContext *avctx, const uint8_t *buf, int buf_size)
+{
+    int i;
+    uint32_t state = -1;
+    int has_ps = 0;
+
+    for (i = 0; i < buf_size; i++) {
+        state = (state << 8) | buf[i];
+        if (((state >> 8) & 0xFFFFFF) == START_CODE) {
+            int nut = (state >> 1) & 0x3F;
+            if (nut >= NAL_VPS && nut <= NAL_PPS) {
+                has_ps = 1;
+            } else if (has_ps) {
+                return i - 3;
+            } else { // no parameter set at the beginning of the stream
+                return 0;
+            }
+        }
+    }
+    return 0;
+}
+
+AVCodecParser ff_hevc_parser = {
+    .codec_ids      = { AV_CODEC_ID_HEVC },
+    .priv_data_size = sizeof(ParseContext),
+    .parser_parse   = hevc_parse,
+    .parser_close   = ff_parse_close,
+    .split          = hevc_split,
+};
diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
new file mode 100644
index 0000000000..5f0b52de65
--- /dev/null
+++ b/libavcodec/hevc_ps.c
@@ -0,0 +1,1242 @@
+/*
+ * HEVC Parameter Set Decoding
+ *
+ * Copyright (C) 2012 - 2103 Guillaume Martres
+ * Copyright (C) 2012 - 2103 Mickael Raulet
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2013 Vittorio Giovara
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "golomb.h"
+#include "libavutil/imgutils.h"
+#include "hevc.h"
+
+static const uint8_t default_scaling_list_intra[] = {
+    16, 16, 16, 16, 17, 18, 21, 24,
+    16, 16, 16, 16, 17, 19, 22, 25,
+    16, 16, 17, 18, 20, 22, 25, 29,
+    16, 16, 18, 21, 24, 27, 31, 36,
+    17, 17, 20, 24, 30, 35, 41, 47,
+    18, 19, 22, 27, 35, 44, 54, 65,
+    21, 22, 25, 31, 41, 54, 70, 88,
+    24, 25, 29,36, 47, 65, 88, 115
+};
+
+static const uint8_t default_scaling_list_inter[] = {
+    16, 16, 16, 16, 17, 18, 20, 24,
+    16, 16, 16, 17, 18, 20, 24, 25,
+    16, 16, 17, 18, 20, 24, 25, 28,
+    16, 17, 18, 20, 24, 25, 28, 33,
+    17, 18, 20, 24, 25, 28, 33, 41,
+    18, 20, 24, 25, 28, 33, 41, 54,
+    20, 24, 25, 28, 33, 41, 54, 71,
+    24, 25, 28, 33, 41, 54, 71, 91
+};
+
+static const AVRational vui_sar[] = {
+    { 0,   1  },
+    { 1,   1  },
+    { 12,  11 },
+    { 10,  11 },
+    { 16,  11 },
+    { 40,  33 },
+    { 24,  11 },
+    { 20,  11 },
+    { 32,  11 },
+    { 80,  33 },
+    { 18,  11 },
+    { 15,  11 },
+    { 64,  33 },
+    { 160, 99 },
+    { 4,   3  },
+    { 3,   2  },
+    { 2,   1  },
+};
+
+int ff_hevc_decode_short_term_rps(HEVCContext *s, ShortTermRPS *rps,
+                                  const HEVCSPS *sps, int is_slice_header)
+{
+    HEVCLocalContext *lc = &s->HEVClc;
+    uint8_t rps_predict = 0;
+    int delta_poc;
+    int k0 = 0;
+    int k1 = 0;
+    int k  = 0;
+    int i;
+
+    GetBitContext *gb = &lc->gb;
+
+    if (rps != sps->st_rps && sps->nb_st_rps)
+        rps_predict  = get_bits1(gb);
+
+    if (rps_predict) {
+        const ShortTermRPS *rps_ridx;
+        int delta_rps, abs_delta_rps;
+        uint8_t use_delta_flag = 0;
+        uint8_t delta_rps_sign;
+
+        if (is_slice_header) {
+            int delta_idx = get_ue_golomb(gb) + 1;
+            if (delta_idx > sps->nb_st_rps) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid value of delta_idx "
+                       "in slice header RPS: %d > %d.\n", delta_idx,
+                       sps->nb_st_rps);
+                return AVERROR_INVALIDDATA;
+            }
+            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
+        } else
+            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
+
+        delta_rps_sign = get_bits1(gb);
+        abs_delta_rps  = get_ue_golomb(gb) + 1;
+        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
+        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
+            int used = rps->used[k] = get_bits1(gb);
+
+            if (!used)
+                use_delta_flag = get_bits1(gb);
+
+            if (used || use_delta_flag) {
+                if (i < rps_ridx->num_delta_pocs)
+                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
+                else
+                    delta_poc = delta_rps;
+                rps->delta_poc[k] = delta_poc;
+                if (delta_poc < 0)
+                    k0++;
+                else
+                    k1++;
+                k++;
+            }
+        }
+
+        rps->num_delta_pocs    = k;
+        rps->num_negative_pics = k0;
+        // sort in increasing order (smallest first)
+        if (rps->num_delta_pocs != 0) {
+            int used, tmp;
+            for (i = 1; i < rps->num_delta_pocs; i++) {
+                delta_poc = rps->delta_poc[i];
+                used      = rps->used[i];
+                for (k = i-1 ; k >= 0;  k--) {
+                    tmp = rps->delta_poc[k];
+                    if (delta_poc < tmp ) {
+                        rps->delta_poc[k+1] = tmp;
+                        rps->used[k+1]      = rps->used[k];
+                        rps->delta_poc[k]   = delta_poc;
+                        rps->used[k]        = used;
+                    }
+                }
+            }
+        }
+        if ((rps->num_negative_pics >> 1) != 0) {
+            int used;
+            k = rps->num_negative_pics - 1;
+            // flip the negative values to largest first
+            for (i = 0; i < rps->num_negative_pics>>1; i++) {
+                delta_poc          = rps->delta_poc[i];
+                used               = rps->used[i];
+                rps->delta_poc[i]  = rps->delta_poc[k];
+                rps->used[i]       = rps->used[k];
+                rps->delta_poc[k]  = delta_poc;
+                rps->used[k]       = used;
+                k--;
+            }
+        }
+    } else {
+        unsigned int prev, nb_positive_pics;
+        rps->num_negative_pics = get_ue_golomb(gb);
+        nb_positive_pics       = get_ue_golomb(gb);
+
+        if (rps->num_negative_pics >= MAX_REFS ||
+            nb_positive_pics >= MAX_REFS) {
+            av_log(s->avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
+            return AVERROR_INVALIDDATA;
+        }
+
+        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
+        if (rps->num_delta_pocs) {
+            prev = 0;
+            for (i = 0; i < rps->num_negative_pics; i++) {
+                delta_poc = get_ue_golomb(gb) + 1;
+                prev -= delta_poc;
+                rps->delta_poc[i] = prev;
+                rps->used[i] = get_bits1(gb);
+            }
+            prev = 0;
+            for (i = 0; i < nb_positive_pics; i++) {
+                delta_poc = get_ue_golomb(gb) + 1;
+                prev += delta_poc;
+                rps->delta_poc[rps->num_negative_pics + i] = prev;
+                rps->used[rps->num_negative_pics + i] = get_bits1(gb);
+            }
+        }
+    }
+    return 0;
+}
+
+static int decode_profile_tier_level(HEVCLocalContext *lc, PTL *ptl, int max_num_sub_layers)
+{
+    int i, j;
+    GetBitContext *gb = &lc->gb;
+
+    ptl->general_profile_space = get_bits(gb, 2);
+    ptl->general_tier_flag = get_bits1(gb);
+    ptl->general_profile_idc = get_bits(gb, 5);
+    for (i = 0; i < 32; i++)
+        ptl->general_profile_compatibility_flag[i] = get_bits1(gb);
+    skip_bits1(gb);// general_progressive_source_flag
+    skip_bits1(gb);// general_interlaced_source_flag
+    skip_bits1(gb);// general_non_packed_constraint_flag
+    skip_bits1(gb);// general_frame_only_constraint_flag
+    if (get_bits(gb, 16) != 0) // XXX_reserved_zero_44bits[0..15]
+        return -1;
+    if (get_bits(gb, 16) != 0) // XXX_reserved_zero_44bits[16..31]
+        return -1;
+    if (get_bits(gb, 12) != 0) // XXX_reserved_zero_44bits[32..43]
+        return -1;
+
+    ptl->general_level_idc = get_bits(gb, 8);
+    for (i = 0; i < max_num_sub_layers - 1; i++) {
+        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
+        ptl->sub_layer_level_present_flag[i] = get_bits1(gb);
+    }
+    if (max_num_sub_layers - 1 > 0)
+        for (i = max_num_sub_layers - 1; i < 8; i++)
+            skip_bits(gb, 2); // reserved_zero_2bits[i]
+    for (i = 0; i < max_num_sub_layers - 1; i++) {
+        if (ptl->sub_layer_profile_present_flag[i]) {
+            ptl->sub_layer_profile_space[i] = get_bits(gb, 2);
+            ptl->sub_layer_tier_flag[i] = get_bits(gb, 1);
+            ptl->sub_layer_profile_idc[i] = get_bits(gb, 5);
+            for (j = 0; j < 32; j++)
+                ptl->sub_layer_profile_compatibility_flags[i][j] = get_bits1(gb);
+            skip_bits1(gb);// sub_layer_progressive_source_flag
+            skip_bits1(gb);// sub_layer_interlaced_source_flag
+            skip_bits1(gb);// sub_layer_non_packed_constraint_flag
+            skip_bits1(gb);// sub_layer_frame_only_constraint_flag
+
+            if (get_bits(gb, 16) != 0) // sub_layer_reserved_zero_44bits[0..15]
+                return -1;
+            if (get_bits(gb, 16) != 0) // sub_layer_reserved_zero_44bits[16..31]
+                return -1;
+            if (get_bits(gb, 12) != 0) // sub_layer_reserved_zero_44bits[32..43]
+                return -1;
+        }
+        if (ptl->sub_layer_level_present_flag[i])
+            ptl->sub_layer_level_idc[i] = get_bits(gb, 8);
+    }
+    return 0;
+}
+
+static void decode_hrd(HEVCContext *s)
+{
+    av_log(s->avctx, AV_LOG_ERROR, "HRD parsing not yet implemented\n");
+}
+
+int ff_hevc_decode_nal_vps(HEVCContext *s)
+{
+    int i,j;
+    GetBitContext *gb = &s->HEVClc.gb;
+    int vps_id = 0;
+    VPS *vps;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "Decoding VPS\n");
+
+    vps = av_mallocz(sizeof(*vps));
+    if (!vps)
+        return AVERROR(ENOMEM);
+
+    vps_id = get_bits(gb, 4);
+    if (vps_id >= MAX_VPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
+        goto err;
+    }
+
+    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
+        av_log(s->avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
+        goto err;
+    }
+
+    vps->vps_max_layers               = get_bits(gb, 6) + 1;
+    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
+    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
+
+    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
+        av_log(s->avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
+        goto err;
+    }
+
+    if (vps->vps_max_sub_layers > MAX_SUB_LAYERS) {
+        av_log(s->avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
+               vps->vps_max_sub_layers);
+        goto err;
+    }
+
+    if (decode_profile_tier_level(&s->HEVClc, &vps->ptl, vps->vps_max_sub_layers) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "Error decoding profile tier level.\n");
+        goto err;
+    }
+    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
+
+    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
+    for (; i < vps->vps_max_sub_layers; i++) {
+        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb(gb);
+        vps->vps_num_reorder_pics[i]      = get_ue_golomb(gb);
+        vps->vps_max_latency_increase[i]  = get_ue_golomb(gb);
+
+        if (vps->vps_max_dec_pic_buffering[i] >= MAX_DPB_SIZE) {
+            av_log(s->avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
+                   vps->vps_max_dec_pic_buffering[i] - 1);
+            goto err;
+        }
+        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i]) {
+            av_log(s->avctx, AV_LOG_ERROR, "vps_max_num_reorder_pics out of range: %d\n",
+                   vps->vps_num_reorder_pics[i]);
+            goto err;
+        }
+    }
+
+    vps->vps_max_layer_id   = get_bits(gb, 6);
+    vps->vps_num_layer_sets = get_ue_golomb(gb) + 1;
+    for (i = 1; i < vps->vps_num_layer_sets; i++)
+        for (j = 0; j <= vps->vps_max_layer_id; j++)
+            skip_bits(gb, 1); // layer_id_included_flag[i][j]
+
+    vps->vps_timing_info_present_flag = get_bits1(gb);
+    if (vps->vps_timing_info_present_flag) {
+        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
+        vps->vps_time_scale                      = get_bits_long(gb, 32);
+        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
+        if (vps->vps_poc_proportional_to_timing_flag)
+            vps->vps_num_ticks_poc_diff_one = get_ue_golomb(gb) + 1;
+        vps->vps_num_hrd_parameters = get_ue_golomb(gb);
+        if (vps->vps_num_hrd_parameters != 0) {
+            avpriv_report_missing_feature(s->avctx, "support for vps_num_hrd_parameters != 0");
+            av_free(vps);
+            return AVERROR_PATCHWELCOME;
+        }
+    }
+    get_bits1(gb); /* vps_extension_flag */
+
+    av_free(s->vps_list[vps_id]);
+    s->vps_list[vps_id] = vps;
+    return 0;
+
+err:
+    av_free(vps);
+    return AVERROR_INVALIDDATA;
+}
+
+static void decode_vui(HEVCContext *s, HEVCSPS *sps)
+{
+    VUI *vui = &sps->vui;
+    GetBitContext *gb = &s->HEVClc.gb;
+    int sar_present;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "Decoding VUI\n");
+
+    sar_present = get_bits1(gb);
+    if (sar_present) {
+        uint8_t sar_idx = get_bits(gb, 8);
+        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
+            vui->sar = vui_sar[sar_idx];
+        else if (sar_idx == 255) {
+            vui->sar.num = get_bits(gb, 16);
+            vui->sar.den = get_bits(gb, 16);
+        } else
+            av_log(s->avctx, AV_LOG_WARNING, "Unknown SAR index: %u.\n",
+                   sar_idx);
+    }
+
+    vui->overscan_info_present_flag = get_bits1(gb);
+    if (vui->overscan_info_present_flag)
+        vui->overscan_appropriate_flag = get_bits1(gb);
+
+    vui->video_signal_type_present_flag = get_bits1(gb);
+    if (vui->video_signal_type_present_flag) {
+        vui->video_format                    = get_bits(gb, 3);
+        vui->video_full_range_flag           = get_bits1(gb);
+        vui->colour_description_present_flag = get_bits1(gb);
+        if (vui->colour_description_present_flag) {
+            vui->colour_primaries        = get_bits(gb, 8);
+            vui->transfer_characteristic = get_bits(gb, 8);
+            vui->matrix_coeffs           = get_bits(gb, 8);
+        }
+    }
+
+    vui->chroma_loc_info_present_flag = get_bits1(gb);
+    if (vui->chroma_loc_info_present_flag) {
+        vui->chroma_sample_loc_type_top_field    = get_ue_golomb(gb);
+        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb(gb);
+    }
+
+    vui->neutra_chroma_indication_flag = get_bits1(gb);
+    vui->field_seq_flag                = get_bits1(gb);
+    vui->frame_field_info_present_flag = get_bits1(gb);
+
+    vui->default_display_window_flag = get_bits1(gb);
+    if (vui->default_display_window_flag) {
+        //TODO: * 2 is only valid for 420
+        vui->def_disp_win.left_offset   = get_ue_golomb(gb) * 2;
+        vui->def_disp_win.right_offset  = get_ue_golomb(gb) * 2;
+        vui->def_disp_win.top_offset    = get_ue_golomb(gb) * 2;
+        vui->def_disp_win.bottom_offset = get_ue_golomb(gb) * 2;
+
+        if (s->strict_def_disp_win &&
+            s->avctx->flags2 & CODEC_FLAG2_IGNORE_CROP) {
+            av_log(s->avctx, AV_LOG_DEBUG,
+                   "discarding vui default display window, "
+                   "original values are l:%u r:%u t:%u b:%u\n",
+                   vui->def_disp_win.left_offset,
+                   vui->def_disp_win.right_offset,
+                   vui->def_disp_win.top_offset,
+                   vui->def_disp_win.bottom_offset);
+
+            vui->def_disp_win.left_offset   =
+            vui->def_disp_win.right_offset  =
+            vui->def_disp_win.top_offset    =
+            vui->def_disp_win.bottom_offset = 0;
+        }
+    }
+
+    vui->vui_timing_info_present_flag = get_bits1(gb);
+    if (vui->vui_timing_info_present_flag) {
+        vui->vui_num_units_in_tick               = get_bits(gb, 32);
+        vui->vui_time_scale                      = get_bits(gb, 32);
+        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
+        if (vui->vui_poc_proportional_to_timing_flag)
+            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb(gb);
+        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
+        if (vui->vui_hrd_parameters_present_flag)
+            decode_hrd(s);
+    }
+
+    vui->bitstream_restriction_flag = get_bits1(gb);
+    if (vui->bitstream_restriction_flag) {
+        vui->tiles_fixed_structure_flag              = get_bits1(gb);
+        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
+        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
+        vui->min_spatial_segmentation_idc            = get_ue_golomb(gb);
+        vui->max_bytes_per_pic_denom                 = get_ue_golomb(gb);
+        vui->max_bits_per_min_cu_denom               = get_ue_golomb(gb);
+        vui->log2_max_mv_length_horizontal           = get_ue_golomb(gb);
+        vui->log2_max_mv_length_vertical             = get_ue_golomb(gb);
+    }
+}
+
+static void set_default_scaling_list_data(ScalingList *sl)
+{
+    int matrixId;
+
+    for (matrixId = 0; matrixId < 6; matrixId++) {
+        // 4x4 default is 16
+        memset(sl->sl[0][matrixId], 16, 16);
+        sl->sl_dc[0][matrixId] = 16; // default for 16x16
+        sl->sl_dc[1][matrixId] = 16; // default for 32x32
+    }
+    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
+    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
+    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
+    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
+    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
+    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
+    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
+    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
+    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
+    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
+    memcpy(sl->sl[3][1], default_scaling_list_inter, 64);
+}
+
+static int scaling_list_data(HEVCContext *s, ScalingList *sl)
+{
+    GetBitContext *gb = &s->HEVClc.gb;
+    uint8_t scaling_list_pred_mode_flag[4][6];
+    int32_t scaling_list_dc_coef[2][6];
+
+    int size_id, matrix_id, i, pos, delta;
+    for (size_id = 0; size_id < 4; size_id++)
+        for (matrix_id = 0; matrix_id < ((size_id == 3) ? 2 : 6); matrix_id++) {
+            scaling_list_pred_mode_flag[size_id][matrix_id] = get_bits1(gb);
+            if (!scaling_list_pred_mode_flag[size_id][matrix_id]) {
+                delta = get_ue_golomb(gb);
+                // Only need to handle non-zero delta. Zero means default, which should already be in the arrays.
+                if (delta) {
+                    // Copy from previous array.
+                    if (matrix_id - delta < 0) {
+                        av_log(s->avctx, AV_LOG_ERROR,
+                               "Invalid delta in scaling list data: %d.\n", delta);
+                        return AVERROR_INVALIDDATA;
+                    }
+
+                    memcpy(sl->sl[size_id][matrix_id],
+                           sl->sl[size_id][matrix_id - delta],
+                           size_id > 0 ? 64 : 16);
+                    if (size_id > 1)
+                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
+                }
+            } else {
+                int next_coef;
+                int coef_num;
+                int32_t scaling_list_delta_coef;
+
+                next_coef = 8;
+                coef_num = FFMIN(64, (1  <<  (4 + (size_id  <<  1))));
+                if (size_id > 1) {
+                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
+                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
+                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
+                }
+                for (i = 0; i < coef_num; i++) {
+                    if (size_id == 0)
+                        pos = 4 * ff_hevc_diag_scan4x4_y[i] + ff_hevc_diag_scan4x4_x[i];
+                    else
+                        pos = 8 * ff_hevc_diag_scan8x8_y[i] + ff_hevc_diag_scan8x8_x[i];
+
+                    scaling_list_delta_coef = get_se_golomb(gb);
+                    next_coef = (next_coef + scaling_list_delta_coef + 256 ) % 256;
+                    sl->sl[size_id][matrix_id][pos] = next_coef;
+                }
+            }
+        }
+
+    return 0;
+}
+
+int ff_hevc_decode_nal_sps(HEVCContext *s)
+{
+    const AVPixFmtDescriptor *desc;
+    GetBitContext *gb = &s->HEVClc.gb;
+    int ret    = 0;
+    int sps_id = 0;
+    int log2_diff_max_min_transform_block_size;
+    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
+    int i;
+
+    HEVCSPS *sps;
+    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
+
+    if (!sps_buf)
+        return AVERROR(ENOMEM);
+    sps = (HEVCSPS*)sps_buf->data;
+
+    av_log(s->avctx, AV_LOG_DEBUG, "Decoding SPS\n");
+
+    // Coded parameters
+
+    sps->vps_id = get_bits(gb, 4);
+    if (sps->vps_id >= MAX_VPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    sps->max_sub_layers = get_bits(gb, 3) + 1;
+    if (sps->max_sub_layers > MAX_SUB_LAYERS) {
+        av_log(s->avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
+               sps->max_sub_layers);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    skip_bits1(gb); // temporal_id_nesting_flag
+    if (decode_profile_tier_level(&s->HEVClc, &sps->ptl, sps->max_sub_layers) < 0) {
+        av_log(s->avctx, AV_LOG_ERROR, "error decoding profile tier level\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    sps_id = get_ue_golomb(gb);
+    if (sps_id >= MAX_SPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", sps_id);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    sps->chroma_format_idc = get_ue_golomb(gb);
+    if (sps->chroma_format_idc != 1) {
+        avpriv_report_missing_feature(s->avctx, "chroma_format_idc != 1\n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    if (sps->chroma_format_idc == 3)
+        sps->separate_colour_plane_flag = get_bits1(gb);
+
+    sps->width  = get_ue_golomb(gb);
+    sps->height = get_ue_golomb(gb);
+    if ((ret = av_image_check_size(sps->width,
+                                   sps->height, 0, s->avctx)) < 0)
+        goto err;
+
+    if (get_bits1(gb)) { // pic_conformance_flag
+        //TODO: * 2 is only valid for 420
+        sps->pic_conf_win.left_offset   = get_ue_golomb(gb) * 2;
+        sps->pic_conf_win.right_offset  = get_ue_golomb(gb) * 2;
+        sps->pic_conf_win.top_offset    = get_ue_golomb(gb) * 2;
+        sps->pic_conf_win.bottom_offset = get_ue_golomb(gb) * 2;
+
+        if (s->avctx->flags2 & CODEC_FLAG2_IGNORE_CROP) {
+            av_log(s->avctx, AV_LOG_DEBUG,
+                   "discarding sps conformance window, "
+                   "original values are l:%u r:%u t:%u b:%u\n",
+                   sps->pic_conf_win.left_offset,
+                   sps->pic_conf_win.right_offset,
+                   sps->pic_conf_win.top_offset,
+                   sps->pic_conf_win.bottom_offset);
+
+            sps->pic_conf_win.left_offset   =
+            sps->pic_conf_win.right_offset  =
+            sps->pic_conf_win.top_offset    =
+            sps->pic_conf_win.bottom_offset = 0;
+        }
+        sps->output_window = sps->pic_conf_win;
+    }
+
+    sps->bit_depth   = get_ue_golomb(gb) + 8;
+    bit_depth_chroma = get_ue_golomb(gb) + 8;
+    if (bit_depth_chroma != sps->bit_depth) {
+        av_log(s->avctx, AV_LOG_ERROR,
+               "Luma bit depth (%d) is different from chroma bit depth (%d), this is unsupported.\n",
+               sps->bit_depth, bit_depth_chroma);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    if (sps->chroma_format_idc == 1) {
+        switch (sps->bit_depth) {
+        case 8:  sps->pix_fmt = PIX_FMT_YUV420P;   break;
+        case 9:  sps->pix_fmt = PIX_FMT_YUV420P9;  break;
+        case 10: sps->pix_fmt = PIX_FMT_YUV420P10; break;
+        default:
+            av_log(s->avctx, AV_LOG_ERROR, "Unsupported bit depth: %d\n",
+                   sps->bit_depth);
+            ret = AVERROR_PATCHWELCOME;
+            goto err;
+        }
+    } else {
+        av_log(s->avctx, AV_LOG_ERROR, "non-4:2:0 support is currently unspecified.\n");
+        return AVERROR_PATCHWELCOME;
+    }
+
+    desc = av_pix_fmt_desc_get(sps->pix_fmt);
+    if (!desc) {
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+
+    sps->hshift[0] = sps->vshift[0] = 0;
+    sps->hshift[2] = sps->hshift[1] = desc->log2_chroma_w;
+    sps->vshift[2] = sps->vshift[1] = desc->log2_chroma_h;
+
+    sps->pixel_shift = sps->bit_depth > 8;
+
+    sps->log2_max_poc_lsb = get_ue_golomb(gb) + 4;
+    if (sps->log2_max_poc_lsb > 16) {
+        av_log(s->avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
+               sps->log2_max_poc_lsb - 4);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    sublayer_ordering_info = get_bits1(gb);
+    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
+    for (i = start; i < sps->max_sub_layers; i++) {
+        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb(gb);
+        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb(gb);
+        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb(gb);
+        if (sps->temporal_layer[i].max_dec_pic_buffering >= MAX_DPB_SIZE) {
+            av_log(s->avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
+                   sps->temporal_layer[i].max_dec_pic_buffering - 1);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering) {
+            av_log(s->avctx, AV_LOG_ERROR, "sps_max_num_reorder_pics out of range: %d\n",
+                   sps->temporal_layer[i].num_reorder_pics);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+    }
+
+    if (!sublayer_ordering_info) {
+        for (i = 0; i < start; i++){
+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
+            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
+            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
+        }
+    }
+
+    sps->log2_min_coding_block_size             = get_ue_golomb(gb) + 3;
+    sps->log2_diff_max_min_coding_block_size    = get_ue_golomb(gb);
+    sps->log2_min_transform_block_size          = get_ue_golomb(gb) + 2;
+    log2_diff_max_min_transform_block_size      = get_ue_golomb(gb);
+    sps->log2_max_trafo_size                    = log2_diff_max_min_transform_block_size + sps->log2_min_transform_block_size;
+
+    if (sps->log2_min_transform_block_size >= sps->log2_min_coding_block_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid value for log2_min_transform_block_size");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    sps->max_transform_hierarchy_depth_inter = get_ue_golomb(gb);
+    sps->max_transform_hierarchy_depth_intra = get_ue_golomb(gb);
+
+    sps->scaling_list_enable_flag = get_bits1(gb);
+    if (sps->scaling_list_enable_flag) {
+        set_default_scaling_list_data(&sps->scaling_list);
+
+        if (get_bits1(gb)) {
+            ret = scaling_list_data(s, &sps->scaling_list);
+            if (ret < 0)
+                goto err;
+        }
+    }
+
+    sps->amp_enabled_flag = get_bits1(gb);
+    sps->sao_enabled      = get_bits1(gb);
+
+    sps->pcm_enabled_flag = get_bits1(gb);
+    if (sps->pcm_enabled_flag) {
+        int pcm_bit_depth_chroma;
+        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+        pcm_bit_depth_chroma = get_bits(gb, 4) + 1;
+        if (pcm_bit_depth_chroma != sps->pcm.bit_depth) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "PCM Luma bit depth (%d) is different from PCM chroma"\
+                   "bit depth (%d), this is unsupported.\n",
+                   sps->pcm.bit_depth, pcm_bit_depth_chroma);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+
+        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb(gb) + 3;
+        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
+                                        get_ue_golomb(gb);
+        if (sps->pcm.bit_depth > sps->bit_depth) {
+            av_log(s->avctx, AV_LOG_ERROR,
+                   "PCM bit depth (%d) is greater than normal bit depth (%d)\n",
+                   sps->pcm.bit_depth, sps->bit_depth);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+
+        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
+    }
+
+    sps->nb_st_rps = get_ue_golomb(gb);
+    if (sps->nb_st_rps > MAX_SHORT_TERM_RPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
+               sps->nb_st_rps);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    for (i = 0; i < sps->nb_st_rps; i++) {
+        if ((ret = ff_hevc_decode_short_term_rps(s, &sps->st_rps[i],
+                                                 sps, 0)) < 0)
+            goto err;
+    }
+
+    sps->long_term_ref_pics_present_flag = get_bits1(gb);
+    if (sps->long_term_ref_pics_present_flag) {
+        sps->num_long_term_ref_pics_sps = get_ue_golomb(gb);
+        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
+            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
+            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
+        }
+    }
+
+    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
+    sps->sps_strong_intra_smoothing_enable_flag = get_bits1(gb);
+    vui_present = get_bits1(gb);
+    if (vui_present)
+        decode_vui(s, sps);
+    skip_bits1(gb); // sps_extension_flag
+
+    if (s->strict_def_disp_win) {
+        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
+        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
+        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
+        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
+    }
+    if (sps->output_window.left_offset & (0x1F >> (sps->pixel_shift)) &&
+        !(s->avctx->flags & CODEC_FLAG_UNALIGNED)) {
+        sps->output_window.left_offset &= ~(0x1F >> (sps->pixel_shift));
+        av_log(s->avctx, AV_LOG_WARNING, "Reducing left output window to %d "
+               "chroma samples to preserve alignment.\n",
+               sps->output_window.left_offset);
+    }
+    sps->output_width  = sps->width -
+                         (sps->output_window.left_offset + sps->output_window.right_offset);
+    sps->output_height = sps->height -
+                         (sps->output_window.top_offset + sps->output_window.bottom_offset);
+    if (sps->output_width <= 0 || sps->output_height <= 0) {
+        av_log(s->avctx, AV_LOG_WARNING, "Invalid visible frame dimensions: %dx%d.\n",
+               sps->output_width, sps->output_height);
+        if (s->avctx->err_recognition & AV_EF_EXPLODE) {
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+        av_log(s->avctx, AV_LOG_WARNING, "Displaying the whole video surface.\n");
+        sps->pic_conf_win.left_offset   =
+        sps->pic_conf_win.right_offset  =
+        sps->pic_conf_win.top_offset    =
+        sps->pic_conf_win.bottom_offset = 0;
+        sps->output_width  = sps->width;
+        sps->output_height = sps->height;
+    }
+
+    // Inferred parameters
+    sps->log2_ctb_size     = sps->log2_min_coding_block_size
+                             + sps->log2_diff_max_min_coding_block_size;
+
+    sps->ctb_width         = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
+    sps->ctb_height        = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
+    sps->ctb_size          = sps->ctb_width * sps->ctb_height;
+
+    sps->min_cb_width      = sps->width  >> sps->log2_min_coding_block_size;
+    sps->min_cb_height     = sps->height >> sps->log2_min_coding_block_size;
+    sps->min_tb_width      = sps->width  >> sps->log2_min_transform_block_size;
+    sps->min_tb_height     = sps->height >> sps->log2_min_transform_block_size;
+    sps->log2_min_pu_size  = sps->log2_min_coding_block_size - 1;
+
+    sps->qp_bd_offset      = 6 * (sps->bit_depth - 8);
+
+    if (sps->width  & ((1 << sps->log2_min_coding_block_size) - 1) ||
+        sps->height & ((1 << sps->log2_min_coding_block_size) - 1)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
+        goto err;
+    }
+
+    if (sps->log2_ctb_size > MAX_LOG2_CTB_SIZE) {
+        av_log(s->avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
+        goto err;
+    }
+    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_transform_block_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
+               sps->max_transform_hierarchy_depth_inter);
+        goto err;
+    }
+    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_transform_block_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
+               sps->max_transform_hierarchy_depth_intra);
+        goto err;
+    }
+    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
+        av_log(s->avctx, AV_LOG_ERROR, "max transform block size out of range: %d\n",
+               sps->log2_max_trafo_size);
+        goto err;
+    }
+
+    /* if an SPS with this id but different dimensions already exists, remove
+     * all PPSes that depend on it */
+#define DIFF(x) (sps->x != ((HEVCSPS*)s->sps_list[sps_id]->data)->x)
+    if (s->sps_list[sps_id] &&
+        (DIFF(width) || DIFF(height) || DIFF(chroma_format_idc) ||
+         DIFF(bit_depth) || DIFF(ctb_width) || DIFF(ctb_height))) {
+        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++) {
+            if (s->pps_list[i] && ((HEVCPPS*)s->pps_list[i]->data)->sps_id == sps_id)
+                av_buffer_unref(&s->pps_list[i]);
+        }
+    }
+#undef DIFF
+
+    av_buffer_unref(&s->sps_list[sps_id]);
+    s->sps_list[sps_id] = sps_buf;
+
+    if (s->avctx->debug & FF_DEBUG_BITSTREAM) {
+        av_log(s->avctx, AV_LOG_DEBUG, "Parsed SPS: id %d; coded wxh: %dx%d; "
+               "cropped wxh: %dx%d; pix_fmt: %s.\n",
+               sps_id, sps->width, sps->height,
+               sps->output_width, sps->output_height,
+               av_get_pix_fmt_name(sps->pix_fmt));
+    }
+
+    return 0;
+err:
+
+    av_buffer_unref(&sps_buf);
+    return ret;
+}
+
+static void hevc_pps_free(void *opaque, uint8_t *data)
+{
+    HEVCPPS *pps = (HEVCPPS*)data;
+
+    av_freep(&pps->column_width);
+    av_freep(&pps->row_height);
+    av_freep(&pps->col_bd);
+    av_freep(&pps->row_bd);
+    av_freep(&pps->col_idxX);
+    av_freep(&pps->ctb_addr_rs_to_ts);
+    av_freep(&pps->ctb_addr_ts_to_rs);
+    av_freep(&pps->tile_pos_rs);
+    av_freep(&pps->tile_id);
+    av_freep(&pps->min_cb_addr_zs);
+    av_freep(&pps->min_tb_addr_zs);
+
+    av_freep(&pps);
+}
+
+int ff_hevc_decode_nal_pps(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc.gb;
+    HEVCSPS      *sps = NULL;
+    int pic_area_in_ctbs, pic_area_in_min_cbs, pic_area_in_min_tbs;
+    int log2_diff_ctb_min_tb_size;
+    int i, j, x, y, ctb_addr_rs, tile_id;
+    int ret    = 0;
+    int pps_id = 0;
+
+    AVBufferRef *pps_buf;
+    HEVCPPS *pps = av_mallocz(sizeof(*pps));
+
+    if (!pps)
+        return AVERROR(ENOMEM);
+
+    pps_buf = av_buffer_create((uint8_t*)pps, sizeof(*pps), hevc_pps_free, NULL, 0);
+    if (!pps_buf) {
+        av_freep(&pps);
+        return AVERROR(ENOMEM);
+    }
+
+    av_log(s->avctx, AV_LOG_DEBUG, "Decoding PPS\n");
+
+    // Default values
+    pps->loop_filter_across_tiles_enabled_flag = 1;
+    pps->num_tile_columns                      = 1;
+    pps->num_tile_rows                         = 1;
+    pps->uniform_spacing_flag                  = 1;
+    pps->pps_disable_deblocking_filter_flag    = 0;
+    pps->beta_offset                           = 0;
+    pps->tc_offset                             = 0;
+
+    // Coded parameters
+    pps_id = get_ue_golomb(gb);
+    if (pps_id >= MAX_PPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->sps_id = get_ue_golomb(gb);
+    if (pps->sps_id >= MAX_SPS_COUNT) {
+        av_log(s->avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    if (!s->sps_list[pps->sps_id]) {
+        av_log(s->avctx, AV_LOG_ERROR, "SPS does not exist \n");
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    sps = (HEVCSPS*)s->sps_list[pps->sps_id]->data;
+
+    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
+    pps->output_flag_present_flag              = get_bits1(gb);
+    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
+
+    pps->sign_data_hiding_flag = get_bits1(gb);
+
+    pps->cabac_init_present_flag = get_bits1(gb);
+
+    pps->num_ref_idx_l0_default_active = get_ue_golomb(gb) + 1;
+    pps->num_ref_idx_l1_default_active = get_ue_golomb(gb) + 1;
+
+    pps->pic_init_qp_minus26 = get_se_golomb(gb);
+
+    pps->constrained_intra_pred_flag = get_bits1(gb);
+    pps->transform_skip_enabled_flag = get_bits1(gb);
+
+    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
+    pps->diff_cu_qp_delta_depth   = 0;
+    if (pps->cu_qp_delta_enabled_flag)
+        pps->diff_cu_qp_delta_depth = get_ue_golomb(gb);
+
+    pps->cb_qp_offset = get_se_golomb(gb);
+    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
+        av_log(s->avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
+               pps->cb_qp_offset);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->cr_qp_offset = get_se_golomb(gb);
+    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
+        av_log(s->avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
+               pps->cr_qp_offset);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
+
+    pps->weighted_pred_flag   = get_bits1(gb);
+    pps->weighted_bipred_flag = get_bits1(gb);
+
+    pps->transquant_bypass_enable_flag    = get_bits1(gb);
+    pps->tiles_enabled_flag               = get_bits1(gb);
+    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
+
+    if (pps->tiles_enabled_flag) {
+        pps->num_tile_columns     = get_ue_golomb(gb) + 1;
+        pps->num_tile_rows        = get_ue_golomb(gb) + 1;
+        if (pps->num_tile_columns == 0 ||
+            pps->num_tile_columns >= sps->width) {
+            av_log(s->avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
+                   pps->num_tile_columns - 1);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+        if (pps->num_tile_rows == 0 ||
+            pps->num_tile_rows >= sps->height) {
+            av_log(s->avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
+                   pps->num_tile_rows - 1);
+            ret = AVERROR_INVALIDDATA;
+            goto err;
+        }
+
+        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
+        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
+        if (!pps->column_width || !pps->row_height) {
+            ret = AVERROR(ENOMEM);
+            goto err;
+        }
+
+        pps->uniform_spacing_flag = get_bits1(gb);
+        if (!pps->uniform_spacing_flag) {
+            int sum = 0;
+            for (i = 0; i < pps->num_tile_columns - 1; i++) {
+                pps->column_width[i] = get_ue_golomb(gb) + 1;
+                sum += pps->column_width[i];
+            }
+            if (sum >= sps->ctb_width) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
+
+            sum = 0;
+            for (i = 0; i < pps->num_tile_rows - 1; i++) {
+                pps->row_height[i] = get_ue_golomb(gb) + 1;
+                sum += pps->row_height[i];
+            }
+            if (sum >= sps->ctb_height) {
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
+        }
+        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
+    }
+
+    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
+
+    pps->deblocking_filter_control_present_flag = get_bits1(gb);
+    if (pps->deblocking_filter_control_present_flag) {
+        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
+        pps->pps_disable_deblocking_filter_flag = get_bits1(gb);
+        if (!pps->pps_disable_deblocking_filter_flag) {
+            pps->beta_offset = get_se_golomb(gb) * 2;
+            pps->tc_offset = get_se_golomb(gb) * 2;
+            if (pps->beta_offset/2 < -6 || pps->beta_offset/2 > 6) {
+                av_log(s->avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
+                       pps->beta_offset/2);
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+            if (pps->tc_offset/2 < -6 || pps->tc_offset/2 > 6) {
+                av_log(s->avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
+                       pps->tc_offset/2);
+                ret = AVERROR_INVALIDDATA;
+                goto err;
+            }
+        }
+    }
+
+    pps->pps_scaling_list_data_present_flag = get_bits1(gb);
+    if (pps->pps_scaling_list_data_present_flag) {
+        set_default_scaling_list_data(&pps->scaling_list);
+        ret = scaling_list_data(s, &pps->scaling_list);
+        if (ret < 0)
+            goto err;
+    }
+    pps->lists_modification_present_flag = get_bits1(gb);
+    pps->log2_parallel_merge_level       = get_ue_golomb(gb) + 2;
+    if (pps->log2_parallel_merge_level > sps->log2_ctb_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
+               pps->log2_parallel_merge_level - 2);
+        ret = AVERROR_INVALIDDATA;
+        goto err;
+    }
+
+    pps->slice_header_extension_present_flag = get_bits1(gb);
+    pps->pps_extension_flag                  = get_bits1(gb);
+
+    // Inferred parameters
+    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
+    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
+    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
+    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX) {
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    if (pps->uniform_spacing_flag) {
+        if (!pps->column_width) {
+            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
+            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
+        }
+        if (!pps->column_width || !pps->row_height) {
+            ret = AVERROR(ENOMEM);
+            goto err;
+        }
+
+        for (i = 0; i < pps->num_tile_columns; i++) {
+            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
+                                   (i * sps->ctb_width) / pps->num_tile_columns;
+        }
+
+        for (i = 0; i < pps->num_tile_rows; i++) {
+            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
+                                 (i * sps->ctb_height) / pps->num_tile_rows;
+        }
+    }
+
+    pps->col_bd[0] = 0;
+    for (i = 0; i < pps->num_tile_columns; i++)
+        pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
+
+    pps->row_bd[0] = 0;
+    for (i = 0; i < pps->num_tile_rows; i++)
+        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
+
+    for (i = 0, j = 0; i < sps->ctb_width; i++) {
+         if (i > pps->col_bd[j])
+             j++;
+         pps->col_idxX[i] = j;
+    }
+
+    /**
+     * 6.5
+     */
+    pic_area_in_ctbs     = sps->ctb_width    * sps->ctb_height;
+    pic_area_in_min_cbs  = sps->min_cb_width * sps->min_cb_height;
+    pic_area_in_min_tbs  = sps->min_tb_width * sps->min_tb_height;
+
+    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
+    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
+    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
+    pps->min_cb_addr_zs    = av_malloc_array(pic_area_in_min_cbs, sizeof(*pps->min_cb_addr_zs));
+    pps->min_tb_addr_zs    = av_malloc_array(pic_area_in_min_tbs, sizeof(*pps->min_tb_addr_zs));
+    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
+        !pps->tile_id || !pps->min_cb_addr_zs || !pps->min_tb_addr_zs) {
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
+        int tb_x   = ctb_addr_rs % sps->ctb_width;
+        int tb_y   = ctb_addr_rs / sps->ctb_width;
+        int tile_x = 0;
+        int tile_y = 0;
+        int val    = 0;
+
+        for (i = 0; i < pps->num_tile_columns; i++) {
+            if (tb_x < pps->col_bd[i + 1]) {
+                tile_x = i;
+                break;
+            }
+        }
+
+        for (i = 0; i < pps->num_tile_rows; i++) {
+            if (tb_y < pps->row_bd[i + 1]) {
+                tile_y = i;
+                break;
+            }
+        }
+
+        for (i = 0; i < tile_x; i++ )
+            val += pps->row_height[tile_y] * pps->column_width[i];
+        for (i = 0; i < tile_y; i++ )
+            val += sps->ctb_width * pps->row_height[i];
+
+        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
+               tb_x - pps->col_bd[tile_x];
+
+        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
+        pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs;
+    }
+
+    for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
+        for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
+            for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
+                for (x = pps->col_bd[i]; x < pps->col_bd[i + 1]; x++)
+                    pps->tile_id[pps->ctb_addr_rs_to_ts[y * sps->ctb_width + x]] = tile_id;
+
+    pps->tile_pos_rs = av_malloc_array(tile_id, sizeof(*pps->tile_pos_rs));
+    if (!pps->tile_pos_rs) {
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    for (j = 0; j < pps->num_tile_rows; j++)
+        for (i = 0; i < pps->num_tile_columns; i++)
+            pps->tile_pos_rs[j * pps->num_tile_columns + i] = pps->row_bd[j] * sps->ctb_width + pps->col_bd[i];
+
+    for (y = 0; y < sps->min_cb_height; y++) {
+        for (x = 0; x < sps->min_cb_width; x++) {
+            int tb_x = x >> sps->log2_diff_max_min_coding_block_size;
+            int tb_y = y >> sps->log2_diff_max_min_coding_block_size;
+            int ctb_addr_rs = sps->ctb_width * tb_y + tb_x;
+            int val = pps->ctb_addr_rs_to_ts[ctb_addr_rs] <<
+                      (sps->log2_diff_max_min_coding_block_size * 2);
+            for (i = 0; i < sps->log2_diff_max_min_coding_block_size; i++) {
+                int m = 1 << i;
+                val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0);
+            }
+            pps->min_cb_addr_zs[y * sps->min_cb_width + x] = val;
+        }
+    }
+
+    log2_diff_ctb_min_tb_size = sps->log2_ctb_size - sps->log2_min_transform_block_size;
+    for (y = 0; y < sps->min_tb_height; y++) {
+        for (x = 0; x < sps->min_tb_width; x++) {
+            int tb_x = x >> log2_diff_ctb_min_tb_size;
+            int tb_y = y >> log2_diff_ctb_min_tb_size;
+            int ctb_addr_rs = sps->ctb_width * tb_y + tb_x;
+            int val = pps->ctb_addr_rs_to_ts[ctb_addr_rs] <<
+                      (log2_diff_ctb_min_tb_size * 2);
+            for (i = 0; i < log2_diff_ctb_min_tb_size; i++) {
+                int m = 1 << i;
+                val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0);
+            }
+            pps->min_tb_addr_zs[y * sps->min_tb_width + x] = val;
+        }
+    }
+
+    av_buffer_unref(&s->pps_list[pps_id]);
+    s->pps_list[pps_id] = pps_buf;
+
+    return 0;
+
+err:
+    av_buffer_unref(&pps_buf);
+    return ret;
+}
diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
new file mode 100644
index 0000000000..9de13359d8
--- /dev/null
+++ b/libavcodec/hevc_refs.c
@@ -0,0 +1,481 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+
+#include "hevc.h"
+#include "internal.h"
+#include "thread.h"
+
+void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags)
+{
+    /* frame->frame can be NULL if context init failed */
+    if (!frame->frame || !frame->frame->buf[0])
+        return;
+
+    frame->flags &= ~flags;
+    if (!frame->flags) {
+        ff_thread_release_buffer(s->avctx, &frame->tf);
+
+        av_buffer_unref(&frame->tab_mvf_buf);
+        frame->tab_mvf = NULL;
+
+        av_buffer_unref(&frame->rpl_buf);
+        av_buffer_unref(&frame->rpl_tab_buf);
+        frame->rpl_tab    = NULL;
+        frame->refPicList = NULL;
+
+        frame->collocated_ref = NULL;
+    }
+}
+
+RefPicList* ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *ref, int x0, int y0)
+{
+    if (x0 < 0 || y0 < 0) {
+        return s->ref->refPicList;
+    } else {
+        int x_cb         = x0 >> s->sps->log2_ctb_size;
+        int y_cb         = y0 >> s->sps->log2_ctb_size;
+        int pic_width_cb = (s->sps->width + (1<<s->sps->log2_ctb_size)-1 ) >> s->sps->log2_ctb_size;
+        int ctb_addr_ts  = s->pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb];
+        return (RefPicList*) ref->rpl_tab[ctb_addr_ts];
+    }
+}
+
+void ff_hevc_clear_refs(HEVCContext *s)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
+        ff_hevc_unref_frame(s, &s->DPB[i],
+                            HEVC_FRAME_FLAG_SHORT_REF | HEVC_FRAME_FLAG_LONG_REF);
+}
+
+void ff_hevc_flush_dpb(HEVCContext *s)
+{
+    int i;
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
+        ff_hevc_unref_frame(s, &s->DPB[i], ~0);
+}
+
+static HEVCFrame *alloc_frame(HEVCContext *s)
+{
+    int i, j, ret;
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *frame = &s->DPB[i];
+        if (frame->frame->buf[0])
+            continue;
+
+        ret = ff_thread_get_buffer(s->avctx, &frame->tf, AV_GET_BUFFER_FLAG_REF);
+        if (ret < 0)
+            return NULL;
+
+        frame->rpl_buf = av_buffer_allocz(s->nb_nals * sizeof(RefPicListTab));
+        if (!frame->rpl_buf)
+            goto fail;
+
+        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+        if (!frame->tab_mvf_buf)
+            goto fail;
+        frame->tab_mvf = (MvField*)frame->tab_mvf_buf->data;
+
+        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+        if (!frame->rpl_tab_buf)
+            goto fail;
+        frame->rpl_tab   = (RefPicListTab**)frame->rpl_tab_buf->data;
+        frame->ctb_count = s->sps->ctb_width * s->sps->ctb_height;
+        for (j = 0; j < frame->ctb_count; j++)
+            frame->rpl_tab[j] = (RefPicListTab*)frame->rpl_buf->data;
+
+        return frame;
+fail:
+        ff_hevc_unref_frame(s, frame, ~0);
+        return NULL;
+    }
+    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
+    return NULL;
+}
+
+int ff_hevc_set_new_ref(HEVCContext *s, AVFrame **frame, int poc)
+{
+    HEVCFrame *ref;
+    int i;
+
+    /* check that this POC doesn't already exist */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *frame = &s->DPB[i];
+
+        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
+            frame->poc == poc) {
+            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
+                   poc);
+            return AVERROR_INVALIDDATA;
+        }
+    }
+
+    ref = alloc_frame(s);
+    if (!ref)
+        return AVERROR(ENOMEM);
+
+    *frame              = ref->frame;
+    s->ref              = ref;
+    ref->poc            = poc;
+
+    ref->flags          = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
+    ref->sequence       = s->seq_decode;
+    ref->window         = s->sps->output_window;
+
+    return 0;
+}
+
+int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
+{
+    int nb_output = 0;
+    int min_poc   = 0xFFFF;
+    int i, j, min_idx, ret;
+
+    do {
+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+            HEVCFrame *frame = &s->DPB[i];
+            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
+                frame->sequence == s->seq_output) {
+                nb_output++;
+                if (frame->poc < min_poc) {
+                    min_poc = frame->poc;
+                    min_idx = i;
+                }
+            }
+        }
+
+        /* wait for more frames before output */
+        if (!flush && s->seq_output == s->seq_decode && s->sps &&
+            nb_output <= s->sps->temporal_layer[s->temporal_id].num_reorder_pics)
+            return 0;
+
+        if (nb_output) {
+            HEVCFrame *frame = &s->DPB[min_idx];
+            AVFrame *dst = out;
+            AVFrame *src = frame->frame;
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
+            int pixel_shift = !!(desc->comp[0].depth_minus1 > 7);
+
+            ret = av_frame_ref(out, src);
+            ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+            if (ret < 0)
+                return ret;
+
+            for (j = 0; j < 3; j++) {
+                int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+                int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+                          (frame->window.top_offset   >> vshift) * dst->linesize[j];
+                dst->data[j] += off;
+            }
+            av_log(s->avctx, AV_LOG_DEBUG, "Output frame with POC %d.\n", frame->poc);
+            return 1;
+        }
+
+        if (s->seq_output != s->seq_decode)
+            s->seq_output = (s->seq_output + 1) & 0xff;
+        else
+            break;
+    } while (1);
+
+    return 0;
+}
+
+static int init_slice_rpl(HEVCContext *s)
+{
+    HEVCFrame *frame = s->ref;
+    int ctb_count   = frame->ctb_count;
+    int ctb_addr_ts = s->pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+    int i;
+
+    if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+        return AVERROR_INVALIDDATA;
+
+    for (i = ctb_addr_ts; i < ctb_count; i++)
+        frame->rpl_tab[i] = (RefPicListTab*)frame->rpl_buf->data + s->slice_idx;
+
+    frame->refPicList = (RefPicList*)frame->rpl_tab[ctb_addr_ts];
+
+    return 0;
+}
+
+int ff_hevc_slice_rpl(HEVCContext *s)
+{
+    SliceHeader *sh = &s->sh;
+
+        uint8_t nb_list = sh->slice_type == B_SLICE ? 2 : 1;
+    uint8_t list_idx;
+    int i, j, ret;
+
+    ret = init_slice_rpl(s);
+    if (ret < 0)
+        return ret;
+
+    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
+          s->rps[LT_CURR].nb_refs)) {
+        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    for (list_idx = 0; list_idx < nb_list; list_idx++) {
+        RefPicList  rpl_tmp = { { 0 } };
+        RefPicList *rpl     = &s->ref->refPicList[list_idx];
+
+        /* The order of the elements is
+         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
+         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1
+         */
+        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
+                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
+                              LT_CURR };
+
+        /* concatenate the candidate lists for the current frame */
+        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
+            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
+                RefPicList *rps = &s->rps[cand_lists[i]];
+                for (j = 0; j < rps->nb_refs; j++) {
+                    rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j];
+                    rpl_tmp.ref[rpl_tmp.nb_refs]  = rps->ref[j];
+                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = (i == 2);
+                    rpl_tmp.nb_refs++;
+                }
+            }
+        }
+
+        /* reorder the references if necessary */
+        if (sh->rpl_modification_flag[list_idx]) {
+            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
+                int idx = sh->list_entry_lx[list_idx][i];
+
+                if (idx >= rpl_tmp.nb_refs) {
+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
+                    return AVERROR_INVALIDDATA;
+                }
+
+                rpl->list[i]       = rpl_tmp.list[idx];
+                rpl->ref[i]        = rpl_tmp.ref[idx];
+                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
+                rpl->nb_refs++;
+            }
+        } else {
+            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
+            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
+        }
+
+        if (sh->collocated_list == list_idx &&
+            sh->collocated_ref_idx < rpl->nb_refs)
+            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
+    }
+
+    return 0;
+}
+
+static HEVCFrame *find_ref_idx(HEVCContext *s, int poc)
+{
+    int i;
+    int LtMask = (1 << s->sps->log2_max_poc_lsb) - 1;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *ref = &s->DPB[i];
+        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
+            if ((ref->poc & LtMask) == poc)
+                    return ref;
+            }
+    }
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *ref = &s->DPB[i];
+        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
+            if (ref->poc == poc || (ref->poc & LtMask) == poc)
+                    return ref;
+            }
+    }
+
+    av_log(s->avctx, AV_LOG_ERROR,
+           "Could not find ref with POC %d\n", poc);
+    return NULL;
+}
+
+static void mark_ref(HEVCFrame *frame, int flag)
+{
+    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
+    frame->flags |= flag;
+}
+
+static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
+{
+    HEVCFrame *frame;
+    int i, x, y;
+
+
+    frame = alloc_frame(s);
+    if (!frame)
+        return NULL;
+
+    if (!s->sps->pixel_shift) {
+        for (i = 0; frame->frame->buf[i]; i++)
+             memset(frame->frame->buf[i]->data, 1 << (s->sps->bit_depth - 1),
+                    frame->frame->buf[i]->size);
+    } else {
+        for (i = 0; frame->frame->data[i]; i++)
+            for (y = 0; y < (s->height >> s->sps->vshift[i]); y++)
+                for (x = 0; x < (s->width >> s->sps->hshift[i]); x++) {
+                    AV_WN16(frame->frame->data[i] + y * frame->frame->linesize[i] + 2 * x,
+                            1 << (s->sps->bit_depth - 1));
+                }
+    }
+
+    frame->poc      = poc;
+    frame->sequence = s->seq_decode;
+    frame->flags    = 0;
+
+    ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+
+    return frame;
+}
+
+/* add a reference with the given poc to the list and mark it as used in DPB */
+static int add_candidate_ref(HEVCContext *s, RefPicList *list,
+                             int poc, int ref_flag)
+{
+    HEVCFrame *ref = find_ref_idx(s, poc);
+
+    if (ref == s->ref)
+        return AVERROR_INVALIDDATA;
+
+    if (!ref) {
+        ref = generate_missing_ref(s, poc);
+        if (!ref)
+            return AVERROR(ENOMEM);
+    }
+
+    list->list[list->nb_refs] = ref->poc;
+    list->ref[list->nb_refs]  = ref;
+    list->nb_refs++;
+
+    mark_ref(ref, ref_flag);
+    return 0;
+}
+
+int ff_hevc_frame_rps(HEVCContext *s)
+{
+    const ShortTermRPS *short_rps = s->sh.short_term_rps;
+    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
+    RefPicList               *rps = s->rps;
+    int i, ret;
+
+    if (!short_rps)
+        return 0;
+
+    /* clear the reference flags on all frames except the current one */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+        HEVCFrame *frame = &s->DPB[i];
+
+        if (frame == s->ref)
+            continue;
+
+        mark_ref(frame, 0);
+    }
+
+    for (i = 0; i < NB_RPS_TYPE; i++)
+        rps[i].nb_refs = 0;
+
+    /* add the short refs */
+    for (i = 0; i < short_rps->num_delta_pocs; i++) {
+        int poc = s->poc + short_rps->delta_poc[i];
+        int list;
+
+        if (!short_rps->used[i])
+            list = ST_FOLL;
+        else if (i < short_rps->num_negative_pics)
+            list = ST_CURR_BEF;
+        else
+            list = ST_CURR_AFT;
+
+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
+        if (ret < 0)
+            return ret;
+    }
+
+    /* add the long refs */
+    for (i = 0; i < long_rps->nb_refs; i++) {
+        int poc  = long_rps->poc[i];
+        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
+
+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
+        if (ret < 0)
+            return ret;
+    }
+
+    /* release any frames that are now unused */
+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
+        ff_hevc_unref_frame(s, &s->DPB[i], 0);
+
+    return 0;
+}
+
+int ff_hevc_compute_poc(HEVCContext *s, int poc_lsb)
+{
+    int max_poc_lsb  = 1 << s->sps->log2_max_poc_lsb;
+    int prev_poc_lsb = s->pocTid0 % max_poc_lsb;
+    int prev_poc_msb = s->pocTid0 - prev_poc_lsb;
+    int poc_msb;
+
+    if ((poc_lsb < prev_poc_lsb) && ((prev_poc_lsb - poc_lsb) >= max_poc_lsb / 2))
+        poc_msb = prev_poc_msb + max_poc_lsb;
+    else if ((poc_lsb > prev_poc_lsb) && ((poc_lsb - prev_poc_lsb) > (max_poc_lsb / 2)))
+        poc_msb = prev_poc_msb - max_poc_lsb;
+    else
+        poc_msb = prev_poc_msb;
+
+    // For BLA picture types, POCmsb is set to 0.
+    if (s->nal_unit_type == NAL_BLA_W_LP ||
+        s->nal_unit_type == NAL_BLA_W_RADL ||
+        s->nal_unit_type == NAL_BLA_N_LP)
+        poc_msb = 0;
+
+    return poc_msb + poc_lsb;
+}
+
+int ff_hevc_frame_nb_refs(HEVCContext *s)
+{
+    int ret = 0;
+    int i;
+    const ShortTermRPS *rps = s->sh.short_term_rps;
+    LongTermRPS *long_rps   = &s->sh.long_term_rps;
+
+    if (rps) {
+        for (i = 0; i < rps->num_negative_pics; i++)
+            ret += !!rps->used[i];
+        for (; i < rps->num_delta_pocs; i++)
+            ret += !!rps->used[i];
+    }
+
+    if (long_rps) {
+        for (i = 0; i < long_rps->nb_refs; i++)
+            ret += !!long_rps->used[i];
+    }
+    return ret;
+}
diff --git a/libavcodec/hevc_sei.c b/libavcodec/hevc_sei.c
new file mode 100644
index 0000000000..abdbc001e1
--- /dev/null
+++ b/libavcodec/hevc_sei.c
@@ -0,0 +1,129 @@
+/*
+ * HEVC Supplementary Enhancement Information messages
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2012 - 2013 Gildas Cocherel
+ * Copyright (C) 2013 Vittorio Giovara
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hevc.h"
+#include "golomb.h"
+
+static void decode_nal_sei_decoded_picture_hash(HEVCContext *s, int payload_size)
+{
+    int cIdx, i;
+    uint8_t hash_type;
+    //uint16_t picture_crc;
+    //uint32_t picture_checksum;
+    GetBitContext *gb = &s->HEVClc.gb;
+    hash_type = get_bits(gb, 8);
+
+
+    for( cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++ ) {
+        if ( hash_type == 0 ) {
+            s->is_md5 = 1;
+            for( i = 0; i < 16; i++) {
+                s->md5[cIdx][i] = get_bits(gb, 8);
+            }
+        } else if( hash_type == 1 ) {
+            // picture_crc = get_bits(gb, 16);
+            skip_bits(gb, 16);
+        } else if( hash_type == 2 ) {
+            // picture_checksum = get_bits(gb, 32);
+            skip_bits(gb, 32);
+        }
+    }
+}
+
+static void decode_nal_sei_frame_packing_arrangement(HEVCLocalContext *lc)
+{
+    GetBitContext *gb = &lc->gb;
+    int cancel, type, quincunx;
+
+    get_ue_golomb(gb);                      // frame_packing_arrangement_id
+    cancel = get_bits1(gb);                 // frame_packing_cancel_flag
+    if ( cancel == 0 )
+    {
+        type = get_bits(gb, 7);             // frame_packing_arrangement_type
+        quincunx = get_bits1(gb);           // quincunx_sampling_flag
+        skip_bits(gb, 6);                   // content_interpretation_type
+
+        // the following skips spatial_flipping_flag frame0_flipped_flag
+        // field_views_flag current_frame_is_frame0_flag
+        // frame0_self_contained_flag frame1_self_contained_flag
+        skip_bits(gb, 6);
+
+        if ( quincunx == 0 && type != 5 )
+            skip_bits(gb, 16);              // frame[01]_grid_position_[xy]
+        skip_bits(gb, 8);                   // frame_packing_arrangement_reserved_byte
+        skip_bits1(gb);                     // frame_packing_arrangement_persistance_flag
+    }
+    skip_bits1(gb);                         // upsampled_aspect_ratio_flag
+}
+
+static int decode_nal_sei_message(HEVCContext *s)
+{
+    GetBitContext *gb = &s->HEVClc.gb;
+
+    int payload_type = 0;
+    int payload_size = 0;
+    int byte = 0xFF;
+    av_log(s->avctx, AV_LOG_DEBUG, "Decoding SEI\n");
+
+    while (byte == 0xFF) {
+        byte = get_bits(gb, 8);
+        payload_type += byte;
+    }
+    byte = 0xFF;
+    while (byte == 0xFF) {
+        byte = get_bits(gb, 8);
+        payload_size += byte;
+    }
+    if (s->nal_unit_type == NAL_SEI_PREFIX) {
+        if (payload_type == 256 /*&& s->decode_checksum_sei*/)
+            decode_nal_sei_decoded_picture_hash(s, payload_size);
+        else if (payload_type == 45)
+            decode_nal_sei_frame_packing_arrangement(&s->HEVClc);
+        else {
+            av_log(s->avctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", payload_type);
+            skip_bits(gb, 8*payload_size);
+        }
+    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
+        if (payload_type == 132 /* && s->decode_checksum_sei */)
+            decode_nal_sei_decoded_picture_hash(s, payload_size);
+        else {
+            av_log(s->avctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", payload_type);
+            skip_bits(gb, 8*payload_size);
+        }
+    }
+    return 0;
+}
+
+static int more_rbsp_data(GetBitContext *gb)
+{
+    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
+}
+
+int ff_hevc_decode_nal_sei(HEVCContext *s)
+{
+    do {
+        decode_nal_sei_message(s);
+    } while (more_rbsp_data(&s->HEVClc.gb));
+    return 0;
+}
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
new file mode 100644
index 0000000000..5ea774ed69
--- /dev/null
+++ b/libavcodec/hevcdsp.c
@@ -0,0 +1,192 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hevc.h"
+#include "hevcdsp.h"
+
+static const int8_t transform[32][32] = {
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+     64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+     -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
+    { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90,
+     -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 },
+    { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+     13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 },
+    { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89,
+     89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
+    { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+     -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 },
+    { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87,
+     -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 },
+    { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+     31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 },
+    { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83,
+     83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
+    { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+     -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 },
+    { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80,
+     -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 },
+    { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+     46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 },
+    { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75,
+     75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
+    { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+     -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 },
+    { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70,
+     -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 },
+    { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+     61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 },
+    { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64,
+     64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
+    { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+     -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 },
+    { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57,
+     -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 },
+    { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+     73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 },
+    { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50,
+     50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
+    { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+     -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 },
+    { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43,
+     -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 },
+    { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+     82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 },
+    { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36,
+     36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
+    { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+     -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 },
+    { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25,
+     -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 },
+    { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+     88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 },
+    { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18,
+     18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
+    { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+     -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 },
+    { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9,
+     -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 },
+    { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90,
+     90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 },
+};
+
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
+    { -2,  58,  10,  -2,-2,  58,  10,  -2,-2,  58,  10,  -2,-2,  58,  10,  -2 },
+    { -4,  54,  16,  -2,-4,  54,  16,  -2,-4,  54,  16,  -2,-4,  54,  16,  -2 },
+    { -6,  46,  28,  -4,-6,  46,  28,  -4,-6,  46,  28,  -4,-6,  46,  28,  -4 },
+    { -4,  36,  36,  -4,-4,  36,  36,  -4,-4,  36,  36,  -4,-4,  36,  36,  -4 },
+    { -4,  28,  46,  -6,-4,  28,  46,  -6,-4,  28,  46,  -6,-4,  28,  46,  -6 },
+    { -2,  16,  54,  -4,-2,  16,  54,  -4,-2,  16,  54,  -4,-2,  16,  54,  -4 },
+    { -2,  10,  58,  -2,-2,  10,  58,  -2,-2,  10,  58,  -2,-2,  10,  58,  -2 },
+};
+
+#define BIT_DEPTH 8
+#include "hevcdsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 9
+#include "hevcdsp_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 10
+#include "hevcdsp_template.c"
+#undef BIT_DEPTH
+
+void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+{
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth
+
+#define HEVC_DSP(depth)                                                            \
+    hevcdsp->put_pcm = FUNC(put_pcm, depth);                                       \
+    hevcdsp->transquant_bypass[0] = FUNC(transquant_bypass4x4, depth);             \
+    hevcdsp->transquant_bypass[1] = FUNC(transquant_bypass8x8, depth);             \
+    hevcdsp->transquant_bypass[2] = FUNC(transquant_bypass16x16, depth);           \
+    hevcdsp->transquant_bypass[3] = FUNC(transquant_bypass32x32, depth);           \
+    hevcdsp->transform_skip = FUNC(transform_skip, depth);                         \
+    hevcdsp->transform_4x4_luma_add = FUNC(transform_4x4_luma_add, depth);         \
+    hevcdsp->transform_add[0] = FUNC(transform_4x4_add, depth);                    \
+    hevcdsp->transform_add[1] = FUNC(transform_8x8_add, depth);                    \
+    hevcdsp->transform_add[2] = FUNC(transform_16x16_add, depth);                  \
+    hevcdsp->transform_add[3] = FUNC(transform_32x32_add, depth);                  \
+                                                                                   \
+    hevcdsp->sao_band_filter[0]  = FUNC(sao_band_filter_0, depth);                 \
+    hevcdsp->sao_band_filter[1]  = FUNC(sao_band_filter_1, depth);                 \
+    hevcdsp->sao_band_filter[2]  = FUNC(sao_band_filter_2, depth);                 \
+    hevcdsp->sao_band_filter[3]  = FUNC(sao_band_filter_3, depth);                 \
+                                                                                   \
+    hevcdsp->sao_edge_filter[0]  = FUNC(sao_edge_filter_0, depth);                 \
+    hevcdsp->sao_edge_filter[1]  = FUNC(sao_edge_filter_1, depth);                 \
+    hevcdsp->sao_edge_filter[2]  = FUNC(sao_edge_filter_2, depth);                 \
+    hevcdsp->sao_edge_filter[3]  = FUNC(sao_edge_filter_3, depth);                 \
+                                                                                   \
+    hevcdsp->put_hevc_qpel[0][0] = FUNC(put_hevc_qpel_pixels, depth);              \
+    hevcdsp->put_hevc_qpel[0][1] = FUNC(put_hevc_qpel_h1, depth);                  \
+    hevcdsp->put_hevc_qpel[0][2] = FUNC(put_hevc_qpel_h2, depth);                  \
+    hevcdsp->put_hevc_qpel[0][3] = FUNC(put_hevc_qpel_h3, depth);                  \
+    hevcdsp->put_hevc_qpel[1][0] = FUNC(put_hevc_qpel_v1, depth);                  \
+    hevcdsp->put_hevc_qpel[1][1] = FUNC(put_hevc_qpel_h1v1, depth);                \
+    hevcdsp->put_hevc_qpel[1][2] = FUNC(put_hevc_qpel_h2v1, depth);                \
+    hevcdsp->put_hevc_qpel[1][3] = FUNC(put_hevc_qpel_h3v1, depth);                \
+    hevcdsp->put_hevc_qpel[2][0] = FUNC(put_hevc_qpel_v2, depth);                  \
+    hevcdsp->put_hevc_qpel[2][1] = FUNC(put_hevc_qpel_h1v2, depth);                \
+    hevcdsp->put_hevc_qpel[2][2] = FUNC(put_hevc_qpel_h2v2, depth);                \
+    hevcdsp->put_hevc_qpel[2][3] = FUNC(put_hevc_qpel_h3v2, depth);                \
+    hevcdsp->put_hevc_qpel[3][0] = FUNC(put_hevc_qpel_v3, depth);                  \
+    hevcdsp->put_hevc_qpel[3][1] = FUNC(put_hevc_qpel_h1v3, depth);                \
+    hevcdsp->put_hevc_qpel[3][2] = FUNC(put_hevc_qpel_h2v3, depth);                \
+    hevcdsp->put_hevc_qpel[3][3] = FUNC(put_hevc_qpel_h3v3, depth);                \
+                                                                                   \
+    hevcdsp->put_hevc_epel[0][0] = FUNC(put_hevc_epel_pixels, depth);              \
+    hevcdsp->put_hevc_epel[0][1] = FUNC(put_hevc_epel_h, depth);                   \
+    hevcdsp->put_hevc_epel[1][0] = FUNC(put_hevc_epel_v, depth);                   \
+    hevcdsp->put_hevc_epel[1][1] = FUNC(put_hevc_epel_hv, depth);                  \
+                                                                                   \
+                                                                                   \
+    hevcdsp->put_unweighted_pred = FUNC(put_unweighted_pred, depth);               \
+    hevcdsp->put_weighted_pred_avg = FUNC(put_weighted_pred_avg, depth);           \
+                                                                                   \
+    hevcdsp->weighted_pred = FUNC(weighted_pred, depth);                           \
+    hevcdsp->weighted_pred_avg = FUNC(weighted_pred_avg, depth);                   \
+    hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth);       \
+    hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth);       \
+    hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth);   \
+    hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth);   \
+    hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth);     \
+    hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth);     \
+    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth);
+
+
+    switch (bit_depth) {
+    case 9:
+        HEVC_DSP(9);
+        break;
+    case 10:
+        HEVC_DSP(10);
+        break;
+    default:
+        HEVC_DSP(8);
+        break;
+    }
+}
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
new file mode 100644
index 0000000000..c6511b01ac
--- /dev/null
+++ b/libavcodec/hevcdsp.h
@@ -0,0 +1,77 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HEVCDSP_H
+#define AVCODEC_HEVCDSP_H
+
+#include "get_bits.h"
+
+struct SAOParams;
+
+typedef struct HEVCDSPContext {
+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int size,
+                    GetBitContext *gb, int pcm_bit_depth);
+
+    void (*transquant_bypass[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+
+    void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+    void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t _stride);
+
+    void (*sao_band_filter[4])( uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride, struct SAOParams *sao, int *borders, int width, int height, int c_idx);
+
+    void (*sao_edge_filter[4])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride,  struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, uint8_t vert_edge, uint8_t horiz_edge, uint8_t diag_edge);
+
+
+    void (*put_hevc_qpel[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                int width, int height, int16_t* mcbuffer);
+
+    void (*put_hevc_epel[2][2])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                             int width, int height, int mx, int my, int16_t* mcbuffer);
+
+    void (*put_unweighted_pred)(uint8_t *dst, ptrdiff_t dststride, int16_t *src, ptrdiff_t srcstride,
+                                int width, int height);
+
+    void (*put_weighted_pred_avg)(uint8_t *dst, ptrdiff_t dststride, int16_t *src1, int16_t *src2,
+                                  ptrdiff_t srcstride, int width, int height);
+    void (*weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag, uint8_t *dst, ptrdiff_t dststride, int16_t *src,
+                                  ptrdiff_t srcstride, int width, int height);
+    void (*weighted_pred_avg)(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag,
+                                   uint8_t *dst, ptrdiff_t dststride, int16_t *src1, int16_t *src2,
+                                   ptrdiff_t srcstride, int width, int height);
+    void (*hevc_h_loop_filter_luma)(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+    void (*hevc_v_loop_filter_luma)(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+    void (*hevc_h_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+    void (*hevc_v_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+    void (*hevc_h_loop_filter_luma_c)(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+    void (*hevc_v_loop_filter_luma_c)(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+    void (*hevc_h_loop_filter_chroma_c)(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+    void (*hevc_v_loop_filter_chroma_c)(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+} HEVCDSPContext;
+
+void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+
+extern const int8_t ff_hevc_epel_filters[7][16];
+
+#endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
new file mode 100644
index 0000000000..6c4331a050
--- /dev/null
+++ b/libavcodec/hevcdsp_template.c
@@ -0,0 +1,1366 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "get_bits.h"
+#include "bit_depth_template.c"
+#include "hevcdsp.h"
+#include "hevc.h"
+
+#define SET(dst, x) (dst) = (x)
+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
+#define ADD_AND_SCALE(dst, x) (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
+
+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int size,
+                          GetBitContext *gb, int pcm_bit_depth)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+
+    for (y = 0; y < size; y++) {
+        for (x = 0; x < size; x++)
+            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+        dst += stride;
+    }
+}
+
+static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+
+    for (y = 0; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+            dst[x] += *coeffs;
+            coeffs++;
+        }
+        dst += stride;
+    }
+
+}
+
+static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+
+    for (y = 0; y < 8; y++) {
+        for (x = 0; x < 8; x++) {
+            dst[x] += *coeffs;
+            coeffs++;
+        }
+        dst += stride;
+    }
+}
+
+static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+
+    for (y = 0; y < 16; y++) {
+        for (x = 0; x < 16; x++) {
+            dst[x] += *coeffs;
+            coeffs++;
+        }
+        dst += stride;
+    }
+
+}
+
+static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+
+    for (y = 0; y < 32; y++) {
+        for (x = 0; x < 32; x++) {
+            dst[x] += *coeffs;
+            coeffs++;
+        }
+        dst += stride;
+    }
+}
+
+static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int size = 4;
+    int shift = 13 - BIT_DEPTH;
+#if BIT_DEPTH <= 13
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+    int x, y;
+    switch (size){
+        case 32:
+            for (y = 0; y < 32*32; y+=32) {
+                for (x = 0; x < 32; x++) {
+                    dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift));
+                }
+                dst += stride;
+            }
+            break;
+        case 16:
+            for (y = 0; y < 16*16; y+=16) {
+                for (x = 0; x < 16; x++) {
+                    dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift));
+                }
+                dst += stride;
+            }
+            break;
+        case 8:
+            for (y = 0; y < 8*8; y+=8) {
+                for (x = 0; x < 8; x++) {
+                    dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift));
+                }
+                dst += stride;
+            }
+            break;
+        case 4:
+            for (y = 0; y < 4*4; y+=4) {
+                for (x = 0; x < 4; x++) {
+                    dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift));
+                }
+                dst += stride;
+            }
+            break;
+    }
+}
+
+static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+#define TR_4x4_LUMA(dst, src, step, assign)                                     \
+    do {                                                                        \
+        int c0 = src[0*step] + src[2*step];                                     \
+        int c1 = src[2*step] + src[3*step];                                     \
+        int c2 = src[0*step] - src[3*step];                                     \
+        int c3 = 74 * src[1*step];                                              \
+                                                                                \
+        assign(dst[2*step], 74 * (src[0*step] - src[2*step] + src[3*step]));    \
+        assign(dst[0*step], 29 * c0 + 55 * c1 + c3);                            \
+        assign(dst[1*step], 55 * c2 - 29 * c1 + c3);                            \
+        assign(dst[3*step], 55 * c0 + 29 * c2 - c3);                            \
+    } while (0)
+
+    int i;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int shift = 7;
+    int add = 1 << (shift - 1);
+    int16_t *src = coeffs;
+
+    for (i = 0; i < 4; i++) {
+        TR_4x4_LUMA(src, src, 4, SCALE);
+        src++;
+    }
+
+    shift = 20 - BIT_DEPTH;
+    add = 1 << (shift - 1);
+    for (i = 0; i < 4; i++) {
+        TR_4x4_LUMA(dst, coeffs, 1, ADD_AND_SCALE);
+        coeffs += 4;
+        dst += stride;
+    }
+
+#undef TR_4x4_LUMA
+}
+
+#define TR_4(dst, src, dstep, sstep, assign)                                    \
+    do {                                                                        \
+        const int e0 = transform[8*0][0] * src[0*sstep] +                       \
+                       transform[8*2][0] * src[2*sstep];                        \
+        const int e1 = transform[8*0][1] * src[0*sstep] +                       \
+                       transform[8*2][1] * src[2*sstep];                        \
+        const int o0 = transform[8*1][0] * src[1*sstep] +                       \
+                       transform[8*3][0] * src[3*sstep];                        \
+        const int o1 = transform[8*1][1] * src[1*sstep] +                       \
+                       transform[8*3][1] * src[3*sstep];                        \
+                                                                                \
+        assign(dst[0*dstep], e0 + o0);                                          \
+        assign(dst[1*dstep], e1 + o1);                                          \
+        assign(dst[2*dstep], e1 - o1);                                          \
+        assign(dst[3*dstep], e0 - o0);                                          \
+    } while (0)
+#define TR_4_1(dst, src) TR_4(dst, src, 4, 4, SCALE)
+#define TR_4_2(dst, src) TR_4(dst, src, 1, 1, ADD_AND_SCALE)
+
+static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    int i;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int shift = 7;
+    int add = 1 << (shift - 1);
+    int16_t *src = coeffs;
+
+    for (i = 0; i < 4; i++) {
+        TR_4_1(src, src);
+        src++;
+    }
+
+    shift = 20 - BIT_DEPTH;
+    add = 1 << (shift - 1);
+    for (i = 0; i < 4; i++) {
+        TR_4_2(dst, coeffs);
+        coeffs += 4;
+        dst += stride;
+    }
+}
+
+#define TR_8(dst, src, dstep, sstep, assign)                \
+    do {                                                    \
+        int i, j;                                           \
+        int e_8[4];                                         \
+        int o_8[4] = { 0 };                                 \
+        for (i = 0; i < 4; i++)                             \
+            for (j = 1; j < 8; j += 2)                      \
+                o_8[i] += transform[4*j][i] * src[j*sstep]; \
+        TR_4(e_8, src, 1, 2*sstep, SET);                    \
+                                                            \
+        for (i = 0; i < 4; i++) {                           \
+            assign(dst[i*dstep], e_8[i] + o_8[i]);          \
+            assign(dst[(7-i)*dstep], e_8[i] - o_8[i]);      \
+        }                                                   \
+    } while (0)
+#define TR_16(dst, src, dstep, sstep, assign)                   \
+    do {                                                        \
+        int i, j;                                               \
+        int e_16[8];                                            \
+        int o_16[8] = { 0 };                                    \
+        for (i = 0; i < 8; i++)                                 \
+            for (j = 1; j < 16; j += 2)                         \
+                o_16[i] += transform[2*j][i] * src[j*sstep];    \
+        TR_8(e_16, src, 1, 2*sstep, SET);                       \
+                                                                \
+        for (i = 0; i < 8; i++) {                               \
+            assign(dst[i*dstep], e_16[i] + o_16[i]);            \
+            assign(dst[(15-i)*dstep], e_16[i] - o_16[i]);       \
+        }                                                       \
+    } while (0)
+#define TR_32(dst, src, dstep, sstep, assign)               \
+    do {                                                    \
+        int i, j;                                           \
+        int e_32[16];                                       \
+        int o_32[16] = { 0 };                               \
+        for (i = 0; i < 16; i++)                            \
+            for (j = 1; j < 32; j += 2)                     \
+                o_32[i] += transform[j][i] * src[j*sstep];  \
+        TR_16(e_32, src, 1, 2*sstep, SET);                \
+                                                            \
+        for (i = 0; i < 16; i++) {                          \
+            assign(dst[i*dstep], e_32[i] + o_32[i]);        \
+            assign(dst[(31-i)*dstep], e_32[i] - o_32[i]);   \
+        }                                                   \
+    } while (0)
+
+#define TR_8_1(dst, src) TR_8(dst, src, 8, 8, SCALE)
+#define TR_16_1(dst, src) TR_16(dst, src, 16, 16, SCALE)
+#define TR_32_1(dst, src) TR_32(dst, src, 32, 32, SCALE)
+
+#define TR_8_2(dst, src) TR_8(dst, src, 1, 1, ADD_AND_SCALE)
+#define TR_16_2(dst, src) TR_16(dst, src, 1, 1, ADD_AND_SCALE)
+#define TR_32_2(dst, src) TR_32(dst, src, 1, 1, ADD_AND_SCALE)
+
+static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    int i;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int shift = 7;
+    int add = 1 << (shift - 1);
+    int16_t *src = coeffs;
+
+    for (i = 0; i < 8; i++) {
+        TR_8_1(src, src);
+        src++;
+    }
+
+    shift = 20 - BIT_DEPTH;
+    add = 1 << (shift - 1);
+    for (i = 0; i < 8; i++) {
+        TR_8_2(dst, coeffs);
+        coeffs += 8;
+        dst += stride;
+    }
+}
+
+static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+    int i;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int shift = 7;
+    int add = 1 << (shift - 1);
+    int16_t *src = coeffs;
+
+    for (i = 0; i < 16; i++) {
+        TR_16_1(src, src);
+        src++;
+    }
+
+    shift = 20 - BIT_DEPTH;
+    add = 1 << (shift - 1);
+    for (i = 0; i < 16; i++) {
+        TR_16_2(dst, coeffs);
+        coeffs += 16;
+        dst += stride;
+    }
+}
+
+static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+{
+#define IT32x32_even(i,w) ( src[ 0*w] * transform[ 0][i] ) + ( src[16*w] * transform[16][i] )
+#define IT32x32_odd(i,w)  ( src[ 8*w] * transform[ 8][i] ) + ( src[24*w] * transform[24][i] )
+#define IT16x16(i,w)      ( src[ 4*w] * transform[ 4][i] ) + ( src[12*w] * transform[12][i] ) + ( src[20*w] * transform[20][i] ) + ( src[28*w] * transform[28][i] )
+#define IT8x8(i,w)        ( src[ 2*w] * transform[ 2][i] ) + ( src[ 6*w] * transform[ 6][i] ) + ( src[10*w] * transform[10][i] ) + ( src[14*w] * transform[14][i] ) + \
+                          ( src[18*w] * transform[18][i] ) + ( src[22*w] * transform[22][i] ) + ( src[26*w] * transform[26][i] ) + ( src[30*w] * transform[30][i] )
+#define IT4x4(i,w)        ( src[ 1*w] * transform[ 1][i] ) + ( src[ 3*w] * transform[ 3][i] ) + ( src[ 5*w] * transform[ 5][i] ) + ( src[ 7*w] * transform[ 7][i] ) + \
+                          ( src[ 9*w] * transform[ 9][i] ) + ( src[11*w] * transform[11][i] ) + ( src[13*w] * transform[13][i] ) + ( src[15*w] * transform[15][i] ) + \
+                          ( src[17*w] * transform[17][i] ) + ( src[19*w] * transform[19][i] ) + ( src[21*w] * transform[21][i] ) + ( src[23*w] * transform[23][i] ) + \
+                          ( src[25*w] * transform[25][i] ) + ( src[27*w] * transform[27][i] ) + ( src[29*w] * transform[29][i] ) + ( src[31*w] * transform[31][i] )
+    int i;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int shift = 7;
+    int add = 1 << (shift - 1);
+    int16_t *src = coeffs;
+
+    for (i = 0; i < 32; i++) {
+        TR_32_1(src, src);
+        src++;
+    }
+    src   = coeffs;
+    shift = 20 - BIT_DEPTH;
+    add   = 1 << (shift - 1);
+    for (i = 0; i < 32; i++) {
+        TR_32_2(dst, coeffs);
+        coeffs += 32;
+        dst += stride;
+    }
+#undef IT32x32_even
+#undef IT32x32_odd
+#undef IT16x16
+#undef IT8x8
+#undef IT4x4
+}
+
+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
+                                  ptrdiff_t _stride, SAOParams *sao,
+                                  int *borders, int width, int height,
+                                  int c_idx, int class)
+{
+    pixel *dst = (pixel*)_dst;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int offset_table[32] = { 0 };
+    int k, y, x;
+    int chroma = !!c_idx;
+    int shift = BIT_DEPTH - 5;
+    int *sao_offset_val = sao->offset_val[c_idx];
+    int sao_left_class  = sao->band_position[c_idx];
+    int init_y = 0, init_x = 0;
+
+    switch (class) {
+    case 0:
+        if (!borders[2])
+            width -= ((8 >> chroma) + 2);
+        if (!borders[3])
+            height -= ((4 >> chroma) + 2);
+        break;
+    case 1:
+        init_y = -(4 >> chroma) - 2;
+        if (!borders[2])
+            width -= ((8 >> chroma) + 2);
+        height = (4 >> chroma) + 2;
+        break;
+    case 2:
+        init_x = -(8 >> chroma) - 2;
+        width  =  (8 >> chroma) + 2;
+        if (!borders[3])
+            height -= ((4 >> chroma) + 2);
+        break;
+    case 3:
+        init_y = -(4 >> chroma) - 2;
+        init_x = -(8 >> chroma) - 2;
+        width  =  (8 >> chroma) + 2;
+        height =  (4 >> chroma) + 2;
+        break;
+    }
+
+    dst = dst + (init_y * stride + init_x);
+    src = src + (init_y * stride + init_x);
+    for (k = 0; k < 4; k++)
+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = av_clip_pixel(src[x] + offset_table[av_clip_pixel(src[x] >> shift)]);
+        dst += stride;
+        src += stride;
+    }
+}
+
+static void FUNC(sao_band_filter_0)(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, SAOParams *sao,
+                                    int *borders, int width, int height,
+                                    int c_idx)
+{
+    FUNC(sao_band_filter)(dst, src, stride, sao, borders, width, height, c_idx, 0);
+}
+
+static void FUNC(sao_band_filter_1)(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, SAOParams *sao,
+                                    int *borders, int width, int height,
+                                    int c_idx)
+{
+    FUNC(sao_band_filter)(dst, src, stride, sao, borders, width, height, c_idx, 1);
+}
+
+static void FUNC(sao_band_filter_2)(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, SAOParams *sao,
+                                    int *borders, int width, int height,
+                                    int c_idx)
+{
+    FUNC(sao_band_filter)(dst, src, stride, sao, borders, width, height, c_idx, 2);
+}
+
+static void FUNC(sao_band_filter_3)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t _stride, SAOParams *sao,
+                                    int *borders, int width, int height,
+                                    int c_idx)
+{
+    FUNC(sao_band_filter)(_dst, _src, _stride, sao, borders, width, height, c_idx, 3);
+}
+
+static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t _stride, SAOParams *sao,
+                                    int *borders, int _width, int _height,
+                                    int c_idx,
+                                    uint8_t vert_edge, uint8_t horiz_edge, uint8_t diag_edge)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int chroma = !!c_idx;
+    int *sao_offset_val = sao->offset_val[c_idx];
+    int sao_eo_class = sao->eo_class[c_idx];
+
+    static const int8_t pos[4][2][2] = {
+        {{ -1,  0}, { 1, 0}}, // horizontal
+        {{  0, -1}, { 0, 1}}, // vertical
+        {{ -1, -1}, { 1, 1}}, // 45 degree
+        {{  1, -1}, {-1, 1}}, // 135 degree
+    };
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+
+    int init_x = 0, init_y = 0, width = _width, height = _height;
+
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+
+    if (!borders[2])
+        width -= (8 >> chroma) + 2;
+    if (!borders[3])
+        height -= (4 >> chroma) + 2;
+
+    dst = dst + (init_y * stride + init_x);
+    src = src + (init_y * stride + init_x);
+    init_y = init_x = 0;
+    if (sao_eo_class != SAO_EO_VERT) {
+        if (borders[0]) {
+            int offset_val = sao_offset_val[0];
+            int y_stride   = 0;
+            for (y = 0; y < height; y++) {
+                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
+                y_stride += stride;
+            }
+            init_x = 1;
+        }
+        if (borders[2]) {
+            int offset_val = sao_offset_val[0];
+            int x_stride   = width - 1;
+            for (x = 0; x < height; x++) {
+                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
+                x_stride += stride;
+            }
+            width --;
+        }
+
+    }
+    if (sao_eo_class != SAO_EO_HORIZ ) {
+        if (borders[1]){
+            int offset_val = sao_offset_val[0];
+            for (x = init_x; x < width; x++) {
+                dst[x] = av_clip_pixel(src[x] + offset_val);
+            }
+            init_y = 1;
+        }
+        if (borders[3]){
+            int offset_val = sao_offset_val[0];
+            int y_stride   = stride * (height - 1);
+            for (x = init_x; x < width; x++) {
+                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
+            }
+            height--;
+        }
+    }
+    {
+        int y_stride     = init_y * stride;
+        int pos_0_0      = pos[sao_eo_class][0][0];
+        int pos_0_1      = pos[sao_eo_class][0][1];
+        int pos_1_0      = pos[sao_eo_class][1][0];
+        int pos_1_1      = pos[sao_eo_class][1][1];
+
+        int y_stride_0_1 = (init_y + pos_0_1) * stride;
+        int y_stride_1_1 = (init_y + pos_1_1) * stride;
+        for (y = init_y; y < height; y++) {
+            for (x = init_x; x < width; x++) {
+                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
+                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
+                int offset_val    = edge_idx[2 + diff0 + diff1];
+                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
+            }
+            y_stride     += stride;
+            y_stride_0_1 += stride;
+            y_stride_1_1 += stride;
+        }
+    }
+
+    {
+        // Restore pixels that can't be modified
+        int save_upper_left = !diag_edge && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
+        if (vert_edge && sao_eo_class != SAO_EO_VERT)
+            for (y = init_y+save_upper_left; y< height; y++)
+                dst[y*stride] = src[y*stride];
+        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
+            for(x = init_x+save_upper_left; x<width; x++)
+                dst[x] = src[x];
+        if(diag_edge && sao_eo_class == SAO_EO_135D)
+            dst[0] = src[0];
+    }
+
+#undef CMP
+}
+
+static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t _stride, SAOParams *sao,
+                                    int *borders, int _width, int _height,
+                                    int c_idx,
+                                    uint8_t vert_edge, uint8_t horiz_edge, uint8_t diag_edge)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int chroma = !!c_idx;
+    int *sao_offset_val = sao->offset_val[c_idx];
+    int sao_eo_class = sao->eo_class[c_idx];
+
+    static const int8_t pos[4][2][2] = {
+        {{ -1,  0}, { 1, 0 }}, // horizontal
+        {{  0, -1}, { 0, 1 }}, // vertical
+        {{ -1, -1}, { 1, 1 }}, // 45 degree
+        {{  1, -1}, {-1, 1 }}, // 135 degree
+    };
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+
+    int init_x = 0, init_y = 0, width = _width, height = _height;
+
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+
+    init_y = -(4 >> chroma) - 2;
+    if (!borders[2])
+        width -= (8 >> chroma) + 2;
+    height = (4 >> chroma) + 2;
+
+    dst = dst + (init_y * stride + init_x);
+    src = src + (init_y * stride + init_x);
+    init_y = init_x = 0;
+    if (sao_eo_class != SAO_EO_VERT) {
+        if (borders[0]) {
+            int offset_val = sao_offset_val[0];
+            int y_stride   = 0;
+            for (y = 0; y < height; y++) {
+                dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
+                y_stride += stride;
+            }
+            init_x = 1;
+        }
+        if (borders[2]) {
+            int offset_val = sao_offset_val[0];
+            int x_stride   = width - 1;
+            for (x = 0; x < height; x++) {
+                dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
+                x_stride += stride;
+            }
+            width--;
+        }
+
+    }
+    {
+        int y_stride     = init_y * stride;
+        int pos_0_0      = pos[sao_eo_class][0][0];
+        int pos_0_1      = pos[sao_eo_class][0][1];
+        int pos_1_0      = pos[sao_eo_class][1][0];
+        int pos_1_1      = pos[sao_eo_class][1][1];
+
+        int y_stride_0_1 = (init_y + pos_0_1) * stride;
+        int y_stride_1_1 = (init_y + pos_1_1) * stride;
+        for (y = init_y; y < height; y++) {
+            for (x = init_x; x < width; x++) {
+                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
+                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
+                int offset_val    = edge_idx[2 + diff0 + diff1];
+                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
+            }
+            y_stride     += stride;
+            y_stride_0_1 += stride;
+            y_stride_1_1 += stride;
+        }
+    }
+
+    {
+        // Restore pixels that can't be modified
+        int save_lower_left = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[0];
+        if(vert_edge && sao_eo_class != SAO_EO_VERT)
+            for(y = init_y; y< height-save_lower_left; y++)
+                dst[y*stride] = src[y*stride];
+        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
+            for(x = init_x+save_lower_left; x<width; x++)
+                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
+        if(diag_edge && sao_eo_class == SAO_EO_45D)
+            dst[stride*(height-1)] = src[stride*(height-1)];
+    }
+
+#undef CMP
+}
+
+static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t _stride, SAOParams *sao,
+                                    int *borders, int _width, int _height,
+                                    int c_idx,
+                                    uint8_t vert_edge, uint8_t horiz_edge, uint8_t diag_edge)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int chroma = !!c_idx;
+    int *sao_offset_val = sao->offset_val[c_idx];
+    int sao_eo_class = sao->eo_class[c_idx];
+
+    static const int8_t pos[4][2][2] = {
+        {{ -1,  0}, { 1, 0}}, // horizontal
+        {{  0, -1}, { 0, 1}}, // vertical
+        {{ -1, -1}, { 1, 1}}, // 45 degree
+        {{  1, -1}, {-1, 1}}, // 135 degree
+    };
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+
+    int init_x = 0, init_y = 0, width = _width, height = _height;
+
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+
+    init_x = -(8 >> chroma) - 2;
+    width  =  (8 >> chroma) + 2;
+    if (!borders[3])
+        height -= (4 >> chroma) + 2;
+
+    dst = dst + (init_y * stride + init_x);
+    src = src + (init_y * stride + init_x);
+    init_y = init_x = 0;
+    if (sao_eo_class != SAO_EO_HORIZ) {
+        if (borders[1]){
+            int offset_val = sao_offset_val[0];
+            for (x = init_x; x < width; x++) {
+                dst[x] = av_clip_pixel(src[x] + offset_val);
+            }
+            init_y = 1;
+        }
+        if (borders[3]){
+            int offset_val = sao_offset_val[0];
+            int y_stride   = stride * (height - 1);
+            for (x = init_x; x < width; x++) {
+                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
+            }
+            height--;
+        }
+    }
+    {
+        int y_stride     = init_y * stride;
+        int pos_0_0      = pos[sao_eo_class][0][0];
+        int pos_0_1      = pos[sao_eo_class][0][1];
+        int pos_1_0      = pos[sao_eo_class][1][0];
+        int pos_1_1      = pos[sao_eo_class][1][1];
+
+        int y_stride_0_1 = (init_y + pos_0_1) * stride;
+        int y_stride_1_1 = (init_y + pos_1_1) * stride;
+        for (y = init_y; y < height; y++) {
+            for (x = init_x; x < width; x++) {
+                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
+                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
+                int offset_val    = edge_idx[2 + diff0 + diff1];
+                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
+            }
+            y_stride     += stride;
+            y_stride_0_1 += stride;
+            y_stride_1_1 += stride;
+        }
+    }
+
+    {
+        // Restore pixels that can't be modified
+        int save_upper_right = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[1];
+        if(vert_edge && sao_eo_class != SAO_EO_VERT)
+            for(y = init_y+save_upper_right; y< height; y++)
+                dst[y*stride+width-1] = src[y*stride+width-1];
+        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
+            for(x = init_x; x<width-save_upper_right; x++)
+                dst[x] = src[x];
+        if(diag_edge && sao_eo_class == SAO_EO_45D)
+            dst[width-1] = src[width-1];
+    }
+#undef CMP
+}
+
+static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
+                                    ptrdiff_t _stride, SAOParams *sao,
+                                    int *borders, int _width, int _height,
+                                    int c_idx,
+                                    uint8_t vert_edge, uint8_t horiz_edge, uint8_t diag_edge)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t stride = _stride / sizeof(pixel);
+    int chroma = !!c_idx;
+    int *sao_offset_val = sao->offset_val[c_idx];
+    int sao_eo_class    = sao->eo_class[c_idx];
+
+    static const int8_t pos[4][2][2] = {
+        {{ -1,  0}, { 1, 0}}, // horizontal
+        {{  0, -1}, { 0, 1}}, // vertical
+        {{ -1, -1}, { 1, 1}}, // 45 degree
+        {{  1, -1}, {-1, 1}}, // 135 degree
+    };
+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+
+    int init_x = 0, init_y = 0, width = _width, height = _height;
+
+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+
+    init_y = -(4 >> chroma) - 2;
+    init_x = -(8 >> chroma) - 2;
+    width  =  (8 >> chroma) + 2;
+    height =  (4 >> chroma) + 2;
+
+
+    dst = dst + (init_y * stride + init_x);
+    src = src + (init_y * stride + init_x);
+    init_y = init_x = 0;
+
+    {
+        int y_stride     = init_y * stride;
+        int pos_0_0      = pos[sao_eo_class][0][0];
+        int pos_0_1      = pos[sao_eo_class][0][1];
+        int pos_1_0      = pos[sao_eo_class][1][0];
+        int pos_1_1      = pos[sao_eo_class][1][1];
+
+        int y_stride_0_1 = (init_y + pos_0_1) * stride;
+        int y_stride_1_1 = (init_y + pos_1_1) * stride;
+
+        for (y = init_y; y < height; y++) {
+            for (x = init_x; x < width; x++) {
+                int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
+                int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
+                int offset_val    = edge_idx[2 + diff0 + diff1];
+                dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
+            }
+            y_stride     += stride;
+            y_stride_0_1 += stride;
+            y_stride_1_1 += stride;
+        }
+    }
+
+    {
+        // Restore pixels that can't be modified
+        int save_lower_right = !diag_edge && sao_eo_class == SAO_EO_135D;
+        if(vert_edge && sao_eo_class != SAO_EO_VERT)
+            for(y = init_y; y< height-save_lower_right; y++)
+                dst[y*stride+width-1] = src[y*stride+width-1];
+        if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
+            for(x = init_x; x<width-save_lower_right; x++)
+                dst[(height-1)*stride+x] = src[(height-1)*stride+x];
+        if(diag_edge && sao_eo_class == SAO_EO_135D)
+            dst[stride*(height-1)+width-1] = src[stride*(height-1)+width-1];
+    }
+#undef CMP
+}
+
+#undef SET
+#undef SCALE
+#undef ADD_AND_SCALE
+#undef TR_4
+#undef TR_4_1
+#undef TR_4_2
+#undef TR_8
+#undef TR_8_1
+#undef TR_8_2
+#undef TR_16
+#undef TR_16_1
+#undef TR_16_2
+#undef TR_32
+#undef TR_32_1
+#undef TR_32_2
+
+static void FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
+                                       uint8_t *_src, ptrdiff_t _srcstride,
+                                       int width, int height, int16_t* mcbuffer)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            dst[x] = src[x] << (14 - BIT_DEPTH);
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+#define QPEL_FILTER_1(src, stride)                                              \
+    (-src[x-3*stride] + 4*src[x-2*stride] - 10*src[x-stride] + 58*src[x] +      \
+     17*src[x+stride] - 5*src[x+2*stride] + 1*src[x+3*stride])
+#define QPEL_FILTER_2(src, stride)                                              \
+    (-src[x-3*stride] + 4*src[x-2*stride] - 11*src[x-stride] + 40*src[x] +      \
+     40*src[x+stride] - 11*src[x+2*stride] + 4*src[x+3*stride] - src[x+4*stride])
+#define QPEL_FILTER_3(src, stride)                                              \
+    (src[x-2*stride] - 5*src[x-stride] + 17*src[x] + 58*src[x+stride]           \
+     - 10*src[x+2*stride] + 4*src[x+3*stride] - src[x+4*stride])
+
+
+#define PUT_HEVC_QPEL_H(H)                                                     \
+static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
+                                       uint8_t *_src, ptrdiff_t _srcstride,    \
+                                       int width, int height,                  \
+                                       int16_t* mcbuffer)                      \
+{                                                                              \
+    int x, y;                                                                  \
+    pixel *src = (pixel*)_src;                                                 \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
+                                                                               \
+    for (y = 0; y < height; y++) {                                             \
+        for (x = 0; x < width; x++)                                            \
+            dst[x] = QPEL_FILTER_ ## H (src, 1) >> (BIT_DEPTH - 8);            \
+        src += srcstride;                                                      \
+        dst += dststride;                                                      \
+    }                                                                          \
+}
+
+#define PUT_HEVC_QPEL_V(V)                                                     \
+static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
+                                       uint8_t *_src, ptrdiff_t _srcstride,    \
+                                       int width, int height,                  \
+                                       int16_t* mcbuffer)                      \
+{                                                                              \
+    int x, y;                                                                  \
+    pixel *src = (pixel*)_src;                                                 \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
+                                                                               \
+    for (y = 0; y < height; y++)  {                                            \
+        for (x = 0; x < width; x++)                                            \
+            dst[x] = QPEL_FILTER_ ## V (src, srcstride) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                                      \
+        dst += dststride;                                                      \
+    }                                                                          \
+}
+
+#define PUT_HEVC_QPEL_HV(H, V)                                                       \
+static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,  ptrdiff_t dststride, \
+                                                 uint8_t *_src, ptrdiff_t _srcstride,\
+                                                 int width, int height,              \
+                                                 int16_t* mcbuffer)                  \
+{                                                                                    \
+    int x, y;                                                                        \
+    pixel *src = (pixel*)_src;                                                       \
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);                                \
+                                                                                     \
+    int16_t tmp_array[(MAX_PB_SIZE + 7)*MAX_PB_SIZE];                                \
+    int16_t *tmp = tmp_array;                                                        \
+                                                                                     \
+    src -= ff_hevc_qpel_extra_before[V] * srcstride;                                 \
+                                                                                     \
+    for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                           \
+        for (x = 0; x < width; x++)                                                  \
+            tmp[x] = QPEL_FILTER_ ## H (src, 1) >> (BIT_DEPTH - 8);                  \
+        src += srcstride;                                                            \
+        tmp += MAX_PB_SIZE;                                                          \
+    }                                                                                \
+                                                                                     \
+    tmp = tmp_array + ff_hevc_qpel_extra_before[V] * MAX_PB_SIZE;                    \
+                                                                                     \
+    for (y = 0; y < height; y++) {                                                   \
+        for (x = 0; x < width; x++)                                                  \
+            dst[x] = QPEL_FILTER_ ## V (tmp, MAX_PB_SIZE) >> 6;                      \
+        tmp += MAX_PB_SIZE;                                                          \
+        dst += dststride;                                                            \
+    }                                                                                \
+}
+
+PUT_HEVC_QPEL_H(1)
+PUT_HEVC_QPEL_H(2)
+PUT_HEVC_QPEL_H(3)
+PUT_HEVC_QPEL_V(1)
+PUT_HEVC_QPEL_V(2)
+PUT_HEVC_QPEL_V(3)
+PUT_HEVC_QPEL_HV(1, 1)
+PUT_HEVC_QPEL_HV(1, 2)
+PUT_HEVC_QPEL_HV(1, 3)
+PUT_HEVC_QPEL_HV(2, 1)
+PUT_HEVC_QPEL_HV(2, 2)
+PUT_HEVC_QPEL_HV(2, 3)
+PUT_HEVC_QPEL_HV(3, 1)
+PUT_HEVC_QPEL_HV(3, 2)
+PUT_HEVC_QPEL_HV(3, 3)
+
+static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
+                                       uint8_t *_src, ptrdiff_t _srcstride,
+                                       int width, int height, int mx, int my,
+                                       int16_t* mcbuffer)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = src[x] << (14 - BIT_DEPTH);
+        }
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+#define EPEL_FILTER(src, stride) \
+    (filter_0*src[x-stride] + filter_1*src[x] + filter_2*src[x+stride] + filter_3*src[x+2*stride])
+
+static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int width, int height, int mx, int my,
+                                  int16_t* mcbuffer)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    int8_t filter_0 = filter[0];
+    int8_t filter_1 = filter[1];
+    int8_t filter_2 = filter[2];
+    int8_t filter_3 = filter[3];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        }
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
+                                  uint8_t *_src, ptrdiff_t _srcstride,
+                                  int width, int height, int mx, int my,
+                                  int16_t* mcbuffer)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter = ff_hevc_epel_filters[my-1];
+    int8_t filter_0 = filter[0];
+    int8_t filter_1 = filter[1];
+    int8_t filter_2 = filter[2];
+    int8_t filter_3 = filter[3];
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
+        }
+        src += srcstride;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
+                                   uint8_t *_src, ptrdiff_t _srcstride,
+                                   int width, int height, int mx, int my,
+                                   int16_t* mcbuffer)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+    const int8_t *filter_h = ff_hevc_epel_filters[mx-1];
+    const int8_t *filter_v = ff_hevc_epel_filters[my-1];
+    int8_t filter_0 = filter_h[0];
+    int8_t filter_1 = filter_h[1];
+    int8_t filter_2 = filter_h[2];
+    int8_t filter_3 = filter_h[3];
+    int16_t tmp_array[(MAX_PB_SIZE + 3)*MAX_PB_SIZE];
+    int16_t *tmp = tmp_array;
+
+    src -= EPEL_EXTRA_BEFORE * srcstride;
+
+    for (y = 0; y < height + EPEL_EXTRA; y++) {
+        for (x = 0; x < width; x++) {
+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
+        }
+        src += srcstride;
+        tmp += MAX_PB_SIZE;
+    }
+
+    tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
+    filter_0 = filter_v[0];
+    filter_1 = filter_v[1];
+    filter_2 = filter_v[2];
+    filter_3 = filter_v[3];
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
+        }
+        tmp += MAX_PB_SIZE;
+        dst += dststride;
+    }
+}
+
+static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
+                                      int16_t *src, ptrdiff_t srcstride,
+                                      int width, int height)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    int shift = 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((src[x] + offset) >> shift);
+        }
+        dst += dststride;
+        src += srcstride;
+    }
+}
+
+static void FUNC(put_weighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
+                                        int16_t *src1, int16_t *src2,
+                                        ptrdiff_t srcstride,
+                                        int width, int height)
+{
+    int x, y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    int shift = 14 + 1 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+    int offset = 1 << (shift - 1);
+#else
+    int offset = 0;
+#endif
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
+        }
+        dst  += dststride;
+        src1 += srcstride;
+        src2 += srcstride;
+    }
+}
+
+static void FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
+                                uint8_t *_dst, ptrdiff_t _dststride,
+                                int16_t *src, ptrdiff_t srcstride,
+                                int width, int height)
+{
+    int shift;
+    int log2Wd;
+    int wx;
+    int ox;
+    int x , y;
+    int offset;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    shift  = 14 - BIT_DEPTH;
+    log2Wd = denom + shift;
+    offset = 1 << (log2Wd - 1);
+    wx = wlxFlag;
+    ox = olxFlag * (1 << (BIT_DEPTH - 8));
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            if (log2Wd >= 1) {
+                dst[x] = av_clip_pixel(((src[x] * wx + offset) >> log2Wd) + ox);
+            } else {
+                dst[x] = av_clip_pixel(src[x] * wx + ox);
+            }
+        }
+        dst  += dststride;
+        src  += srcstride;
+    }
+}
+
+static void FUNC(weighted_pred_avg)(uint8_t denom, int16_t wl0Flag, int16_t wl1Flag,
+                                    int16_t ol0Flag, int16_t ol1Flag,
+                                    uint8_t *_dst, ptrdiff_t _dststride,
+                                    int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
+                                    int width, int height)
+{
+    int shift;
+    int log2Wd;
+    int w0;
+    int w1;
+    int o0;
+    int o1;
+    int x , y;
+    pixel *dst = (pixel*)_dst;
+    ptrdiff_t dststride = _dststride / sizeof(pixel);
+
+    shift  = 14 - BIT_DEPTH;
+    log2Wd = denom + shift;
+    w0 = wl0Flag;
+    w1 = wl1Flag;
+    o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8));
+    o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8));
+
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
+                                    ((o0 + o1 + 1) << log2Wd)) >> (log2Wd + 1));
+        }
+        dst  += dststride;
+        src1 += srcstride;
+        src2 += srcstride;
+    }
+}
+
+// line zero
+#define P3 pix[-4*xstride]
+#define P2 pix[-3*xstride]
+#define P1 pix[-2*xstride]
+#define P0 pix[-xstride]
+#define Q0 pix[0]
+#define Q1 pix[xstride]
+#define Q2 pix[2*xstride]
+#define Q3 pix[3*xstride]
+
+// line three. used only for deblocking decision
+#define TP3 pix[-4*xstride+3*ystride]
+#define TP2 pix[-3*xstride+3*ystride]
+#define TP1 pix[-2*xstride+3*ystride]
+#define TP0 pix[-xstride+3*ystride]
+#define TQ0 pix[3*ystride]
+#define TQ1 pix[xstride+3*ystride]
+#define TQ2 pix[2*xstride+3*ystride]
+#define TQ3 pix[3*xstride+3*ystride]
+
+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, ptrdiff_t _xstride,
+                                        ptrdiff_t _ystride, int *_beta, int *_tc,
+                                        uint8_t *_no_p, uint8_t *_no_q)
+{
+    int d, j;
+    pixel *pix = (pixel*)_pix;
+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+    for (j = 0; j < 2; j++) {
+        const int dp0  = abs(P2 - 2 * P1 +  P0);
+        const int dq0  = abs(Q2 - 2 * Q1 +  Q0);
+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+        const int d0   = dp0 + dq0;
+        const int d3   = dp3 + dq3;
+        int       beta = _beta[j] << (BIT_DEPTH - 8);
+        const int tc   = _tc[j] << (BIT_DEPTH - 8);
+        const int no_p = _no_p[j];
+        const int no_q = _no_q[j];
+
+        if (d0 + d3 >= beta /*|| tc <= 0*/) {
+            pix += 4 * ystride;
+            continue;
+        } else {
+            const int beta_3 = beta >> 3;
+            const int beta_2 = beta >> 2;
+            const int tc25 = ((tc * 5 + 1) >> 1);
+
+            if (abs( P3 -  P0) + abs( Q3 -  Q0) < beta_3 && abs( P0 -  Q0) < tc25 &&
+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+                // strong filtering
+                const int tc2 = tc << 1;
+                for (d = 0; d < 4; d++) {
+                    const int p3 = P3;
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    const int q3 = Q3;
+                    if (!no_p) {
+                        P0 = p0 + av_clip((( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3) - p0, -tc2, tc2);
+                        P1 = p1 + av_clip((( p2 + p1 + p0 + q0 + 2 ) >> 2) - p1, -tc2, tc2);
+                        P2 = p2 + av_clip((( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3) - p2, -tc2, tc2);
+                    }
+                    if (!no_q) {
+                        Q0 = q0 + av_clip((( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3) - q0, -tc2, tc2);
+                        Q1 = q1 + av_clip((( p0 + q0 + q1 + q2 + 2 ) >> 2) - q1, -tc2, tc2);
+                        Q2 = q2 + av_clip((( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3) - q2, -tc2, tc2);
+                    }
+                    pix += ystride;
+                }
+            } else { // normal filtering
+                int nd_p = 1;
+                int nd_q = 1;
+                const int tc_2 = tc >> 1;
+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+                    nd_p = 2;
+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+                    nd_q = 2;
+
+                for (d = 0; d < 4; d++) {
+                    const int p2 = P2;
+                    const int p1 = P1;
+                    const int p0 = P0;
+                    const int q0 = Q0;
+                    const int q1 = Q1;
+                    const int q2 = Q2;
+                    int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+                    if (abs(delta0) < 10 * tc) {
+                        delta0 = av_clip(delta0, -tc, tc);
+                        if (!no_p)
+                            P0 = av_clip_pixel(p0 + delta0);
+                        if (!no_q)
+                            Q0 = av_clip_pixel(q0 - delta0);
+                        if (!no_p && nd_p > 1) {
+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+                            P1 = av_clip_pixel(p1 + deltap1);
+                        }
+                        if (!no_q && nd_q > 1) {
+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+                            Q1 = av_clip_pixel(q1 + deltaq1);
+                        }
+                    }
+                    pix += ystride;
+                }
+            }
+        }
+    }
+}
+
+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
+                                          ptrdiff_t _ystride, int *_tc,
+                                          uint8_t *_no_p, uint8_t *_no_q)
+{
+    int d, j;
+    int no_p, no_q;
+    pixel *pix = (pixel*)_pix;
+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+
+    for (j = 0; j < 2; j++) {
+        const int tc = _tc[j] << (BIT_DEPTH - 8);
+        if (tc <= 0) {
+            pix += 4 * ystride;
+            continue;
+        }
+        no_p = _no_p[j];
+        no_q = _no_q[j];
+
+        for (d = 0; d < 4; d++) {
+            int delta0;
+            const int p1 = P1;
+            const int p0 = P0;
+            const int q0 = Q0;
+            const int q1 = Q1;
+            delta0 = av_clip((((q0 - p0) << 2) + p1 - q1 + 4) >> 3, -tc, tc);
+            if (!no_p)
+                P0 = av_clip_pixel(p0 + delta0);
+            if (!no_q)
+                Q0 = av_clip_pixel(q0 - delta0);
+            pix += ystride;
+        }
+    }
+}
+
+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+                                            int *tc, uint8_t *no_p, uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
+}
+
+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
+                                            int *tc, uint8_t *no_p, uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
+}
+
+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+                                          int *beta, int *tc, uint8_t *no_p,
+                                          uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), beta, tc, no_p, no_q);
+}
+
+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+                                          int *beta, int *tc, uint8_t *no_p,
+                                          uint8_t *no_q)
+{
+    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, beta, tc, no_p, no_q);
+}
+
+#undef P3
+#undef P2
+#undef P1
+#undef P0
+#undef Q0
+#undef Q1
+#undef Q2
+#undef Q3
+
+#undef TP3
+#undef TP2
+#undef TP1
+#undef TP0
+#undef TQ0
+#undef TQ1
+#undef TQ2
+#undef TQ3
diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
new file mode 100644
index 0000000000..b6b3906406
--- /dev/null
+++ b/libavcodec/hevcpred.c
@@ -0,0 +1,66 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hevc.h"
+#include "hevcpred.h"
+
+#define BIT_DEPTH 8
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 9
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+#define BIT_DEPTH 10
+#include "hevcpred_template.c"
+#undef BIT_DEPTH
+
+void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+{
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth
+
+#define HEVC_PRED(depth)                            \
+    hpc->intra_pred   = FUNC(intra_pred, depth);   \
+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
+    hpc->pred_dc      = FUNC(pred_dc, depth);      \
+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth);\
+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth);\
+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth);\
+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+
+    switch (bit_depth) {
+    case 9:
+        HEVC_PRED(9);
+        break;
+    case 10:
+        HEVC_PRED(10);
+        break;
+    default:
+        HEVC_PRED(8);
+        break;
+    }
+}
diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
new file mode 100644
index 0000000000..fe7cbb8aaf
--- /dev/null
+++ b/libavcodec/hevcpred.h
@@ -0,0 +1,40 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HEVCPRED_H
+#define AVCODEC_HEVCPRED_H
+
+struct HEVCContext;
+
+typedef struct HEVCPredContext {
+    void (*intra_pred)(struct HEVCContext *s, int x0, int y0, int log2_size, int c_idx);
+
+    void(*pred_planar[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
+    void(*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride,
+                   int log2_size, int c_idx);
+    void(*pred_angular[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride,
+                         int c_idx, int mode);
+} HEVCPredContext;
+
+void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+
+#endif /* AVCODEC_HEVCPRED_H */
diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
new file mode 100644
index 0000000000..4b677b7ced
--- /dev/null
+++ b/libavcodec/hevcpred_template.c
@@ -0,0 +1,533 @@
+/*
+ * HEVC video Decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include "bit_depth_template.c"
+#include "hevcpred.h"
+
+#define POS(x, y) src[(x) + stride * (y)]
+
+static void FUNC(intra_pred)(HEVCContext *s, int x0, int y0, int log2_size, int c_idx)
+{
+#define PU(x) \
+    ((x) >> s->sps->log2_min_pu_size)
+#define MVF(x, y) \
+    (s->ref->tab_mvf[(x) + (y) * pic_width_in_min_pu])
+#define MVF_PU(x, y) \
+    MVF(PU(x0 + ((x) << hshift)), PU(y0 + ((y) << vshift)))
+#define IS_INTRA(x, y) \
+    (MVF_PU(x, y).is_intra || !s->pps->constrained_intra_pred_flag)
+#define MIN_TB_ADDR_ZS(x, y) \
+    s->pps->min_tb_addr_zs[(y) * s->sps->min_tb_width + (x)]
+#define EXTEND_LEFT(ptr, start, length) \
+        for (i = (start); i > (start) - (length); i--) \
+            ptr[i - 1] = ptr[i]
+#define EXTEND_RIGHT(ptr, start, length) \
+        for (i = (start); i < (start) + (length); i++) \
+            ptr[i] = ptr[i - 1]
+#define EXTEND_UP(ptr, start, length)   EXTEND_LEFT(ptr, start, length)
+#define EXTEND_DOWN(ptr, start, length) EXTEND_RIGHT(ptr, start, length)
+#define EXTEND_LEFT_CIP(ptr, start, length) \
+        for (i = (start); i > (start) - (length); i--) \
+            if (!IS_INTRA(i - 1, -1)) \
+                ptr[i - 1] = ptr[i]
+#define EXTEND_RIGHT_CIP(ptr, start, length) \
+        for (i = (start); i < (start) + (length); i++) \
+            if (!IS_INTRA(i, -1)) \
+                ptr[i] = ptr[i - 1]
+#define EXTEND_UP_CIP(ptr, start, length) \
+        for (i = (start); i > (start) - (length); i--) \
+            if (!IS_INTRA(-1, i - 1)) \
+                ptr[i - 1] = ptr[i]
+#define EXTEND_DOWN_CIP(ptr, start, length) \
+        for (i = (start); i < (start) + (length); i++) \
+            if (!IS_INTRA(-1, i)) \
+            ptr[i] = ptr[i - 1]
+    HEVCLocalContext *lc = &s->HEVClc;
+    int i;
+    int hshift = s->sps->hshift[c_idx];
+    int vshift = s->sps->vshift[c_idx];
+    int size = (1 << log2_size);
+    int size_in_luma = size << hshift;
+    int size_in_tbs = size_in_luma >> s->sps->log2_min_transform_block_size;
+    int x = x0 >> hshift;
+    int y = y0 >> vshift;
+    int x_tb = x0 >> s->sps->log2_min_transform_block_size;
+    int y_tb = y0 >> s->sps->log2_min_transform_block_size;
+    int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
+
+    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+    pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+
+    int pic_width_in_min_pu = PU(s->sps->width);
+
+    enum IntraPredMode mode = c_idx ? lc->pu.intra_pred_mode_c :
+                              lc->tu.cur_intra_pred_mode;
+
+    pixel left_array[2 * MAX_TB_SIZE + 1];
+    pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
+    pixel top_array[2 * MAX_TB_SIZE + 1];
+    pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
+
+    pixel *left          = left_array + 1;
+    pixel *top           = top_array + 1;
+    pixel *filtered_left = filtered_left_array + 1;
+    pixel *filtered_top  = filtered_top_array + 1;
+
+    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb - 1, y_tb + size_in_tbs);
+    int cand_left        = lc->na.cand_left;
+    int cand_up_left     = lc->na.cand_up_left;
+    int cand_up          = lc->na.cand_up;
+    int cand_up_right    = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS(x_tb + size_in_tbs, y_tb - 1);
+
+    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma, s->sps->height) -
+                            (y0 + size_in_luma)) >> vshift;
+    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma, s->sps->width) -
+                            (x0 + size_in_luma)) >> hshift;
+
+    if (s->pps->constrained_intra_pred_flag == 1) {
+        int size_in_luma_pu = PU(size_in_luma);
+        int on_pu_edge_x = !(x0 & ((1 << s->sps->log2_min_pu_size) - 1));
+        int on_pu_edge_y = !(y0 & ((1 << s->sps->log2_min_pu_size) - 1));
+        if(!size_in_luma_pu)
+            size_in_luma_pu++;
+        if (cand_bottom_left == 1 && on_pu_edge_x) {
+            int x_left_pu   = PU(x0 - 1);
+            int y_bottom_pu = PU(y0 + size_in_luma);
+            cand_bottom_left = 0;
+            for(i = 0; i < size_in_luma_pu; i++)
+                cand_bottom_left |= MVF(x_left_pu, y_bottom_pu + i).is_intra;
+        }
+        if (cand_left == 1 && on_pu_edge_x) {
+            int x_left_pu   = PU(x0 - 1);
+            int y_left_pu   = PU(y0);
+            cand_left = 0;
+            for(i = 0; i < size_in_luma_pu; i++)
+                cand_left |= MVF(x_left_pu, y_left_pu + i).is_intra;
+        }
+        if (cand_up_left == 1) {
+            int x_left_pu   = PU(x0 - 1);
+            int y_top_pu    = PU(y0 - 1);
+            cand_up_left = MVF(x_left_pu, y_top_pu).is_intra;
+        }
+        if (cand_up == 1 && on_pu_edge_y) {
+            int x_top_pu    = PU(x0);
+            int y_top_pu    = PU(y0 - 1);
+            cand_up = 0;
+            for(i = 0; i < size_in_luma_pu; i++)
+                cand_up |= MVF(x_top_pu + i, y_top_pu).is_intra;
+        }
+        if (cand_up_right == 1 && on_pu_edge_y) {
+            int y_top_pu    = PU(y0 - 1);
+            int x_right_pu  = PU(x0 + size_in_luma);
+            cand_up_right = 0;
+            for(i = 0; i < size_in_luma_pu; i++)
+                cand_up_right |= MVF(x_right_pu + i, y_top_pu).is_intra;
+        }
+        for (i = 0; i < 2 * MAX_TB_SIZE; i++) {
+            left[i] = 128;
+            top[i]  = 128;
+        }
+    }
+    if (cand_bottom_left) {
+        for (i = size + bottom_left_size; i < (size << 1); i++)
+            if (IS_INTRA(-1, size + bottom_left_size - 1))
+                left[i] = POS(-1, size + bottom_left_size - 1);
+        for (i = size + bottom_left_size - 1; i >= size; i--)
+            if (IS_INTRA(-1, i))
+                left[i] = POS(-1, i);
+    }
+    if (cand_left)
+        for (i = size - 1; i >= 0; i--)
+            if (IS_INTRA(-1, i))
+                left[i] = POS(-1, i);
+    if (cand_up_left)
+        if (IS_INTRA(-1, -1)) {
+            left[-1] = POS(-1, -1);
+            top[-1]  = left[-1];
+        }
+    if (cand_up)
+        for (i = size - 1; i >= 0; i--)
+            if (IS_INTRA(i, -1))
+                top[i] = POS(i, -1);
+    if (cand_up_right) {
+        for (i = size + top_right_size; i < (size << 1); i++)
+            if (IS_INTRA(size + top_right_size - 1, -1))
+                top[i] = POS(size + top_right_size - 1, -1);
+        for (i = size + top_right_size - 1; i >= size; i--)
+            if (IS_INTRA(i, -1))
+                top[i] = POS(i, -1);
+    }
+
+    if (s->pps->constrained_intra_pred_flag == 1) {
+        if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) {
+            int size_max_x = x0 + ((2 * size) << hshift) < s->sps->width ?
+                                    2 * size : (s->sps->width - x0) >> hshift;
+            int size_max_y = y0 + ((2 * size) << vshift) < s->sps->height ?
+                                    2 * size : (s->sps->height - y0) >> vshift;
+            int j = size + (cand_bottom_left? bottom_left_size: 0) -1;
+            if (cand_bottom_left || cand_left || cand_up_left) {
+                while (j>-1 && !IS_INTRA(-1, j)) j--;
+                if (!IS_INTRA(-1, j)) {
+                    j = 0;
+                    while(j < size_max_x && !IS_INTRA(j, -1)) j++;
+                    EXTEND_LEFT_CIP(top, j, j+1);
+                    left[-1] = top[-1];
+                    j = 0;
+                }
+            } else {
+                j = 0;
+                while (j < size_max_x && !IS_INTRA(j, -1)) j++;
+                if (j > 0)
+                    EXTEND_LEFT_CIP(top, j, j+1);
+                left[-1] = top[-1];
+                j = 0;
+            }
+            if (cand_bottom_left || cand_left) {
+                EXTEND_DOWN_CIP(left, j, size_max_y-j);
+            }
+            if (!cand_left) {
+                EXTEND_DOWN(left, 0, size);
+            }
+            if (!cand_bottom_left) {
+                EXTEND_DOWN(left, size, size);
+            }
+            if (x0 != 0 && y0 != 0) {
+                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
+            } else {
+                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y-1);
+            }
+            top[-1] = left[-1];
+            if (y0 != 0) {
+                EXTEND_RIGHT_CIP(top, 0, size_max_x);
+            }
+        }
+    }
+    // Infer the unavailable samples
+    if (!cand_bottom_left) {
+        if (cand_left) {
+            EXTEND_DOWN(left, size, size);
+        } else if (cand_up_left) {
+            EXTEND_DOWN(left, 0, 2 * size);
+            cand_left = 1;
+        } else if (cand_up) {
+            left[-1] = top[0];
+            EXTEND_DOWN(left, 0, 2 * size);
+            cand_up_left = 1;
+            cand_left = 1;
+        } else if (cand_up_right) {
+            EXTEND_LEFT(top, size, size);
+            left[-1] = top[0];
+            EXTEND_DOWN(left ,0 , 2 * size);
+            cand_up = 1;
+            cand_up_left = 1;
+            cand_left = 1;
+        } else { // No samples available
+            top[0] = left[-1] = (1 << (BIT_DEPTH - 1));
+            EXTEND_RIGHT(top, 1, 2 * size - 1);
+            EXTEND_DOWN(left, 0, 2 * size);
+        }
+    }
+
+    if (!cand_left) {
+        EXTEND_UP(left, size, size);
+    }
+    if (!cand_up_left) {
+        left[-1] = left[0];
+    }
+    if (!cand_up) {
+        top[0] = left[-1];
+        EXTEND_RIGHT(top, 1, size-1);
+    }
+    if (!cand_up_right) {
+        EXTEND_RIGHT(top, size, size);
+    }
+
+    top[-1] = left[-1];
+
+#undef EXTEND_LEFT_CIP
+#undef EXTEND_RIGHT_CIP
+#undef EXTEND_UP_CIP
+#undef EXTEND_DOWN_CIP
+#undef IS_INTRA
+#undef MVF_PU
+#undef MVF
+#undef PU
+#undef EXTEND_LEFT
+#undef EXTEND_RIGHT
+#undef EXTEND_UP
+#undef EXTEND_DOWN
+#undef MIN_TB_ADDR_ZS
+
+    // Filtering process
+    if (c_idx == 0 && mode != INTRA_DC && size != 4) {
+        int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+        int min_dist_vert_hor           = FFMIN(FFABS((int)mode - 26),
+                                                FFABS((int)mode - 10));
+        if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
+            int threshold = 1 << (BIT_DEPTH - 5);
+            if (s->sps->sps_strong_intra_smoothing_enable_flag &&
+                log2_size == 5 &&
+                FFABS(top[-1] + top[63] - 2 * top[31]) < threshold &&
+                FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
+                // We can't just overwrite values in top because it could be
+                // a pointer into src
+                filtered_top[-1] = top[-1];
+                filtered_top[63] = top[63];
+                for (i = 0; i < 63; i++)
+                    filtered_top[i] = ((64 - (i + 1)) * top[-1] +
+                                             (i + 1)  * top[63] + 32) >> 6;
+                for (i = 0; i < 63; i++)
+                    left[i] = ((64 - (i + 1)) * left[-1] +
+                                     (i + 1)  * left[63] + 32) >> 6;
+                top = filtered_top;
+            } else {
+                filtered_left[2 * size - 1] = left[2 * size - 1];
+                filtered_top[2 * size - 1]  = top[2 * size - 1];
+                for (i = 2 * size - 2; i >= 0; i--)
+                    filtered_left[i] = (left[i + 1] + 2 * left[i] +
+                                        left[i - 1] + 2) >> 2;
+                filtered_top[-1] =
+                filtered_left[-1] = (left[0] + 2 * left[-1] +
+                                     top[0]  + 2) >> 2;
+                for (i = 2 * size - 2; i >= 0; i--)
+                    filtered_top[i] = (top[i + 1] + 2 * top[i] +
+                                       top[i - 1] + 2) >> 2;
+                left = filtered_left;
+                top  = filtered_top;
+            }
+        }
+    }
+
+    switch (mode) {
+    case INTRA_PLANAR:
+        s->hpc.pred_planar[log2_size - 2]((uint8_t*)src, (uint8_t*)top,
+                                          (uint8_t*)left, stride);
+        break;
+    case INTRA_DC:
+        s->hpc.pred_dc((uint8_t*)src, (uint8_t*)top,
+                       (uint8_t*)left, stride, log2_size, c_idx);
+        break;
+    default:
+        s->hpc.pred_angular[log2_size - 2]((uint8_t*)src, (uint8_t*)top,
+                                           (uint8_t*)left, stride, c_idx, mode);
+        break;
+    }
+}
+
+static void FUNC(pred_planar_0)(uint8_t *_src, const uint8_t *_top,
+                                const uint8_t *_left,
+                                ptrdiff_t stride)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    const pixel *top  = (const pixel*)_top;
+    const pixel *left = (const pixel*)_left;
+    for (y = 0; y < 4; y++)
+        for (x = 0; x < 4; x++)
+            POS(x, y) = ((3 - x) * left[y]  + (x + 1) * top[4] +
+                         (3 - y) * top[x]   + (y + 1) * left[4] + 4) >> 3;
+}
+
+static void FUNC(pred_planar_1)(uint8_t *_src, const uint8_t *_top,
+                                const uint8_t *_left, ptrdiff_t stride)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    const pixel *top  = (const pixel*)_top;
+    const pixel *left = (const pixel*)_left;
+    for (y = 0; y < 8; y++)
+        for (x = 0; x < 8; x++)
+            POS(x, y) = ((7 - x) * left[y]  + (x + 1) * top[8] +
+                         (7 - y) * top[x]   + (y + 1) * left[8] + 8) >> 4;
+}
+
+static void FUNC(pred_planar_2)(uint8_t *_src, const uint8_t *_top,
+                                const uint8_t *_left, ptrdiff_t stride)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    const pixel *top  = (const pixel*)_top;
+    const pixel *left = (const pixel*)_left;
+    for (y = 0; y < 16; y++)
+        for (x = 0; x < 16; x++)
+            POS(x, y) = ((15 - x) * left[y]  + (x + 1) * top[16] +
+                         (15 - y) * top[x]   + (y + 1) * left[16] + 16) >> 5;
+}
+
+static void FUNC(pred_planar_3)(uint8_t *_src, const uint8_t *_top,
+                                const uint8_t *_left, ptrdiff_t stride)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    const pixel *top  = (const pixel*)_top;
+    const pixel *left = (const pixel*)_left;
+    for (y = 0; y < 32; y++)
+        for (x = 0; x < 32; x++)
+            POS(x, y) = ((31 - x) * left[y]  + (x + 1) * top[32] +
+                         (31 - y) * top[x]   + (y + 1) * left[32] + 32) >> 6;
+}
+
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+                          const uint8_t *_left,
+                          ptrdiff_t stride, int log2_size, int c_idx)
+{
+    int i, j, x, y;
+    int size = (1 << log2_size);
+    pixel *src        = (pixel*)_src;
+    const pixel *top  = (const pixel*)_top;
+    const pixel *left = (const pixel*)_left;
+    int dc = size;
+    pixel4 a;
+    for (i = 0; i < size; i++)
+        dc += left[i] + top[i];
+
+    dc >>= log2_size + 1;
+
+    a = PIXEL_SPLAT_X4(dc);
+
+    for (i = 0; i < size; i++)
+        for (j = 0; j < size / 4; j++)
+            AV_WN4PA(&POS(j * 4, i), a);
+
+    if (c_idx == 0 && size < 32) {
+        POS(0, 0) = (left[0] + 2 * dc  + top[0] + 2) >> 2;
+        for (x = 1; x < size; x++)
+            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
+        for (y = 1; y < size; y++)
+            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
+    }
+}
+
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+                                                const uint8_t *_top,
+                                                const uint8_t *_left,
+                                                ptrdiff_t stride, int c_idx,
+                                                int mode, int size)
+{
+    int x, y;
+    pixel *src = (pixel*)_src;
+    const pixel *top  = (const pixel*)_top;
+    const pixel *left = (const pixel*)_left;
+
+    static const int intra_pred_angle[] = {
+        32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+        -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
+    };
+    static const int inv_angle[] = {
+        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+        -630, -910, -1638, -4096
+    };
+
+    int angle = intra_pred_angle[mode - 2];
+    pixel ref_array[3 * MAX_TB_SIZE + 1];
+    pixel *ref_tmp = ref_array + size;
+    const pixel *ref;
+    int last = (size * angle) >> 5;
+
+    if (mode >= 18) {
+        ref = top - 1;
+        if (angle < 0 && last < -1) {
+            for (x = 0; x <= size; x++)
+                ref_tmp[x] = top[x - 1];
+            for (x = last; x <= -1; x++)
+                ref_tmp[x] = left[-1 + ((x * inv_angle[mode-11] + 128) >> 8)];
+            ref = ref_tmp;
+        }
+
+        for (y = 0; y < size; y++) {
+            int idx  = ((y + 1) * angle) >> 5;
+            int fact = ((y + 1) * angle) & 31;
+            if (fact) {
+                for (x = 0; x < size; x++) {
+                    POS(x, y) = ((32 - fact) * ref[x + idx + 1] +
+                                       fact  * ref[x + idx + 2] + 16) >> 5;
+                }
+            } else {
+                for (x = 0; x < size; x++) {
+                    POS(x, y) = ref[x + idx + 1];
+                }
+            }
+        }
+        if (mode == 26 && c_idx == 0 && size < 32) {
+            for (y = 0; y < size; y++)
+                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
+        }
+    } else {
+        ref = left - 1;
+        if (angle < 0 && last < -1) {
+            for (x = 0; x <= size; x++)
+                ref_tmp[x] = left[x - 1];
+            for (x = last; x <= -1; x++)
+                ref_tmp[x] = top[-1 + ((x * inv_angle[mode-11] + 128) >> 8)];
+            ref = ref_tmp;
+        }
+
+        for (x = 0; x < size; x++) {
+            int idx  = ((x + 1) * angle) >> 5;
+            int fact = ((x + 1) * angle) & 31;
+            if (fact) {
+                for (y = 0; y < size; y++) {
+                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
+                                       fact  * ref[y + idx + 2] + 16) >> 5;
+                }
+            } else {
+                for (y = 0; y < size; y++) {
+                    POS(x, y) = ref[y + idx + 1];
+                }
+            }
+        }
+        if (mode == 10 && c_idx == 0 && size < 32) {
+            for (x = 0; x < size; x++)
+                POS(x, 0) = av_clip_pixel(left[0] + ((top[x] - top[-1]) >> 1));
+        }
+    }
+}
+
+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int c_idx, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2);
+}
+
+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int c_idx, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3);
+}
+
+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int c_idx, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4);
+}
+
+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+                                 const uint8_t *left,
+                                 ptrdiff_t stride, int c_idx, int mode)
+{
+    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
+}
+#undef POS
diff --git a/libavcodec/version.h b/libavcodec/version.h
index c540c27bfa..ac8dea8573 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -29,7 +29,7 @@
 #include "libavutil/avutil.h"
 
 #define LIBAVCODEC_VERSION_MAJOR 55
-#define LIBAVCODEC_VERSION_MINOR  36
+#define LIBAVCODEC_VERSION_MINOR  37
 #define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
author	Guillaume Martres <[email protected]>	2013-10-12 11:55:48 +0200
committer	Michael Niedermayer <[email protected]>	2013-10-15 22:13:02 +0200
commit	c8dd048ab8cff815c9f4b16a62db0b74df011f0a (patch)
tree	e9167d50e3b802a195b6fcfb4c042332f0d2b469
parent	2a19fcc12311f71f55eab7129b764d4cb800c934 (diff)