aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKostya Shishkov <kostya.shishkov@gmail.com>2023-08-07 19:01:42 +0200
committerKostya Shishkov <kostya.shishkov@gmail.com>2023-08-07 19:01:42 +0200
commit754ab49a62c862e8c6e66ec88bb7ad626247140e (patch)
tree41e92d1b790a37574166bcdc74c22c98e3a9d958
parentef19a9351d2cae4bfedcf2acb7b0edb06ee131d5 (diff)
downloadnihav-754ab49a62c862e8c6e66ec88bb7ad626247140e.tar.gz
h264: miscellaneous micro-optimisations
-rw-r--r--nihav-itu/src/codecs/h264/cabac_coder.rs4
-rw-r--r--nihav-itu/src/codecs/h264/decoder_st.rs34
-rw-r--r--nihav-itu/src/codecs/h264/dsp/mc/mod.rs4
-rw-r--r--nihav-itu/src/codecs/h264/dsp/mod.rs10
-rw-r--r--nihav-itu/src/codecs/h264/mb_recon.rs6
-rw-r--r--nihav-itu/src/codecs/h264/types.rs30
6 files changed, 57 insertions, 31 deletions
diff --git a/nihav-itu/src/codecs/h264/cabac_coder.rs b/nihav-itu/src/codecs/h264/cabac_coder.rs
index 3e9278e..82c20b0 100644
--- a/nihav-itu/src/codecs/h264/cabac_coder.rs
+++ b/nihav-itu/src/codecs/h264/cabac_coder.rs
@@ -235,8 +235,10 @@ impl<'a> CABAC<'a> {
pub fn decode_012(&mut self, start: usize) -> u8 {
if !self.decode_bit(start) {
0
+ } else if !self.decode_bit(start + 1) {
+ 1
} else {
- self.decode_bit(start + 1) as u8 + 1
+ 2
}
}
fn refill(&mut self) {
diff --git a/nihav-itu/src/codecs/h264/decoder_st.rs b/nihav-itu/src/codecs/h264/decoder_st.rs
index 13fe1bd..7e8b83a 100644
--- a/nihav-itu/src/codecs/h264/decoder_st.rs
+++ b/nihav-itu/src/codecs/h264/decoder_st.rs
@@ -362,25 +362,29 @@ println!("PAFF?");
mb_info.coeffs[i][0] = mb_info.coeffs[24][i];
}
}
- if !mb_info.transform_size_8x8 {
- let quant_dc = !mb_info.mb_type.is_intra16x16();
- for i in 0..16 {
- if mb_info.coded[i] {
- if !tx_bypass {
- idct(&mut mb_info.coeffs[i], qp_y, quant_dc);
+ if !tx_bypass {
+ if !mb_info.transform_size_8x8 {
+ let quant_dc = !mb_info.mb_type.is_intra16x16();
+ for (coded, coeffs) in mb_info.coded[..16].iter_mut().zip(mb_info.coeffs[..16].iter_mut()) {
+ if *coded {
+ idct(coeffs, qp_y, quant_dc);
+ } else if has_dc {
+ idct_dc(coeffs, qp_y, quant_dc);
+ *coded = true;
}
- } else if has_dc {
- if !tx_bypass {
- idct_dc(&mut mb_info.coeffs[i], qp_y, quant_dc);
+ }
+ } else {
+ for i in 0..4 {
+ if mb_info.coded[(i & 1) * 2 + (i & 2) * 4] {
+ dequant8x8(&mut mb_info.coeffs8x8[i].coeffs, &pps.scaling_list_8x8[!mb_info.mb_type.is_intra() as usize]);
+ idct8x8(&mut mb_info.coeffs8x8[i].coeffs, qp_y);
}
- mb_info.coded[i] = true;
}
}
- } else {
- for i in 0..4 {
- if mb_info.coded[(i & 1) * 2 + (i & 2) * 4] && !tx_bypass {
- dequant8x8(&mut mb_info.coeffs8x8[i].coeffs, &pps.scaling_list_8x8[!mb_info.mb_type.is_intra() as usize]);
- idct8x8(&mut mb_info.coeffs8x8[i].coeffs, qp_y);
+ } else if !mb_info.transform_size_8x8 {
+ for i in 0..16 {
+ if !mb_info.coded[i] && has_dc {
+ mb_info.coded[i] = true;
}
}
}
diff --git a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs
index 19f2f28..f558441 100644
--- a/nihav-itu/src/codecs/h264/dsp/mc/mod.rs
+++ b/nihav-itu/src/codecs/h264/dsp/mc/mod.rs
@@ -273,7 +273,7 @@ fn put_block_weighted(dst: &mut [u8], stride: usize, src: &[u8], w: usize, h: us
let wshift = wparams[2] as u8;
let bias = (1 << wshift) >> 1;
- for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks(16)).take(h) {
+ for (drow, srow) in dst.chunks_mut(stride).zip(src.chunks_exact(16)).take(h) {
for (dst, &src) in drow[..w].iter_mut().zip(srow.iter()) {
*dst = clip_u8(((i16::from(src) * weight + bias) >> wshift) + offset);
}
@@ -302,7 +302,7 @@ fn put_block_weighted2(dst: &mut [u8], stride: usize, src0: &[u8], src1: &[u8],
let offset = (offset0 + offset1 + 1) >> 1;
let bias = (1 << wshift) >> 1;
- for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks(16).zip(src1.chunks(16))).take(h) {
+ for (drow, (srow0, srow1)) in dst.chunks_mut(stride).zip(src0.chunks_exact(16).zip(src1.chunks_exact(16))).take(h) {
for (dst, (&src0, &src1)) in drow[..w].iter_mut().zip(srow0.iter().zip(srow1.iter())) {
*dst = clip_u8(((i16::from(src0) * weight0 + i16::from(src1) * weight1 + bias) >> wshift) + offset);
}
diff --git a/nihav-itu/src/codecs/h264/dsp/mod.rs b/nihav-itu/src/codecs/h264/dsp/mod.rs
index b07ffe8..76936ad 100644
--- a/nihav-itu/src/codecs/h264/dsp/mod.rs
+++ b/nihav-itu/src/codecs/h264/dsp/mod.rs
@@ -130,7 +130,7 @@ pub fn idct_luma_dc(blk: &mut [i16; 16], qp: u8) {
for i in 0..4 {
transform!(luma_dc; blk[i], blk[i + 4], blk[i + 8], blk[i + 12]);
}
- for row in blk.chunks_mut(4) {
+ for row in blk.chunks_exact_mut(4) {
transform!(luma_dc; row[0], row[1], row[2], row[3]);
}
}
@@ -148,7 +148,7 @@ pub fn idct(blk: &mut [i16; 16], qp: u8, quant_dc: bool) {
for (el, &idx) in blk.iter_mut().zip(BLK_INDEX.iter()).skip(start) {
*el = (*el * LEVEL_SCALE[idx][qidx]) << shift;
}
- for row in blk.chunks_mut(4) {
+ for row in blk.chunks_exact_mut(4) {
transform!(row[0], row[1], row[2], row[3], 0);
}
for i in 0..4 {
@@ -228,7 +228,7 @@ pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
*dst = i32::from(src).wrapping_mul(i32::from(qmat[idx])).wrapping_add(bias) >> shift;
}
}
- for row in tmp.chunks_mut(8) {
+ for row in tmp.chunks_exact_mut(8) {
transform!(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]);
}
for col in 0..8 {
@@ -242,7 +242,7 @@ pub fn idct8x8(blk: &mut [i16; 64], qp: u8) {
pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16]) {
let out = &mut dst[offset..][..stride * 3 + 4];
- for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks(4)) {
+ for (line, src) in out.chunks_mut(stride).take(4).zip(coeffs.chunks_exact(4)) {
for (dst, src) in line.iter_mut().take(4).zip(src.iter()) {
*dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
}
@@ -251,7 +251,7 @@ pub fn add_coeffs(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16])
pub fn add_coeffs8(dst: &mut [u8], offset: usize, stride: usize, coeffs: &[i16; 64]) {
let out = &mut dst[offset..];
- for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks(8)) {
+ for (line, src) in out.chunks_mut(stride).take(8).zip(coeffs.chunks_exact(8)) {
for (dst, src) in line.iter_mut().take(8).zip(src.iter()) {
*dst = (i32::from(*dst) + i32::from(*src)).max(0).min(255) as u8;
}
diff --git a/nihav-itu/src/codecs/h264/mb_recon.rs b/nihav-itu/src/codecs/h264/mb_recon.rs
index 5d82503..5a204f3 100644
--- a/nihav-itu/src/codecs/h264/mb_recon.rs
+++ b/nihav-itu/src/codecs/h264/mb_recon.rs
@@ -364,7 +364,7 @@ fn do_b_mc(frm: &mut NASimpleVideoFrame<u8>, mode: BMode, xpos: usize, ypos: usi
}
}
-fn do_b_mc_4x4bi(frm: &mut NASimpleVideoFrame<u8>, xpos: usize, ypos: usize, mv: [MV; 2], ref_pic0: Option<NAVideoBufferRef<u8>>, weight0: &WeightInfo, ref_pic1: Option<NAVideoBufferRef<u8>>, weight1: &WeightInfo, mc_dsp: &mut H264MC) {
+fn do_b_mc_4x4bi(frm: &mut NASimpleVideoFrame<u8>, xpos: usize, ypos: usize, mv: &[MV; 2], ref_pic0: Option<NAVideoBufferRef<u8>>, weight0: &WeightInfo, ref_pic1: Option<NAVideoBufferRef<u8>>, weight1: &WeightInfo, mc_dsp: &mut H264MC) {
if !weight0.is_weighted() || !weight1.is_weighted() {
match (ref_pic0, ref_pic1) {
(Some(buf0), Some(buf1)) => {
@@ -585,11 +585,11 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame<u8>, slice_hdr: &SliceHeader, mb_in
do_b_mc(frm, BMode::Bi, xpos, ypos, 16, 16, mv[0], rpic0, &weight0, mv[1], rpic1, &weight1, mc_dsp);
} else {
for blk4 in 0..16 {
- let mv = sstate.get_cur_blk4(blk4).mv;
let ref_idx = sstate.get_cur_blk8(blk4_to_blk8(blk4)).ref_idx;
let rpic0 = frame_refs.select_ref_pic(0, ref_idx[0].index());
let rpic1 = frame_refs.select_ref_pic(1, ref_idx[1].index());
let (weight0, weight1) = get_weights(slice_hdr, frame_refs, BMode::Bi, weight_mode, ref_idx[0], ref_idx[1]);
+ let mv = &sstate.get_cur_blk4(blk4).mv;
do_b_mc_4x4bi(frm, xpos + (blk4 & 3) * 4, ypos + (blk4 >> 2) * 4, mv, rpic0, &weight0, rpic1, &weight1, mc_dsp);
}
}
@@ -607,11 +607,11 @@ pub fn recon_mb(frm: &mut NASimpleVideoFrame<u8>, slice_hdr: &SliceHeader, mb_in
match subtype {
SubMBType::Direct8x8 => {
for blk in 0..4 {
- let mv = sstate.get_cur_blk4(bx / 4 + (by / 4) * 4).mv;
let ref_idx = sstate.get_cur_blk8(bx / 8 + (by / 8) * 2).ref_idx;
let rpic0 = frame_refs.select_ref_pic(0, ref_idx[0].index());
let rpic1 = frame_refs.select_ref_pic(1, ref_idx[1].index());
let (weight0, weight1) = get_weights(slice_hdr, frame_refs, BMode::Bi, weight_mode, ref_idx[0], ref_idx[1]);
+ let mv = &sstate.get_cur_blk4(bx / 4 + (by / 4) * 4).mv;
do_b_mc_4x4bi(frm, xpos + bx, ypos + by, mv, rpic0, &weight0, rpic1, &weight1, mc_dsp);
bx += 4;
if blk == 1 {
diff --git a/nihav-itu/src/codecs/h264/types.rs b/nihav-itu/src/codecs/h264/types.rs
index 00aa72e..4cc1fca 100644
--- a/nihav-itu/src/codecs/h264/types.rs
+++ b/nihav-itu/src/codecs/h264/types.rs
@@ -551,7 +551,7 @@ impl SliceState {
if cur_cc || top_cc {
self.deblock[y * 4 + x] |= 0x20;
} else {
- if mvdiff4(cur_mv[0], top_mv[0]) || mvdiff4(cur_mv[1], top_mv[1]) || !frefs.cmp_refs(cur_ref, top_ref) {
+ if mvdiff4(cur_mv, top_mv) || !frefs.cmp_refs(cur_ref, top_ref) {
self.deblock[y * 4 + x] |= 0x10;
}
}
@@ -574,7 +574,7 @@ impl SliceState {
} else if cur_cc || left_cc {
self.deblock[y * 4 + x] |= 2;
} else {
- if mvdiff4(cur_mv[0], left_mv[0]) || mvdiff4(cur_mv[1], left_mv[1]) || !frefs.cmp_refs(cur_ref, left_ref) {
+ if mvdiff4(cur_mv, left_mv) || !frefs.cmp_refs(cur_ref, left_ref) {
self.deblock[y * 4 + x] |= 1;
}
}
@@ -917,7 +917,27 @@ impl SliceState {
}
}
-fn mvdiff4(mv1: MV, mv2: MV) -> bool {
- let mv = mv1 - mv2;
- (mv.x.abs() >= 4) || (mv.y.abs() >= 4)
+#[cfg(not(target_arch="x86_64"))]
+fn mvdiff4(mv1: &[MV; 2], mv2: &[MV; 2]) -> bool {
+ let mvd0 = mv1[0] - mv2[0];
+ let mvd1 = mv1[1] - mv2[1];
+ (mvd0.x.abs() >= 4) || (mvd0.y.abs() >= 4) || (mvd1.x.abs() >= 4) || (mvd1.y.abs() >= 4)
+}
+
+#[cfg(target_arch="x86_64")]
+fn mvdiff4(mv1: &[MV; 2], mv2: &[MV; 2]) -> bool {
+ unsafe {
+ let mut flag = false;
+ let ptr = std::mem::transmute::<*const MV, *const u64>(mv1.as_ptr());
+ let mut m0 = *ptr;
+ let ptr = std::mem::transmute::<*const MV, *const u64>(mv2.as_ptr());
+ let mut m1 = *ptr;
+ for _ in 0..4 {
+ let tmp = m0.wrapping_sub(m1) as u16;
+ flag |= tmp.wrapping_add(3) > 6;
+ m0 >>= 16;
+ m1 >>= 16;
+ }
+ flag
+ }
}