aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorReimar Döffinger <Reimar.Doeffinger@gmx.de>2012-08-11 14:15:09 +0200
committerReimar Döffinger <Reimar.Doeffinger@gmx.de>2012-08-12 23:23:19 +0200
commit118bd609f048f457cf42d358a07510b87626f316 (patch)
tree752a186823b7483c0eb9ed6117367a40a30f3cbb
parentbb7073921c75b1b4f793fca5123f42ac0da0b9b6 (diff)
downloadffmpeg-118bd609f048f457cf42d358a07510b87626f316.tar.gz
Optimized unscaled yuvp9/yuvp10 -> yuvp16 conversion.
About 30% faster on 32 bit Atom, 120% faster on 64 bit Phenom2. This is interesting because supporting P16 is easier in e.g. OpenGL (can misuse support for any 2-component 8 bit format), whereas supporting p9/p10 without conversion needs a texture format with at least 14 bits actual precision. The shiftonly == 0 case is not optimized since the code is more complex and the speed gain less obvious. Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
-rw-r--r--libswscale/swscale_unscaled.c27
1 files changed, 27 insertions, 0 deletions
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index c391a07d51..9180f2eb5c 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -830,7 +830,34 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
srcPtr += srcStride[plane];
}
} else if (src_depth <= dst_depth) {
+ int orig_length = length;
for (i = 0; i < height; i++) {
+ if(isBE(c->srcFormat) == HAVE_BIGENDIAN &&
+ isBE(c->dstFormat) == HAVE_BIGENDIAN &&
+ shiftonly) {
+ unsigned shift = dst_depth - src_depth;
+ length = orig_length;
+#if HAVE_FAST_64BIT
+#define FAST_COPY_UP(shift) \
+ for (j = 0; j < length - 3; j += 4) { \
+ uint64_t v = AV_RN64A(srcPtr2 + j); \
+ AV_WN64A(dstPtr2 + j, v << shift); \
+ } \
+ length &= 3;
+#else
+#define FAST_COPY_UP(shift) \
+ for (j = 0; j < length - 1; j += 2) { \
+ uint32_t v = AV_RN32A(srcPtr2 + j); \
+ AV_WN32A(dstPtr2 + j, v << shift); \
+ } \
+ length &= 1;
+#endif
+ switch (shift)
+ {
+ case 6: FAST_COPY_UP(6); break;
+ case 7: FAST_COPY_UP(7); break;
+ }
+ }
#define COPY_UP(r,w) \
if(shiftonly){\
for (j = 0; j < length; j++){ \