aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/alpha
diff options
context:
space:
mode:
authorNick Kurshev <nickols_k@mail.ru>2002-01-20 14:48:02 +0000
committerNick Kurshev <nickols_k@mail.ru>2002-01-20 14:48:02 +0000
commit1e98dffb7aa4b4681ecc7949e7ad58acc80ad86a (patch)
treeeda5315707572d48e2f75e55cd210254068fea18 /libavcodec/alpha
parent4bdd9157cc0b06c7001cb93e5cdd6304306253c4 (diff)
downloadffmpeg-1e98dffb7aa4b4681ecc7949e7ad58acc80ad86a.tar.gz
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
Originally committed as revision 274 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/alpha')
-rw-r--r--libavcodec/alpha/asm.h141
-rw-r--r--libavcodec/alpha/dsputil_alpha.c223
-rw-r--r--libavcodec/alpha/mpegvideo_alpha.c88
-rw-r--r--libavcodec/alpha/pixops.h135
4 files changed, 587 insertions, 0 deletions
diff --git a/libavcodec/alpha/asm.h b/libavcodec/alpha/asm.h
new file mode 100644
index 0000000000..088497c24c
--- /dev/null
+++ b/libavcodec/alpha/asm.h
@@ -0,0 +1,141 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef LIBAVCODEC_ALPHA_ASM_H
+#define LIBAVCODEC_ALPHA_ASM_H
+
+#include <stdint.h>
+
+#define AMASK_BWX (1 << 0)
+#define AMASK_FIX (1 << 1)
+#define AMASK_MVI (1 << 8)
+
+static inline uint64_t BYTE_VEC(uint64_t x)
+{
+ x |= x << 8;
+ x |= x << 16;
+ x |= x << 32;
+ return x;
+}
+static inline uint64_t WORD_VEC(uint64_t x)
+{
+ x |= x << 16;
+ x |= x << 32;
+ return x;
+}
+
+static inline int32_t ldl(const void* p)
+{
+ return *(const int32_t*) p;
+}
+static inline uint64_t ldq(const void* p)
+{
+ return *(const uint64_t*) p;
+}
+/* FIXME ccc doesn't seem to get it? Use inline asm? */
+static inline uint64_t ldq_u(const void* p)
+{
+ return *(const uint64_t*) ((uintptr_t) p & ~7ul);
+}
+static inline void stl(uint32_t l, void* p)
+{
+ *(uint32_t*) p = l;
+}
+static inline void stq(uint64_t l, void* p)
+{
+ *(uint64_t*) p = l;
+}
+
+#ifdef __GNUC__
+#define OPCODE1(name) \
+static inline uint64_t name(uint64_t l) \
+{ \
+ uint64_t r; \
+ asm (#name " %1, %0" : "=r" (r) : "r" (l)); \
+ return r; \
+}
+
+#define OPCODE2(name) \
+static inline uint64_t name(uint64_t l1, uint64_t l2) \
+{ \
+ uint64_t r; \
+ asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2)); \
+ return r; \
+}
+
+/* We don't want gcc to move this around or combine it with another
+ rpcc, so mark it volatile. */
+static inline uint64_t rpcc(void)
+{
+ uint64_t r;
+ asm volatile ("rpcc %0" : "=r" (r));
+ return r;
+}
+
+static inline uint64_t uldq(const void* v)
+{
+ struct foo {
+ unsigned long l;
+ } __attribute__((packed));
+
+ return ((const struct foo*) v)->l;
+}
+
+#elif defined(__DECC) /* Compaq "ccc" compiler */
+
+#include <c_asm.h>
+#define OPCODE1(name) \
+static inline uint64_t name(uint64_t l) \
+{ \
+ return asm (#name " %a0, %v0", l); \
+}
+
+#define OPCODE2(name) \
+static inline uint64_t name(uint64_t l1, uint64_t l2) \
+{ \
+ return asm (#name " %a0, %a1, %v0", l1, l2); \
+}
+
+static inline uint64_t rpcc(void)
+{
+ return asm ("rpcc %v0");
+}
+
+static inline uint64_t uldq(const void* v)
+{
+ return *(const __unaligned uint64_t *) v;
+}
+
+#endif
+
+OPCODE1(amask);
+OPCODE1(unpkbw);
+OPCODE1(pkwb);
+OPCODE2(extql);
+OPCODE2(extqh);
+OPCODE2(zap);
+OPCODE2(cmpbge);
+OPCODE2(minsw4);
+OPCODE2(minuw4);
+OPCODE2(minub8);
+OPCODE2(maxsw4);
+OPCODE2(maxuw4);
+OPCODE2(perr);
+
+#endif /* LIBAVCODEC_ALPHA_ASM_H */
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c
new file mode 100644
index 0000000000..7212a659bd
--- /dev/null
+++ b/libavcodec/alpha/dsputil_alpha.c
@@ -0,0 +1,223 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "asm.h"
+#include "../dsputil.h"
+
+void simple_idct_axp(DCTELEM *block);
+
+static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
+ int line_size)
+{
+ int i = 8;
+ do {
+ UINT64 shorts;
+
+ shorts = ldq(block);
+ shorts = maxsw4(shorts, 0);
+ shorts = minsw4(shorts, WORD_VEC(0x00ff));
+ stl(pkwb(shorts), pixels);
+
+ shorts = ldq(block + 4);
+ shorts = maxsw4(shorts, 0);
+ shorts = minsw4(shorts, WORD_VEC(0x00ff));
+ stl(pkwb(shorts), pixels + 4);
+
+ pixels += line_size;
+ block += 8;
+ } while (--i);
+}
+
+static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
+ int line_size)
+{
+ int i = 8;
+ do {
+ UINT64 shorts;
+
+ shorts = ldq(block);
+ shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
+ shorts += unpkbw(ldl(pixels));
+ shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
+ shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
+ shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
+ shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
+ stl(pkwb(shorts), pixels);
+
+ /* next 4 */
+ shorts = ldq(block + 4);
+ shorts &= ~WORD_VEC(0x8000);
+ shorts += unpkbw(ldl(pixels + 4));
+ shorts &= ~WORD_VEC(0x8000);
+ shorts = minuw4(shorts, WORD_VEC(0x4000));
+ shorts &= ~WORD_VEC(0x4000);
+ shorts = minsw4(shorts, WORD_VEC(0x00ff));
+ stl(pkwb(shorts), pixels + 4);
+
+ pixels += line_size;
+ block += 8;
+ } while (--i);
+}
+
+/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
+ Since the immediate result could be greater than 255, we do the
+ shift first. The result is too low by one if the bytes were both
+ odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
+static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
+{
+ UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
+ l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
+ l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
+ return l1 + l2 + correction;
+}
+
+/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
+ The '1' only has an effect when one byte is even and the other odd,
+ i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
+ Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */
+static inline UINT64 avg2(UINT64 l1, UINT64 l2)
+{
+ UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
+ l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
+ l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
+ return l1 + l2 + correction;
+}
+
+static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+{
+ UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+ UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
+ + (l2 & BYTE_VEC(0x03))
+ + (l3 & BYTE_VEC(0x03))
+ + (l4 & BYTE_VEC(0x03))
+ + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+ return r1 + r2;
+}
+
+static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+{
+ UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+ UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
+ + (l2 & BYTE_VEC(0x03))
+ + (l3 & BYTE_VEC(0x03))
+ + (l4 & BYTE_VEC(0x03))
+ + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
+ return r1 + r2;
+}
+
+#define PIXOPNAME(suffix) put ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2
+#define AVG4 avg4
+#define STORE(l, b) stq(l, b)
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#define PIXOPNAME(suffix) put_no_rnd ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define STORE(l, b) stq(l, b)
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+/* The following functions are untested. */
+#if 0
+
+#define PIXOPNAME(suffix) avg ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2
+#define AVG4 avg4
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#define PIXOPNAME(suffix) avg_no_rnd ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#define PIXOPNAME(suffix) sub ## suffix
+#define BTYPE DCTELEM
+#define AVG2 avg2
+#define AVG4 avg4
+#define STORE(l, block) do { \
+ UINT64 xxx = l; \
+ (block)[0] -= (xxx >> 0) & 0xff; \
+ (block)[1] -= (xxx >> 8) & 0xff; \
+ (block)[2] -= (xxx >> 16) & 0xff; \
+ (block)[3] -= (xxx >> 24) & 0xff; \
+ (block)[4] -= (xxx >> 32) & 0xff; \
+ (block)[5] -= (xxx >> 40) & 0xff; \
+ (block)[6] -= (xxx >> 48) & 0xff; \
+ (block)[7] -= (xxx >> 56) & 0xff; \
+} while (0)
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#endif
+
+void dsputil_init_alpha(void)
+{
+ put_pixels_tab[0] = put_pixels_axp;
+ put_pixels_tab[1] = put_pixels_x2_axp;
+ put_pixels_tab[2] = put_pixels_y2_axp;
+ put_pixels_tab[3] = put_pixels_xy2_axp;
+
+ put_no_rnd_pixels_tab[0] = put_pixels_axp;
+ put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
+ put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
+ put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
+
+ /* amask clears all bits that correspond to present features. */
+ if (amask(AMASK_MVI) == 0) {
+ fprintf(stderr, "MVI extension detected\n");
+ put_pixels_clamped = put_pixels_clamped_axp;
+ add_pixels_clamped = add_pixels_clamped_axp;
+ }
+}
diff --git a/libavcodec/alpha/mpegvideo_alpha.c b/libavcodec/alpha/mpegvideo_alpha.c
new file mode 100644
index 0000000000..db4fd3a9db
--- /dev/null
+++ b/libavcodec/alpha/mpegvideo_alpha.c
@@ -0,0 +1,88 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "asm.h"
+#include "../dsputil.h"
+#include "../mpegvideo.h"
+
+extern UINT8 zigzag_end[64];
+
+static void dct_unquantize_h263_axp(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int i, level;
+ UINT64 qmul, qadd;
+ if (s->mb_intra) {
+ if (n < 4)
+ block[0] = block[0] * s->y_dc_scale;
+ else
+ block[0] = block[0] * s->c_dc_scale;
+ /* Catch up to aligned point. */
+ qmul = s->qscale << 1;
+ qadd = (s->qscale - 1) | 1;
+ for (i = 1; i < 4; ++i) {
+ level = block[i];
+ if (level) {
+ if (level < 0) {
+ level = level * qmul - qadd;
+ } else {
+ level = level * qmul + qadd;
+ }
+ block[i] = level;
+ }
+ }
+ block += 4;
+ i = 60 / 4;
+ } else {
+ i = zigzag_end[s->block_last_index[n]] / 4;
+ }
+ qmul = s->qscale << 1;
+ qadd = WORD_VEC((qscale - 1) | 1);
+ do {
+ UINT64 levels, negmask, zeromask, corr;
+ levels = ldq(block);
+ if (levels == 0)
+ continue;
+ zeromask = cmpbge(0, levels);
+ zeromask &= zeromask >> 1;
+ /* Negate all negative words. */
+ negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */
+ negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
+ corr = negmask & WORD_VEC(0x0001); /* twos-complement correction */
+ levels ^= negmask;
+ levels += corr;
+
+ levels = levels * qmul;
+ levels += zap(qadd, zeromask);
+
+ /* Re-negate negative words. */
+ levels -= corr;
+ levels ^= negmask;
+
+ stq(levels, block);
+ } while (block += 4, --i);
+}
+
+void MPV_common_init_axp(MpegEncContext *s)
+{
+ if (amask(AMASK_MVI) == 0) {
+ if (s->out_format == FMT_H263)
+ s->dct_unquantize = dct_unquantize_h263_axp;
+ }
+}
diff --git a/libavcodec/alpha/pixops.h b/libavcodec/alpha/pixops.h
new file mode 100644
index 0000000000..7ae72ed779
--- /dev/null
+++ b/libavcodec/alpha/pixops.h
@@ -0,0 +1,135 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This file is intended to be #included with proper definitions of
+ * PIXOPNAME, BTYPE, AVG2, AVG4 and STORE. */
+
+static void PIXOPNAME(_pixels_axp)(BTYPE *block, const UINT8 *pixels,
+ int line_size, int h)
+{
+ if ((size_t) pixels & 0x7) {
+ do {
+ STORE(uldq(pixels), block);
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+ } else {
+ do {
+ STORE(ldq(pixels), block);
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+ }
+}
+
+static void PIXOPNAME(_pixels_x2_axp)(BTYPE *block, const UINT8 *pixels,
+ int line_size, int h)
+{
+ if ((size_t) pixels & 0x7) {
+ do {
+ UINT64 pix1, pix2;
+
+ pix1 = uldq(pixels);
+ pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+ STORE(AVG2(pix1, pix2), block);
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+ } else {
+ do {
+ UINT64 pix1, pix2;
+
+ pix1 = ldq(pixels);
+ pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+ STORE(AVG2(pix1, pix2), block);
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+ }
+}
+
+static void PIXOPNAME(_pixels_y2_axp)(BTYPE *block, const UINT8 *pixels,
+ int line_size, int h)
+{
+ if ((size_t) pixels & 0x7) {
+ UINT64 pix = uldq(pixels);
+ do {
+ UINT64 next_pix;
+
+ pixels += line_size;
+ next_pix = uldq(pixels);
+ STORE(AVG2(pix, next_pix), block);
+ block += line_size;
+ pix = next_pix;
+ } while (--h);
+ } else {
+ UINT64 pix = ldq(pixels);
+ do {
+ UINT64 next_pix;
+
+ pixels += line_size;
+ next_pix = ldq(pixels);
+ STORE(AVG2(pix, next_pix), block);
+ block += line_size;
+ pix = next_pix;
+ } while (--h);
+ }
+}
+
+/* This could be further sped up by recycling AVG4 intermediate
+ results from the previous loop pass. */
+static void PIXOPNAME(_pixels_xy2_axp)(BTYPE *block, const UINT8 *pixels,
+ int line_size, int h)
+{
+ if ((size_t) pixels & 0x7) {
+ UINT64 pix1 = uldq(pixels);
+ UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+ do {
+ UINT64 next_pix1, next_pix2;
+
+ pixels += line_size;
+ next_pix1 = uldq(pixels);
+ next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+ STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
+
+ block += line_size;
+ pix1 = next_pix1;
+ pix2 = next_pix2;
+ } while (--h);
+ } else {
+ UINT64 pix1 = ldq(pixels);
+ UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+ do {
+ UINT64 next_pix1, next_pix2;
+
+ pixels += line_size;
+ next_pix1 = ldq(pixels);
+ next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+ STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
+
+ block += line_size;
+ pix1 = next_pix1;
+ pix2 = next_pix2;
+ } while (--h);
+ }
+}