aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-09-17 21:45:38 -0300
committerJames Almer <jamrial@gmail.com>2014-09-17 23:52:36 -0300
commit77f9a81ccaadb34f309dc8922e8939442e4e81aa (patch)
tree8a0536c0c24e1bfac987a2cf8c19ddcf90bdc334 /libavcodec
parent986f63d17fa14e757fd11b4b50acac563bd1bb9d (diff)
downloadffmpeg-77f9a81ccaadb34f309dc8922e8939442e4e81aa.tar.gz
x86/me_cmp: combine sad functions into a single macro
No point in having the sad8 functions separate now that the loop is no longer unrolled. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/me_cmp.asm221
1 files changed, 93 insertions, 128 deletions
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index ef591f54b5..b657642c41 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -473,43 +473,35 @@ HF_NOISE 16
;---------------------------------------------------------------------------------------
;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
;---------------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal sad8, 5, 5, 0, v, pix1, pix2, stride, h
+;%1 = 8/16
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
movu m2, [pix2q]
movu m1, [pix2q+strideq]
psadbw m2, [pix1q]
psadbw m1, [pix1q+strideq]
paddw m2, m1
- sub hd, 2
-
-.loop:
- lea pix1q, [pix1q+strideq*2]
- lea pix2q, [pix2q+strideq*2]
- movu m0, [pix2q]
- movu m1, [pix2q+strideq]
- psadbw m0, [pix1q]
- psadbw m1, [pix1q+strideq]
+%if %1 != mmsize
+ movu m0, [pix2q+8]
+ movu m1, [pix2q+strideq+8]
+ psadbw m0, [pix1q+8]
+ psadbw m1, [pix1q+strideq+8]
paddw m2, m0
paddw m2, m1
+%endif
sub hd, 2
- jne .loop
-
- movd eax, m2
- RET
-
-%macro SAD16 0
-cglobal sad16, 5, 5, 3, v, pix1, pix2, stride, h
- pxor m2, m2
align 16
-.loop
+.loop:
+ lea pix1q, [pix1q+strideq*2]
+ lea pix2q, [pix2q+strideq*2]
movu m0, [pix2q]
movu m1, [pix2q+strideq]
psadbw m0, [pix1q]
psadbw m1, [pix1q+strideq]
paddw m2, m0
paddw m2, m1
-%if mmsize == 8
+%if %1 != mmsize
movu m0, [pix2q+8]
movu m1, [pix2q+strideq+8]
psadbw m0, [pix1q+8]
@@ -517,8 +509,6 @@ align 16
paddw m2, m0
paddw m2, m1
%endif
- lea pix1q, [pix1q+strideq*2]
- lea pix2q, [pix2q+strideq*2]
sub hd, 2
jg .loop
%if mmsize == 16
@@ -530,47 +520,47 @@ align 16
%endmacro
INIT_MMX mmxext
-SAD16
+SAD 8
+SAD 16
INIT_XMM sse2
-SAD16
+SAD 16
;------------------------------------------------------------------------------------------
;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
;------------------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal sad8_x2, 5, 5, 0, v, pix1, pix2, stride, h
+;%1 = 8/16
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
movu m0, [pix2q]
movu m2, [pix2q+strideq]
+%if mmsize == 16
+ movu m3, [pix2q+1]
+ movu m4, [pix2q+strideq+1]
+ pavgb m0, m3
+ pavgb m2, m4
+%else
pavgb m0, [pix2q+1]
pavgb m2, [pix2q+strideq+1]
+%endif
psadbw m0, [pix1q]
psadbw m2, [pix1q+strideq]
paddw m0, m2
- sub hd, 2
-
-.loop:
- lea pix1q, [pix1q+2*strideq]
- lea pix2q, [pix2q+2*strideq]
- movu m1, [pix2q]
- movu m2, [pix2q+strideq]
- pavgb m1, [pix2q+1]
- pavgb m2, [pix2q+strideq+1]
- psadbw m1, [pix1q]
- psadbw m2, [pix1q+strideq]
+%if %1 != mmsize
+ movu m1, [pix2q+8]
+ movu m2, [pix2q+strideq+8]
+ pavgb m1, [pix2q+9]
+ pavgb m2, [pix2q+strideq+9]
+ psadbw m1, [pix1q+8]
+ psadbw m2, [pix1q+strideq+8]
paddw m0, m1
paddw m0, m2
+%endif
sub hd, 2
- jne .loop
-
- movd eax, m0
- RET
-
-%macro SAD16_X2 0
-cglobal sad16_x2, 5, 5, 5, v, pix1, pix2, stride, h
- pxor m0, m0
align 16
.loop:
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
movu m1, [pix2q]
movu m2, [pix2q+strideq]
%if mmsize == 16
@@ -586,7 +576,7 @@ align 16
psadbw m2, [pix1q+strideq]
paddw m0, m1
paddw m0, m2
-%if mmsize == 8
+%if %1 != mmsize
movu m1, [pix2q+8]
movu m2, [pix2q+strideq+8]
pavgb m1, [pix2q+9]
@@ -596,8 +586,6 @@ align 16
paddw m0, m1
paddw m0, m2
%endif
- lea pix1q, [pix1q+2*strideq]
- lea pix2q, [pix2q+2*strideq]
sub hd, 2
jg .loop
%if mmsize == 16
@@ -609,56 +597,45 @@ align 16
%endmacro
INIT_MMX mmxext
-SAD16_X2
+SAD_X2 8
+SAD_X2 16
INIT_XMM sse2
-SAD16_X2
+SAD_X2 16
;------------------------------------------------------------------------------------------
;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
;------------------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal sad8_y2, 5, 5, 0, v, pix1, pix2, stride, h
+;%1 = 8/16
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
movu m1, [pix2q]
movu m0, [pix2q+strideq]
movu m3, [pix2q+2*strideq]
pavgb m1, m0
pavgb m0, m3
- add pix2q, strideq
psadbw m1, [pix1q]
psadbw m0, [pix1q+strideq]
paddw m0, m1
mova m1, m3
- sub hd, 2
-
-.loop:
- lea pix1q, [pix1q+2*strideq]
- lea pix2q, [pix2q+2*strideq]
- movu m2, [pix2q]
- movu m3, [pix2q+strideq]
- pavgb m1, m2
- pavgb m2, m3
- psadbw m1, [pix1q]
- psadbw m2, [pix1q+strideq]
- paddw m0, m1
- paddw m0, m2
- mova m1, m3
- sub hd, 2
- jne .loop
-
- movd eax, m0
- RET
-
-%macro SAD16_Y2 0
-cglobal sad16_y2, 5, 5, 4, v, pix1, pix2, stride, h
- movu m1, [pix2q]
-%if mmsize == 8
+%if %1 != mmsize
movu m4, [pix2q+8]
+ movu m5, [pix2q+strideq+8]
+ movu m6, [pix2q+2*strideq+8]
+ pavgb m4, m5
+ pavgb m5, m6
+ psadbw m4, [pix1q+8]
+ psadbw m5, [pix1q+strideq+8]
+ paddw m0, m4
+ paddw m0, m5
+ mova m4, m6
%endif
- pxor m0, m0
add pix2q, strideq
+ sub hd, 2
align 16
.loop:
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
movu m2, [pix2q]
movu m3, [pix2q+strideq]
pavgb m1, m2
@@ -668,7 +645,7 @@ align 16
paddw m0, m1
paddw m0, m2
mova m1, m3
-%if mmsize == 8
+%if %1 != mmsize
movu m5, [pix2q+8]
movu m6, [pix2q+strideq+8]
pavgb m4, m5
@@ -679,8 +656,6 @@ align 16
paddw m0, m5
mova m4, m6
%endif
- lea pix1q, [pix1q+2*strideq]
- lea pix2q, [pix2q+2*strideq]
sub hd, 2
jg .loop
%if mmsize == 16
@@ -692,72 +667,63 @@ align 16
%endmacro
INIT_MMX mmxext
-SAD16_Y2
+SAD_Y2 8
+SAD_Y2 16
INIT_XMM sse2
-SAD16_Y2
+SAD_Y2 16
;-------------------------------------------------------------------------------------------
;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
;-------------------------------------------------------------------------------------------
-INIT_MMX mmxext
-cglobal sad8_approx_xy2, 5, 5, 0, v, pix1, pix2, stride, h
- pxor m0, m0
+;%1 = 8/16
+%macro SAD_APPROX_XY2 1
+cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
mova m4, [pb_1]
movu m1, [pix2q]
movu m0, [pix2q+strideq]
movu m3, [pix2q+2*strideq]
+%if mmsize == 16
+ movu m5, [pix2q+1]
+ movu m6, [pix2q+strideq+1]
+ movu m2, [pix2q+2*strideq+1]
+ pavgb m1, m5
+ pavgb m0, m6
+ pavgb m3, m2
+%else
pavgb m1, [pix2q+1]
pavgb m0, [pix2q+strideq+1]
pavgb m3, [pix2q+2*strideq+1]
+%endif
psubusb m0, m4
pavgb m1, m0
pavgb m0, m3
- add pix2q, strideq
psadbw m1, [pix1q]
psadbw m0, [pix1q+strideq]
paddw m0, m1
mova m1, m3
- sub hd, 2
-
-.loop:
- lea pix1q, [pix1q+2*strideq]
- lea pix2q, [pix2q+2*strideq]
- movu m2, [pix2q]
- movu m3, [pix2q+strideq]
- pavgb m2, [pix2q+1]
- pavgb m3, [pix2q+strideq+1]
- psubusb m2, m4
- pavgb m1, m2
- pavgb m2, m3
- psadbw m1, [pix1q]
- psadbw m2, [pix1q+strideq]
- paddw m0, m1
- paddw m0, m2
- mova m1, m3
- sub hd, 2
- jne .loop
-
- movd eax, m0
- RET
-
-%macro SAD16_APPROX_XY2 0
-cglobal sad16_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
- pxor m0, m0
- mova m4, [pb_1]
-
- movu m1, [pix2q]
-%if mmsize == 16
- movu m2, [pix2q+1]
- pavgb m1, m2
-%else
+%if %1 != mmsize
movu m5, [pix2q+8]
- pavgb m1, [pix2q+1]
- pavgb m5, [pix2q+8+1]
+ movu m6, [pix2q+strideq+8]
+ movu m7, [pix2q+2*strideq+8]
+ pavgb m5, [pix2q+1+8]
+ pavgb m6, [pix2q+strideq+1+8]
+ pavgb m7, [pix2q+2*strideq+1+8]
+ psubusb m6, m4
+ pavgb m5, m6
+ pavgb m6, m7
+ psadbw m5, [pix1q+8]
+ psadbw m6, [pix1q+strideq+8]
+ paddw m0, m5
+ paddw m0, m6
+ mova m5, m7
%endif
add pix2q, strideq
+ sub hd, 2
align 16
.loop:
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
movu m2, [pix2q]
movu m3, [pix2q+strideq]
%if mmsize == 16
@@ -777,7 +743,7 @@ align 16
paddw m0, m1
paddw m0, m2
mova m1, m3
-%if mmsize == 8
+%if %1 != mmsize
movu m6, [pix2q+8]
movu m7, [pix2q+strideq+8]
pavgb m6, [pix2q+8+1]
@@ -791,8 +757,6 @@ align 16
paddw m0, m6
mova m5, m7
%endif
- lea pix1q, [pix1q+2*strideq]
- lea pix2q, [pix2q+2*strideq]
sub hd, 2
jg .loop
%if mmsize == 16
@@ -804,6 +768,7 @@ align 16
%endmacro
INIT_MMX mmxext
-SAD16_APPROX_XY2
+SAD_APPROX_XY2 8
+SAD_APPROX_XY2 16
INIT_XMM sse2
-SAD16_APPROX_XY2
+SAD_APPROX_XY2 16