[x264-devel] commit: Dramatically reduce size of pixel_ssd_* asm functions ( Jason Garrett-Glaser )
git version control
git at videolan.org
Mon Nov 9 05:22:03 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Fri Oct 30 18:58:03 2009 -0700| [96bad1a71c656f4f05632d685d4b91bd54d5ce58] | committer: Jason Garrett-Glaser
Dramatically reduce size of pixel_ssd_* asm functions
~10k of code size eliminated.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=96bad1a71c656f4f05632d685d4b91bd54d5ce58
---
common/x86/pixel-a.asm | 114 ++++++++++++++++++++++++++++--------------------
common/x86/x86inc.asm | 14 +++---
2 files changed, 74 insertions(+), 54 deletions(-)
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index e691c25..15e39b6 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -77,13 +77,16 @@ SECTION .text
;=============================================================================
%macro SSD_LOAD_FULL 5
- mova m1, [r0+%1]
- mova m2, [r2+%2]
- mova m3, [r0+%3]
- mova m4, [r2+%4]
-%if %5
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ mova m1, [t0+%1]
+ mova m2, [t2+%2]
+ mova m3, [t0+%3]
+ mova m4, [t2+%4]
+%if %5==1
+ add t0, t1
+ add t2, t3
+%elif %5==2
+ lea t0, [t0+2*t1]
+ lea t2, [t2+2*t3]
%endif
%endmacro
@@ -91,7 +94,7 @@ SECTION .text
movh m%1, %3
movh m%2, %4
%if %5
- lea r0, [r0+2*r1]
+ lea t0, [t0+2*t1]
%endif
%endmacro
@@ -99,7 +102,7 @@ SECTION .text
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklbw m%1, m7
punpcklbw m%3, m7
@@ -113,7 +116,7 @@ SECTION .text
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklqdq m%1, m%2
punpcklqdq m%3, m%4
@@ -126,17 +129,17 @@ SECTION .text
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklbw m%1, m%3
punpcklbw m%2, m%4
%endmacro
%macro SSD_LOAD_HALF 5
- LOAD 1, 2, [r0+%1], [r0+%3], 1
- JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1
- LOAD 3, 4, [r0+%1], [r0+%3], %5
- JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5
+ LOAD 1, 2, [t0+%1], [t0+%3], 1
+ JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
+ LOAD 3, 4, [t0+%1], [t0+%3], %5
+ JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
%endmacro
%macro SSD_CORE 7-8
@@ -152,8 +155,8 @@ SECTION .text
mova m%2, m%1
mova m%4, m%3
punpckhbw m%1, m%5
- punpckhbw m%3, m%5
punpcklbw m%2, m%5
+ punpckhbw m%3, m%5
punpcklbw m%4, m%5
%endif
pmaddwd m%1, m%1
@@ -167,11 +170,11 @@ SECTION .text
DEINTB %6, %1, %7, %2, %5
psubw m%6, m%7
psubw m%1, m%2
- SWAP %2, %6
+ SWAP %6, %2, %1
DEINTB %6, %3, %7, %4, %5
psubw m%6, m%7
psubw m%3, m%4
- SWAP %4, %6
+ SWAP %6, %4, %3
%endif
pmaddwd m%1, m%1
pmaddwd m%2, m%2
@@ -187,7 +190,7 @@ SECTION .text
punpcklbw m%3, m%4
punpckhbw m%6, m%2
punpckhbw m%7, m%4
- SWAP %6, %2
+ SWAP %6, %2, %3
SWAP %7, %4
%endif
pmaddubsw m%1, m%5
@@ -200,28 +203,43 @@ SECTION .text
pmaddwd m%4, m%4
%endmacro
-%macro SSD_END 1
+%macro SSD_ITER 6
+ SSD_LOAD_%1 %2,%3,%4,%5,%6
+ SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
paddd m1, m2
paddd m3, m4
-%if %1
paddd m0, m1
-%else
- SWAP 0, 1
-%endif
paddd m0, m3
%endmacro
-%macro SSD_ITER 7
- SSD_LOAD_%1 %2,%3,%4,%5,%7
- SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
- SSD_END %6
-%endmacro
-
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3-4 0
-cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
+%if %1 != %2
+ %assign function_align 8
+%else
+ %assign function_align 16
+%endif
+cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
+ mov al, %1*%2/mmsize/2
+
+%if %1 != %2
+ jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
+%else
+
+.startloop:
+%ifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3
+%else
+ PROLOGUE 0,5
+ DECLARE_REG_TMP 1,2,3,4
+ mov t0, r0m
+ mov t1, r1m
+ mov t2, r2m
+ mov t3, r3m
+%endif
+
%ifidn %3, ssse3
mova m7, [hsub_mul GLOBAL]
%elifidn %3, sse2
@@ -229,57 +247,57 @@ cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
%elif %1 >= mmsize
pxor m7, m7
%endif
-%assign i 0
-%rep %2/4
+ pxor m0, m0
+
+ALIGN 16
+.loop:
%if %1 > mmsize
- SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0
- SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1
- SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0
- SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1
+ SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
%elif %1 == mmsize
- SSD_ITER FULL, 0, 0, r1, r3, i, 1
- SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1
+ SSD_ITER FULL, 0, 0, t1, t3, 2
%else
- SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1
+ SSD_ITER HALF, 0, 0, t1, t3, 2
%endif
-%assign i i+1
-%endrep
+ dec al
+ jg .loop
HADDD m0, m1
movd eax, m0
RET
+%endif
%endmacro
INIT_MMX
SSD 16, 16, mmx
SSD 16, 8, mmx
-SSD 8, 16, mmx
SSD 8, 8, mmx
+SSD 8, 16, mmx
+SSD 4, 4, mmx
SSD 8, 4, mmx
SSD 4, 8, mmx
-SSD 4, 4, mmx
INIT_XMM
SSD 16, 16, sse2slow, 8
+SSD 8, 8, sse2slow, 8
SSD 16, 8, sse2slow, 8
SSD 8, 16, sse2slow, 8
-SSD 8, 8, sse2slow, 8
SSD 8, 4, sse2slow, 8
%define SSD_CORE SSD_CORE_SSE2
%define JOIN JOIN_SSE2
SSD 16, 16, sse2, 8
+SSD 8, 8, sse2, 8
SSD 16, 8, sse2, 8
SSD 8, 16, sse2, 8
-SSD 8, 8, sse2, 8
SSD 8, 4, sse2, 8
%define SSD_CORE SSD_CORE_SSSE3
%define JOIN JOIN_SSSE3
SSD 16, 16, ssse3, 8
+SSD 8, 8, ssse3, 8
SSD 16, 8, ssse3, 8
SSD 8, 16, ssse3, 8
-SSD 8, 8, ssse3, 8
SSD 8, 4, ssse3, 8
INIT_MMX
-SSD 4, 8, ssse3
SSD 4, 4, ssse3
+SSD 4, 8, ssse3
+%assign function_align 16
;=============================================================================
; variance
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index d42f4f3..753fa96 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -40,6 +40,12 @@
%endif
%endif
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
; FIXME: All of the 64bit asm functions that take a stride as an argument
; via register, assume that the high dword of that register is filled with 0.
; This is true in practice (since we never do any 64bit arithmetic on strides,
@@ -446,9 +452,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
; Symbol prefix for C linkage
%macro cglobal 1-2+
- %ifdef PREFIX
- %xdefine %1 _ %+ %1
- %endif
+ %xdefine %1 mangle(%1)
%xdefine %1.skip_prologue %1 %+ .skip_prologue
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
@@ -465,9 +469,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endmacro
%macro cextern 1
- %ifdef PREFIX
- %xdefine %1 _%1
- %endif
+ %xdefine %1 mangle(%1)
extern %1
%endmacro
More information about the x264-devel
mailing list