[x264-devel] commit: Shrink a few x86 asm functions (Jason Garrett-Glaser )
git at videolan.org
git at videolan.org
Thu May 6 07:49:20 CEST 2010
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Fri Apr 30 09:57:55 2010 -0700| [9ec07af8692c9e10f048f0b41897e66ac3a062cb] | committer: Jason Garrett-Glaser
Shrink a few x86 asm functions
Add a few more instructions to cut down on the use of the 4-byte addressing mode.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9ec07af8692c9e10f048f0b41897e66ac3a062cb
---
common/x86/predict-a.asm | 24 ++++++++++++++----------
common/x86/sad-a.asm | 6 +++++-
2 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 4d03f8f..e1378ae 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -506,10 +506,10 @@ cglobal predict_8x8_ddl_mmxext, 2,2
movq mm4, [r1+25]
movq mm1, mm5
psllq mm1, 8
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6
-
-%assign Y 7
+%assign Y 3
%rep 6
movq [r0+Y*FDEC_STRIDE], mm1
movq mm2, mm0
@@ -535,10 +535,10 @@ cglobal predict_8x8_ddr_mmxext, 2,2
movq mm2, [r1+9]
movq mm3, [r1+15]
movq mm4, [r1+17]
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7
PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6
-
-%assign Y 7
+%assign Y 3
%rep 6
movq [r0+Y*FDEC_STRIDE], mm0
movq mm2, mm1
@@ -622,9 +622,10 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
movq mm1, [r1+14]
movq mm4, mm3
pavgb mm3, mm2
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
-%assign Y 0
+%assign Y -4
%rep 3
movq [r0+ Y *FDEC_STRIDE], mm3
movq [r0+(Y+1)*FDEC_STRIDE], mm0
@@ -717,9 +718,10 @@ cglobal predict_8x8_ddl_sse2, 2,2
movdqu xmm2, [r1+17]
movdqa xmm1, xmm3
pslldq xmm1, 1
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
-%assign Y 0
+%assign Y -4
%rep 8
psrldq xmm0, 1
movq [r0+Y*FDEC_STRIDE], xmm0
@@ -735,11 +737,12 @@ cglobal predict_8x8_ddr_sse2, 2,2
movdqu xmm1, [r1+7]
movdqa xmm2, xmm3
psrldq xmm2, 1
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
movdqa xmm1, xmm0
psrldq xmm1, 1
-%assign Y 7
+%assign Y 3
%rep 3
movq [r0+Y*FDEC_STRIDE], xmm0
movq [r0+(Y-1)*FDEC_STRIDE], xmm1
@@ -747,8 +750,8 @@ cglobal predict_8x8_ddr_sse2, 2,2
psrldq xmm1, 2
%assign Y (Y-2)
%endrep
- movq [r0+1*FDEC_STRIDE], xmm0
- movq [r0+0*FDEC_STRIDE], xmm1
+ movq [r0-3*FDEC_STRIDE], xmm0
+ movq [r0-4*FDEC_STRIDE], xmm1
RET
@@ -763,11 +766,12 @@ cglobal predict_8x8_vl_sse2, 2,2
psrldq xmm2, 1
pslldq xmm1, 1
pavgb xmm3, xmm2
+ add r0, FDEC_STRIDE*4
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
; xmm3: (t0 + t1 + 1) >> 1
-%assign Y 0
+%assign Y -4
%rep 3
psrldq xmm0, 1
movq [r0+ Y *FDEC_STRIDE], xmm3
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 6db8abf..0a5ceb1 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -526,10 +526,14 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
%endif
%assign x 0
%rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*x]
+ movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
+%if (x&3)==3 && x!=15
+ add r1, FDEC_STRIDE*4
+%endif
add r3d, r4d
%assign x x+1
%endrep
+ sub r1, FDEC_STRIDE*12
add r3d, 16
shr r3d, 5
imul r3d, 0x01010101
More information about the x264-devel
mailing list