[x264-devel] commit: Faster SSSE3 hpel_filter_v (Holger Lubitz )

git version control git at videolan.org
Tue Mar 10 12:02:54 CET 2009


x264 | branch: master | Holger Lubitz <holger at lubitz.org> | Mon Mar  9 14:05:16 2009 -0700| [96733ab692b4a268685d65070d6977964a466c91] | committer: Jason Garrett-Glaser 

Faster SSSE3 hpel_filter_v
~10% faster hpel_filter on 64-bit Penryn.
32-bit version by Jason Garrett-Glaser.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=96733ab692b4a268685d65070d6977964a466c91
---

 common/x86/mc-a2.asm |   64 +++++++++++++++++++++++++++++++++----------------
 1 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index d0ca1ef..775a84e 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -28,7 +28,9 @@
 
 SECTION_RODATA
 
-pb_1:  times 16 db 1
+filt_mul20: times 16 db 20
+filt_mul51: times 8 db 1, -5
+
 pw_1:  times 8 dw 1
 pw_16: times 8 dw 16
 pw_32: times 8 dw 32
@@ -122,13 +124,13 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
 %ifnidn %1, ssse3
     pxor m0, m0
 %else
-    mova m0, [pb_1 GLOBAL]
+    mova m0, [filt_mul51 GLOBAL]
 %endif
 .loop:
 %ifidn %1, ssse3
     mova m1, [r1]
-    mova m4, [r5+r3*2]
-    mova m2, [r1+r3]
+    mova m4, [r1+r3]
+    mova m2, [r5+r3*2]
     mova m5, [r5+r3]
     mova m3, [r1+r3*2]
     mova m6, [r5]
@@ -139,15 +141,19 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
     pmaddubsw m4, m0
     pmaddubsw m2, m0
     pmaddubsw m5, m0
-    pmaddubsw m3, m0
-    pmaddubsw m6, m0
+    pmaddubsw m3, [filt_mul20 GLOBAL]
+    pmaddubsw m6, [filt_mul20 GLOBAL]
+    paddw  m1, m2
+    paddw  m4, m5
+    paddw  m1, m3
+    paddw  m4, m6
 %else
     LOAD_ADD_2 m1, m4, [r1     ], [r5+r3*2], m6, m7            ; a0 / a1
     LOAD_ADD_2 m2, m5, [r1+r3  ], [r5+r3  ], m6, m7            ; b0 / b1
     LOAD_ADD   m3,     [r1+r3*2], [r5     ], m7                ; c0
     LOAD_ADD   m6,     [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
-%endif
     FILT_V2
+%endif
     mova      m7, [pw_16 GLOBAL]
     mova      [r2+r4*2], m1
     mova      [r2+r4*2+mmsize], m4
@@ -424,27 +430,41 @@ HPEL_V ssse3
 %macro DO_FILT_V 6
 %ifidn %6, ssse3
     mova m1, [r3]
-    mova m5, [r1+r2*2]
     mova m2, [r3+r2]
-    mova m6, [r1+r2]
-    mova m3, [pb_1 GLOBAL]
+    mova %3, [r3+r2*2]
+    mova m3, [r1]
+    mova %4, [r1+r2]
+    mova m0, [r1+r2*2]
+    mova %2, [filt_mul51 GLOBAL]
     mova m4, m1
-    punpcklbw m1, m5
-    punpckhbw m4, m5
-    mova m5, m2
-    punpcklbw m2, m6
-    punpckhbw m5, m6
-    pmaddubsw m1, m3
-    pmaddubsw m4, m3
-    pmaddubsw m2, m3
-    pmaddubsw m5, m3
+    punpcklbw m1, m2
+    punpckhbw m4, m2
+    mova m2, m0
+    punpcklbw m0, %4
+    punpckhbw m2, %4
+    mova %1, m3
+    punpcklbw m3, %3
+    punpckhbw %1, %3
+    mova %3, m3
+    mova %4, %1
+    pmaddubsw m1, %2
+    pmaddubsw m4, %2
+    pmaddubsw m0, %2
+    pmaddubsw m2, %2
+    pmaddubsw m3, [filt_mul20 GLOBAL]
+    pmaddubsw %1, [filt_mul20 GLOBAL]
+    psrlw     %3, 8
+    psrlw     %4, 8
+    paddw m1, m0
+    paddw m4, m2
+    paddw m1, m3
+    paddw m4, %1
 %else
     LOAD_ADD_2 m1, m4, [r3     ], [r1+r2*2], m2, m5            ; a0 / a1
     LOAD_ADD_2 m2, m5, [r3+r2  ], [r1+r2  ], m3, m6            ; b0 / b1
-%endif
-    ; H filter depends on LOAD_ADD writing unpacked words from [r3+r2*2] to %3 %4
     LOAD_ADD_2 m3, m6, [r3+r2*2], [r1     ], %3, %4            ; c0 / c1
     FILT_V2
+%endif
     mova      %1, m1
     mova      %2, m4
     paddw     m1, m15
@@ -522,7 +542,9 @@ cglobal x264_hpel_filter_%1, 7,7,16
     sub       r3, r2
     sub       r3, r2
     mov       r4, r10
+%ifidn %1, sse2
     pxor      m0, m0
+%endif
     pcmpeqw  m15, m15
     psrlw    m15, 15 ; pw_1
     psllw    m15, 4



More information about the x264-devel mailing list