[x265] [PATCH Review Only] optimiztion with interp_4tap_vert_pp_12x16 asm code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Oct 28 12:18:04 CET 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1382959071 -19800
# Node ID 14ae0b70567b0d78738f2cb064b25c46e437d950
# Parent  1438266dbde617130d04ae13bd0d6e9c0993581c
optimiztion with interp_4tap_vert_pp_12x16 asm code

diff -r 1438266dbde6 -r 14ae0b70567b source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Oct 28 13:54:08 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Oct 28 16:47:51 2013 +0530
@@ -630,7 +630,7 @@
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
-%macro FILTER_V8_W12_H2 2
+%macro FILTER_V4_W12_H2 2
 INIT_XMM sse4
 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
 
@@ -647,70 +647,69 @@
 pshufb      m1,        m0,       [tab_Vm]
 pshufb      m0,        [tab_Vm + 16]
 
-xor         r4,        r4
-add         r4d,       %2
+mov         r4d,       %2
 
 .loop
 movu        m2,        [r0]
 movu        m3,        [r0 + r1]
 
 punpcklbw   m4,        m2,        m3,
-punpckhbw   m5,        m2,        m3,
+punpckhbw   m2,        m3,
 
 pmaddubsw   m4,        m1
-pmaddubsw   m5,        m1
+pmaddubsw   m2,        m1
 
-movu        m2,        [r0 + 2 * r1]
+movu        m5,        [r0 + 2 * r1]
 lea         r5,        [r0 + 2 * r1]
 movu        m3,        [r5 + r1]
 
-punpcklbw   m6,        m2,        m3,
-punpckhbw   m7,        m2,        m3,
+punpcklbw   m6,        m5,        m3,
+punpckhbw   m5,        m3,
 
 pmaddubsw   m6,        m0
-pmaddubsw   m7,        m0
+pmaddubsw   m5,        m0
 
-paddw       m4,        m6;
-paddw       m5,        m7;
+paddw       m4,        m6
+paddw       m2,        m5
 
 mova        m6,        [tab_c_512]
 
 pmulhrsw    m4,        m6
-pmulhrsw    m5,        m6
+pmulhrsw    m2,        m6
 
-packuswb    m4,        m5
+packuswb    m4,        m2
 
-movh         [r2],      m4
-pextrd       [r2 + 8],  m4,  2
+movh        [r2],      m4
+pextrd      [r2 + 8],  m4,  2
 
 movu        m2,        [r0 + r1]
 movu        m3,        [r0 + 2 * r1]
 
 punpcklbw   m4,        m2,        m3,
-punpckhbw   m5,        m2,        m3,
+punpckhbw   m2,        m3,
 
 pmaddubsw   m4,        m1
-pmaddubsw   m5,        m1
+pmaddubsw   m2,        m1
 
 lea         r5,        [r0 + 2 * r1]
-movu        m2,        [r5 + r1]
+movu        m5,        [r5 + r1]
 movu        m3,        [r5 + 2 * r1]
 
-punpcklbw   m6,        m2,        m3,
-punpckhbw   m7,        m2,        m3,
+punpcklbw   m6,        m5,        m3,
+punpckhbw   m5,        m3,
 
 pmaddubsw   m6,        m0
-pmaddubsw   m7,        m0
+pmaddubsw   m5,        m0
 
 paddw       m4,        m6
-paddw       m5,        m7
+paddw       m2,        m5
 
 mova        m6,        [tab_c_512]
 
 pmulhrsw    m4,        m6
-pmulhrsw    m5,        m6
+pmulhrsw    m2,        m6
 
-packuswb    m4,        m5
+packuswb    m4,        m2
 
 movh        [r2 + r3],      m4
 pextrd      [r2 + r3 + 8],  m4,  2
@@ -723,4 +722,4 @@
 RET
 %endmacro
 
-FILTER_V8_W12_H2 12, 16
+FILTER_V4_W12_H2 12, 16


More information about the x265-devel mailing list