[x265] [PATCH Review only] asm: code for sse_pp_12x16 routine

murugan at multicorewareinc.com murugan at multicorewareinc.com
Thu Nov 21 15:48:49 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385045293 -19800
#      Thu Nov 21 20:18:13 2013 +0530
# Node ID 289b23f3ec9e955bfdfd8a1d4ff1910a4f8a7e95
# Parent  44d15f8ce9403cc8c8d97bffee355c1e24ad1271
asm: code for sse_pp_12x16 routine

diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 21 14:44:06 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 21 20:18:13 2013 +0530
@@ -564,6 +564,7 @@
         PIXEL_AVG_W4(ssse3);
 
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+        p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_ssse3;
 
         p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
         p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Nov 21 14:44:06 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Thu Nov 21 20:18:13 2013 +0530
@@ -537,6 +537,86 @@
 %endif ; !HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
+; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
+
+    pxor        m6,     m6
+    mov         r4d,    4
+
+.loop
+    movu        m0,    [r0]
+    movu        m1,    [r2]
+    movu        m2,    [r0 + r1]
+    movu        m3,    [r2 + r3]
+
+    mova        m4,    m0
+    mova        m5,    m1
+    punpckhdq   m4,    m2
+    punpckhdq   m5,    m3
+
+    pmovzxbw    m0,    m0
+    pmovzxbw    m1,    m1
+    pmovzxbw    m2,    m2
+    pmovzxbw    m3,    m3
+    pmovzxbw    m4,    m4
+    pmovzxbw    m5,    m5
+
+    psubw       m0,    m1
+    psubw       m2,    m3
+    psubw       m4,    m5
+
+    pmaddwd     m0,    m0
+    pmaddwd     m2,    m2
+    pmaddwd     m4,    m4
+
+    paddd       m6,    m0
+    paddd       m6,    m2
+    paddd       m6,    m4
+
+    movu        m0,    [r0 + 2 * r1]
+    movu        m1,    [r2 + 2 * r3]
+    lea         r0,    [r0 + 2 * r1]
+    lea         r2,    [r2 + 2 * r3]
+    movu        m2,    [r0 + r1]
+    movu        m3,    [r2 + r3]
+
+    mova        m4,    m0
+    mova        m5,    m1
+    punpckhdq   m4,    m2
+    punpckhdq   m5,    m3
+
+    pmovzxbw    m0,    m0
+    pmovzxbw    m1,    m1
+    pmovzxbw    m2,    m2
+    pmovzxbw    m3,    m3
+    pmovzxbw    m4,    m4
+    pmovzxbw    m5,    m5
+
+    psubw       m0,    m1
+    psubw       m2,    m3
+    psubw       m4,    m5
+
+    pmaddwd     m0,    m0
+    pmaddwd     m2,    m2
+    pmaddwd     m4,    m4
+
+    paddd       m6,    m0
+    paddd       m6,    m2
+    paddd       m6,    m4
+
+    lea       r0,                    [r0 + 2 * r1]
+    lea       r2,                    [r2 + 2 * r3]
+    dec    r4d
+    jnz    .loop
+
+    HADDD   m6, m1
+    movd   eax, m6
+
+    RET
+
+;-----------------------------------------------------------------------------
 ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
 ;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
 ;
diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Thu Nov 21 14:44:06 2013 +0530
+++ b/source/common/x86/pixel.h	Thu Nov 21 20:18:13 2013 +0530
@@ -372,5 +372,6 @@
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
 void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
 void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
+int x265_pixel_ssd_12x16_ssse3(pixel *, intptr_t, pixel *, intptr_t);
 
 #endif // ifndef X265_I386_PIXEL_H


More information about the x265-devel mailing list