[x265] [PATCH Review only] asm: code for scale2D_64to32 routine

murugan at multicorewareinc.com murugan at multicorewareinc.com
Thu Nov 14 15:52:08 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384440689 -19800
#      Thu Nov 14 20:21:29 2013 +0530
# Node ID 939b58fa36f56506f32ad761f6c3df72e20e0f2b
# Parent  4526a727f0b4975eeaa1094e0ced0a3b3b5c5a7d
asm: code for scale2D_64to32 routine

diff -r 4526a727f0b4 -r 939b58fa36f5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 14 16:57:19 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 14 20:21:29 2013 +0530
@@ -464,6 +464,7 @@
         PIXEL_AVG_W4(ssse3);
 
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+        p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
 
         p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
         p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
diff -r 4526a727f0b4 -r 939b58fa36f5 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Nov 14 16:57:19 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Thu Nov 14 20:21:29 2013 +0530
@@ -153,6 +153,7 @@
 cextern pb_0
 cextern pb_1
 cextern pw_1
+cextern pw_2
 cextern pw_8
 cextern pw_16
 cextern pw_32
@@ -6846,3 +6847,102 @@
     movu          [r0 + 48],    m4
 
 RET
+
+;-----------------------------------------------------------------
+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM ssse3
+cglobal scale2D_64to32, 3, 7, 8, dest, src, stride
+
+    mova        m7,      [pw_00ff]
+    mova        m6,      [pw_2]
+    xor         r3,      r3
+    mov         r6d,     32
+.loop
+
+    mov         r4,      r3
+    imul        r4,      r2
+
+    mov         r5,      r3
+    inc         r5
+    imul        r5,      r2
+
+    movu        m0,      [r1 + r4]
+    palignr     m1,      m0,    1
+    movu        m2,      [r1 + r5]
+    palignr     m3,      m2,    1
+
+    pand        m0,      m7
+    pand        m1,      m7
+    pand        m2,      m7
+    pand        m3,      m7
+
+    paddusw       m0,      m1
+    paddusw       m0,      m2
+    paddusw       m0,      m3
+    paddusw       m0,      m6
+
+    psrlw       m0,      2
+
+    movu        m4,      [r1 + r4 + 16]
+    palignr     m5,      m4,    1
+    movu        m1,      [r1 + r5 + 16]
+    palignr     m2,      m1,    1
+
+    pand        m4,      m7
+    pand        m5,      m7
+    pand        m1,      m7
+    pand        m2,      m7
+
+    paddusw       m4,      m5
+    paddusw       m4,      m1
+    paddusw       m4,      m2
+    paddusw       m4,      m6
+    psrlw         m4,      2
+
+    packuswb    m0,      m4
+    movu        [r0],    m0
+
+    movu        m0,      [r1 + r4 + 32]
+    palignr     m1,      m0,    1
+    movu        m2,      [r1 + r5 + 32]
+    palignr     m3,      m2,    1
+
+    pand        m0,      m7
+    pand        m1,      m7
+    pand        m2,      m7
+    pand        m3,      m7
+
+    paddusw       m0,      m1
+    paddusw       m0,      m2
+    paddusw       m0,      m3
+    paddusw       m0,      m6
+
+    psrlw       m0,      2
+
+    movu        m4,      [r1 + r4 + 48]
+    palignr     m5,      m4,    1
+    movu        m1,      [r1 + r5 + 48]
+    palignr     m2,      m1,    1
+
+    pand        m4,      m7
+    pand        m5,      m7
+    pand        m1,      m7
+    pand        m2,      m7
+
+    paddusw       m4,      m5
+    paddusw       m4,      m1
+    paddusw       m4,      m2
+    paddusw       m4,      m6
+    psrlw         m4,      2
+
+    packuswb    m0,           m4
+    movu        [r0 + 16],    m0
+
+    lea    r0,    [r0 + 32]
+    add    r3,    2
+    dec    r6d
+
+    jnz    .loop
+
+RET
diff -r 4526a727f0b4 -r 939b58fa36f5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Thu Nov 14 16:57:19 2013 +0530
+++ b/source/common/x86/pixel.h	Thu Nov 14 20:21:29 2013 +0530
@@ -117,6 +117,7 @@
 int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
 int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t);
 void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t);
+void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
 
 DECL_PIXELS(uint64_t, var, mmx2, (pixel * pix, intptr_t i_stride))
 DECL_PIXELS(uint64_t, var, sse2, (pixel * pix, intptr_t i_stride))


More information about the x265-devel mailing list