[x265] [PATCH] assembly code for pixel_sad_x3_12x16

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Thu Oct 31 12:21:11 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1383218452 -19800
#      Thu Oct 31 16:50:52 2013 +0530
# Node ID 1c0f0aa845b1c8a520aa91c9fffc68144effd75a
# Parent  f6e35bfe1fd67668cc3c18bc41260a3f1d71dffc
assembly code for pixel_sad_x3_12x16

diff -r f6e35bfe1fd6 -r 1c0f0aa845b1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Oct 31 12:58:25 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 31 16:50:52 2013 +0530
@@ -296,6 +296,7 @@
         p.sad_x3[LUMA_8x32]  = x265_pixel_sad_x3_8x32_ssse3;
         p.sad_x4[LUMA_8x32]  = x265_pixel_sad_x4_8x32_ssse3;
 
+        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
         p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ssse3;
         p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
         p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
@@ -340,6 +341,7 @@
         SA8D_INTER_FROM_BLOCK(avx);
         ASSGN_SSE(avx);
 
+        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
         p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
         p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
         p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx;
diff -r f6e35bfe1fd6 -r 1c0f0aa845b1 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Thu Oct 31 12:58:25 2013 +0530
+++ b/source/common/x86/sad-a.asm	Thu Oct 31 16:50:52 2013 +0530
@@ -1842,6 +1842,72 @@
     RET
 %endmacro
 
+%macro SAD_X3_12x4 0
+    mova    m3,  [r0]
+    movu    m5,  [r1]
+    pand    m3,  m4
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m2,  m5
+    mova    m3,  [r0 + FENC_STRIDE]
+    movu    m5,  [r1 + r4]
+    pand    m3,  m4
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m0,  m5
+    movu    m5,  [r2 + r4]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m1,  m5
+    movu    m5,  [r3 + r4]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m2,  m5
+    mova    m3,  [r0 + FENC_STRIDE * 2]
+    movu    m5,  [r1 + r4 * 2]
+    pand    m3,  m4
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m0,  m5
+    movu    m5,  [r2 + r4 * 2]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m1,  m5
+    movu    m5,  [r3 + r4 * 2]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m2,  m5
+    lea     r1, [r1 + r4 * 2]
+    lea     r2, [r2 + r4 * 2]
+    lea     r3, [r3 + r4 * 2]
+    mova    m3,  [r0 + FENC_STRIDE + FENC_STRIDE * 2]
+    movu    m5,  [r1 + r4]
+    pand    m3,  m4
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m0,  m5
+    movu    m5,  [r2 + r4]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m1,  m5
+    movu    m5,  [r3 + r4]
+    pand    m5,  m4
+    psadbw  m5,  m3
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE * 4]
+    lea     r1,  [r1 + r4 * 2]
+    lea     r2,  [r2 + r4 * 2]
+    lea     r3,  [r3 + r4 * 2]
+%endmacro
+
 %macro SAD_X3_24x4 0
     mova    m3,  [r0]
     mova    m4,  [r0 + 16]
@@ -2865,6 +2931,20 @@
 %endif
 %endmacro
 
+%macro SAD_X3_W12 0
+cglobal pixel_sad_x3_12x16, 5, 7, 8
+    mova  m4,  [MSK]
+    pxor  m0,  m0
+    pxor  m1,  m1
+    pxor  m2,  m2
+
+    SAD_X3_12x4
+    SAD_X3_12x4
+    SAD_X3_12x4
+    SAD_X3_12x4
+    SAD_X3_END_SSE2 1
+%endmacro
+
 %macro SAD_X3_W24 0
 cglobal pixel_sad_x3_24x32, 5, 7, 8
     pxor  m0, m0
@@ -3096,6 +3176,7 @@
 %endmacro
 
 INIT_XMM ssse3
+SAD_X3_W12
 SAD_X3_W32
 SAD_X3_W24
 SAD_X_SSE2  3, 16, 64, 7
@@ -3118,6 +3199,7 @@
 SAD_X_SSSE3 4,  8,  4
 
 INIT_XMM avx
+SAD_X3_W12
 SAD_X3_W32
 SAD_X3_W24
 SAD_X_SSE2 3, 16, 64, 7


More information about the x265-devel mailing list