[x265] [PATCH] assembly code for pixel_sad_x4_12x16

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Thu Oct 31 12:40:06 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1383219583 -19800
#      Thu Oct 31 17:09:43 2013 +0530
# Node ID 56368c1e4df4d043eadc1352d75542f77c405077
# Parent  1c0f0aa845b1c8a520aa91c9fffc68144effd75a
assembly code for pixel_sad_x4_12x16

diff -r 1c0f0aa845b1 -r 56368c1e4df4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Oct 31 16:50:52 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 31 17:09:43 2013 +0530
@@ -297,6 +297,7 @@
         p.sad_x4[LUMA_8x32]  = x265_pixel_sad_x4_8x32_ssse3;
 
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
+        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
         p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ssse3;
         p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
         p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
@@ -342,6 +343,7 @@
         ASSGN_SSE(avx);
 
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
+        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
         p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
         p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
         p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx;
diff -r 1c0f0aa845b1 -r 56368c1e4df4 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Thu Oct 31 16:50:52 2013 +0530
+++ b/source/common/x86/sad-a.asm	Thu Oct 31 17:09:43 2013 +0530
@@ -1908,6 +1908,90 @@
     lea     r3,  [r3 + r4 * 2]
 %endmacro
 
+%macro SAD_X4_12x4 0
+    mova    m4,  [r0]
+    movu    m5,  [r1]
+    pand    m4,  m6
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m2,  m5
+    movu    m5,  [r4]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m3,  m5
+    mova    m4,  [r0 + FENC_STRIDE]
+    movu    m5,  [r1 + r5]
+    pand    m4,  m6
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m0,  m5
+    movu    m5,  [r2 + r5]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m1,  m5
+    movu    m5,  [r3 + r5]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m2,  m5
+    movu    m5,  [r4 + r5]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m3,  m5
+    mova    m4,  [r0 + FENC_STRIDE * 2]
+    movu    m5,  [r1 + r5 * 2]
+    pand    m4,  m6
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m0,  m5
+    movu    m5,  [r2 + r5 * 2]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m1,  m5
+    movu    m5,  [r3 + r5 * 2]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m2,  m5
+    movu    m5,  [r4 + r5 * 2]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m3,  m5
+    lea     r1, [r1 + r5 * 2]
+    lea     r2, [r2 + r5 * 2]
+    lea     r3, [r3 + r5 * 2]
+    lea     r4, [r4 + r5 * 2]
+    mova    m4,  [r0 + FENC_STRIDE + FENC_STRIDE * 2]
+    movu    m5,  [r1 + r5]
+    pand    m4,  m6
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m0,  m5
+    movu    m5,  [r2 + r5]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m1,  m5
+    movu    m5,  [r3 + r5]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m2,  m5
+    movu    m5,  [r4 + r5]
+    pand    m5,  m6
+    psadbw  m5,  m4
+    paddd   m3,  m5
+    lea     r0,  [r0 + FENC_STRIDE * 4]
+    lea     r1,  [r1 + r5 * 2]
+    lea     r2,  [r2 + r5 * 2]
+    lea     r3,  [r3 + r5 * 2]
+    lea     r4,  [r4 + r5 * 2]
+%endmacro
+
 %macro SAD_X3_24x4 0
     mova    m3,  [r0]
     mova    m4,  [r0 + 16]
@@ -2945,6 +3029,21 @@
     SAD_X3_END_SSE2 1
 %endmacro
 
+%macro SAD_X4_W12 0
+cglobal pixel_sad_x4_12x16, 6, 8, 8
+    mova  m6,  [MSK]
+    pxor  m0,  m0
+    pxor  m1,  m1
+    pxor  m2,  m2
+    pxor  m3,  m3
+
+    SAD_X4_12x4
+    SAD_X4_12x4
+    SAD_X4_12x4
+    SAD_X4_12x4
+    SAD_X4_END_SSE2 1
+%endmacro
+
 %macro SAD_X3_W24 0
 cglobal pixel_sad_x3_24x32, 5, 7, 8
     pxor  m0, m0
@@ -3186,6 +3285,7 @@
 SAD_X_SSE2  3, 16,  8, 7
 SAD_X_SSE2  3,  8, 32, 7
 SAD_X_SSE2  3,  8, 16, 7
+SAD_X4_W12
 SAD_X4_W24
 SAD_X4_W32
 SAD_X_SSE2  4, 16, 64, 7
@@ -3208,6 +3308,7 @@
 SAD_X_SSE2 3, 16, 12, 6
 SAD_X_SSE2 3, 16,  8, 6
 SAD_X_SSE2 3, 16,  4, 6
+SAD_X4_W12
 SAD_X4_W24
 SAD_X4_W32
 SAD_X_SSE2 4, 16, 64, 7


More information about the x265-devel mailing list