[x265] [PATCH] assembly code for pixel_sad_x3_12x16
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Oct 31 12:21:11 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1383218452 -19800
# Thu Oct 31 16:50:52 2013 +0530
# Node ID 1c0f0aa845b1c8a520aa91c9fffc68144effd75a
# Parent f6e35bfe1fd67668cc3c18bc41260a3f1d71dffc
assembly code for pixel_sad_x3_12x16
diff -r f6e35bfe1fd6 -r 1c0f0aa845b1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Oct 31 12:58:25 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 31 16:50:52 2013 +0530
@@ -296,6 +296,7 @@
p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_ssse3;
p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_ssse3;
+ p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ssse3;
p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
@@ -340,6 +341,7 @@
SA8D_INTER_FROM_BLOCK(avx);
ASSGN_SSE(avx);
+ p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx;
diff -r f6e35bfe1fd6 -r 1c0f0aa845b1 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Thu Oct 31 12:58:25 2013 +0530
+++ b/source/common/x86/sad-a.asm Thu Oct 31 16:50:52 2013 +0530
@@ -1842,6 +1842,72 @@
RET
%endmacro
+%macro SAD_X3_12x4 0
+ mova m3, [r0]
+ movu m5, [r1]
+ pand m3, m4
+ pand m5, m4
+ psadbw m5, m3
+ paddd m0, m5
+ movu m5, [r2]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m1, m5
+ movu m5, [r3]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m2, m5
+ mova m3, [r0 + FENC_STRIDE]
+ movu m5, [r1 + r4]
+ pand m3, m4
+ pand m5, m4
+ psadbw m5, m3
+ paddd m0, m5
+ movu m5, [r2 + r4]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m1, m5
+ movu m5, [r3 + r4]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m2, m5
+ mova m3, [r0 + FENC_STRIDE * 2]
+ movu m5, [r1 + r4 * 2]
+ pand m3, m4
+ pand m5, m4
+ psadbw m5, m3
+ paddd m0, m5
+ movu m5, [r2 + r4 * 2]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m1, m5
+ movu m5, [r3 + r4 * 2]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m2, m5
+ lea r1, [r1 + r4 * 2]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r4 * 2]
+ mova m3, [r0 + FENC_STRIDE + FENC_STRIDE * 2]
+ movu m5, [r1 + r4]
+ pand m3, m4
+ pand m5, m4
+ psadbw m5, m3
+ paddd m0, m5
+ movu m5, [r2 + r4]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m1, m5
+ movu m5, [r3 + r4]
+ pand m5, m4
+ psadbw m5, m3
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE * 4]
+ lea r1, [r1 + r4 * 2]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r4 * 2]
+%endmacro
+
%macro SAD_X3_24x4 0
mova m3, [r0]
mova m4, [r0 + 16]
@@ -2865,6 +2931,20 @@
%endif
%endmacro
+%macro SAD_X3_W12 0
+cglobal pixel_sad_x3_12x16, 5, 7, 8
+ mova m4, [MSK]
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ SAD_X3_12x4
+ SAD_X3_12x4
+ SAD_X3_12x4
+ SAD_X3_12x4
+ SAD_X3_END_SSE2 1
+%endmacro
+
%macro SAD_X3_W24 0
cglobal pixel_sad_x3_24x32, 5, 7, 8
pxor m0, m0
@@ -3096,6 +3176,7 @@
%endmacro
INIT_XMM ssse3
+SAD_X3_W12
SAD_X3_W32
SAD_X3_W24
SAD_X_SSE2 3, 16, 64, 7
@@ -3118,6 +3199,7 @@
SAD_X_SSSE3 4, 8, 4
INIT_XMM avx
+SAD_X3_W12
SAD_X3_W32
SAD_X3_W24
SAD_X_SSE2 3, 16, 64, 7
More information about the x265-devel
mailing list