[x265] [PATCH] assembly code for pixel_sad_x4_12x16
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Oct 31 12:40:06 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1383219583 -19800
# Thu Oct 31 17:09:43 2013 +0530
# Node ID 56368c1e4df4d043eadc1352d75542f77c405077
# Parent 1c0f0aa845b1c8a520aa91c9fffc68144effd75a
assembly code for pixel_sad_x4_12x16
diff -r 1c0f0aa845b1 -r 56368c1e4df4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Oct 31 16:50:52 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 31 17:09:43 2013 +0530
@@ -297,6 +297,7 @@
p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_ssse3;
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
+ p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ssse3;
p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
@@ -342,6 +343,7 @@
ASSGN_SSE(avx);
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
+ p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx;
diff -r 1c0f0aa845b1 -r 56368c1e4df4 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Thu Oct 31 16:50:52 2013 +0530
+++ b/source/common/x86/sad-a.asm Thu Oct 31 17:09:43 2013 +0530
@@ -1908,6 +1908,90 @@
lea r3, [r3 + r4 * 2]
%endmacro
+%macro SAD_X4_12x4 0
+ mova m4, [r0]
+ movu m5, [r1]
+ pand m4, m6
+ pand m5, m6
+ psadbw m5, m4
+ paddd m0, m5
+ movu m5, [r2]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m1, m5
+ movu m5, [r3]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m2, m5
+ movu m5, [r4]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m3, m5
+ mova m4, [r0 + FENC_STRIDE]
+ movu m5, [r1 + r5]
+ pand m4, m6
+ pand m5, m6
+ psadbw m5, m4
+ paddd m0, m5
+ movu m5, [r2 + r5]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m1, m5
+ movu m5, [r3 + r5]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m2, m5
+ movu m5, [r4 + r5]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m3, m5
+ mova m4, [r0 + FENC_STRIDE * 2]
+ movu m5, [r1 + r5 * 2]
+ pand m4, m6
+ pand m5, m6
+ psadbw m5, m4
+ paddd m0, m5
+ movu m5, [r2 + r5 * 2]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m1, m5
+ movu m5, [r3 + r5 * 2]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m2, m5
+ movu m5, [r4 + r5 * 2]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m3, m5
+ lea r1, [r1 + r5 * 2]
+ lea r2, [r2 + r5 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r4, [r4 + r5 * 2]
+ mova m4, [r0 + FENC_STRIDE + FENC_STRIDE * 2]
+ movu m5, [r1 + r5]
+ pand m4, m6
+ pand m5, m6
+ psadbw m5, m4
+ paddd m0, m5
+ movu m5, [r2 + r5]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m1, m5
+ movu m5, [r3 + r5]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m2, m5
+ movu m5, [r4 + r5]
+ pand m5, m6
+ psadbw m5, m4
+ paddd m3, m5
+ lea r0, [r0 + FENC_STRIDE * 4]
+ lea r1, [r1 + r5 * 2]
+ lea r2, [r2 + r5 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r4, [r4 + r5 * 2]
+%endmacro
+
%macro SAD_X3_24x4 0
mova m3, [r0]
mova m4, [r0 + 16]
@@ -2945,6 +3029,21 @@
SAD_X3_END_SSE2 1
%endmacro
+%macro SAD_X4_W12 0
+cglobal pixel_sad_x4_12x16, 6, 8, 8
+ mova m6, [MSK]
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+
+ SAD_X4_12x4
+ SAD_X4_12x4
+ SAD_X4_12x4
+ SAD_X4_12x4
+ SAD_X4_END_SSE2 1
+%endmacro
+
%macro SAD_X3_W24 0
cglobal pixel_sad_x3_24x32, 5, 7, 8
pxor m0, m0
@@ -3186,6 +3285,7 @@
SAD_X_SSE2 3, 16, 8, 7
SAD_X_SSE2 3, 8, 32, 7
SAD_X_SSE2 3, 8, 16, 7
+SAD_X4_W12
SAD_X4_W24
SAD_X4_W32
SAD_X_SSE2 4, 16, 64, 7
@@ -3208,6 +3308,7 @@
SAD_X_SSE2 3, 16, 12, 6
SAD_X_SSE2 3, 16, 8, 6
SAD_X_SSE2 3, 16, 4, 6
+SAD_X4_W12
SAD_X4_W24
SAD_X4_W32
SAD_X_SSE2 4, 16, 64, 7
More information about the x265-devel
mailing list