[x265] [PATCH Review Only] assembly code for pixel_sad_x3_32xN
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Tue Oct 29 12:15:57 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1383044811 -19800
# Tue Oct 29 16:36:51 2013 +0530
# Node ID fc35a117efd17270eb15aa56aad7cc90bb7bdd35
# Parent e2f512dbd2424d099d9984c72bfc7d0729be25fe
assembly code for pixel_sad_x3_32xN
diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Oct 29 16:36:51 2013 +0530
@@ -280,6 +280,11 @@
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ssse3;
p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
+ p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ssse3;
+ p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ssse3;
+ p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ssse3;
+ p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ssse3;
+ p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -310,6 +315,11 @@
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_avx;
p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
+ p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_avx;
+ p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_avx;
+ p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_avx;
+ p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_avx;
+ p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_avx;
}
if (cpuMask & X265_CPU_XOP)
{
diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/x86/pixel.h Tue Oct 29 16:36:51 2013 +0530
@@ -29,6 +29,11 @@
#define X265_I386_PIXEL_H
#define DECL_PIXELS(ret, name, suffix, args) \
+ ret x265_pixel_ ## name ## _32x64_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x32_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x24_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x16_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x8_ ## suffix args; \
ret x265_pixel_ ## name ## _16x64_ ## suffix args; \
ret x265_pixel_ ## name ## _16x32_ ## suffix args; \
ret x265_pixel_ ## name ## _16x16_ ## suffix args; \
diff -r e2f512dbd242 -r fc35a117efd1 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/x86/sad-a.asm Tue Oct 29 16:36:51 2013 +0530
@@ -1007,19 +1007,30 @@
; SAD x3/x4 XMM
;=============================================================================
-%macro SAD_X3_START_1x16P_SSE2 0
- mova m2, [r0]
+%macro SAD_X3_START_1x16P_SSE2 1
+ mova m3, [r0 + %1]
+%if %1 == 0
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+%endif
%if cpuflag(avx)
- psadbw m0, m2, [r1]
- psadbw m1, m2, [r2]
- psadbw m2, [r3]
+ psadbw m4, m3, [r1 + %1]
+ psadbw m5, m3, [r2 + %1]
+ psadbw m3, [r3 + %1]
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m3
%else
- movu m0, [r1]
- movu m1, [r2]
- movu m3, [r3]
- psadbw m0, m2
- psadbw m1, m2
- psadbw m2, m3
+ movu m4, [r1 + %1]
+ movu m5, [r2 + %1]
+ movu m6, [r3 + %1]
+ psadbw m4, m3
+ psadbw m5, m3
+ psadbw m6, m3
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m6
%endif
%endmacro
@@ -1051,7 +1062,7 @@
%macro SAD_X3_4x16P_SSE2 2
%if %1==0
lea t0, [r4*3]
- SAD_X3_START_1x16P_SSE2
+ SAD_X3_START_1x16P_SSE2 0
%else
SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
%endif
@@ -1068,6 +1079,30 @@
%endif
%endmacro
+%macro SAD_X3_4x32P_SSE2 2
+%assign y 0
+%rep 2
+%if %1==0
+ lea t0, [r4+r4*2]
+ SAD_X3_START_1x16P_SSE2 y
+%else
+ SAD_X3_1x16P_SSE2 (FENC_STRIDE*(0+(%1&1)*4) + y), (r4*0 + y)
+%endif
+ SAD_X3_1x16P_SSE2 (FENC_STRIDE*(1+(%1&1)*4) + y), (r4*1 + y)
+ SAD_X3_1x16P_SSE2 (FENC_STRIDE*(2+(%1&1)*4) + y), (r4*2 + y)
+ SAD_X3_1x16P_SSE2 (FENC_STRIDE*(3+(%1&1)*4) + y), (t0 + y)
+%assign y y+16
+%endrep
+%if %1 != %2-1
+%if (%1&1) != 0
+ add r0, 8*FENC_STRIDE
+%endif
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
+%endif
+%endmacro
+
%macro SAD_X3_START_2x8P_SSE2 0
movq m3, [r0]
movq m0, [r1]
@@ -1506,7 +1541,7 @@
SAD_X%1_4x%2P_SSE2 x, %3/4
%assign x x+1
%endrep
-%if %3 == 64
+%if %3 >= 24
SAD_X%1_END_SSE2 1
%else
SAD_X%1_END_SSE2 0
@@ -1544,6 +1579,11 @@
%endmacro
INIT_XMM ssse3
+SAD_X_SSE2 3, 32, 64, 7
+SAD_X_SSE2 3, 32, 32, 7
+SAD_X_SSE2 3, 32, 24, 7
+SAD_X_SSE2 3, 32, 16, 7
+SAD_X_SSE2 3, 32, 8, 7
SAD_X_SSE2 3, 16, 64, 7
SAD_X_SSE2 3, 16, 32, 7
SAD_X_SSE2 3, 16, 16, 7
@@ -1562,6 +1602,11 @@
SAD_X_SSSE3 4, 8, 4
INIT_XMM avx
+SAD_X_SSE2 3, 32, 64, 7
+SAD_X_SSE2 3, 32, 32, 7
+SAD_X_SSE2 3, 32, 24, 7
+SAD_X_SSE2 3, 32, 16, 7
+SAD_X_SSE2 3, 32, 8, 7
SAD_X_SSE2 3, 16, 64, 7
SAD_X_SSE2 3, 16, 32, 6
SAD_X_SSE2 3, 16, 16, 6
More information about the x265-devel
mailing list