[x265] [PATCH] asm: 10bpp code of sse_ss for 12x16, 24x32, 48x64 and 64xN blocks
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Dec 2 14:28:01 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385990870 -19800
# Mon Dec 02 18:57:50 2013 +0530
# Node ID b2f3d3a91ad1ad1ca0a468e727b647bc0bbd9c46
# Parent 507f68d447760113bad13eaaacd91ce08f1a5cd2
asm: 10bpp code of sse_ss for 12x16, 24x32, 48x64 and 64xN blocks
diff -r 507f68d44776 -r b2f3d3a91ad1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Dec 02 16:10:17 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 02 18:57:50 2013 +0530
@@ -506,17 +506,24 @@
p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_sse2;
p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_sse2;
p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_sse2;
+ p.sse_ss[LUMA_12x16] = x265_pixel_ssd_ss_12x16_sse2;
p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_sse2;
p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_sse2;
p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_sse2;
p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_sse2;
p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_sse2;
p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_sse2;
+ p.sse_ss[LUMA_24x32] = x265_pixel_ssd_ss_24x32_sse2;
p.sse_ss[LUMA_32x8] = x265_pixel_ssd_ss_32x8_sse2;
p.sse_ss[LUMA_32x16] = x265_pixel_ssd_ss_32x16_sse2;
p.sse_ss[LUMA_32x24] = x265_pixel_ssd_ss_32x24_sse2;
p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
+ p.sse_ss[LUMA_48x64] = x265_pixel_ssd_ss_48x64_sse2;
+ p.sse_ss[LUMA_64x16] = x265_pixel_ssd_ss_64x16_sse2;
+ p.sse_ss[LUMA_64x32] = x265_pixel_ssd_ss_64x32_sse2;
+ p.sse_ss[LUMA_64x48] = x265_pixel_ssd_ss_64x48_sse2;
+ p.sse_ss[LUMA_64x64] = x265_pixel_ssd_ss_64x64_sse2;
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
PIXEL_AVG_W4(sse2);
diff -r 507f68d44776 -r b2f3d3a91ad1 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Dec 02 16:10:17 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Dec 02 18:57:50 2013 +0530
@@ -174,7 +174,179 @@
%endif
RET
%endmacro
-
+%macro SSD_TWO 2
+cglobal pixel_ssd_ss_%1x%2, 4,7,6
+ FIX_STRIDES r1, r3
+ pxor m0, m0
+ mov r4d, %2/2
+ lea r5, [r1 * 2]
+ lea r6, [r3 * 2]
+.loop
+ mova m1, [r0]
+ mova m2, [r0 + 16]
+ mova m3, [r0 + 32]
+ mova m4, [r0 + 48]
+ psubw m1, [r2]
+ psubw m2, [r2 + 16]
+ psubw m3, [r2 + 32]
+ psubw m4, [r2 + 48]
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ mova m1, [r0 + 64]
+ mova m2, [r0 + 80]
+ psubw m1, [r2 + 64]
+ psubw m2, [r2 + 80]
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ paddd m1, m2
+ paddd m0, m1
+%if %1 == 64
+ mova m3, [r0 + 96]
+ mova m4, [r0 + 112]
+ psubw m3, [r2 + 96]
+ psubw m4, [r2 + 112]
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m3, m4
+ paddd m0, m3
+%endif
+ mova m1, [r0 + r1]
+ mova m2, [r0 + r1 + 16]
+ mova m3, [r0 + r1 + 32]
+ mova m4, [r0 + r1 + 48]
+ psubw m1, [r2 + r3]
+ psubw m2, [r2 + r3 + 16]
+ psubw m3, [r2 + r3 + 32]
+ psubw m4, [r2 + r3 + 48]
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ mova m1, [r0 + r1 + 64]
+ mova m2, [r0 + r1 + 80]
+ psubw m1, [r2 + r3 + 64]
+ psubw m2, [r2 + r3 + 80]
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ paddd m1, m2
+ paddd m0, m1
+%if %1 == 64
+ mova m3, [r0 + r1 + 96]
+ mova m4, [r0 + r1 + 112]
+ psubw m3, [r2 + r3 + 96]
+ psubw m4, [r2 + r3 + 112]
+ pmaddwd m3, m3
+ pmaddwd m4, m4
+ paddd m3, m4
+ paddd m0, m3
+%endif
+ lea r0, [r0 + r5]
+ lea r2, [r2 + r6]
+ dec r4d
+ jnz .loop
+ HADDD m0, m5
+ movd eax, xm0
+ RET
+%endmacro
+%macro SSD_24 2
+cglobal pixel_ssd_ss_%1x%2, 4,7,6
+ FIX_STRIDES r1, r3
+ pxor m0, m0
+ mov r4d, %2/2
+ lea r5, [r1 * 2]
+ lea r6, [r3 * 2]
+.loop
+ mova m1, [r0]
+ mova m2, [r0 + 16]
+ mova m3, [r0 + 32]
+ psubw m1, [r2]
+ psubw m2, [r2 + 16]
+ psubw m3, [r2 + 32]
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m1, m2
+ paddd m0, m1
+ mova m1, [r0 + r1]
+ mova m2, [r0 + r1 + 16]
+ mova m4, [r0 + r1 + 32]
+ psubw m1, [r2 + r3]
+ psubw m2, [r2 + r3 + 16]
+ psubw m4, [r2 + r3 + 32]
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m4, m4
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ lea r0, [r0 + r5]
+ lea r2, [r2 + r6]
+ dec r4d
+ jnz .loop
+ HADDD m0, m5
+ movd eax, xm0
+ RET
+%endmacro
+%macro SSD_12 2
+cglobal pixel_ssd_ss_%1x%2, 4,7,7
+ FIX_STRIDES r1, r3
+ pxor m0, m0
+ mov r4d, %2/4
+ lea r5, [r1 * 2]
+ lea r6, [r3 * 2]
+.loop
+ mova m1, [r0]
+ movh m2, [r0 + 16]
+ mova m3, [r0 + r1]
+ punpcklqdq m2, [r0 + r1 + 16]
+ psubw m1, [r2]
+ movh m4, [r2 + 16]
+ psubw m3, [r2 + r3]
+ punpcklqdq m4, [r2 + r3 + 16]
+ psubw m2, m4
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m1, m2
+ paddd m0, m1
+
+ mova m1, [r0 + r5]
+ movh m2, [r0 + r5 + 16]
+ lea r0, [r0 + r5]
+ mova m6, [r0 + r1]
+ punpcklqdq m2, [r0 + r1 + 16]
+ psubw m1, [r2 + r6]
+ movh m4, [r2 + r6 + 16]
+ lea r2, [r2 + r6]
+ psubw m6, [r2 + r3]
+ punpcklqdq m4, [r2 + r3 + 16]
+ psubw m2, m4
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m6, m6
+ paddd m1, m2
+ paddd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ lea r0, [r0 + r5]
+ lea r2, [r2 + r6]
+ dec r4d
+ jnz .loop
+ HADDD m0, m5
+ movd eax, xm0
+ RET
+%endmacro
INIT_MMX mmx2
SSD_ONE 4, 4
SSD_ONE 4, 8
@@ -189,17 +361,24 @@
SSD_ONE 8, 8
SSD_ONE 8, 16
SSD_ONE 8, 32
+SSD_12 12, 16
SSD_ONE 16, 4
SSD_ONE 16, 8
SSD_ONE 16, 12
SSD_ONE 16, 16
SSD_ONE 16, 32
SSD_ONE 16, 64
+SSD_24 24, 32
SSD_ONE 32, 8
SSD_ONE 32, 16
SSD_ONE 32, 24
SSD_ONE 32, 32
SSD_ONE 32, 64
+SSD_TWO 48, 64
+SSD_TWO 64, 16
+SSD_TWO 64, 32
+SSD_TWO 64, 48
+SSD_TWO 64, 64
INIT_YMM avx2
SSD_ONE 16, 8
SSD_ONE 16, 16
More information about the x265-devel
mailing list