[x265] [PATCH] asm: 16bpp support for sad_x3 - all block sizes
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Dec 5 12:26:29 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386242735 -19800
# Thu Dec 05 16:55:35 2013 +0530
# Node ID 9de0a1e5cf6a710aac8f28be46cca49a9acce85f
# Parent 79d649d551f04121950a7de1bf5f48e52f625f6a
asm: 16bpp support for sad_x3 - all block sizes
diff -r 79d649d551f0 -r 9de0a1e5cf6a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 04 20:43:39 2013 +0550
+++ b/source/common/x86/asm-primitives.cpp Thu Dec 05 16:55:35 2013 +0530
@@ -73,6 +73,24 @@
p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
+#define SAD_X3(cpu) \
+ p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
+ p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \
+ p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \
+ p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \
+ p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \
+ p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \
+ p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \
+ p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \
+ p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \
+ p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \
+ p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \
+ p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \
+ p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \
+ p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \
+ p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
+ p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
+
#define ASSGN_SSE(cpu) \
p.sse_pp[LUMA_8x8] = x265_pixel_ssd_8x8_ ## cpu; \
p.sse_pp[LUMA_8x4] = x265_pixel_ssd_8x4_ ## cpu; \
@@ -564,6 +582,17 @@
p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2;
p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
+ SAD_X3(sse2);
+ p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
+ p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
+ p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2;
+ p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2;
+ p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2;
+ p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2;
+ p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2;
+ p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
+ p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
+
p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
p.cvt16to32_shl = x265_cvt16to32_shl_sse2;
}
@@ -737,6 +766,7 @@
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
+ SAD_X3(ssse3);
p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
@@ -746,30 +776,16 @@
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
- p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ssse3;
p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
- p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
- p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ssse3;
p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
- p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ssse3;
p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ssse3;
- p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ssse3;
- p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ssse3;
- p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ssse3;
- p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ssse3;
- p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ssse3;
p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ssse3;
p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ssse3;
p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ssse3;
p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ssse3;
p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ssse3;
- p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ssse3;
p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ssse3;
- p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ssse3;
- p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ssse3;
- p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ssse3;
- p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ssse3;
p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ssse3;
p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ssse3;
p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ssse3;
@@ -846,36 +862,22 @@
ASSGN_SSE(avx);
HEVC_SATD(avx);
ASSGN_SSE_SS(avx);
-
+ SAD_X3(avx);
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
- p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx;
p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx;
- p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_avx;
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
- p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_avx;
p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
- p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_avx;
p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_avx;
- p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_avx;
- p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_avx;
- p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_avx;
- p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_avx;
- p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_avx;
p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_avx;
p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_avx;
p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_avx;
p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_avx;
p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_avx;
- p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_avx;
p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_avx;
- p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_avx;
- p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_avx;
- p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_avx;
- p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_avx;
p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_avx;
p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_avx;
p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_avx;
diff -r 79d649d551f0 -r 9de0a1e5cf6a source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Wed Dec 04 20:43:39 2013 +0550
+++ b/source/common/x86/sad16-a.asm Thu Dec 05 16:55:35 2013 +0530
@@ -470,6 +470,9 @@
psubw m2, m3
ABSW2 m0, m1, m0, m1, m4, m5
ABSW m2, m2, m6
+ pmaddwd m0, [pw_1]
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
%endmacro
%macro SAD_X3_ONE 2
@@ -482,9 +485,12 @@
psubw m5, m6
ABSW2 m3, m4, m3, m4, m7, m6
ABSW m5, m5, m6
- paddw m0, m3
- paddw m1, m4
- paddw m2, m5
+ pmaddwd m3, [pw_1]
+ pmaddwd m4, [pw_1]
+ pmaddwd m5, [pw_1]
+ paddd m0, m3
+ paddd m1, m4
+ paddd m2, m5
%endmacro
%macro SAD_X3_END 2
@@ -493,9 +499,9 @@
HADDUW m1, m4
HADDUW m2, m5
%else
- HADDW m0, m3
- HADDW m1, m4
- HADDW m2, m5
+ HADDD m0, m3
+ HADDD m1, m4
+ HADDD m2, m5
%endif
%if UNIX64
movd [r5+0], xm0
@@ -719,9 +725,11 @@
%define XMM_REGS 0
SAD_X 3, 16, 16
SAD_X 3, 16, 8
+SAD_X 3, 12, 16
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
+SAD_X 3, 4, 16
SAD_X 3, 4, 8
SAD_X 3, 4, 4
SAD_X 4, 16, 16
@@ -751,8 +759,24 @@
SAD_X 4, 8, 4
INIT_XMM sse2
%define XMM_REGS 8
+SAD_X 3, 64, 64
+SAD_X 3, 64, 48
+SAD_X 3, 64, 32
+SAD_X 3, 64, 16
+SAD_X 3, 48, 64
+SAD_X 3, 32, 64
+SAD_X 3, 32, 32
+SAD_X 3, 32, 24
+SAD_X 3, 32, 16
+SAD_X 3, 32, 8
+SAD_X 3, 24, 32
+SAD_X 3, 16, 64
+SAD_X 3, 16, 32
SAD_X 3, 16, 16
+SAD_X 3, 16, 12
SAD_X 3, 16, 8
+SAD_X 3, 16, 4
+SAD_X 3, 8, 32
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
More information about the x265-devel
mailing list