[x265] [PATCH] asm: 16bpp support for sad_x4 - all block sizes
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Dec 5 14:02:18 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386248495 -19800
# Thu Dec 05 18:31:35 2013 +0530
# Node ID ca02a77ece624815aaf7b52fffb6f7710e13bc1e
# Parent 9de0a1e5cf6a710aac8f28be46cca49a9acce85f
asm: 16bpp support for sad_x4 - all block sizes
diff -r 9de0a1e5cf6a -r ca02a77ece62 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Dec 05 16:55:35 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Dec 05 18:31:35 2013 +0530
@@ -91,6 +91,24 @@
p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
+#define SAD_X4(cpu) \
+ p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \
+ p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \
+ p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \
+ p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \
+ p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \
+ p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \
+ p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \
+ p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \
+ p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \
+ p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \
+ p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \
+ p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \
+ p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \
+ p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \
+ p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \
+ p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu
+
#define ASSGN_SSE(cpu) \
p.sse_pp[LUMA_8x8] = x265_pixel_ssd_8x8_ ## cpu; \
p.sse_pp[LUMA_8x4] = x265_pixel_ssd_8x4_ ## cpu; \
@@ -593,6 +611,17 @@
p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
+ SAD_X4(sse2);
+ p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2;
+ p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2;
+ p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2;
+ p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2;
+ p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2;
+ p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2;
+ p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2;
+ p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
+ p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
+
p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
p.cvt16to32_shl = x265_cvt16to32_shl_sse2;
}
@@ -767,6 +796,7 @@
p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
SAD_X3(ssse3);
+ SAD_X4(ssse3);
p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
@@ -776,20 +806,6 @@
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
- p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
- p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
- p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
- p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ssse3;
- p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ssse3;
- p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ssse3;
- p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ssse3;
- p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ssse3;
- p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ssse3;
- p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ssse3;
- p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ssse3;
- p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ssse3;
- p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ssse3;
- p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ssse3;
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
p.luma_p2s = x265_luma_p2s_ssse3;
@@ -863,25 +879,12 @@
HEVC_SATD(avx);
ASSGN_SSE_SS(avx);
SAD_X3(avx);
+ SAD_X3(avx);
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
- p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx;
- p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
- p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
- p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_avx;
- p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_avx;
- p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_avx;
- p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_avx;
- p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_avx;
- p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_avx;
- p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_avx;
- p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_avx;
- p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_avx;
- p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_avx;
- p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_avx;
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
p.ssim_end_4 = x265_pixel_ssim_end4_avx;
}
diff -r 9de0a1e5cf6a -r ca02a77ece62 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Dec 05 16:55:35 2013 +0530
+++ b/source/common/x86/sad16-a.asm Thu Dec 05 18:31:35 2013 +0530
@@ -536,6 +536,10 @@
psubw m3, m4
ABSW2 m0, m1, m0, m1, m5, m6
ABSW2 m2, m3, m2, m3, m4, m7
+ pmaddwd m0, [pw_1]
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ pmaddwd m3, [pw_1]
%endmacro
%macro SAD_X4_ONE 2
@@ -551,10 +555,14 @@
psubw m8, m4
ABSW2 m5, m6, m5, m6, m9, m10
ABSW2 m7, m8, m7, m8, m9, m10
- paddw m0, m5
- paddw m1, m6
- paddw m2, m7
- paddw m3, m8
+ pmaddwd m5, [pw_1]
+ pmaddwd m6, [pw_1]
+ pmaddwd m7, [pw_1]
+ pmaddwd m8, [pw_1]
+ paddd m0, m5
+ paddd m1, m6
+ paddd m2, m7
+ paddd m3, m8
%elif cpuflag(ssse3)
movu m7, [r3+%2]
psubw m5, m4
@@ -566,24 +574,32 @@
pabsw m6, m6
pabsw m7, m7
pabsw m4, m4
- paddw m0, m5
- paddw m1, m6
- paddw m2, m7
- paddw m3, m4
+ pmaddwd m5, [pw_1]
+ pmaddwd m6, [pw_1]
+ pmaddwd m7, [pw_1]
+ pmaddwd m4, [pw_1]
+ paddd m0, m5
+ paddd m1, m6
+ paddd m2, m7
+ paddd m3, m4
%else ; num_mmregs == 8 && !ssse3
psubw m5, m4
psubw m6, m4
ABSW m5, m5, m7
ABSW m6, m6, m7
- paddw m0, m5
- paddw m1, m6
+ pmaddwd m5, [pw_1]
+ pmaddwd m6, [pw_1]
+ paddd m0, m5
+ paddd m1, m6
movu m5, [r3+%2]
movu m6, [r4+%2]
psubw m5, m4
psubw m6, m4
ABSW2 m5, m6, m5, m6, m7, m4
- paddw m2, m5
- paddw m3, m6
+ pmaddwd m5, [pw_1]
+ pmaddwd m6, [pw_1]
+ paddd m2, m5
+ paddd m3, m6
%endif
%endmacro
@@ -594,10 +610,10 @@
HADDUW m2, m6
HADDUW m3, m7
%else
- HADDW m0, m4
- HADDW m1, m5
- HADDW m2, m6
- HADDW m3, m7
+ HADDD m0, m4
+ HADDD m1, m5
+ HADDD m2, m6
+ HADDD m3, m7
%endif
mov r0, r6mp
movd [r0+ 0], xm0
@@ -734,9 +750,11 @@
SAD_X 3, 4, 4
SAD_X 4, 16, 16
SAD_X 4, 16, 8
+SAD_X 4, 12, 16
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
+SAD_X 4, 4, 16
SAD_X 4, 4, 8
SAD_X 4, 4, 4
INIT_MMX ssse3
@@ -781,8 +799,24 @@
SAD_X 3, 8, 8
SAD_X 3, 8, 4
%define XMM_REGS 11
+SAD_X 4, 64, 64
+SAD_X 4, 64, 48
+SAD_X 4, 64, 32
+SAD_X 4, 64, 16
+SAD_X 4, 48, 64
+SAD_X 4, 32, 64
+SAD_X 4, 32, 32
+SAD_X 4, 32, 24
+SAD_X 4, 32, 16
+SAD_X 4, 32, 8
+SAD_X 4, 24, 32
+SAD_X 4, 16, 64
+SAD_X 4, 16, 32
SAD_X 4, 16, 16
+SAD_X 4, 16, 12
SAD_X 4, 16, 8
+SAD_X 4, 16, 4
+SAD_X 4, 8, 32
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
More information about the x265-devel
mailing list