[x265] [PATCH] asm: assembly code for satd_16x32, satd_16x64, satd_8x32
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Tue Nov 12 12:05:27 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1384254277 -19800
# Tue Nov 12 16:34:37 2013 +0530
# Node ID 76120c6aa908b35734f8e09541090e1834584070
# Parent 428b58f09945a4eacdaca057a68ecb570342f5c6
asm: assembly code for satd_16x32, satd_16x64, satd_8x32
diff -r 428b58f09945 -r 76120c6aa908 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Nov 12 13:16:19 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Nov 12 16:34:37 2013 +0530
@@ -58,16 +58,7 @@
#define INIT7(name, cpu) INIT7_NAME(name, name, cpu)
#define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
-#if X86_64
-#define HEVC_X64_SATD(cpu)
-#else
-#define HEVC_X64_SATD(cpu) \
- p.satd[LUMA_8x32] = cmp<8, 32, 8, 16, x265_pixel_satd_8x16_ ## cpu>; \
- p.satd[LUMA_16x32] = cmp<16, 32, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
- p.satd[LUMA_16x64] = cmp<16, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>;
-#endif
#define HEVC_SATD(cpu) \
- HEVC_X64_SATD(cpu) \
p.satd[LUMA_32x32] = cmp<32, 32, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
p.satd[LUMA_24x32] = cmp<24, 32, 8, 16, x265_pixel_satd_8x16_ ## cpu>; \
p.satd[LUMA_64x64] = cmp<64, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
@@ -327,9 +318,12 @@
INIT8(sad_x4, _mmx2);
INIT8(satd, _mmx2);
HEVC_SATD(mmx2);
+ p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
p.satd[LUMA_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_mmx2>;
p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
+ p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
+ p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;
p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
@@ -407,11 +401,6 @@
p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
-#if X86_64
- p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
- p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
- p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
-#endif
p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_sse2;
diff -r 428b58f09945 -r 76120c6aa908 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Nov 12 13:16:19 2013 +0530
+++ b/source/common/x86/pixel-a.asm Tue Nov 12 16:34:37 2013 +0530
@@ -1654,17 +1654,6 @@
; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
%if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
-cglobal pixel_satd_8x32, 4,6,8
- SATD_START_SSE2 m6, m7
-%if vertical
- mova m7, [pw_00ff]
-%endif
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
-
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
@@ -1964,7 +1953,56 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
SATD_END_SSE2 m6, m7
-%endif
+
+cglobal pixel_satd_16x32, 4,6,8
+ SATD_START_SSE2 m6, m7, 1
+ BACKUP_POINTERS
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ RESTORE_AND_INC_POINTERS
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
+
+cglobal pixel_satd_16x64, 4,6,8
+ SATD_START_SSE2 m6, m7, 1
+ BACKUP_POINTERS
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ RESTORE_AND_INC_POINTERS
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
+%endif
+
+cglobal pixel_satd_8x32, 4,6,8
+ SATD_START_SSE2 m6, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
cglobal pixel_satd_8x16, 4,6,8
SATD_START_SSE2 m6, m7
More information about the x265-devel
mailing list