[x265] [PATCH] asm: assembly code for satd_16x32, satd_16x64, satd_8x32

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Tue Nov 12 12:05:27 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1384254277 -19800
#      Tue Nov 12 16:34:37 2013 +0530
# Node ID 76120c6aa908b35734f8e09541090e1834584070
# Parent  428b58f09945a4eacdaca057a68ecb570342f5c6
asm: assembly code for satd_16x32, satd_16x64, satd_8x32

diff -r 428b58f09945 -r 76120c6aa908 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Nov 12 13:16:19 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Nov 12 16:34:37 2013 +0530
@@ -58,16 +58,7 @@
 #define INIT7(name, cpu) INIT7_NAME(name, name, cpu)
 #define INIT8(name, cpu) INIT8_NAME(name, name, cpu)
 
-#if X86_64
-#define HEVC_X64_SATD(cpu)
-#else
-#define HEVC_X64_SATD(cpu) \
-    p.satd[LUMA_8x32] = cmp<8, 32, 8, 16, x265_pixel_satd_8x16_ ## cpu>; \
-    p.satd[LUMA_16x32] = cmp<16, 32, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
-    p.satd[LUMA_16x64] = cmp<16, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>;
-#endif
 #define HEVC_SATD(cpu) \
-    HEVC_X64_SATD(cpu) \
     p.satd[LUMA_32x32] = cmp<32, 32, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
     p.satd[LUMA_24x32] = cmp<24, 32, 8, 16, x265_pixel_satd_8x16_ ## cpu>; \
     p.satd[LUMA_64x64] = cmp<64, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
@@ -327,9 +318,12 @@
         INIT8(sad_x4, _mmx2);
         INIT8(satd, _mmx2);
         HEVC_SATD(mmx2);
+        p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
         p.satd[LUMA_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_mmx2>;
         p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
         p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
+        p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
+        p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
         p.satd[LUMA_32x8]  = x265_pixel_satd_32x8_sse2;
         p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
         p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
@@ -407,11 +401,6 @@
         p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
         p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
         p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
-#if X86_64
-        p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
-        p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
-        p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
-#endif
 
         p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
         p.sa8d[BLOCK_8x8]   = x265_pixel_sa8d_8x8_sse2;
diff -r 428b58f09945 -r 76120c6aa908 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Nov 12 13:16:19 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Nov 12 16:34:37 2013 +0530
@@ -1654,17 +1654,6 @@
 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
 %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
 
-cglobal pixel_satd_8x32, 4,6,8
-    SATD_START_SSE2 m6, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
-
 cglobal pixel_satd_16x4_internal
     LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
     lea  r2, [r2+4*r3]
@@ -1964,7 +1953,56 @@
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
     SATD_END_SSE2 m6, m7
-%endif
+
+cglobal pixel_satd_16x32, 4,6,8
+    SATD_START_SSE2 m6, m7, 1
+    BACKUP_POINTERS
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_ACCUM m6, m0, m7
+    RESTORE_AND_INC_POINTERS
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_END_SSE2 m6, m7
+
+cglobal pixel_satd_16x64, 4,6,8
+    SATD_START_SSE2 m6, m7, 1
+    BACKUP_POINTERS
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_ACCUM m6, m0, m7
+    RESTORE_AND_INC_POINTERS
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_END_SSE2 m6, m7
+%endif
+
+cglobal pixel_satd_8x32, 4,6,8
+    SATD_START_SSE2 m6, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_END_SSE2 m6
 
 cglobal pixel_satd_8x16, 4,6,8
     SATD_START_SSE2 m6, m7


More information about the x265-devel mailing list