[x265] [PATCH] primitives: asm: update: implementation of satd(sse2)
mandar at multicorewareinc.com
mandar at multicorewareinc.com
Thu Jun 20 02:11:02 CEST 2013
# HG changeset patch
# User Mandar Gurav
# Date 1371686950 25200
# Node ID 71f39d462ae8d8bc8f76a16550d1b10a93981b91
# Parent 64d2861dc6a82d8663bf482040170308e2f75096
primitives: asm: update: implementation of satd(sse2)
diff -r 64d2861dc6a8 -r 71f39d462ae8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 19 04:31:08 2013 -0700
+++ b/source/common/x86/asm-primitives.cpp Wed Jun 19 17:09:10 2013 -0700
@@ -181,11 +181,11 @@
p.satd[PARTITION_4x48] = cmp<4, 48, 4, 16, x265_pixel_satd_4x16_sse2>;
p.satd[PARTITION_4x64] = cmp<4, 64, 4, 16, x265_pixel_satd_4x16_sse2>;
- p.satd[PARTITION_8x12] = cmp<8, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
- p.satd[PARTITION_8x24] = cmp<8, 24, 8, 8, x265_pixel_satd_8x8_sse2>;
- p.satd[PARTITION_8x32] = cmp<8, 32, 8, 16, x265_pixel_satd_8x16_sse2>;
- p.satd[PARTITION_8x48] = cmp<8, 48, 8, 16, x265_pixel_satd_8x16_sse2>;
- p.satd[PARTITION_8x64] = cmp<8, 64, 8, 16, x265_pixel_satd_8x16_sse2>;
+ p.satd[PARTITION_8x12] = x265_pixel_satd_8x12_sse2;
+ p.satd[PARTITION_8x24] = x265_pixel_satd_8x24_sse2;
+ p.satd[PARTITION_8x32] = x265_pixel_satd_8x32_sse2;
+ p.satd[PARTITION_8x48] = x265_pixel_satd_8x48_sse2;
+ p.satd[PARTITION_8x64] = x265_pixel_satd_8x64_sse2;
p.satd[PARTITION_12x8] = cmp<12, 8, 4, 8, x265_pixel_satd_4x8_sse2>;
p.satd[PARTITION_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_sse2>;
@@ -194,12 +194,12 @@
p.satd[PARTITION_12x48] = cmp<12, 48, 4, 16, x265_pixel_satd_4x16_sse2>;
p.satd[PARTITION_12x64] = cmp<12, 64, 4, 16, x265_pixel_satd_4x16_sse2>;
- p.satd[PARTITION_16x4] = cmp<16, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_16x4] = x265_pixel_satd_16x4_sse2;
p.satd[PARTITION_16x12] = x265_pixel_satd_16x12_sse2;
- p.satd[PARTITION_16x24] = cmp<16, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
- p.satd[PARTITION_16x32] = cmp<16, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
- p.satd[PARTITION_16x48] = cmp<16, 48, 16, 16, x265_pixel_satd_16x16_sse2>;
- p.satd[PARTITION_16x64] = cmp<16, 64, 16, 16, x265_pixel_satd_16x16_sse2>;
+ p.satd[PARTITION_16x24] = x265_pixel_satd_16x24_sse2;
+ p.satd[PARTITION_16x32] = x265_pixel_satd_16x32_sse2;
+ p.satd[PARTITION_16x48] = x265_pixel_satd_16x48_sse2;
+ p.satd[PARTITION_16x64] = x265_pixel_satd_16x64_sse2;
p.satd[PARTITION_24x4] = cmp<24, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
p.satd[PARTITION_24x8] = cmp<24, 8, 8, 8, x265_pixel_satd_8x8_sse2>;
@@ -210,29 +210,29 @@
p.satd[PARTITION_24x48] = cmp<24, 48, 8, 16, x265_pixel_satd_8x16_sse2>;
p.satd[PARTITION_24x64] = cmp<24, 64, 8, 16, x265_pixel_satd_8x16_sse2>;
- p.satd[PARTITION_32x4] = cmp<32, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_32x4] = cmp<32, 4, 16, 4, x265_pixel_satd_16x4_sse2>;
p.satd[PARTITION_32x8] = cmp<32, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
p.satd[PARTITION_32x12] = cmp<32, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_32x16] = cmp<32, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
- p.satd[PARTITION_32x24] = cmp<32, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
+ p.satd[PARTITION_32x24] = cmp<32, 24, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_32x32] = cmp<32, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_32x48] = cmp<32, 48, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_32x64] = cmp<32, 64, 16, 16, x265_pixel_satd_16x16_sse2>;
- p.satd[PARTITION_48x4] = cmp<48, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_48x4] = cmp<48, 4, 16, 4, x265_pixel_satd_16x4_sse2>;
p.satd[PARTITION_48x8] = cmp<48, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
p.satd[PARTITION_48x12] = cmp<48, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_48x16] = cmp<48, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
- p.satd[PARTITION_48x24] = cmp<48, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
+ p.satd[PARTITION_48x24] = cmp<48, 24, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_48x32] = cmp<48, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_48x48] = cmp<48, 48, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_48x64] = cmp<48, 64, 16, 16, x265_pixel_satd_16x16_sse2>;
- p.satd[PARTITION_64x4] = cmp<64, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_64x4] = cmp<64, 4, 16, 4, x265_pixel_satd_16x4_sse2>;
p.satd[PARTITION_64x8] = cmp<64, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
p.satd[PARTITION_64x12] = cmp<64, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_64x16] = cmp<64, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
- p.satd[PARTITION_64x24] = cmp<64, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
+ p.satd[PARTITION_64x24] = cmp<64, 24, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_64x32] = cmp<64, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_64x48] = cmp<64, 48, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_64x64] = cmp<64, 64, 16, 16, x265_pixel_satd_16x16_sse2>;
diff -r 64d2861dc6a8 -r 71f39d462ae8 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Jun 19 04:31:08 2013 -0700
+++ b/source/common/x86/pixel-a.asm Wed Jun 19 17:09:10 2013 -0700
@@ -1650,6 +1650,65 @@
; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
+
+cglobal pixel_satd_8x12, 4,6,8
+ SATD_START_SSE2 m6, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_8x8_internal
+ call %%pixel_satd_8x4_internal
+ SATD_END_SSE2 m6
+
+cglobal pixel_satd_8x24, 4,6,8
+ SATD_START_SSE2 m6, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
+
+cglobal pixel_satd_8x32, 4,6,8
+ SATD_START_SSE2 m6, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
+
+cglobal pixel_satd_8x48, 4,6,8
+ SATD_START_SSE2 m6, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
+
+cglobal pixel_satd_8x64, 4,6,8
+ SATD_START_SSE2 m6, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6
+
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
@@ -1659,6 +1718,14 @@
SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
ret
+cglobal pixel_satd_16x4, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal
+ SATD_END_SSE2 m10
+
cglobal pixel_satd_16x8, 4,6,12
SATD_START_SSE2 m10, m7
%if vertical
@@ -1675,6 +1742,72 @@
jmp %%pixel_satd_16x8_internal
SATD_END_SSE2 m10
+cglobal pixel_satd_16x24, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ jmp %%pixel_satd_16x8_internal
+ SATD_END_SSE2 m10
+
+cglobal pixel_satd_16x32, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ jmp %%pixel_satd_16x8_internal
+ SATD_END_SSE2 m10
+
+cglobal pixel_satd_16x48, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ jmp %%pixel_satd_16x8_internal
+ SATD_END_SSE2 m10
+
+cglobal pixel_satd_16x64, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ call pixel_satd_16x4_internal
+ jmp %%pixel_satd_16x8_internal
+ SATD_END_SSE2 m10
+
cglobal pixel_satd_16x16, 4,6,12
SATD_START_SSE2 m10, m7
%if vertical
diff -r 64d2861dc6a8 -r 71f39d462ae8 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Jun 19 04:31:08 2013 -0700
+++ b/source/common/x86/pixel.h Wed Jun 19 17:09:10 2013 -0700
@@ -92,9 +92,19 @@
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-#if !HIGH_BIT_DEPTH
+int x265_pixel_satd_8x12_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+int x265_pixel_satd_8x24_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+int x265_pixel_satd_8x32_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+int x265_pixel_satd_8x48_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+int x265_pixel_satd_8x64_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+
+int x265_pixel_satd_16x4_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
int x265_pixel_satd_16x12_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
-#endif
+int x265_pixel_satd_16x24_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+int x265_pixel_satd_16x32_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+int x265_pixel_satd_16x48_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+int x265_pixel_satd_16x64_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
More information about the x265-devel
mailing list