[x265] [PATCH] primitives: asm: implementation of satd_16x12(sse2)
mandar at multicorewareinc.com
mandar at multicorewareinc.com
Wed Jun 19 17:45:27 CEST 2013
# HG changeset patch
# User Mandar Gurav
# Date 1371608201 25200
# Node ID 621d4fc3875b55b9b7033a8415691e24c256e900
# Parent 41855cf217d08a47163474842e06a02a2a446363
primitives: asm: implementation of satd_16x12(sse2)
diff -r 41855cf217d0 -r 621d4fc3875b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 18 14:28:07 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 18 19:16:41 2013 -0700
@@ -198,7 +198,7 @@
p.satd[PARTITION_12x64] = cmp<12, 64, 4, 16, x265_pixel_satd_4x16_sse2>;
p.satd[PARTITION_16x4] = cmp<16, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
- p.satd[PARTITION_16x12] = cmp<16, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_16x12] = x265_pixel_satd_16x12_sse2;
p.satd[PARTITION_16x24] = cmp<16, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
p.satd[PARTITION_16x32] = cmp<16, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_16x48] = cmp<16, 48, 16, 16, x265_pixel_satd_16x16_sse2>;
@@ -215,7 +215,7 @@
p.satd[PARTITION_32x4] = cmp<32, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
p.satd[PARTITION_32x8] = cmp<32, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
- p.satd[PARTITION_32x12] = cmp<32, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_32x12] = cmp<32, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_32x16] = cmp<32, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_32x24] = cmp<32, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
p.satd[PARTITION_32x32] = cmp<32, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
@@ -224,7 +224,7 @@
p.satd[PARTITION_48x4] = cmp<48, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
p.satd[PARTITION_48x8] = cmp<48, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
- p.satd[PARTITION_48x12] = cmp<48, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_48x12] = cmp<48, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_48x16] = cmp<48, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_48x24] = cmp<48, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
p.satd[PARTITION_48x32] = cmp<48, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
@@ -233,7 +233,7 @@
p.satd[PARTITION_64x4] = cmp<64, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
p.satd[PARTITION_64x8] = cmp<64, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
- p.satd[PARTITION_64x12] = cmp<64, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+ p.satd[PARTITION_64x12] = cmp<64, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
p.satd[PARTITION_64x16] = cmp<64, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
p.satd[PARTITION_64x24] = cmp<64, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
p.satd[PARTITION_64x32] = cmp<64, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
diff -r 41855cf217d0 -r 621d4fc3875b source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Jun 18 14:28:07 2013 +0530
+++ b/source/common/x86/pixel-a.asm Tue Jun 18 19:16:41 2013 -0700
@@ -1649,7 +1649,7 @@
; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
-%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
+%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
@@ -1666,6 +1666,15 @@
%endif
jmp %%pixel_satd_16x8_internal
+cglobal pixel_satd_16x12, 4,6,12
+ SATD_START_SSE2 m10, m7
+%if vertical
+ mova m7, [pw_00ff]
+%endif
+ call pixel_satd_16x4_internal
+ jmp %%pixel_satd_16x8_internal
+ SATD_END_SSE2 m10
+
cglobal pixel_satd_16x16, 4,6,12
SATD_START_SSE2 m10, m7
%if vertical
@@ -1676,7 +1685,7 @@
%%pixel_satd_16x8_internal:
call pixel_satd_16x4_internal
call pixel_satd_16x4_internal
- SATD_END_SSE2 m10
+ SATD_END_SSE2 m10
%else
cglobal pixel_satd_16x8, 4,6,8
SATD_START_SSE2 m6, m7
diff -r 41855cf217d0 -r 621d4fc3875b source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Tue Jun 18 14:28:07 2013 +0530
+++ b/source/common/x86/pixel.h Tue Jun 18 19:16:41 2013 -0700
@@ -92,6 +92,10 @@
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
+#if !HIGH_BIT_DEPTH
+int x265_pixel_satd_16x12_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+#endif
+
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
More information about the x265-devel
mailing list