[x265] [PATCH] primitives: asm: implementation of satd_16x12(sse2)

mandar at multicorewareinc.com mandar at multicorewareinc.com
Wed Jun 19 17:45:27 CEST 2013


# HG changeset patch
# User Mandar Gurav
# Date 1371608201 25200
# Node ID 621d4fc3875b55b9b7033a8415691e24c256e900
# Parent  41855cf217d08a47163474842e06a02a2a446363
primitives: asm: implementation of satd_16x12(sse2)

diff -r 41855cf217d0 -r 621d4fc3875b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 18 14:28:07 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jun 18 19:16:41 2013 -0700
@@ -198,7 +198,7 @@
         p.satd[PARTITION_12x64] = cmp<12, 64, 4, 16, x265_pixel_satd_4x16_sse2>;
 
         p.satd[PARTITION_16x4] = cmp<16, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
-        p.satd[PARTITION_16x12] = cmp<16, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+        p.satd[PARTITION_16x12] = x265_pixel_satd_16x12_sse2;
         p.satd[PARTITION_16x24] = cmp<16, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
         p.satd[PARTITION_16x32] = cmp<16, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
         p.satd[PARTITION_16x48] = cmp<16, 48, 16, 16, x265_pixel_satd_16x16_sse2>;
@@ -215,7 +215,7 @@
 
         p.satd[PARTITION_32x4] = cmp<32, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
         p.satd[PARTITION_32x8] = cmp<32, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
-        p.satd[PARTITION_32x12] = cmp<32, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+        p.satd[PARTITION_32x12] = cmp<32, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
         p.satd[PARTITION_32x16] = cmp<32, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
         p.satd[PARTITION_32x24] = cmp<32, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
         p.satd[PARTITION_32x32] = cmp<32, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
@@ -224,7 +224,7 @@
 
         p.satd[PARTITION_48x4] = cmp<48, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
         p.satd[PARTITION_48x8] = cmp<48, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
-        p.satd[PARTITION_48x12] = cmp<48, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+        p.satd[PARTITION_48x12] = cmp<48, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
         p.satd[PARTITION_48x16] = cmp<48, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
         p.satd[PARTITION_48x24] = cmp<48, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
         p.satd[PARTITION_48x32] = cmp<48, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
@@ -233,7 +233,7 @@
 
         p.satd[PARTITION_64x4] = cmp<64, 4, 8, 4, x265_pixel_satd_8x4_sse2>;
         p.satd[PARTITION_64x8] = cmp<64, 8, 16, 8, x265_pixel_satd_16x8_sse2>;
-        p.satd[PARTITION_64x12] = cmp<64, 12, 8, 4, x265_pixel_satd_8x4_sse2>;
+        p.satd[PARTITION_64x12] = cmp<64, 12, 16, 12, x265_pixel_satd_16x12_sse2>;
         p.satd[PARTITION_64x16] = cmp<64, 16, 16, 16, x265_pixel_satd_16x16_sse2>;
         p.satd[PARTITION_64x24] = cmp<64, 24, 16, 8, x265_pixel_satd_16x8_sse2>;
         p.satd[PARTITION_64x32] = cmp<64, 32, 16, 16, x265_pixel_satd_16x16_sse2>;
diff -r 41855cf217d0 -r 621d4fc3875b source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Jun 18 14:28:07 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Jun 18 19:16:41 2013 -0700
@@ -1649,7 +1649,7 @@
 
 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
-%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
+%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
 cglobal pixel_satd_16x4_internal
     LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
     lea  r2, [r2+4*r3]
@@ -1666,6 +1666,15 @@
 %endif
     jmp %%pixel_satd_16x8_internal
 
+cglobal pixel_satd_16x12, 4,6,12
+    SATD_START_SSE2 m10, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal
+    jmp %%pixel_satd_16x8_internal
+    SATD_END_SSE2 m10  
+
 cglobal pixel_satd_16x16, 4,6,12
     SATD_START_SSE2 m10, m7
 %if vertical
@@ -1676,7 +1685,7 @@
 %%pixel_satd_16x8_internal:
     call pixel_satd_16x4_internal
     call pixel_satd_16x4_internal
-    SATD_END_SSE2 m10
+    SATD_END_SSE2 m10  
 %else
 cglobal pixel_satd_16x8, 4,6,8
     SATD_START_SSE2 m6, m7
diff -r 41855cf217d0 -r 621d4fc3875b source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Tue Jun 18 14:28:07 2013 +0530
+++ b/source/common/x86/pixel.h	Tue Jun 18 19:16:41 2013 -0700
@@ -92,6 +92,10 @@
 DECL_X4( sad, cache64_sse2 );
 DECL_X4( sad, cache64_ssse3 );
 
+#if !HIGH_BIT_DEPTH
+int x265_pixel_satd_16x12_sse2 ( pixel *, intptr_t, pixel *, intptr_t );
+#endif
+
 DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
 DECL_PIXELS( uint64_t, var, avx,  ( pixel *pix, intptr_t i_stride ))


More information about the x265-devel mailing list