[x265] [PATCH] asm: instantiate some sad_x3 and sad_x4 functions for HEVC partitions

Steve Borho steve at borho.org
Thu Oct 24 09:26:19 CEST 2013


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1382599561 18000
#      Thu Oct 24 02:26:01 2013 -0500
# Node ID 3b8fa23f68ececed69fe59f3772dc5adda1b31c9
# Parent  e8f05b1c543a66734e53ab8fb95674cc81e7ac7c
asm: instantiate some sad_x3 and sad_x4 functions for HEVC partitions

diff -r e8f05b1c543a -r 3b8fa23f68ec source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Oct 23 23:59:51 2013 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 24 02:26:01 2013 -0500
@@ -269,7 +269,15 @@
 
         p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
         p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
+        p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
         p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_ssse3;
+        p.sad_x3[LUMA_8x32]  = x265_pixel_sad_x3_8x32_ssse3;
+        p.sad_x4[LUMA_8x32]  = x265_pixel_sad_x4_8x32_ssse3;
+
+        p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ssse3;
+        p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
+        p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
+        p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -291,6 +299,13 @@
         p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_avx;
         SA8D_INTER_FROM_BLOCK(avx);
         ASSGN_SSE(avx);
+
+        p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_avx;
+        p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_avx;
+        p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx;
+        p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx;
+        p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_avx;
+        p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -310,6 +325,11 @@
         INIT2_NAME(sse_pp, ssd, _avx2);
         p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_avx2;
         SA8D_INTER_FROM_BLOCK8(avx2);
+
+        p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx2;
+        p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
+        p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_avx2;
+        p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r e8f05b1c543a -r 3b8fa23f68ec source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Wed Oct 23 23:59:51 2013 -0500
+++ b/source/common/x86/pixel.h	Thu Oct 24 02:26:01 2013 -0500
@@ -29,8 +29,13 @@
 #define X265_I386_PIXEL_H
 
 #define DECL_PIXELS(ret, name, suffix, args) \
+    ret x265_pixel_ ## name ## _16x64_ ## suffix args; \
+    ret x265_pixel_ ## name ## _16x32_ ## suffix args; \
     ret x265_pixel_ ## name ## _16x16_ ## suffix args; \
+    ret x265_pixel_ ## name ## _16x12_ ## suffix args; \
     ret x265_pixel_ ## name ## _16x8_ ## suffix args; \
+    ret x265_pixel_ ## name ## _16x4_ ## suffix args; \
+    ret x265_pixel_ ## name ## _8x32_ ## suffix args; \
     ret x265_pixel_ ## name ## _8x16_ ## suffix args; \
     ret x265_pixel_ ## name ## _8x8_ ## suffix args; \
     ret x265_pixel_ ## name ## _8x4_ ## suffix args; \
@@ -58,6 +63,7 @@
 DECL_X4(sad, sse2)
 DECL_X4(sad, sse3)
 DECL_X4(sad, ssse3)
+DECL_X4(sad, avx)
 DECL_X4(sad, avx2)
 DECL_X1(ssd, mmx)
 DECL_X1(ssd, mmx2)
diff -r e8f05b1c543a -r 3b8fa23f68ec source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Wed Oct 23 23:59:51 2013 -0500
+++ b/source/common/x86/sad-a.asm	Thu Oct 24 02:26:01 2013 -0500
@@ -1504,8 +1504,10 @@
 INIT_XMM sse3
 SAD_X_SSE2 3, 16, 16, 7
 SAD_X_SSE2 3, 16,  8, 7
+SAD_X_SSE2 3, 16,  4, 7
 SAD_X_SSE2 4, 16, 16, 7
 SAD_X_SSE2 4, 16,  8, 7
+SAD_X_SSE2 4, 16,  4, 7
 
 %macro SAD_X_SSSE3 3
 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
@@ -1518,19 +1520,32 @@
 %endmacro
 
 INIT_XMM ssse3
+SAD_X_SSE2  3, 16, 32, 7
 SAD_X_SSE2  3, 16, 16, 7
+SAD_X_SSE2  3, 16, 12, 7
 SAD_X_SSE2  3, 16,  8, 7
+SAD_X_SSE2  3,  8, 32, 7
+SAD_X_SSE2  3,  8, 16, 7
+SAD_X_SSE2  4, 16, 32, 7
 SAD_X_SSE2  4, 16, 16, 7
+SAD_X_SSE2  4, 16, 12, 7
 SAD_X_SSE2  4, 16,  8, 7
+SAD_X_SSSE3 4,  8, 32
 SAD_X_SSSE3 4,  8, 16
 SAD_X_SSSE3 4,  8,  8
 SAD_X_SSSE3 4,  8,  4
 
 INIT_XMM avx
+SAD_X_SSE2 3, 16, 32, 6
 SAD_X_SSE2 3, 16, 16, 6
+SAD_X_SSE2 3, 16, 12, 6
 SAD_X_SSE2 3, 16,  8, 6
+SAD_X_SSE2 3, 16,  4, 6
+SAD_X_SSE2 4, 16, 32, 7
 SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 12, 7
 SAD_X_SSE2 4, 16,  8, 7
+SAD_X_SSE2 4, 16,  4, 7
 
 %macro SAD_X_AVX2 4
 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
@@ -1543,9 +1558,13 @@
 %endmacro
 
 INIT_YMM avx2
+SAD_X_AVX2 3, 16, 32, 7
 SAD_X_AVX2 3, 16, 16, 7
+SAD_X_AVX2 3, 16, 12, 7
 SAD_X_AVX2 3, 16,  8, 7
+SAD_X_AVX2 4, 16, 32, 8
 SAD_X_AVX2 4, 16, 16, 8
+SAD_X_AVX2 4, 16, 12, 8
 SAD_X_AVX2 4, 16,  8, 8
 
 ;=============================================================================


More information about the x265-devel mailing list