[x265] [PATCH] asm: 16bpp support for sad_x3 - all block sizes

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Thu Dec 5 12:26:29 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386242735 -19800
#      Thu Dec 05 16:55:35 2013 +0530
# Node ID 9de0a1e5cf6a710aac8f28be46cca49a9acce85f
# Parent  79d649d551f04121950a7de1bf5f48e52f625f6a
asm: 16bpp support for sad_x3 - all block sizes

diff -r 79d649d551f0 -r 9de0a1e5cf6a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 04 20:43:39 2013 +0550
+++ b/source/common/x86/asm-primitives.cpp	Thu Dec 05 16:55:35 2013 +0530
@@ -73,6 +73,24 @@
     p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
     p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
 
+#define SAD_X3(cpu) \
+    p.sad_x3[LUMA_16x8] = x265_pixel_sad_x3_16x8_ ## cpu; \
+    p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ ## cpu; \
+    p.sad_x3[LUMA_16x16] = x265_pixel_sad_x3_16x16_ ## cpu; \
+    p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ ## cpu; \
+    p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ ## cpu; \
+    p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ ## cpu; \
+    p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ ## cpu; \
+    p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ ## cpu; \
+    p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ ## cpu; \
+    p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ ## cpu; \
+    p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ ## cpu; \
+    p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ ## cpu; \
+    p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ ## cpu; \
+    p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ ## cpu; \
+    p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
+    p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
+
 #define ASSGN_SSE(cpu) \
     p.sse_pp[LUMA_8x8]   = x265_pixel_ssd_8x8_ ## cpu; \
     p.sse_pp[LUMA_8x4]   = x265_pixel_ssd_8x4_ ## cpu; \
@@ -564,6 +582,17 @@
         p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2;
         p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
 
+        SAD_X3(sse2);
+        p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
+        p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
+        p.sad_x3[LUMA_4x16] = x265_pixel_sad_x3_4x16_mmx2;
+        p.sad_x3[LUMA_8x4] = x265_pixel_sad_x3_8x4_sse2;
+        p.sad_x3[LUMA_8x8] = x265_pixel_sad_x3_8x8_sse2;
+        p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_sse2;
+        p.sad_x3[LUMA_8x32] = x265_pixel_sad_x3_8x32_sse2;
+        p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
+        p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
+
         p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
         p.cvt16to32_shl = x265_cvt16to32_shl_sse2;
     }
@@ -737,6 +766,7 @@
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
 
+        SAD_X3(ssse3);
         p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
         p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
         p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
@@ -746,30 +776,16 @@
 
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
-        p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_ssse3;
         p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
-        p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
         p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
-        p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ssse3;
         p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
-        p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ssse3;
         p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ssse3;
-        p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ssse3;
-        p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ssse3;
-        p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ssse3;
-        p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ssse3;
-        p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ssse3;
         p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ssse3;
         p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ssse3;
         p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ssse3;
         p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ssse3;
         p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ssse3;
-        p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_ssse3;
         p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ssse3;
-        p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_ssse3;
-        p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_ssse3;
-        p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ssse3;
-        p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ssse3;
         p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ssse3;
         p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ssse3;
         p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ssse3;
@@ -846,36 +862,22 @@
         ASSGN_SSE(avx);
         HEVC_SATD(avx);
         ASSGN_SSE_SS(avx);
-
+        SAD_X3(avx);
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
         p.sad_x3[LUMA_16x4]  = x265_pixel_sad_x3_16x4_avx;
         p.sad_x4[LUMA_16x4]  = x265_pixel_sad_x4_16x4_avx;
-        p.sad_x3[LUMA_16x12] = x265_pixel_sad_x3_16x12_avx;
         p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx;
-        p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_avx;
         p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
-        p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_avx;
         p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
-        p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_avx;
         p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_avx;
 
-        p.sad_x3[LUMA_32x8]  = x265_pixel_sad_x3_32x8_avx;
-        p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_avx;
-        p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_avx;
-        p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_avx;
-        p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_avx;
         p.sad_x4[LUMA_32x8]  = x265_pixel_sad_x4_32x8_avx;
         p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_avx;
         p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_avx;
         p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_avx;
         p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_avx;
-        p.sad_x3[LUMA_48x64] = x265_pixel_sad_x3_48x64_avx;
         p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_avx;
-        p.sad_x3[LUMA_64x16] = x265_pixel_sad_x3_64x16_avx;
-        p.sad_x3[LUMA_64x32] = x265_pixel_sad_x3_64x32_avx;
-        p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_avx;
-        p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_avx;
         p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_avx;
         p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_avx;
         p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_avx;
diff -r 79d649d551f0 -r 9de0a1e5cf6a source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Wed Dec 04 20:43:39 2013 +0550
+++ b/source/common/x86/sad16-a.asm	Thu Dec 05 16:55:35 2013 +0530
@@ -470,6 +470,9 @@
     psubw   m2, m3
     ABSW2   m0, m1, m0, m1, m4, m5
     ABSW    m2, m2, m6
+    pmaddwd m0, [pw_1]
+    pmaddwd m1, [pw_1]
+    pmaddwd m2, [pw_1]
 %endmacro
 
 %macro SAD_X3_ONE 2
@@ -482,9 +485,12 @@
     psubw   m5, m6
     ABSW2   m3, m4, m3, m4, m7, m6
     ABSW    m5, m5, m6
-    paddw   m0, m3
-    paddw   m1, m4
-    paddw   m2, m5
+    pmaddwd m3, [pw_1]
+    pmaddwd m4, [pw_1]
+    pmaddwd m5, [pw_1]
+    paddd   m0, m3
+    paddd   m1, m4
+    paddd   m2, m5
 %endmacro
 
 %macro SAD_X3_END 2
@@ -493,9 +499,9 @@
     HADDUW   m1, m4
     HADDUW   m2, m5
 %else
-    HADDW    m0, m3
-    HADDW    m1, m4
-    HADDW    m2, m5
+    HADDD    m0, m3
+    HADDD    m1, m4
+    HADDD    m2, m5
 %endif
 %if UNIX64
     movd [r5+0], xm0
@@ -719,9 +725,11 @@
 %define XMM_REGS 0
 SAD_X 3, 16, 16
 SAD_X 3, 16,  8
+SAD_X 3, 12, 16
 SAD_X 3,  8, 16
 SAD_X 3,  8,  8
 SAD_X 3,  8,  4
+SAD_X 3,  4, 16
 SAD_X 3,  4,  8
 SAD_X 3,  4,  4
 SAD_X 4, 16, 16
@@ -751,8 +759,24 @@
 SAD_X 4,  8,  4
 INIT_XMM sse2
 %define XMM_REGS 8
+SAD_X 3, 64, 64
+SAD_X 3, 64, 48
+SAD_X 3, 64, 32
+SAD_X 3, 64, 16
+SAD_X 3, 48, 64
+SAD_X 3, 32, 64
+SAD_X 3, 32, 32
+SAD_X 3, 32, 24
+SAD_X 3, 32, 16
+SAD_X 3, 32,  8
+SAD_X 3, 24, 32
+SAD_X 3, 16, 64
+SAD_X 3, 16, 32
 SAD_X 3, 16, 16
+SAD_X 3, 16, 12
 SAD_X 3, 16,  8
+SAD_X 3, 16,  4
+SAD_X 3,  8, 32
 SAD_X 3,  8, 16
 SAD_X 3,  8,  8
 SAD_X 3,  8,  4


More information about the x265-devel mailing list