[x265] [PATCH] asm: 16bpp support for sad_x4 - all block sizes

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Thu Dec 5 14:02:18 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386248495 -19800
#      Thu Dec 05 18:31:35 2013 +0530
# Node ID ca02a77ece624815aaf7b52fffb6f7710e13bc1e
# Parent  9de0a1e5cf6a710aac8f28be46cca49a9acce85f
asm: 16bpp support for sad_x4 - all block sizes

diff -r 9de0a1e5cf6a -r ca02a77ece62 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Dec 05 16:55:35 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Dec 05 18:31:35 2013 +0530
@@ -91,6 +91,24 @@
     p.sad_x3[LUMA_64x48] = x265_pixel_sad_x3_64x48_ ## cpu; \
     p.sad_x3[LUMA_64x64] = x265_pixel_sad_x3_64x64_ ## cpu
 
+#define SAD_X4(cpu) \
+    p.sad_x4[LUMA_16x8] = x265_pixel_sad_x4_16x8_ ## cpu; \
+    p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ ## cpu; \
+    p.sad_x4[LUMA_16x16] = x265_pixel_sad_x4_16x16_ ## cpu; \
+    p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ ## cpu; \
+    p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ ## cpu; \
+    p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ ## cpu; \
+    p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ ## cpu; \
+    p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ ## cpu; \
+    p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ ## cpu; \
+    p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ ## cpu; \
+    p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ ## cpu; \
+    p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ ## cpu; \
+    p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ ## cpu; \
+    p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ ## cpu; \
+    p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ ## cpu; \
+    p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ ## cpu
+
 #define ASSGN_SSE(cpu) \
     p.sse_pp[LUMA_8x8]   = x265_pixel_ssd_8x8_ ## cpu; \
     p.sse_pp[LUMA_8x4]   = x265_pixel_ssd_8x4_ ## cpu; \
@@ -593,6 +611,17 @@
         p.sad_x3[LUMA_16x4] = x265_pixel_sad_x3_16x4_sse2;
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_mmx2;
 
+        SAD_X4(sse2);
+        p.sad_x4[LUMA_4x4] = x265_pixel_sad_x4_4x4_mmx2;
+        p.sad_x4[LUMA_4x8] = x265_pixel_sad_x4_4x8_mmx2;
+        p.sad_x4[LUMA_4x16] = x265_pixel_sad_x4_4x16_mmx2;
+        p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_sse2;
+        p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_sse2;
+        p.sad_x4[LUMA_8x16] = x265_pixel_sad_x4_8x16_sse2;
+        p.sad_x4[LUMA_8x32] = x265_pixel_sad_x4_8x32_sse2;
+        p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
+        p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
+
         p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
         p.cvt16to32_shl = x265_cvt16to32_shl_sse2;
     }
@@ -767,6 +796,7 @@
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
 
         SAD_X3(ssse3);
+        SAD_X4(ssse3);
         p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
         p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
         p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
@@ -776,20 +806,6 @@
 
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_ssse3;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_ssse3;
-        p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
-        p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
-        p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
-        p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ssse3;
-        p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ssse3;
-        p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ssse3;
-        p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ssse3;
-        p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ssse3;
-        p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ssse3;
-        p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_ssse3;
-        p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_ssse3;
-        p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_ssse3;
-        p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_ssse3;
-        p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_ssse3;
 
         p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
         p.luma_p2s = x265_luma_p2s_ssse3;
@@ -863,25 +879,12 @@
         HEVC_SATD(avx);
         ASSGN_SSE_SS(avx);
         SAD_X3(avx);
+        SAD_X3(avx);
         p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
         p.sad_x3[LUMA_16x4]  = x265_pixel_sad_x3_16x4_avx;
         p.sad_x4[LUMA_16x4]  = x265_pixel_sad_x4_16x4_avx;
-        p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx;
-        p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
-        p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
-        p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_avx;
 
-        p.sad_x4[LUMA_32x8]  = x265_pixel_sad_x4_32x8_avx;
-        p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_avx;
-        p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_avx;
-        p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_avx;
-        p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_avx;
-        p.sad_x4[LUMA_48x64] = x265_pixel_sad_x4_48x64_avx;
-        p.sad_x4[LUMA_64x16] = x265_pixel_sad_x4_64x16_avx;
-        p.sad_x4[LUMA_64x32] = x265_pixel_sad_x4_64x32_avx;
-        p.sad_x4[LUMA_64x48] = x265_pixel_sad_x4_64x48_avx;
-        p.sad_x4[LUMA_64x64] = x265_pixel_sad_x4_64x64_avx;
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
         p.ssim_end_4 = x265_pixel_ssim_end4_avx;
     }
diff -r 9de0a1e5cf6a -r ca02a77ece62 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Thu Dec 05 16:55:35 2013 +0530
+++ b/source/common/x86/sad16-a.asm	Thu Dec 05 18:31:35 2013 +0530
@@ -536,6 +536,10 @@
     psubw   m3, m4
     ABSW2   m0, m1, m0, m1, m5, m6
     ABSW2   m2, m3, m2, m3, m4, m7
+    pmaddwd m0, [pw_1]
+    pmaddwd m1, [pw_1]
+    pmaddwd m2, [pw_1]
+    pmaddwd m3, [pw_1]
 %endmacro
 
 %macro SAD_X4_ONE 2
@@ -551,10 +555,14 @@
     psubw   m8, m4
     ABSW2   m5, m6, m5, m6, m9, m10
     ABSW2   m7, m8, m7, m8, m9, m10
-    paddw   m0, m5
-    paddw   m1, m6
-    paddw   m2, m7
-    paddw   m3, m8
+    pmaddwd m5, [pw_1]
+    pmaddwd m6, [pw_1]
+    pmaddwd m7, [pw_1]
+    pmaddwd m8, [pw_1]
+    paddd   m0, m5
+    paddd   m1, m6
+    paddd   m2, m7
+    paddd   m3, m8
 %elif cpuflag(ssse3)
     movu    m7, [r3+%2]
     psubw   m5, m4
@@ -566,24 +574,32 @@
     pabsw   m6, m6
     pabsw   m7, m7
     pabsw   m4, m4
-    paddw   m0, m5
-    paddw   m1, m6
-    paddw   m2, m7
-    paddw   m3, m4
+    pmaddwd m5, [pw_1]
+    pmaddwd m6, [pw_1]
+    pmaddwd m7, [pw_1]
+    pmaddwd m4, [pw_1]
+    paddd   m0, m5
+    paddd   m1, m6
+    paddd   m2, m7
+    paddd   m3, m4
 %else ; num_mmregs == 8 && !ssse3
     psubw   m5, m4
     psubw   m6, m4
     ABSW    m5, m5, m7
     ABSW    m6, m6, m7
-    paddw   m0, m5
-    paddw   m1, m6
+    pmaddwd m5, [pw_1]
+    pmaddwd m6, [pw_1]
+    paddd   m0, m5
+    paddd   m1, m6
     movu    m5, [r3+%2]
     movu    m6, [r4+%2]
     psubw   m5, m4
     psubw   m6, m4
     ABSW2   m5, m6, m5, m6, m7, m4
-    paddw   m2, m5
-    paddw   m3, m6
+    pmaddwd m5, [pw_1]
+    pmaddwd m6, [pw_1]
+    paddd   m2, m5
+    paddd   m3, m6
 %endif
 %endmacro
 
@@ -594,10 +610,10 @@
     HADDUW    m2, m6
     HADDUW    m3, m7
 %else
-    HADDW     m0, m4
-    HADDW     m1, m5
-    HADDW     m2, m6
-    HADDW     m3, m7
+    HADDD     m0, m4
+    HADDD     m1, m5
+    HADDD     m2, m6
+    HADDD     m3, m7
 %endif
     mov       r0, r6mp
     movd [r0+ 0], xm0
@@ -734,9 +750,11 @@
 SAD_X 3,  4,  4
 SAD_X 4, 16, 16
 SAD_X 4, 16,  8
+SAD_X 4, 12, 16
 SAD_X 4,  8, 16
 SAD_X 4,  8,  8
 SAD_X 4,  8,  4
+SAD_X 4,  4, 16
 SAD_X 4,  4,  8
 SAD_X 4,  4,  4
 INIT_MMX ssse3
@@ -781,8 +799,24 @@
 SAD_X 3,  8,  8
 SAD_X 3,  8,  4
 %define XMM_REGS 11
+SAD_X 4, 64, 64
+SAD_X 4, 64, 48
+SAD_X 4, 64, 32
+SAD_X 4, 64, 16
+SAD_X 4, 48, 64
+SAD_X 4, 32, 64
+SAD_X 4, 32, 32
+SAD_X 4, 32, 24
+SAD_X 4, 32, 16
+SAD_X 4, 32,  8
+SAD_X 4, 24, 32
+SAD_X 4, 16, 64
+SAD_X 4, 16, 32
 SAD_X 4, 16, 16
+SAD_X 4, 16, 12
 SAD_X 4, 16,  8
+SAD_X 4, 16,  4
+SAD_X 4,  8, 32
 SAD_X 4,  8, 16
 SAD_X 4,  8,  8
 SAD_X 4,  8,  4


More information about the x265-devel mailing list