[x265] [PATCH 273 of 307] x86: AVX512 intra_pred_ang32x32 mode 10 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:31 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1514366500 -19800
#      Wed Dec 27 14:51:40 2017 +0530
# Node ID 8036bbce3d26fbebd3408a7e17a76206275fbde9
# Parent  ca3c04bd0a71bb263b8084283acce012f0cc397c
x86: AVX512 intra_pred_ang32x32 mode 10 for high bit depth

Primitive           | AVX2 performance | AVX512 performance
-------------------------------------------------------------
intra_ang_32x32[10] |     18.99x       |      29.11x

diff -r ca3c04bd0a71 -r 8036bbce3d26 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Dec 29 09:30:36 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 27 14:51:40 2017 +0530
@@ -3091,6 +3091,8 @@
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
         p.cu[BLOCK_32x32].intra_pred[2]      = PFX(intra_pred_ang32_2_avx512);
         p.cu[BLOCK_32x32].intra_pred[34]     = PFX(intra_pred_ang32_2_avx512);
+        p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx512);
+
         p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
         p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
         p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
diff -r ca3c04bd0a71 -r 8036bbce3d26 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Dec 29 09:30:36 2017 +0530
+++ b/source/common/x86/intrapred16.asm	Wed Dec 27 14:51:40 2017 +0530
@@ -18389,6 +18389,100 @@
     palignr     m2,                 m0, m1, 14
     movu        [r0 + r3],          m2
     RET
+
+cglobal intra_pred_ang32_10, 3,4,2
+    add             r2, mmsize*2
+    add             r1d, r1d
+    lea             r3, [r1 * 3]
+
+    vpbroadcastw    m0, [r2 + 2]       ; [1...]
+    vpbroadcastw    m1, [r2 + 2 + 2]   ; [2...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 4]   ; [3...]
+    vpbroadcastw    m1, [r2 + 2 + 6]   ; [4...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+    lea             r0, [r0 + r1 * 4]
+
+    vpbroadcastw    m0, [r2 + 2 + 8]   ; [5...]
+    vpbroadcastw    m1, [r2 + 2 + 10]  ; [6...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 12]  ; [7...]
+    vpbroadcastw    m1, [r2 + 2 + 14]  ; [8...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+    lea             r0, [r0 + r1 *4]
+
+    vpbroadcastw    m0, [r2 + 2 + 16]  ; [9...]
+    vpbroadcastw    m1, [r2 + 2 + 18]  ; [10...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 20]  ; [11...]
+    vpbroadcastw    m1, [r2 + 2 + 22]  ; [12...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+    lea             r0, [r0 + r1 *4]
+
+    vpbroadcastw    m0, [r2 + 2 + 24]  ; [13...]
+    vpbroadcastw    m1, [r2 + 2 + 26]  ; [14...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 28]  ; [15...]
+    vpbroadcastw    m1, [r2 + 2 + 30]  ; [16...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+    lea             r0, [r0 + r1 *4]
+
+    vpbroadcastw    m0, [r2 + 2 + 32]  ; [17...]
+    vpbroadcastw    m1, [r2 + 2 + 34]  ; [18...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 36]  ; [19...]
+    vpbroadcastw    m1, [r2 + 2 + 38]  ; [20...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+    lea             r0, [r0 + r1 *4]
+
+    vpbroadcastw    m0, [r2 + 2 + 40]  ; [21...]
+    vpbroadcastw    m1, [r2 + 2 + 42]  ; [22...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 44]  ; [23...]
+    vpbroadcastw    m1, [r2 + 2 + 46]  ; [24...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+    lea             r0, [r0 + r1 *4]
+
+    vpbroadcastw    m0, [r2 + 2 + 48]  ; [25...]
+    vpbroadcastw    m1, [r2 + 2 + 50]  ; [26...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 52]  ; [27...]
+    vpbroadcastw    m1, [r2 + 2 + 54]  ; [28...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+    lea             r0, [r0 + r1 *4]
+
+    vpbroadcastw    m0, [r2 + 2 + 56]  ; [29...]
+    vpbroadcastw    m1, [r2 + 2 + 58]  ; [30...]
+    movu            [r0], m0
+    movu            [r0 + r1], m1
+
+    vpbroadcastw    m0, [r2 + 2 + 60]  ; [31...]
+    vpbroadcastw    m1, [r2 + 2 + 62]  ; [32...]
+    movu            [r0 + r1 * 2], m0
+    movu            [r0 + r3], m1
+
+    RET
 ;-------------------------------------------------------------------------------------------------------
 ; avx512 code for intra_pred_ang32 mode 2 to 34 end
 ;-------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list