[x265] [PATCH 260 of 307] x86: AVX512 optimise intra_pred_dc_32 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:18 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1513077484 -19800
#      Tue Dec 12 16:48:04 2017 +0530
# Node ID 42fe321e5cdf9ad260e4e5c7a64137a8b7601915
# Parent  d6873e0a0786cd732304a94812a28914978113e3
x86: AVX512 optimise intra_pred_dc_32 for high bit depth

Remove using phaddd instruction in code

diff -r d6873e0a0786 -r 42fe321e5cdf source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Mon Dec 11 17:13:36 2017 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Dec 12 16:48:04 2017 +0530
@@ -688,26 +688,25 @@
     movu            [r0 + r2 * 1 +  0], m0
     movu            [r0 + r2 * 1 + mmsize], m0
     RET
-
 INIT_ZMM avx512
-cglobal intra_pred_dc32, 3,3,17
+cglobal intra_pred_dc32, 3,3,2
     add              r2, 2
     add             r1d, r1d
-    movu             m16, [r2]
+    movu             m0, [r2]
     movu             m1, [r2 + 2 * mmsize]
-    paddw            m16, m1
-    vextracti32x8   ym1, m16, 1
-    paddw           ym16, ym1
-    vextracti32x4   xm1, m16, 1
-    paddw           xm16, xm1
-    pmaddwd         xm16, [pw_1]
-    movhlps         xm1, xm16
-    paddd           xm16, xm1
-    phaddd          xm16, xm16
-    paddd           xm16, [pd_32]                        ; sum = sum + 32
-    psrld           xm16, 6                              ; sum = sum / 64
-    vpbroadcastw     m0, xm16
-
+    paddw            m0, m1
+    vextracti32x8   ym1, m0, 1
+    paddw           ym0, ym1
+    vextracti32x4   xm1, m0, 1
+    paddw           xm0, xm1
+    pmaddwd         xm0, [pw_1]
+    movhlps         xm1, xm0
+    paddd           xm0, xm1
+    vpsrldq         xm1, xm0, 4
+    paddd           xm0, xm1
+    paddd           xm0, [pd_32]                        ; sum = sum + 32
+    psrld           xm0, 6                              ; sum = sum / 64
+    vpbroadcastw     m0, xm0
     lea              r2, [r1 * 3]
     ; store DC 32x32
     movu            [r0 + r1 * 0 +  0], m0


More information about the x265-devel mailing list