[x265] [PATCH 260 of 307] x86: AVX512 optimise intra_pred_dc_32 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:18 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1513077484 -19800
# Tue Dec 12 16:48:04 2017 +0530
# Node ID 42fe321e5cdf9ad260e4e5c7a64137a8b7601915
# Parent d6873e0a0786cd732304a94812a28914978113e3
x86: AVX512 optimise intra_pred_dc_32 for high bit depth
Remove using phaddd instruction in code
diff -r d6873e0a0786 -r 42fe321e5cdf source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Dec 11 17:13:36 2017 +0530
+++ b/source/common/x86/intrapred16.asm Tue Dec 12 16:48:04 2017 +0530
@@ -688,26 +688,25 @@
movu [r0 + r2 * 1 + 0], m0
movu [r0 + r2 * 1 + mmsize], m0
RET
-
INIT_ZMM avx512
-cglobal intra_pred_dc32, 3,3,17
+cglobal intra_pred_dc32, 3,3,2
add r2, 2
add r1d, r1d
- movu m16, [r2]
+ movu m0, [r2]
movu m1, [r2 + 2 * mmsize]
- paddw m16, m1
- vextracti32x8 ym1, m16, 1
- paddw ym16, ym1
- vextracti32x4 xm1, m16, 1
- paddw xm16, xm1
- pmaddwd xm16, [pw_1]
- movhlps xm1, xm16
- paddd xm16, xm1
- phaddd xm16, xm16
- paddd xm16, [pd_32] ; sum = sum + 32
- psrld xm16, 6 ; sum = sum / 64
- vpbroadcastw m0, xm16
-
+ paddw m0, m1
+ vextracti32x8 ym1, m0, 1
+ paddw ym0, ym1
+ vextracti32x4 xm1, m0, 1
+ paddw xm0, xm1
+ pmaddwd xm0, [pw_1]
+ movhlps xm1, xm0
+ paddd xm0, xm1
+ vpsrldq xm1, xm0, 4
+ paddd xm0, xm1
+ paddd xm0, [pd_32] ; sum = sum + 32
+ psrld xm0, 6 ; sum = sum / 64
+ vpbroadcastw m0, xm0
lea r2, [r1 * 3]
; store DC 32x32
movu [r0 + r1 * 0 + 0], m0
More information about the x265-devel
mailing list