[x265] [PATCH 083 of 307] [x265-avx512]x86: AVX512 weight_pp
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:21 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1502442378 -19800
# Fri Aug 11 14:36:18 2017 +0530
# Node ID 3d8c45642752803c560891fdfbe0a8b5c03ca76a
# Parent b30539ebe5c9b2d9412d3a39458a90a7574ac744
[x265-avx512]x86: AVX512 weight_pp
BitDepth | AVX2 performance | AVX512 performance
------------------------------------------------
8 | 6.23x | 10.60x
10 | 9.43x | 14.59x
diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Aug 14 17:19:48 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 14:36:18 2017 +0530
@@ -2322,6 +2322,7 @@
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+ p.weight_pp = PFX(weight_pp_avx512);
}
}
@@ -4026,6 +4027,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
+ p.weight_pp = PFX(weight_pp_avx512);
}
#endif
diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Aug 14 17:19:48 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Aug 11 14:36:18 2017 +0530
@@ -1662,6 +1662,116 @@
jnz .loopH
RET
%endif
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal weight_pp, 6, 7, 7
+%define correction (14 - BIT_DEPTH)
+ mov r6d, r6m
+ shl r6d, 16 - correction
+ or r6d, r5d
+
+ movd xm0, r6d
+ vpbroadcastd m0, xm0
+ mov r5d, r7m
+ sub r5d, correction
+ movd xm1, r5d
+
+ vpbroadcastd m2, r8m
+ vbroadcasti32x8 m5, [pw_1]
+ vbroadcasti32x8 m6, [pw_pixel_max]
+
+ add r2d, r2d
+ add r3d, r3d
+ sub r2d, r3d
+ shr r3d, 6
+
+.loopH:
+ mov r5d, r3d
+
+.loopW:
+ movu m4, [r0]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, xm1
+ paddd m3, m2
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, xm1
+ paddd m4, m2
+
+ packusdw m3, m4
+ pminuw m3, m6
+ movu [r1], m3
+
+ add r0, 64
+ add r1, 64
+
+ dec r5d
+ jnz .loopW
+
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r2]
+
+ dec r4d
+ jnz .loopH
+%undef correction
+ RET
+%else
+INIT_ZMM avx512
+cglobal weight_pp, 6, 7, 6
+
+ shl r5d, 6
+ mov r6d, r6m
+ shl r6d, 16
+ or r6d, r5d
+
+ movd xm0, r6d
+ vpbroadcastd m0, xm0
+ movd xm1, r7m
+ vpbroadcastd m2, r8m
+
+ vbroadcasti32x8 m5, [pw_1]
+
+ sub r2d, r3d
+ shr r3d, 5
+
+.loopH:
+ mov r5d, r3d
+
+.loopW:
+ pmovzxbw m4, [r0]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, xm1
+ paddd m3, m2
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, xm1
+ paddd m4, m2
+
+ packssdw m3, m4
+ vextracti64x4 ym4, m3, 1
+ packuswb ym3, ym4
+ vpermq ym3, ym3, q3120
+ movu [r1], ym3
+
+ add r0, 32
+ add r1, 32
+
+ dec r5d
+ jnz .loopW
+
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r2]
+
+ dec r4d
+ jnz .loopH
+ RET
+%endif
+
;-------------------------------------------------------------------------------------------------------------------------------------------------
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
;-------------------------------------------------------------------------------------------------------------------------------------------------
diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/reference.cpp
--- a/source/encoder/reference.cpp Mon Aug 14 17:19:48 2017 +0530
+++ b/source/encoder/reference.cpp Fri Aug 11 14:36:18 2017 +0530
@@ -155,12 +155,10 @@
const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
-
// Computing weighted CU rows
int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
- int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths
+ int padwidth = (width + 31) & ~31; // weightp assembly needs even 32 byte widths
primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
-
// Extending Left & Right
primitives.extendRowBorder(dst, stride, width, height, marginX);
diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp Mon Aug 14 17:19:48 2017 +0530
+++ b/source/encoder/weightPrediction.cpp Fri Aug 11 14:36:18 2017 +0530
@@ -184,8 +184,7 @@
int denom = w->log2WeightDenom;
int round = denom ? 1 << (denom - 1) : 0;
int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
- int pwidth = ((width + 15) >> 4) << 4;
-
+ int pwidth = ((width + 31) >> 5) << 5;
primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
weight, round << correction, denom + correction, offset);
ref = weightTemp;
diff -r b30539ebe5c9 -r 3d8c45642752 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Aug 14 17:19:48 2017 +0530
+++ b/source/test/pixelharness.cpp Fri Aug 11 14:36:18 2017 +0530
@@ -291,6 +291,9 @@
memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
int j = 0;
int width = 16 * (rand() % 4 + 1);
+ int cpuid = X265_NS::cpu_detect();
+ if (cpuid & X265_CPU_AVX512)
+ width = 32 * (rand() % 2 + 1);
int height = 8;
int w0 = rand() % 128;
int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset()
More information about the x265-devel
mailing list