[x265] [PATCH 01 of 13] x86:AVX2 nonpsyRdoQuant primitive for all sizes
vignesh at multicorewareinc.com
vignesh at multicorewareinc.com
Fri Jul 6 11:18:01 CEST 2018
# HG changeset patch
# User Jayashree <jayashree.c at multicorewareinc.com>
# Date 1520395477 -19800
# Wed Mar 07 09:34:37 2018 +0530
# Node ID c9f622347ce51cf90b593e8500ee5a40888c6f29
# Parent f377b028f4a91715372a6241fc80e78a672dbd06
x86:AVX2 nonpsyRdoQuant primitive for all sizes
diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 05 17:58:06 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Mar 07 09:34:37 2018 +0530
@@ -2310,6 +2310,10 @@
p.integral_inith[INTEGRAL_8] = PFX(integral8h_avx2);
p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2);
p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2);
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2);
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
/* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only
p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
@@ -4689,6 +4693,10 @@
p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2);
p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2);
p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2);
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2);
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
}
if (cpuMask & X265_CPU_AVX512)
diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Jul 05 17:58:06 2018 +0530
+++ b/source/common/x86/dct8.asm Wed Mar 07 09:34:37 2018 +0530
@@ -7058,4 +7058,299 @@
movq [r3], xm0
movq [r4], xm1
RET
+
+INIT_YMM avx2
+cglobal nonPsyRdoQuant4, 5, 9, 16
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+ movq xm0, [r2]
+ movq xm1, [r3]
+
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8]
+%else
+ %error Unsupported BIT_DEPTH!
%endif
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+ vpxor m13, m13
+
+ vpmovsxwd m6, [r0]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1], m13
+
+ vpmovsxwd m6, [r0 + 8]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 32], m13
+
+ vpmovsxwd m6, [r0 + 16]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 64], m13
+
+ vpmovsxwd m6, [r0 +24]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 96], m13
+
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r2], xm0
+ movq [r3], xm1
+ RET
+
+
+
+INIT_YMM avx2
+cglobal nonPsyRdoQuant8, 5, 5, 8
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+%if BIT_DEPTH == 12
+ mov r4, [tab_nonpsyRdo12 + 8]
+%elif BIT_DEPTH == 10
+ mov r4, [tab_nonpsyRdo10 + 8]
+%elif BIT_DEPTH == 8
+ mov r4, [tab_nonpsyRdo8 + 8]
+%else
+ %error Unsupported BIT_DEPTH!
+ %endif
+ movq xm3, r4
+ movq xm6, [r2]
+ movq xm7, [r3]
+ vpxor m4, m4
+ vpxor m5, m5
+ movq xm0, [r0]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1], ym0
+ vpxor m0, m0
+ movq xm0, [r0 +mmsize/2]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1 +2*mmsize], m0
+ vpxor m0, m0
+ movq xm0, [r0 +mmsize]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1 +4*mmsize], m0
+ vpxor m0, m0
+ movq xm0, [r0 +3*mmsize/2]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1 +6*mmsize], m0
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm5
+ paddq xm4, xm2
+
+ paddq xm6, xm4
+ paddq xm7, xm4
+
+ movq [r2], xm6
+ movq [r3], xm7
+ RET
+INIT_YMM avx2
+cglobal nonPsyRdoQuant16, 5, 5, 8
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+%if BIT_DEPTH == 12
+ mov r4, [tab_nonpsyRdo12 + 16]
+%elif BIT_DEPTH == 10
+ mov r4, [tab_nonpsyRdo10 + 16]
+%elif BIT_DEPTH == 8
+ mov r4, [tab_nonpsyRdo8 + 16]
+%else
+ %error Unsupported BIT_DEPTH!
+ %endif
+ movq xm3, r4
+ movq xm6, [r2]
+ movq xm7, [r3]
+ vpxor m4, m4
+ vpxor m5, m5
+
+;Row 1, 2
+ movq xm0, [r0]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1], ym0
+
+ movq xm0, [r0 +mmsize]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1+4*mmsize], ym0
+
+ movq xm0, [r0 + 2*mmsize]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1+8*mmsize], ym0
+
+ movq xm0, [r0 + 3*mmsize]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1+12*mmsize], ym0
+
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm5
+ paddq xm4, xm2
+
+ paddq xm6, xm4
+ paddq xm7, xm4
+
+ movq [r2], xm6
+ movq [r3], xm7
+ RET
+INIT_YMM avx2
+cglobal nonPsyRdoQuant32, 5, 5, 8
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+%if BIT_DEPTH == 12
+ mov r4, [tab_nonpsyRdo12 + 24]
+%elif BIT_DEPTH == 10
+ mov r4, [tab_nonpsyRdo10 + 24]
+%elif BIT_DEPTH == 8
+ mov r4, [tab_nonpsyRdo8 + 24]
+%else
+ %error Unsupported BIT_DEPTH!
+ %endif
+ movq xm3, r4
+ movq xm6, [r2]
+ movq xm7, [r3]
+ vpxor m4, m4
+ vpxor m5, m5
+
+ movq xm0, [r0]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1], m0
+ vpxor m0, m0
+
+ movq xm0, [r0 +2*mmsize]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1 + 8*mmsize], m0
+ vpxor m0, m0
+
+ movq xm0, [r0 +4*mmsize]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1 +16*mmsize], m0
+ vpxor m0, m0
+
+ movq xm0, [r0 +6*mmsize]
+ vpmovsxwd m1, xm0
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vcvtpd2dq xm1, m2
+ vpmovsxdq m0 , xm1
+ vpsllq m0, xm3 ; costUncoded
+ paddq m4, m0
+ movu [r1 +24*mmsize], m0
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm5
+ paddq xm4, xm2
+
+ paddq xm6, xm4
+ paddq xm7, xm4
+
+ movq [r2], xm6
+ movq [r3], xm7
+ RET
+%endif
diff -r f377b028f4a9 -r c9f622347ce5 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Jul 05 17:58:06 2018 +0530
+++ b/source/common/x86/dct8.h Wed Mar 07 09:34:37 2018 +0530
@@ -36,6 +36,7 @@
FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-01.patch
Type: text/x-patch
Size: 16662 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20180706/e59eb52b/attachment-0001.bin>
More information about the x265-devel
mailing list