[x265] [PATCH x265] Add AVX2 assembly code for normFactor primitive.
Akil
akil at multicorewareinc.com
Tue Mar 5 05:34:22 CET 2019
# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1551693998 -19800
# Mon Mar 04 15:36:38 2019 +0530
# Node ID 19f27e0c8a6f8250af5e2f7c7984dc3b57bea7e4
# Parent d12a4caf7963fd47d646040689ad5f02754ad879
x86: normFactor primitive
This patch adds AVX2 assembly for this primitive.
|---------|-----------|-----------------|-----------------|
| Size |Performance|AVX2 clock cycles|CPP clock cycles |
|---------|-----------|-----------------|-----------------|
| [8x8] | 7.65x | 312.90 | 2394.83 |
| [16x16] | 8.42x | 1157.14 | 9741.56 |
| [32x32] | 9.56x | 3942.18 | 37692.20 |
| [64x64] | 8.96x | 15388.24 | 137889.28 |
|---------|-----------|-----------------|-----------------|
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/pixel.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -959,6 +959,19 @@
}
}
+static void normFact_c(const pixel* src, uint32_t blockSize, int shift,
uint64_t *z_k)
+{
+ *z_k = 0;
+ for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+ {
+ for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+ {
+ uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+ *z_k += temp * temp;
+ }
+ }
+}
+
#if HIGH_BIT_DEPTH
static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int
height, uint64_t *outsum,
const pixel minPix, const pixel maxPix)
@@ -1314,5 +1327,10 @@
p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>;
p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>;
p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>;
+
+ p.cu[BLOCK_8x8].normFact = normFact_c;
+ p.cu[BLOCK_16x16].normFact = normFact_c;
+ p.cu[BLOCK_32x32].normFact = normFact_c;
+ p.cu[BLOCK_64x64].normFact = normFact_c;
}
}
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/primitives.h
--- a/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/primitives.h Mon Mar 04 15:36:38 2019 +0530
@@ -228,6 +228,7 @@
typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t
*costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t
blkPos);
typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const
pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t
*ac_k);
+typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int
shift, uint64_t *z_k);
/* Function pointers to optimized encoder primitives. Each pointer can
reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function
*/
struct EncoderPrimitives
@@ -305,6 +306,7 @@
psyRdoQuant_t1 psyRdoQuant_1p;
psyRdoQuant_t2 psyRdoQuant_2p;
ssimDistortion_t ssimDist;
+ normFactor_t normFact;
}
cu[NUM_CU_SIZES];
/* These remaining primitives work on either fixed block sizes or take
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -2325,6 +2325,11 @@
p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
+ p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2);
+ p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2);
+ p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2);
+ p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2);
+
/* TODO: This kernel needs to be modified to work with
HIGH_BIT_DEPTH only
p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
@@ -4718,6 +4723,11 @@
p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
+ p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2);
+ p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2);
+ p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2);
+ p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2);
+
}
if (cpuMask & X265_CPU_AVX512)
{
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/pixel-a.asm Mon Mar 04 15:36:38 2019 +0530
@@ -388,6 +388,16 @@
vpaddq m7, m6
%endmacro
+%macro NORM_FACT_COL 1
+ vpsrld m1, m0, SSIMRD_SHIFT
+ vpmuldq m2, m1, m1
+ vpsrldq m1, m1, 4
+ vpmuldq m1, m1, m1
+
+ vpaddq m1, m2
+ vpaddq m3, m1
+%endmacro
+
; FIXME avoid the spilling of regs to hold 3*stride.
; for small blocks on x86_32, modify pixel pointer instead.
@@ -16303,3 +16313,266 @@
movq [r4], xm4
movq [r6], xm7
RET
+
+
+;static void normFact_c(const pixel* src, uint32_t blockSize, int shift,
uint64_t *z_k)
+;{
+; *z_k = 0;
+; for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+; {
+; for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+; {
+; uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+; *z_k += temp * temp;
+; }
+; }
+;}
+;--------------------------------------------------------------------------------------
+; void normFact_c(const pixel* src, uint32_t blockSize, int shift,
uint64_t *z_k)
+;--------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal normFact8, 4, 5, 6
+ mov r4d, 8
+ vpxor m3, m3 ;z_k
+ vpxor m5, m5
+.row:
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+%if HIGH_BIT_DEPTH
+ lea r0, [r0 + 2 * r1]
+%else
+ lea r0, [r0 + r1]
+%endif
+ dec r4d
+ jnz .row
+ vextracti128 xm4, m3, 1
+ vpaddq xm3, xm4
+ punpckhqdq xm2, xm3, xm5
+ paddq xm3, xm2
+ movq [r3], xm3
+ RET
+
+
+INIT_YMM avx2
+cglobal normFact16, 4, 5, 6
+ mov r4d, 16
+ vpxor m3, m3 ;z_k
+ vpxor m5, m5
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 16] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 8]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+%if HIGH_BIT_DEPTH
+ lea r0, [r0 + 2 * r1]
+%else
+ lea r0, [r0 + r1]
+%endif
+ dec r4d
+ jnz .row
+ vextracti128 xm4, m3, 1
+ vpaddq xm3, xm4
+ punpckhqdq xm2, xm3, xm5
+ paddq xm3, xm2
+ movq [r3], xm3
+ RET
+
+
+INIT_YMM avx2
+cglobal normFact32, 4, 5, 6
+ mov r4d, 32
+ vpxor m3, m3 ;z_k
+ vpxor m5, m5
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 16] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 8]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 17-24
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 32] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 16]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 25-32
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 48] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 24]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+%if HIGH_BIT_DEPTH
+ lea r0, [r0 + 2 * r1]
+%else
+ lea r0, [r0 + r1]
+%endif
+ dec r4d
+ jnz .row
+ vextracti128 xm4, m3, 1
+ vpaddq xm3, xm4
+ punpckhqdq xm2, xm3, xm5
+ paddq xm3, xm2
+ movq [r3], xm3
+ RET
+
+
+INIT_YMM avx2
+cglobal normFact64, 4, 5, 6
+ mov r4d, 64
+ vpxor m3, m3 ;z_k
+ vpxor m5, m5
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 16] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 8]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 17-24
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 32] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 16]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 25-32
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 48] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 24]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 33-40
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 64] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 32]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 41-48
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 80] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 40]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 49-56
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 96] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 48]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+;Col 57-64
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 112] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0 + 56]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ NORM_FACT_COL m0
+
+%if HIGH_BIT_DEPTH
+ lea r0, [r0 + 2 * r1]
+%else
+ lea r0, [r0 + r1]
+%endif
+ dec r4d
+ jnz .row
+ vextracti128 xm4, m3, 1
+ vpaddq xm3, xm4
+ punpckhqdq xm2, xm3, xm5
+ paddq xm3, xm2
+ movq [r3], xm3
+ RET
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/pixel.h Mon Mar 04 15:36:38 2019 +0530
@@ -61,7 +61,8 @@
FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t
sstride, const pixel* recon, intptr_t rstride); \
FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t
sstride, const int16_t* recon, intptr_t rstride); \
- FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
shift, uint64_t *ac_k)
+ FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
shift, uint64_t *ac_k); \
+ FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t
blockSize, int shift, uint64_t *z_k)
DECL_PIXELS(mmx);
DECL_PIXELS(mmx2);
diff -r d12a4caf7963 -r 19f27e0c8a6f source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/encoder/analysis.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -3696,14 +3696,8 @@
// 2. Calculate ac component
uint64_t z_k = 0;
- for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
- {
- for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
- {
- uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
- z_k += temp * temp;
- }
- }
+ int block = (int)((log(blockSize) / log(2)) - 2);
+ primitives.cu[block].normFact(src, blockSize, shift, &z_k);
// Remove the DC part
z_k -= z_o;
diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/test/pixelharness.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -2296,6 +2296,30 @@
return true;
}
+bool PixelHarness::check_normFact(normFactor_t ref, normFactor_t opt, int
block)
+{
+ int shift = X265_DEPTH - 8;
+ uint64_t opt_dest = 0, ref_dest = 0;
+ int j = 0;
+ int blockSize = 4 << block;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index = i % TEST_CASES;
+ ref(pixel_test_buff[index] + j, blockSize, shift, &ref_dest);
+ opt(pixel_test_buff[index] + j, blockSize, shift, &opt_dest);
+
+ if (opt_dest != ref_dest)
+ {
+ return false;
+ }
+
+ reportfail()
+ j += INCR;
+ }
+ return true;
+}
+
bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const
EncoderPrimitives& opt)
{
if (opt.pu[part].satd)
@@ -3129,6 +3153,18 @@
}
}
+ for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++)
+ {
+ if (opt.cu[i].normFact)
+ {
+ if (!check_normFact(ref.cu[i].normFact, opt.cu[i].normFact, i))
+ {
+ printf("\nnormFact[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -3769,4 +3805,16 @@
REPORT_SPEEDUP(opt.integral_inith[k], ref.integral_inith[k],
dst_buf, pbuf1, STRIDE);
}
}
+
+ for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++)
+ {
+ if (opt.cu[i].normFact)
+ {
+ uint64_t dst = 0;
+ int blockSize = 4 << i;
+ int shift = X265_DEPTH - 8;
+ printf("normFact[%dx%d]", blockSize, blockSize);
+ REPORT_SPEEDUP(opt.cu[i].normFact, ref.cu[i].normFact,
pixel_test_buff[0], blockSize, shift, &dst);
+ }
+ }
}
diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Feb 27 12:35:02 2019 +0530
+++ b/source/test/pixelharness.h Mon Mar 04 15:36:38 2019 +0530
@@ -137,6 +137,7 @@
bool check_integral_initv(integralv_t ref, integralv_t opt);
bool check_integral_inith(integralh_t ref, integralh_t opt);
bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
+ bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
public:
--
*Regards,*
*Akil R*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190305/add805be/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: normFactor_avx2.patch
Type: application/octet-stream
Size: 16207 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190305/add805be/attachment-0001.obj>
More information about the x265-devel
mailing list