[x265] [PATCH x265] Add AVX2 assembly code for normFactor primitive.
chen
chenm003 at 163.com
Thu Mar 7 11:10:11 CET 2019
Just say it works.
First at all,
The expect algorithm is square of (x >> shift)
It is 8 bits (I assume we talk with 8bpp, the 16bpp are similar) multiple of 8-bits and result is 16 bits.
The function works on CU-level, the blockSize is up to 64 only, or call 6-bits.
So, we can decide the maximum dynamic range is 16+6+6 = 28 bits
In this way, the output uint64_t is unnecessary on 8bpp mode.
Moreover, PMOVZXBD+VPMULDQ can be replace by PMOVZXBW+PMADDWD, (please remember that PMADDUBSW just work on one of unsigned input),
this way may accelerate 3~4 times of processing throughput.
I don't why not VPMULLD, it almost double performance
Further, unnecessary VPSRLDQ because we choice VPMULDQ
+ vpmuldq m2, m1, m1
+ vpsrldq m1, m1, 4
+ vpmuldq m1, m1, m1
Regards,
Min
At 2019-03-07 17:36:19, "Dinesh Kumar Reddy" <dinesh at multicorewareinc.com> wrote:
+static void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
+{
+ *z_k = 0;
+ for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+ {
+ for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+ {
+ uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+ *z_k += temp * temp;
+ }
+ }
+}
+
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asmWed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/pixel-a.asmMon Mar 04 15:36:38 2019 +0530
@@ -388,6 +388,16 @@
vpaddq m7, m6
%endmacro
+%macro NORM_FACT_COL 1
+ vpsrld m1, m0, SSIMRD_SHIFT
+ vpmuldq m2, m1, m1
+ vpsrldq m1, m1, 4
+ vpmuldq m1, m1, m1
+
+ vpaddq m1, m2
+ vpaddq m3, m1
+%endmacro
+
; FIXME avoid the spilling of regs to hold 3*stride.
; for small blocks on x86_32, modify pixel pointer instead.
@@ -16303,3 +16313,266 @@
movq [r4], xm4
movq [r6], xm7
RET
+
+
+;static void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
+;{
+; *z_k = 0;
+; for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+; {
+; for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+; {
+; uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+; *z_k += temp * temp;
+; }
+; }
+;}
+;--------------------------------------------------------------------------------------
+; void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
+;--------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal normFact8, 4, 5, 6
+ mov r4d, 8
+ vpxor m3, m3 ;z_k
+ vpxor m5, m5
+.row:
+%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0] ;src
+%elif BIT_DEPTH == 8
+ vpmovzxbd m0, [r0]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190307/1b7e8c17/attachment.html>
More information about the x265-devel
mailing list