[x265] [PATCH x265] Add AVX2 assembly code for normFactor primitive.

Thu Mar 7 11:10:11 CET 2019

Just say it works.


First at all,
The expect algorithm is square of (x >> shift)
It is 8 bits (I assume we talk with 8bpp, the 16bpp are similar) multiple of 8-bits and result is 16 bits.
The function works on CU-level, the blockSize is up to 64 only, or call 6-bits.
So, we can decide the maximum dynamic range is 16+6+6 = 28 bits 


In this way, the output uint64_t is unnecessary on 8bpp mode.


Moreover, PMOVZXBD+VPMULDQ can be replace by PMOVZXBW+PMADDWD, (please remember that PMADDUBSW just work on one of unsigned input),
this way may accelerate 3~4 times of processing throughput. 
I don't why not VPMULLD, it almost double performance


Further, unnecessary VPSRLDQ because we choice VPMULDQ


+    vpmuldq        m2,          m1,        m1
+    vpsrldq        m1,          m1,        4
+    vpmuldq        m1,          m1,        m1




Regards,
Min


At 2019-03-07 17:36:19, "Dinesh Kumar Reddy" <dinesh at multicorewareinc.com> wrote:

+static void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
+{
+    *z_k = 0;
+    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+    {
+        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+        {
+            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+            *z_k += temp * temp;
+        }
+    }
+}
+
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel-a.asm

--- a/source/common/x86/pixel-a.asmWed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/pixel-a.asmMon Mar 04 15:36:38 2019 +0530
@@ -388,6 +388,16 @@
     vpaddq         m7,         m6
 %endmacro
 
+%macro NORM_FACT_COL 1
+    vpsrld         m1,          m0,        SSIMRD_SHIFT
+    vpmuldq        m2,          m1,        m1
+    vpsrldq        m1,          m1,        4
+    vpmuldq        m1,          m1,        m1
+
+    vpaddq         m1,          m2
+    vpaddq         m3,          m1
+%endmacro
+
 ; FIXME avoid the spilling of regs to hold 3*stride.
 ; for small blocks on x86_32, modify pixel pointer instead.
 
@@ -16303,3 +16313,266 @@
     movq           [r4],         xm4
     movq           [r6],         xm7
     RET
+
+
+;static void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
+;{
+;    *z_k = 0;
+;    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+;    {
+;        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+;        {
+;            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+;            *z_k += temp * temp;
+;        }
+;    }
+;}
+;--------------------------------------------------------------------------------------
+; void normFact_c(const pixel* src, uint32_t blockSize, int shift, uint64_t *z_k)
+;--------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal normFact8, 4, 5, 6
+    mov            r4d,       8
+    vpxor          m3,        m3                               ;z_k
+    vpxor          m5,        m5
+.row:
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,        [r0]                             ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,        [r0]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190307/1b7e8c17/attachment.html>