[x265] [PATCH x265] SSIM-RD: 8-bit AVX2 performance improvement

Akil akil at multicorewareinc.com
Fri Apr 5 12:03:22 CEST 2019


# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1554365158 -19800
#      Thu Apr 04 13:35:58 2019 +0530
# Node ID e7a726d1ca84d59f85cfafb428b8ffc4b9eb7000
# Parent  b36242b9f354b8773e38674b876b0ca5dfc35ad2
SSIM-RD : 8-bit AVX2 performance improvement

ssimDistortion
[16x16]   5.44x   =>     13.52x
[32x32]   6.01x   =>     18.99x
[64x64]   6.70x   =>     20.78x

normFactor
[16x16]   8.42x   =>     17.96x
[32x32]   9.56x   =>     29.12x
[64x64]   8.96x   =>     25.29x

diff -r b36242b9f354 -r e7a726d1ca84 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Apr 02 15:01:12 2019 +0530
+++ b/source/common/x86/pixel-a.asm Thu Apr 04 13:35:58 2019 +0530
@@ -370,7 +370,7 @@
     RET
 %endmacro

-%macro SSIM_RD_COL 2
+%macro SSIM_DIST_HIGH 2
     vpsrld         m6,         m0,        SSIMRD_SHIFT
     vpsubd         m0,         m1

@@ -388,7 +388,7 @@
     vpaddq         m7,         m6
 %endmacro

-%macro NORM_FACT_COL 1
+%macro NORM_FACT_HIGH 1
     vpsrld         m1,          m0,        SSIMRD_SHIFT
     vpmuldq        m2,          m1,        m1
     vpsrldq        m1,          m1,        4
@@ -398,6 +398,23 @@
     vpaddq         m3,          m1
 %endmacro

+%macro SSIM_DIST_LOW 2
+    vpsrlw         m6,         m0,        SSIMRD_SHIFT
+    vpsubw         m0,         m1
+
+    vpmaddwd       m0,         m0,        m0
+    vpmaddwd       m6,         m6,        m6
+
+    vpaddd         m4,         m0
+    vpaddd         m7,         m6
+%endmacro
+
+%macro NORM_FACT_LOW 1
+    vpsrlw         m1,          m0,        SSIMRD_SHIFT
+    vpmaddwd       m1,          m1,        m1
+    vpaddd         m3,          m1
+%endmacro
+
 ; FIXME avoid the spilling of regs to hold 3*stride.
 ; for small blocks on x86_32, modify pixel pointer instead.

@@ -16014,7 +16031,7 @@
     %error Unsupported BIT_DEPTH!
 %endif

-    SSIM_RD_COL    m0,          m1
+    SSIM_DIST_HIGH m0,          m1

 %if HIGH_BIT_DEPTH
     lea            r0,         [r0 + 2 * r1]
@@ -16047,41 +16064,37 @@
     vpxor          m3,          m3
     vpxor          m7,          m7                                ;ac_k
 .row:
+%if HIGH_BIT_DEPTH
 ;Col 1-8
-%if HIGH_BIT_DEPTH
     vpmovzxwd      m0,          [r0]                              ;fenc
     vpmovzxwd      m1,          [r2]                              ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0]
-    vpmovzxbd      m1,          [r2]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 9-16
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 16]                         ;fenc
-    vpmovzxwd      m1,          [r2 + 16]                         ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 8]
-    vpmovzxbd      m1,          [r2 + 8]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
-
-%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 16]
+    vpmovzxwd      m1,          [r2 + 16]
+
+    SSIM_DIST_HIGH m0,          m1
+
     lea            r0,         [r0 + 2 * r1]
     lea            r2,         [r2 + 2 * r3]
-%else
+%elif BIT_DEPTH == 8
+;col 1- 16
+    vpmovzxbw      m0,         [r0]                             ;fenc
+    vpmovzxbw      m1,         [r2]                             ;recon
+
+    SSIM_DIST_LOW  m0,         m1
+
     lea            r0,         [r0 + r1]
     lea            r2,         [r2 + r3]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
     dec            r5d
     jnz           .row
+
+%if HIGH_BIT_DEPTH
     vextracti128   xm5,        m4,        1
     vpaddq         xm4,        xm5
     punpckhqdq     xm2,        xm4,       xm3
@@ -16091,7 +16104,23 @@
     vpaddq         xm7,        xm5
     punpckhqdq     xm2,        xm7,       xm3
     paddq          xm7,        xm2
-
+%else
+    vextracti128   xm5,        m4,        1
+    vpaddd         xm4,        xm5
+    punpckhqdq     xm2,        xm4,       xm3
+    paddd          xm4,        xm2
+    punpckldq      xm4,        xm4,       xm3
+    punpckhqdq     xm2,        xm4,       xm3
+    paddd          xm4,        xm2
+
+    vextracti128   xm5,        m7,        1
+    vpaddd         xm7,        xm5
+    punpckhqdq     xm2,        xm7,       xm3
+    paddd          xm7,        xm2
+    punpckldq      xm7,        xm7,       xm3
+    punpckhqdq     xm2,        xm7,       xm3
+    paddd          xm7,        xm2
+%endif
     movq           [r4],       xm4
     movq           [r6],       xm7
     RET
@@ -16104,67 +16133,55 @@
     vpxor          m3,         m3
     vpxor          m7,         m7                              ;ac_k
 .row:
+%if HIGH_BIT_DEPTH
 ;Col 1-8
-%if HIGH_BIT_DEPTH
     vpmovzxwd      m0,         [r0]                            ;fenc
     vpmovzxwd      m1,         [r2]                            ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,         [r0]
-    vpmovzxbd      m1,         [r2]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 9-16
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 16]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 16]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 8]
-    vpmovzxbd      m1,          [r2 + 8]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 16]
+    vpmovzxwd      m1,          [r2 + 16]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 17-24
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 32]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 32]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 16]
-    vpmovzxbd      m1,          [r2 + 16]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 32]
+    vpmovzxwd      m1,          [r2 + 32]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 25-32
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 48]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 48]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 24]
-    vpmovzxbd      m1,          [r2 + 24]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
-
-%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 48]
+    vpmovzxwd      m1,          [r2 + 48]
+
+    SSIM_DIST_HIGH m0,          m1
+
     lea            r0,          [r0 + 2 * r1]
     lea            r2,          [r2 + 2 * r3]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+    vpmovzxbw      m0,         [r0]                             ;fenc
+    vpmovzxbw      m1,         [r2]                             ;recon
+
+    SSIM_DIST_LOW  m0,         m1
+
+;col 17-32
+    vpmovzxbw      m0,         [r0 + 16]
+    vpmovzxbw      m1,         [r2 + 16]
+
+    SSIM_DIST_LOW  m0,         m1
+
     lea            r0,          [r0 + r1]
     lea            r2,          [r2 + r3]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
     dec            r5d
     jnz           .row
+
+%if HIGH_BIT_DEPTH
     vextracti128   xm5,         m4,        1
     vpaddq         xm4,         xm5
     punpckhqdq     xm2,         xm4,       xm3
@@ -16174,7 +16191,23 @@
     vpaddq         xm7,         xm5
     punpckhqdq     xm2,         xm7,       xm3
     paddq          xm7,         xm2
-
+%else
+    vextracti128   xm5,        m4,        1
+    vpaddd         xm4,        xm5
+    punpckhqdq     xm2,        xm4,       xm3
+    paddd          xm4,        xm2
+    punpckldq      xm4,        xm4,       xm3
+    punpckhqdq     xm2,        xm4,       xm3
+    paddd          xm4,        xm2
+
+    vextracti128   xm5,        m7,        1
+    vpaddd         xm7,        xm5
+    punpckhqdq     xm2,        xm7,       xm3
+    paddd          xm7,        xm2
+    punpckldq      xm7,        xm7,       xm3
+    punpckhqdq     xm2,        xm7,       xm3
+    paddd          xm7,        xm2
+%endif
     movq           [r4],        xm4
     movq           [r6],        xm7
     RET
@@ -16187,119 +16220,89 @@
     vpxor          m3,          m3
     vpxor          m7,          m7                             ;ac_k
 .row:
+%if HIGH_BIT_DEPTH
 ;Col 1-8
-%if HIGH_BIT_DEPTH
     vpmovzxwd      m0,          [r0]                           ;fenc
     vpmovzxwd      m1,          [r2]                           ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0]
-    vpmovzxbd      m1,          [r2]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 9-16
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 16]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 16]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 8]
-    vpmovzxbd      m1,          [r2 + 8]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 16]
+    vpmovzxwd      m1,          [r2 + 16]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 17-24
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 32]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 32]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 16]
-    vpmovzxbd      m1,          [r2 + 16]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 32]
+    vpmovzxwd      m1,          [r2 + 32]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 25-32
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 48]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 48]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 24]
-    vpmovzxbd      m1,          [r2 + 24]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 48]
+    vpmovzxwd      m1,          [r2 + 48]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 33-40
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 64]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 64]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 32]
-    vpmovzxbd      m1,          [r2 + 32]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 64]
+    vpmovzxwd      m1,          [r2 + 64]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 41-48
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 80]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 80]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 40]
-    vpmovzxbd      m1,          [r2 + 40]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 80]
+    vpmovzxwd      m1,          [r2 + 80]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 49-56
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 96]                      ;fenc
-    vpmovzxwd      m1,          [r2 + 96]                      ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 48]
-    vpmovzxbd      m1,          [r2 + 48]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
+    vpmovzxwd      m0,          [r0 + 96]
+    vpmovzxwd      m1,          [r2 + 96]
+
+    SSIM_DIST_HIGH m0,          m1

 ;Col 57-64
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 112]                     ;fenc
-    vpmovzxwd      m1,          [r2 + 112]                     ;recon
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 56]
-    vpmovzxbd      m1,          [r2 + 56]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    SSIM_RD_COL    m0,          m1
-
-%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 112]
+    vpmovzxwd      m1,          [r2 + 112]
+
+    SSIM_DIST_HIGH m0,          m1
+
     lea            r0,          [r0 + 2 * r1]
     lea            r2,          [r2 + 2 * r3]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+    vpmovzxbw      m0,         [r0]                             ;fenc
+    vpmovzxbw      m1,         [r2]                             ;recon
+
+    SSIM_DIST_LOW  m0,         m1
+
+;col 17-32
+    vpmovzxbw      m0,         [r0 + 16]
+    vpmovzxbw      m1,         [r2 + 16]
+
+    SSIM_DIST_LOW  m0,         m1
+
+;col 33-48
+    vpmovzxbw      m0,         [r0 + 32]
+    vpmovzxbw      m1,         [r2 + 32]
+
+    SSIM_DIST_LOW  m0,         m1
+
+;col 49-64
+    vpmovzxbw      m0,         [r0 + 48]
+    vpmovzxbw      m1,         [r2 + 48]
+
+    SSIM_DIST_LOW  m0,         m1
+
     lea            r0,          [r0 + r1]
     lea            r2,          [r2 + r3]
 %endif
     dec            r5d
     jnz            .row
+
+%if HIGH_BIT_DEPTH
     vextracti128   xm5,          m4,        1
     vpaddq         xm4,          xm5
     punpckhqdq     xm2,          xm4,       xm3
@@ -16309,7 +16312,23 @@
     vpaddq         xm7,          xm5
     punpckhqdq     xm2,          xm7,       xm3
     paddq          xm7,          xm2
-
+%else
+    vextracti128   xm5,        m4,        1
+    vpaddd         xm4,        xm5
+    punpckhqdq     xm2,        xm4,       xm3
+    paddd          xm4,        xm2
+    punpckldq      xm4,        xm4,       xm3
+    punpckhqdq     xm2,        xm4,       xm3
+    paddd          xm4,        xm2
+
+    vextracti128   xm5,        m7,        1
+    vpaddd         xm7,        xm5
+    punpckhqdq     xm2,        xm7,       xm3
+    paddd          xm7,        xm2
+    punpckldq      xm7,        xm7,       xm3
+    punpckhqdq     xm2,        xm7,       xm3
+    paddd          xm7,        xm2
+%endif
     movq           [r4],         xm4
     movq           [r6],         xm7
     RET
@@ -16344,7 +16363,7 @@
     %error Unsupported BIT_DEPTH!
 %endif

-    NORM_FACT_COL  m0
+    NORM_FACT_HIGH m0

 %if HIGH_BIT_DEPTH
     lea            r0,         [r0 + 2 * r1]
@@ -16367,39 +16386,45 @@
     vpxor          m3,          m3                                ;z_k
     vpxor          m5,          m5
 .row:
+%if HIGH_BIT_DEPTH
 ;Col 1-8
-%if HIGH_BIT_DEPTH
     vpmovzxwd      m0,          [r0]                              ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+
+    NORM_FACT_HIGH  m0

 ;Col 9-16
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 16]                         ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 8]
-%else
+    vpmovzxwd      m0,          [r0 + 16]
+
+    NORM_FACT_HIGH m0
+
+    lea            r0,         [r0 + 2 * r1]
+%elif BIT_DEPTH == 8
+;col 1-16
+    vpmovzxbw      m0,         [r0]                             ;src
+
+    NORM_FACT_LOW  m0
+
+    lea            r0,         [r0 + r1]
+%else
     %error Unsupported BIT_DEPTH!
 %endif
-
-    NORM_FACT_COL  m0
-
-%if HIGH_BIT_DEPTH
-    lea            r0,         [r0 + 2 * r1]
-%else
-    lea            r0,         [r0 + r1]
-%endif
     dec            r4d
     jnz           .row
+
+%if HIGH_BIT_DEPTH
     vextracti128   xm4,         m3,        1
     vpaddq         xm3,         xm4
     punpckhqdq     xm2,         xm3,       xm5
     paddq          xm3,         xm2
+%else
+    vextracti128   xm4,        m3,        1
+    vpaddd         xm3,        xm4
+    punpckhqdq     xm2,        xm3,       xm5
+    paddd          xm3,        xm2
+    punpckldq      xm3,        xm3,       xm5
+    punpckhqdq     xm2,        xm3,       xm5
+    paddd          xm3,        xm2
+%endif
     movq           [r3],        xm3
     RET

@@ -16410,61 +16435,59 @@
     vpxor          m3,          m3                              ;z_k
     vpxor          m5,          m5
 .row:
+%if HIGH_BIT_DEPTH
 ;Col 1-8
-%if HIGH_BIT_DEPTH
     vpmovzxwd      m0,         [r0]                             ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,         [r0]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+
+    NORM_FACT_HIGH m0

 ;Col 9-16
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 16]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 8]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 16]
+
+    NORM_FACT_HIGH m0

 ;Col 17-24
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 32]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 16]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 32]
+
+    NORM_FACT_HIGH  m0

 ;Col 25-32
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 48]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 24]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
-
-%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 48]
+
+    NORM_FACT_HIGH m0
+
     lea            r0,          [r0 + 2 * r1]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+    vpmovzxbw      m0,         [r0]                             ;src
+
+    NORM_FACT_LOW  m0
+;col 17-32
+    vpmovzxbw      m0,         [r0 + 16]
+
+    NORM_FACT_LOW  m0
+
     lea            r0,          [r0 + r1]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
     dec            r4d
     jnz           .row
+
+%if HIGH_BIT_DEPTH
     vextracti128   xm4,         m3,        1
     vpaddq         xm3,         xm4
     punpckhqdq     xm2,         xm3,       xm5
     paddq          xm3,         xm2
+%else
+    vextracti128   xm4,        m3,        1
+    vpaddd         xm3,        xm4
+    punpckhqdq     xm2,        xm3,       xm5
+    paddd          xm3,        xm2
+    punpckldq      xm3,        xm3,       xm5
+    punpckhqdq     xm2,        xm3,       xm5
+    paddd          xm3,        xm2
+%endif
     movq           [r3],        xm3
     RET

@@ -16475,104 +16498,86 @@
     vpxor          m3,          m3                             ;z_k
     vpxor          m5,          m5
 .row:
+%if HIGH_BIT_DEPTH
 ;Col 1-8
-%if HIGH_BIT_DEPTH
     vpmovzxwd      m0,          [r0]                           ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+
+    NORM_FACT_HIGH m0

 ;Col 9-16
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 16]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 8]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 16]
+
+    NORM_FACT_HIGH m0

 ;Col 17-24
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 32]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 16]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 32]
+
+    NORM_FACT_HIGH  m0

 ;Col 25-32
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 48]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 24]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 48]
+
+    NORM_FACT_HIGH  m0

 ;Col 33-40
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 64]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 32]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 64]
+
+    NORM_FACT_HIGH  m0

 ;Col 41-48
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 80]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 40]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 80]
+
+    NORM_FACT_HIGH  m0

 ;Col 49-56
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 96]                      ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 48]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
+    vpmovzxwd      m0,          [r0 + 96]
+
+    NORM_FACT_HIGH  m0

 ;Col 57-64
-%if HIGH_BIT_DEPTH
-    vpmovzxwd      m0,          [r0 + 112]                     ;src
-%elif BIT_DEPTH == 8
-    vpmovzxbd      m0,          [r0 + 56]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
-
-    NORM_FACT_COL  m0
-
-%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 112]
+
+    NORM_FACT_HIGH m0
+
     lea            r0,          [r0 + 2 * r1]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+    vpmovzxbw      m0,         [r0]                             ;src
+
+    NORM_FACT_LOW  m0
+;col 17-32
+    vpmovzxbw      m0,         [r0 + 16]
+
+    NORM_FACT_LOW  m0
+;col 33-48
+    vpmovzxbw      m0,         [r0 + 32]
+
+    NORM_FACT_LOW  m0
+;col 49-56
+    vpmovzxbw      m0,         [r0 + 48]
+
+    NORM_FACT_LOW  m0
+
     lea            r0,          [r0 + r1]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
     dec            r4d
     jnz           .row
+
+%if HIGH_BIT_DEPTH
     vextracti128   xm4,         m3,        1
     vpaddq         xm3,         xm4
     punpckhqdq     xm2,         xm3,       xm5
     paddq          xm3,         xm2
+%else
+    vextracti128   xm4,        m3,        1
+    vpaddd         xm3,        xm4
+    punpckhqdq     xm2,        xm3,       xm5
+    paddd          xm3,        xm2
+    punpckldq      xm3,        xm3,       xm5
+    punpckhqdq     xm2,        xm3,       xm5
+    paddd          xm3,        xm2
+%endif
     movq           [r3],        xm3
     RET


-- 
*Regards,*
*Akil R*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190405/67dd3755/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ssim_rd_perf_improvement_final.patch
Type: application/x-patch
Size: 22248 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190405/67dd3755/attachment-0001.bin>


More information about the x265-devel mailing list