[x265] [PATCH x265] SSIM-RD: 8-bit AVX2 performance improvement

Dinesh Kumar Reddy dinesh at multicorewareinc.com
Mon Apr 8 10:43:07 CEST 2019


# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1554365158 -19800
#      Thu Apr 04 13:35:58 2019 +0530
# Node ID e7a726d1ca84d59f85cfafb428b8ffc4b9eb7000
# Parent  b36242b9f354b8773e38674b876b0ca5dfc35ad2
SSIM-RD : 8-bit AVX2 performance improvement

Patch has been pushed to x265 public branch.


Thanks & Regards,
Dinesh

On Fri, Apr 5, 2019 at 3:33 PM Akil <akil at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Akil Ayyappan<akil at multicorewareinc.com>
> # Date 1554365158 -19800
> #      Thu Apr 04 13:35:58 2019 +0530
> # Node ID e7a726d1ca84d59f85cfafb428b8ffc4b9eb7000
> # Parent  b36242b9f354b8773e38674b876b0ca5dfc35ad2
> SSIM-RD : 8-bit AVX2 performance improvement
>
> ssimDistortion
> [16x16]   5.44x   =>     13.52x
> [32x32]   6.01x   =>     18.99x
> [64x64]   6.70x   =>     20.78x
>
> normFactor
> [16x16]   8.42x   =>     17.96x
> [32x32]   9.56x   =>     29.12x
> [64x64]   8.96x   =>     25.29x
>
> diff -r b36242b9f354 -r e7a726d1ca84 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Tue Apr 02 15:01:12 2019 +0530
> +++ b/source/common/x86/pixel-a.asm Thu Apr 04 13:35:58 2019 +0530
> @@ -370,7 +370,7 @@
>      RET
>  %endmacro
>
> -%macro SSIM_RD_COL 2
> +%macro SSIM_DIST_HIGH 2
>      vpsrld         m6,         m0,        SSIMRD_SHIFT
>      vpsubd         m0,         m1
>
> @@ -388,7 +388,7 @@
>      vpaddq         m7,         m6
>  %endmacro
>
> -%macro NORM_FACT_COL 1
> +%macro NORM_FACT_HIGH 1
>      vpsrld         m1,          m0,        SSIMRD_SHIFT
>      vpmuldq        m2,          m1,        m1
>      vpsrldq        m1,          m1,        4
> @@ -398,6 +398,23 @@
>      vpaddq         m3,          m1
>  %endmacro
>
> +%macro SSIM_DIST_LOW 2
> +    vpsrlw         m6,         m0,        SSIMRD_SHIFT
> +    vpsubw         m0,         m1
> +
> +    vpmaddwd       m0,         m0,        m0
> +    vpmaddwd       m6,         m6,        m6
> +
> +    vpaddd         m4,         m0
> +    vpaddd         m7,         m6
> +%endmacro
> +
> +%macro NORM_FACT_LOW 1
> +    vpsrlw         m1,          m0,        SSIMRD_SHIFT
> +    vpmaddwd       m1,          m1,        m1
> +    vpaddd         m3,          m1
> +%endmacro
> +
>  ; FIXME avoid the spilling of regs to hold 3*stride.
>  ; for small blocks on x86_32, modify pixel pointer instead.
>
> @@ -16014,7 +16031,7 @@
>      %error Unsupported BIT_DEPTH!
>  %endif
>
> -    SSIM_RD_COL    m0,          m1
> +    SSIM_DIST_HIGH m0,          m1
>
>  %if HIGH_BIT_DEPTH
>      lea            r0,         [r0 + 2 * r1]
> @@ -16047,41 +16064,37 @@
>      vpxor          m3,          m3
>      vpxor          m7,          m7                                ;ac_k
>  .row:
> +%if HIGH_BIT_DEPTH
>  ;Col 1-8
> -%if HIGH_BIT_DEPTH
>      vpmovzxwd      m0,          [r0]                              ;fenc
>      vpmovzxwd      m1,          [r2]                              ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0]
> -    vpmovzxbd      m1,          [r2]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 9-16
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 16]                         ;fenc
> -    vpmovzxwd      m1,          [r2 + 16]                         ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 8]
> -    vpmovzxbd      m1,          [r2 + 8]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> -
> -%if HIGH_BIT_DEPTH
> +    vpmovzxwd      m0,          [r0 + 16]
> +    vpmovzxwd      m1,          [r2 + 16]
> +
> +    SSIM_DIST_HIGH m0,          m1
> +
>      lea            r0,         [r0 + 2 * r1]
>      lea            r2,         [r2 + 2 * r3]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1- 16
> +    vpmovzxbw      m0,         [r0]                             ;fenc
> +    vpmovzxbw      m1,         [r2]                             ;recon
> +
> +    SSIM_DIST_LOW  m0,         m1
> +
>      lea            r0,         [r0 + r1]
>      lea            r2,         [r2 + r3]
> +%else
> +    %error Unsupported BIT_DEPTH!
>  %endif
>      dec            r5d
>      jnz           .row
> +
> +%if HIGH_BIT_DEPTH
>      vextracti128   xm5,        m4,        1
>      vpaddq         xm4,        xm5
>      punpckhqdq     xm2,        xm4,       xm3
> @@ -16091,7 +16104,23 @@
>      vpaddq         xm7,        xm5
>      punpckhqdq     xm2,        xm7,       xm3
>      paddq          xm7,        xm2
> -
> +%else
> +    vextracti128   xm5,        m4,        1
> +    vpaddd         xm4,        xm5
> +    punpckhqdq     xm2,        xm4,       xm3
> +    paddd          xm4,        xm2
> +    punpckldq      xm4,        xm4,       xm3
> +    punpckhqdq     xm2,        xm4,       xm3
> +    paddd          xm4,        xm2
> +
> +    vextracti128   xm5,        m7,        1
> +    vpaddd         xm7,        xm5
> +    punpckhqdq     xm2,        xm7,       xm3
> +    paddd          xm7,        xm2
> +    punpckldq      xm7,        xm7,       xm3
> +    punpckhqdq     xm2,        xm7,       xm3
> +    paddd          xm7,        xm2
> +%endif
>      movq           [r4],       xm4
>      movq           [r6],       xm7
>      RET
> @@ -16104,67 +16133,55 @@
>      vpxor          m3,         m3
>      vpxor          m7,         m7                              ;ac_k
>  .row:
> +%if HIGH_BIT_DEPTH
>  ;Col 1-8
> -%if HIGH_BIT_DEPTH
>      vpmovzxwd      m0,         [r0]                            ;fenc
>      vpmovzxwd      m1,         [r2]                            ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,         [r0]
> -    vpmovzxbd      m1,         [r2]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 9-16
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 16]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 16]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 8]
> -    vpmovzxbd      m1,          [r2 + 8]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 16]
> +    vpmovzxwd      m1,          [r2 + 16]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 17-24
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 32]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 32]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 16]
> -    vpmovzxbd      m1,          [r2 + 16]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 32]
> +    vpmovzxwd      m1,          [r2 + 32]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 25-32
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 48]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 48]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 24]
> -    vpmovzxbd      m1,          [r2 + 24]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> -
> -%if HIGH_BIT_DEPTH
> +    vpmovzxwd      m0,          [r0 + 48]
> +    vpmovzxwd      m1,          [r2 + 48]
> +
> +    SSIM_DIST_HIGH m0,          m1
> +
>      lea            r0,          [r0 + 2 * r1]
>      lea            r2,          [r2 + 2 * r3]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> +    vpmovzxbw      m0,         [r0]                             ;fenc
> +    vpmovzxbw      m1,         [r2]                             ;recon
> +
> +    SSIM_DIST_LOW  m0,         m1
> +
> +;col 17-32
> +    vpmovzxbw      m0,         [r0 + 16]
> +    vpmovzxbw      m1,         [r2 + 16]
> +
> +    SSIM_DIST_LOW  m0,         m1
> +
>      lea            r0,          [r0 + r1]
>      lea            r2,          [r2 + r3]
> +%else
> +    %error Unsupported BIT_DEPTH!
>  %endif
>      dec            r5d
>      jnz           .row
> +
> +%if HIGH_BIT_DEPTH
>      vextracti128   xm5,         m4,        1
>      vpaddq         xm4,         xm5
>      punpckhqdq     xm2,         xm4,       xm3
> @@ -16174,7 +16191,23 @@
>      vpaddq         xm7,         xm5
>      punpckhqdq     xm2,         xm7,       xm3
>      paddq          xm7,         xm2
> -
> +%else
> +    vextracti128   xm5,        m4,        1
> +    vpaddd         xm4,        xm5
> +    punpckhqdq     xm2,        xm4,       xm3
> +    paddd          xm4,        xm2
> +    punpckldq      xm4,        xm4,       xm3
> +    punpckhqdq     xm2,        xm4,       xm3
> +    paddd          xm4,        xm2
> +
> +    vextracti128   xm5,        m7,        1
> +    vpaddd         xm7,        xm5
> +    punpckhqdq     xm2,        xm7,       xm3
> +    paddd          xm7,        xm2
> +    punpckldq      xm7,        xm7,       xm3
> +    punpckhqdq     xm2,        xm7,       xm3
> +    paddd          xm7,        xm2
> +%endif
>      movq           [r4],        xm4
>      movq           [r6],        xm7
>      RET
> @@ -16187,119 +16220,89 @@
>      vpxor          m3,          m3
>      vpxor          m7,          m7                             ;ac_k
>  .row:
> +%if HIGH_BIT_DEPTH
>  ;Col 1-8
> -%if HIGH_BIT_DEPTH
>      vpmovzxwd      m0,          [r0]                           ;fenc
>      vpmovzxwd      m1,          [r2]                           ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0]
> -    vpmovzxbd      m1,          [r2]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 9-16
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 16]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 16]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 8]
> -    vpmovzxbd      m1,          [r2 + 8]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 16]
> +    vpmovzxwd      m1,          [r2 + 16]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 17-24
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 32]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 32]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 16]
> -    vpmovzxbd      m1,          [r2 + 16]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 32]
> +    vpmovzxwd      m1,          [r2 + 32]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 25-32
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 48]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 48]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 24]
> -    vpmovzxbd      m1,          [r2 + 24]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 48]
> +    vpmovzxwd      m1,          [r2 + 48]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 33-40
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 64]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 64]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 32]
> -    vpmovzxbd      m1,          [r2 + 32]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 64]
> +    vpmovzxwd      m1,          [r2 + 64]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 41-48
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 80]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 80]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 40]
> -    vpmovzxbd      m1,          [r2 + 40]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 80]
> +    vpmovzxwd      m1,          [r2 + 80]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 49-56
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 96]                      ;fenc
> -    vpmovzxwd      m1,          [r2 + 96]                      ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 48]
> -    vpmovzxbd      m1,          [r2 + 48]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> +    vpmovzxwd      m0,          [r0 + 96]
> +    vpmovzxwd      m1,          [r2 + 96]
> +
> +    SSIM_DIST_HIGH m0,          m1
>
>  ;Col 57-64
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 112]                     ;fenc
> -    vpmovzxwd      m1,          [r2 + 112]                     ;recon
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 56]
> -    vpmovzxbd      m1,          [r2 + 56]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    SSIM_RD_COL    m0,          m1
> -
> -%if HIGH_BIT_DEPTH
> +    vpmovzxwd      m0,          [r0 + 112]
> +    vpmovzxwd      m1,          [r2 + 112]
> +
> +    SSIM_DIST_HIGH m0,          m1
> +
>      lea            r0,          [r0 + 2 * r1]
>      lea            r2,          [r2 + 2 * r3]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> +    vpmovzxbw      m0,         [r0]                             ;fenc
> +    vpmovzxbw      m1,         [r2]                             ;recon
> +
> +    SSIM_DIST_LOW  m0,         m1
> +
> +;col 17-32
> +    vpmovzxbw      m0,         [r0 + 16]
> +    vpmovzxbw      m1,         [r2 + 16]
> +
> +    SSIM_DIST_LOW  m0,         m1
> +
> +;col 33-48
> +    vpmovzxbw      m0,         [r0 + 32]
> +    vpmovzxbw      m1,         [r2 + 32]
> +
> +    SSIM_DIST_LOW  m0,         m1
> +
> +;col 49-64
> +    vpmovzxbw      m0,         [r0 + 48]
> +    vpmovzxbw      m1,         [r2 + 48]
> +
> +    SSIM_DIST_LOW  m0,         m1
> +
>      lea            r0,          [r0 + r1]
>      lea            r2,          [r2 + r3]
>  %endif
>      dec            r5d
>      jnz            .row
> +
> +%if HIGH_BIT_DEPTH
>      vextracti128   xm5,          m4,        1
>      vpaddq         xm4,          xm5
>      punpckhqdq     xm2,          xm4,       xm3
> @@ -16309,7 +16312,23 @@
>      vpaddq         xm7,          xm5
>      punpckhqdq     xm2,          xm7,       xm3
>      paddq          xm7,          xm2
> -
> +%else
> +    vextracti128   xm5,        m4,        1
> +    vpaddd         xm4,        xm5
> +    punpckhqdq     xm2,        xm4,       xm3
> +    paddd          xm4,        xm2
> +    punpckldq      xm4,        xm4,       xm3
> +    punpckhqdq     xm2,        xm4,       xm3
> +    paddd          xm4,        xm2
> +
> +    vextracti128   xm5,        m7,        1
> +    vpaddd         xm7,        xm5
> +    punpckhqdq     xm2,        xm7,       xm3
> +    paddd          xm7,        xm2
> +    punpckldq      xm7,        xm7,       xm3
> +    punpckhqdq     xm2,        xm7,       xm3
> +    paddd          xm7,        xm2
> +%endif
>      movq           [r4],         xm4
>      movq           [r6],         xm7
>      RET
> @@ -16344,7 +16363,7 @@
>      %error Unsupported BIT_DEPTH!
>  %endif
>
> -    NORM_FACT_COL  m0
> +    NORM_FACT_HIGH m0
>
>  %if HIGH_BIT_DEPTH
>      lea            r0,         [r0 + 2 * r1]
> @@ -16367,39 +16386,45 @@
>      vpxor          m3,          m3                                ;z_k
>      vpxor          m5,          m5
>  .row:
> +%if HIGH_BIT_DEPTH
>  ;Col 1-8
> -%if HIGH_BIT_DEPTH
>      vpmovzxwd      m0,          [r0]                              ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +
> +    NORM_FACT_HIGH  m0
>
>  ;Col 9-16
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 16]                         ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 8]
> -%else
> +    vpmovzxwd      m0,          [r0 + 16]
> +
> +    NORM_FACT_HIGH m0
> +
> +    lea            r0,         [r0 + 2 * r1]
> +%elif BIT_DEPTH == 8
> +;col 1-16
> +    vpmovzxbw      m0,         [r0]                             ;src
> +
> +    NORM_FACT_LOW  m0
> +
> +    lea            r0,         [r0 + r1]
> +%else
>      %error Unsupported BIT_DEPTH!
>  %endif
> -
> -    NORM_FACT_COL  m0
> -
> -%if HIGH_BIT_DEPTH
> -    lea            r0,         [r0 + 2 * r1]
> -%else
> -    lea            r0,         [r0 + r1]
> -%endif
>      dec            r4d
>      jnz           .row
> +
> +%if HIGH_BIT_DEPTH
>      vextracti128   xm4,         m3,        1
>      vpaddq         xm3,         xm4
>      punpckhqdq     xm2,         xm3,       xm5
>      paddq          xm3,         xm2
> +%else
> +    vextracti128   xm4,        m3,        1
> +    vpaddd         xm3,        xm4
> +    punpckhqdq     xm2,        xm3,       xm5
> +    paddd          xm3,        xm2
> +    punpckldq      xm3,        xm3,       xm5
> +    punpckhqdq     xm2,        xm3,       xm5
> +    paddd          xm3,        xm2
> +%endif
>      movq           [r3],        xm3
>      RET
>
> @@ -16410,61 +16435,59 @@
>      vpxor          m3,          m3                              ;z_k
>      vpxor          m5,          m5
>  .row:
> +%if HIGH_BIT_DEPTH
>  ;Col 1-8
> -%if HIGH_BIT_DEPTH
>      vpmovzxwd      m0,         [r0]                             ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,         [r0]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +
> +    NORM_FACT_HIGH m0
>
>  ;Col 9-16
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 16]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 8]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 16]
> +
> +    NORM_FACT_HIGH m0
>
>  ;Col 17-24
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 32]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 16]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 32]
> +
> +    NORM_FACT_HIGH  m0
>
>  ;Col 25-32
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 48]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 24]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> -
> -%if HIGH_BIT_DEPTH
> +    vpmovzxwd      m0,          [r0 + 48]
> +
> +    NORM_FACT_HIGH m0
> +
>      lea            r0,          [r0 + 2 * r1]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> +    vpmovzxbw      m0,         [r0]                             ;src
> +
> +    NORM_FACT_LOW  m0
> +;col 17-32
> +    vpmovzxbw      m0,         [r0 + 16]
> +
> +    NORM_FACT_LOW  m0
> +
>      lea            r0,          [r0 + r1]
> +%else
> +    %error Unsupported BIT_DEPTH!
>  %endif
>      dec            r4d
>      jnz           .row
> +
> +%if HIGH_BIT_DEPTH
>      vextracti128   xm4,         m3,        1
>      vpaddq         xm3,         xm4
>      punpckhqdq     xm2,         xm3,       xm5
>      paddq          xm3,         xm2
> +%else
> +    vextracti128   xm4,        m3,        1
> +    vpaddd         xm3,        xm4
> +    punpckhqdq     xm2,        xm3,       xm5
> +    paddd          xm3,        xm2
> +    punpckldq      xm3,        xm3,       xm5
> +    punpckhqdq     xm2,        xm3,       xm5
> +    paddd          xm3,        xm2
> +%endif
>      movq           [r3],        xm3
>      RET
>
> @@ -16475,104 +16498,86 @@
>      vpxor          m3,          m3                             ;z_k
>      vpxor          m5,          m5
>  .row:
> +%if HIGH_BIT_DEPTH
>  ;Col 1-8
> -%if HIGH_BIT_DEPTH
>      vpmovzxwd      m0,          [r0]                           ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +
> +    NORM_FACT_HIGH m0
>
>  ;Col 9-16
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 16]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 8]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 16]
> +
> +    NORM_FACT_HIGH m0
>
>  ;Col 17-24
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 32]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 16]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 32]
> +
> +    NORM_FACT_HIGH  m0
>
>  ;Col 25-32
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 48]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 24]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 48]
> +
> +    NORM_FACT_HIGH  m0
>
>  ;Col 33-40
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 64]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 32]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 64]
> +
> +    NORM_FACT_HIGH  m0
>
>  ;Col 41-48
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 80]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 40]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 80]
> +
> +    NORM_FACT_HIGH  m0
>
>  ;Col 49-56
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 96]                      ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 48]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> +    vpmovzxwd      m0,          [r0 + 96]
> +
> +    NORM_FACT_HIGH  m0
>
>  ;Col 57-64
> -%if HIGH_BIT_DEPTH
> -    vpmovzxwd      m0,          [r0 + 112]                     ;src
> -%elif BIT_DEPTH == 8
> -    vpmovzxbd      m0,          [r0 + 56]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
> -    NORM_FACT_COL  m0
> -
> -%if HIGH_BIT_DEPTH
> +    vpmovzxwd      m0,          [r0 + 112]
> +
> +    NORM_FACT_HIGH m0
> +
>      lea            r0,          [r0 + 2 * r1]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> +    vpmovzxbw      m0,         [r0]                             ;src
> +
> +    NORM_FACT_LOW  m0
> +;col 17-32
> +    vpmovzxbw      m0,         [r0 + 16]
> +
> +    NORM_FACT_LOW  m0
> +;col 33-48
> +    vpmovzxbw      m0,         [r0 + 32]
> +
> +    NORM_FACT_LOW  m0
> +;col 49-56
> +    vpmovzxbw      m0,         [r0 + 48]
> +
> +    NORM_FACT_LOW  m0
> +
>      lea            r0,          [r0 + r1]
> +%else
> +    %error Unsupported BIT_DEPTH!
>  %endif
>      dec            r4d
>      jnz           .row
> +
> +%if HIGH_BIT_DEPTH
>      vextracti128   xm4,         m3,        1
>      vpaddq         xm3,         xm4
>      punpckhqdq     xm2,         xm3,       xm5
>      paddq          xm3,         xm2
> +%else
> +    vextracti128   xm4,        m3,        1
> +    vpaddd         xm3,        xm4
> +    punpckhqdq     xm2,        xm3,       xm5
> +    paddd          xm3,        xm2
> +    punpckldq      xm3,        xm3,       xm5
> +    punpckhqdq     xm2,        xm3,       xm5
> +    paddd          xm3,        xm2
> +%endif
>      movq           [r3],        xm3
>      RET
>
>
> --
> *Regards,*
> *Akil R*
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190408/b08836b7/attachment-0001.html>


More information about the x265-devel mailing list