[x265] [PATCH x265] SSIM-RD: 8-bit AVX2 performance improvement
Dinesh Kumar Reddy
dinesh at multicorewareinc.com
Mon Apr 8 10:43:07 CEST 2019
# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1554365158 -19800
# Thu Apr 04 13:35:58 2019 +0530
# Node ID e7a726d1ca84d59f85cfafb428b8ffc4b9eb7000
# Parent b36242b9f354b8773e38674b876b0ca5dfc35ad2
SSIM-RD : 8-bit AVX2 performance improvement
Patch has been pushed to x265 public branch.
Thanks & Regards,
Dinesh
On Fri, Apr 5, 2019 at 3:33 PM Akil <akil at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Akil Ayyappan<akil at multicorewareinc.com>
> # Date 1554365158 -19800
> # Thu Apr 04 13:35:58 2019 +0530
> # Node ID e7a726d1ca84d59f85cfafb428b8ffc4b9eb7000
> # Parent b36242b9f354b8773e38674b876b0ca5dfc35ad2
> SSIM-RD : 8-bit AVX2 performance improvement
>
> ssimDistortion
> [16x16] 5.44x => 13.52x
> [32x32] 6.01x => 18.99x
> [64x64] 6.70x => 20.78x
>
> normFactor
> [16x16] 8.42x => 17.96x
> [32x32] 9.56x => 29.12x
> [64x64] 8.96x => 25.29x
>
> diff -r b36242b9f354 -r e7a726d1ca84 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Tue Apr 02 15:01:12 2019 +0530
> +++ b/source/common/x86/pixel-a.asm Thu Apr 04 13:35:58 2019 +0530
> @@ -370,7 +370,7 @@
> RET
> %endmacro
>
> -%macro SSIM_RD_COL 2
> +%macro SSIM_DIST_HIGH 2
> vpsrld m6, m0, SSIMRD_SHIFT
> vpsubd m0, m1
>
> @@ -388,7 +388,7 @@
> vpaddq m7, m6
> %endmacro
>
> -%macro NORM_FACT_COL 1
> +%macro NORM_FACT_HIGH 1
> vpsrld m1, m0, SSIMRD_SHIFT
> vpmuldq m2, m1, m1
> vpsrldq m1, m1, 4
> @@ -398,6 +398,23 @@
> vpaddq m3, m1
> %endmacro
>
> +%macro SSIM_DIST_LOW 2
> + vpsrlw m6, m0, SSIMRD_SHIFT
> + vpsubw m0, m1
> +
> + vpmaddwd m0, m0, m0
> + vpmaddwd m6, m6, m6
> +
> + vpaddd m4, m0
> + vpaddd m7, m6
> +%endmacro
> +
> +%macro NORM_FACT_LOW 1
> + vpsrlw m1, m0, SSIMRD_SHIFT
> + vpmaddwd m1, m1, m1
> + vpaddd m3, m1
> +%endmacro
> +
> ; FIXME avoid the spilling of regs to hold 3*stride.
> ; for small blocks on x86_32, modify pixel pointer instead.
>
> @@ -16014,7 +16031,7 @@
> %error Unsupported BIT_DEPTH!
> %endif
>
> - SSIM_RD_COL m0, m1
> + SSIM_DIST_HIGH m0, m1
>
> %if HIGH_BIT_DEPTH
> lea r0, [r0 + 2 * r1]
> @@ -16047,41 +16064,37 @@
> vpxor m3, m3
> vpxor m7, m7 ;ac_k
> .row:
> +%if HIGH_BIT_DEPTH
> ;Col 1-8
> -%if HIGH_BIT_DEPTH
> vpmovzxwd m0, [r0] ;fenc
> vpmovzxwd m1, [r2] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0]
> - vpmovzxbd m1, [r2]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 9-16
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 16] ;fenc
> - vpmovzxwd m1, [r2 + 16] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 8]
> - vpmovzxbd m1, [r2 + 8]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> -
> -%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 16]
> + vpmovzxwd m1, [r2 + 16]
> +
> + SSIM_DIST_HIGH m0, m1
> +
> lea r0, [r0 + 2 * r1]
> lea r2, [r2 + 2 * r3]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1- 16
> + vpmovzxbw m0, [r0] ;fenc
> + vpmovzxbw m1, [r2] ;recon
> +
> + SSIM_DIST_LOW m0, m1
> +
> lea r0, [r0 + r1]
> lea r2, [r2 + r3]
> +%else
> + %error Unsupported BIT_DEPTH!
> %endif
> dec r5d
> jnz .row
> +
> +%if HIGH_BIT_DEPTH
> vextracti128 xm5, m4, 1
> vpaddq xm4, xm5
> punpckhqdq xm2, xm4, xm3
> @@ -16091,7 +16104,23 @@
> vpaddq xm7, xm5
> punpckhqdq xm2, xm7, xm3
> paddq xm7, xm2
> -
> +%else
> + vextracti128 xm5, m4, 1
> + vpaddd xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddd xm4, xm2
> + punpckldq xm4, xm4, xm3
> + punpckhqdq xm2, xm4, xm3
> + paddd xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddd xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddd xm7, xm2
> + punpckldq xm7, xm7, xm3
> + punpckhqdq xm2, xm7, xm3
> + paddd xm7, xm2
> +%endif
> movq [r4], xm4
> movq [r6], xm7
> RET
> @@ -16104,67 +16133,55 @@
> vpxor m3, m3
> vpxor m7, m7 ;ac_k
> .row:
> +%if HIGH_BIT_DEPTH
> ;Col 1-8
> -%if HIGH_BIT_DEPTH
> vpmovzxwd m0, [r0] ;fenc
> vpmovzxwd m1, [r2] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0]
> - vpmovzxbd m1, [r2]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 9-16
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 16] ;fenc
> - vpmovzxwd m1, [r2 + 16] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 8]
> - vpmovzxbd m1, [r2 + 8]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 16]
> + vpmovzxwd m1, [r2 + 16]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 17-24
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 32] ;fenc
> - vpmovzxwd m1, [r2 + 32] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 16]
> - vpmovzxbd m1, [r2 + 16]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 32]
> + vpmovzxwd m1, [r2 + 32]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 25-32
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 48] ;fenc
> - vpmovzxwd m1, [r2 + 48] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 24]
> - vpmovzxbd m1, [r2 + 24]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> -
> -%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 48]
> + vpmovzxwd m1, [r2 + 48]
> +
> + SSIM_DIST_HIGH m0, m1
> +
> lea r0, [r0 + 2 * r1]
> lea r2, [r2 + 2 * r3]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> + vpmovzxbw m0, [r0] ;fenc
> + vpmovzxbw m1, [r2] ;recon
> +
> + SSIM_DIST_LOW m0, m1
> +
> +;col 17-32
> + vpmovzxbw m0, [r0 + 16]
> + vpmovzxbw m1, [r2 + 16]
> +
> + SSIM_DIST_LOW m0, m1
> +
> lea r0, [r0 + r1]
> lea r2, [r2 + r3]
> +%else
> + %error Unsupported BIT_DEPTH!
> %endif
> dec r5d
> jnz .row
> +
> +%if HIGH_BIT_DEPTH
> vextracti128 xm5, m4, 1
> vpaddq xm4, xm5
> punpckhqdq xm2, xm4, xm3
> @@ -16174,7 +16191,23 @@
> vpaddq xm7, xm5
> punpckhqdq xm2, xm7, xm3
> paddq xm7, xm2
> -
> +%else
> + vextracti128 xm5, m4, 1
> + vpaddd xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddd xm4, xm2
> + punpckldq xm4, xm4, xm3
> + punpckhqdq xm2, xm4, xm3
> + paddd xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddd xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddd xm7, xm2
> + punpckldq xm7, xm7, xm3
> + punpckhqdq xm2, xm7, xm3
> + paddd xm7, xm2
> +%endif
> movq [r4], xm4
> movq [r6], xm7
> RET
> @@ -16187,119 +16220,89 @@
> vpxor m3, m3
> vpxor m7, m7 ;ac_k
> .row:
> +%if HIGH_BIT_DEPTH
> ;Col 1-8
> -%if HIGH_BIT_DEPTH
> vpmovzxwd m0, [r0] ;fenc
> vpmovzxwd m1, [r2] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0]
> - vpmovzxbd m1, [r2]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 9-16
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 16] ;fenc
> - vpmovzxwd m1, [r2 + 16] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 8]
> - vpmovzxbd m1, [r2 + 8]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 16]
> + vpmovzxwd m1, [r2 + 16]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 17-24
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 32] ;fenc
> - vpmovzxwd m1, [r2 + 32] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 16]
> - vpmovzxbd m1, [r2 + 16]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 32]
> + vpmovzxwd m1, [r2 + 32]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 25-32
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 48] ;fenc
> - vpmovzxwd m1, [r2 + 48] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 24]
> - vpmovzxbd m1, [r2 + 24]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 48]
> + vpmovzxwd m1, [r2 + 48]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 33-40
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 64] ;fenc
> - vpmovzxwd m1, [r2 + 64] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 32]
> - vpmovzxbd m1, [r2 + 32]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 64]
> + vpmovzxwd m1, [r2 + 64]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 41-48
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 80] ;fenc
> - vpmovzxwd m1, [r2 + 80] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 40]
> - vpmovzxbd m1, [r2 + 40]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 80]
> + vpmovzxwd m1, [r2 + 80]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 49-56
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 96] ;fenc
> - vpmovzxwd m1, [r2 + 96] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 48]
> - vpmovzxbd m1, [r2 + 48]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> + vpmovzxwd m0, [r0 + 96]
> + vpmovzxwd m1, [r2 + 96]
> +
> + SSIM_DIST_HIGH m0, m1
>
> ;Col 57-64
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 112] ;fenc
> - vpmovzxwd m1, [r2 + 112] ;recon
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 56]
> - vpmovzxbd m1, [r2 + 56]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - SSIM_RD_COL m0, m1
> -
> -%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 112]
> + vpmovzxwd m1, [r2 + 112]
> +
> + SSIM_DIST_HIGH m0, m1
> +
> lea r0, [r0 + 2 * r1]
> lea r2, [r2 + 2 * r3]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> + vpmovzxbw m0, [r0] ;fenc
> + vpmovzxbw m1, [r2] ;recon
> +
> + SSIM_DIST_LOW m0, m1
> +
> +;col 17-32
> + vpmovzxbw m0, [r0 + 16]
> + vpmovzxbw m1, [r2 + 16]
> +
> + SSIM_DIST_LOW m0, m1
> +
> +;col 33-48
> + vpmovzxbw m0, [r0 + 32]
> + vpmovzxbw m1, [r2 + 32]
> +
> + SSIM_DIST_LOW m0, m1
> +
> +;col 49-64
> + vpmovzxbw m0, [r0 + 48]
> + vpmovzxbw m1, [r2 + 48]
> +
> + SSIM_DIST_LOW m0, m1
> +
> lea r0, [r0 + r1]
> lea r2, [r2 + r3]
> %endif
> dec r5d
> jnz .row
> +
> +%if HIGH_BIT_DEPTH
> vextracti128 xm5, m4, 1
> vpaddq xm4, xm5
> punpckhqdq xm2, xm4, xm3
> @@ -16309,7 +16312,23 @@
> vpaddq xm7, xm5
> punpckhqdq xm2, xm7, xm3
> paddq xm7, xm2
> -
> +%else
> + vextracti128 xm5, m4, 1
> + vpaddd xm4, xm5
> + punpckhqdq xm2, xm4, xm3
> + paddd xm4, xm2
> + punpckldq xm4, xm4, xm3
> + punpckhqdq xm2, xm4, xm3
> + paddd xm4, xm2
> +
> + vextracti128 xm5, m7, 1
> + vpaddd xm7, xm5
> + punpckhqdq xm2, xm7, xm3
> + paddd xm7, xm2
> + punpckldq xm7, xm7, xm3
> + punpckhqdq xm2, xm7, xm3
> + paddd xm7, xm2
> +%endif
> movq [r4], xm4
> movq [r6], xm7
> RET
> @@ -16344,7 +16363,7 @@
> %error Unsupported BIT_DEPTH!
> %endif
>
> - NORM_FACT_COL m0
> + NORM_FACT_HIGH m0
>
> %if HIGH_BIT_DEPTH
> lea r0, [r0 + 2 * r1]
> @@ -16367,39 +16386,45 @@
> vpxor m3, m3 ;z_k
> vpxor m5, m5
> .row:
> +%if HIGH_BIT_DEPTH
> ;Col 1-8
> -%if HIGH_BIT_DEPTH
> vpmovzxwd m0, [r0] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> +
> + NORM_FACT_HIGH m0
>
> ;Col 9-16
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 16] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 8]
> -%else
> + vpmovzxwd m0, [r0 + 16]
> +
> + NORM_FACT_HIGH m0
> +
> + lea r0, [r0 + 2 * r1]
> +%elif BIT_DEPTH == 8
> +;col 1-16
> + vpmovzxbw m0, [r0] ;src
> +
> + NORM_FACT_LOW m0
> +
> + lea r0, [r0 + r1]
> +%else
> %error Unsupported BIT_DEPTH!
> %endif
> -
> - NORM_FACT_COL m0
> -
> -%if HIGH_BIT_DEPTH
> - lea r0, [r0 + 2 * r1]
> -%else
> - lea r0, [r0 + r1]
> -%endif
> dec r4d
> jnz .row
> +
> +%if HIGH_BIT_DEPTH
> vextracti128 xm4, m3, 1
> vpaddq xm3, xm4
> punpckhqdq xm2, xm3, xm5
> paddq xm3, xm2
> +%else
> + vextracti128 xm4, m3, 1
> + vpaddd xm3, xm4
> + punpckhqdq xm2, xm3, xm5
> + paddd xm3, xm2
> + punpckldq xm3, xm3, xm5
> + punpckhqdq xm2, xm3, xm5
> + paddd xm3, xm2
> +%endif
> movq [r3], xm3
> RET
>
> @@ -16410,61 +16435,59 @@
> vpxor m3, m3 ;z_k
> vpxor m5, m5
> .row:
> +%if HIGH_BIT_DEPTH
> ;Col 1-8
> -%if HIGH_BIT_DEPTH
> vpmovzxwd m0, [r0] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> +
> + NORM_FACT_HIGH m0
>
> ;Col 9-16
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 16] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 8]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 16]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 17-24
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 32] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 16]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 32]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 25-32
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 48] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 24]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> -
> -%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 48]
> +
> + NORM_FACT_HIGH m0
> +
> lea r0, [r0 + 2 * r1]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> + vpmovzxbw m0, [r0] ;src
> +
> + NORM_FACT_LOW m0
> +;col 17-32
> + vpmovzxbw m0, [r0 + 16]
> +
> + NORM_FACT_LOW m0
> +
> lea r0, [r0 + r1]
> +%else
> + %error Unsupported BIT_DEPTH!
> %endif
> dec r4d
> jnz .row
> +
> +%if HIGH_BIT_DEPTH
> vextracti128 xm4, m3, 1
> vpaddq xm3, xm4
> punpckhqdq xm2, xm3, xm5
> paddq xm3, xm2
> +%else
> + vextracti128 xm4, m3, 1
> + vpaddd xm3, xm4
> + punpckhqdq xm2, xm3, xm5
> + paddd xm3, xm2
> + punpckldq xm3, xm3, xm5
> + punpckhqdq xm2, xm3, xm5
> + paddd xm3, xm2
> +%endif
> movq [r3], xm3
> RET
>
> @@ -16475,104 +16498,86 @@
> vpxor m3, m3 ;z_k
> vpxor m5, m5
> .row:
> +%if HIGH_BIT_DEPTH
> ;Col 1-8
> -%if HIGH_BIT_DEPTH
> vpmovzxwd m0, [r0] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> +
> + NORM_FACT_HIGH m0
>
> ;Col 9-16
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 16] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 8]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 16]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 17-24
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 32] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 16]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 32]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 25-32
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 48] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 24]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 48]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 33-40
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 64] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 32]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 64]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 41-48
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 80] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 40]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 80]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 49-56
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 96] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 48]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> + vpmovzxwd m0, [r0 + 96]
> +
> + NORM_FACT_HIGH m0
>
> ;Col 57-64
> -%if HIGH_BIT_DEPTH
> - vpmovzxwd m0, [r0 + 112] ;src
> -%elif BIT_DEPTH == 8
> - vpmovzxbd m0, [r0 + 56]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> - NORM_FACT_COL m0
> -
> -%if HIGH_BIT_DEPTH
> + vpmovzxwd m0, [r0 + 112]
> +
> + NORM_FACT_HIGH m0
> +
> lea r0, [r0 + 2 * r1]
> -%else
> +%elif BIT_DEPTH == 8
> +;col 1-16
> + vpmovzxbw m0, [r0] ;src
> +
> + NORM_FACT_LOW m0
> +;col 17-32
> + vpmovzxbw m0, [r0 + 16]
> +
> + NORM_FACT_LOW m0
> +;col 33-48
> + vpmovzxbw m0, [r0 + 32]
> +
> + NORM_FACT_LOW m0
> +;col 49-56
> + vpmovzxbw m0, [r0 + 48]
> +
> + NORM_FACT_LOW m0
> +
> lea r0, [r0 + r1]
> +%else
> + %error Unsupported BIT_DEPTH!
> %endif
> dec r4d
> jnz .row
> +
> +%if HIGH_BIT_DEPTH
> vextracti128 xm4, m3, 1
> vpaddq xm3, xm4
> punpckhqdq xm2, xm3, xm5
> paddq xm3, xm2
> +%else
> + vextracti128 xm4, m3, 1
> + vpaddd xm3, xm4
> + punpckhqdq xm2, xm3, xm5
> + paddd xm3, xm2
> + punpckldq xm3, xm3, xm5
> + punpckhqdq xm2, xm3, xm5
> + paddd xm3, xm2
> +%endif
> movq [r3], xm3
> RET
>
>
> --
> *Regards,*
> *Akil R*
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190408/b08836b7/attachment-0001.html>
More information about the x265-devel
mailing list