[x265] [PATCH x265] SSIM-RD: 8-bit AVX2 performance improvement
Akil
akil at multicorewareinc.com
Fri Apr 5 12:03:22 CEST 2019
# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1554365158 -19800
# Thu Apr 04 13:35:58 2019 +0530
# Node ID e7a726d1ca84d59f85cfafb428b8ffc4b9eb7000
# Parent b36242b9f354b8773e38674b876b0ca5dfc35ad2
SSIM-RD : 8-bit AVX2 performance improvement
ssimDistortion
[16x16] 5.44x => 13.52x
[32x32] 6.01x => 18.99x
[64x64] 6.70x => 20.78x
normFactor
[16x16] 8.42x => 17.96x
[32x32] 9.56x => 29.12x
[64x64] 8.96x => 25.29x
diff -r b36242b9f354 -r e7a726d1ca84 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Apr 02 15:01:12 2019 +0530
+++ b/source/common/x86/pixel-a.asm Thu Apr 04 13:35:58 2019 +0530
@@ -370,7 +370,7 @@
RET
%endmacro
-%macro SSIM_RD_COL 2
+%macro SSIM_DIST_HIGH 2
vpsrld m6, m0, SSIMRD_SHIFT
vpsubd m0, m1
@@ -388,7 +388,7 @@
vpaddq m7, m6
%endmacro
-%macro NORM_FACT_COL 1
+%macro NORM_FACT_HIGH 1
vpsrld m1, m0, SSIMRD_SHIFT
vpmuldq m2, m1, m1
vpsrldq m1, m1, 4
@@ -398,6 +398,23 @@
vpaddq m3, m1
%endmacro
+%macro SSIM_DIST_LOW 2
+ vpsrlw m6, m0, SSIMRD_SHIFT
+ vpsubw m0, m1
+
+ vpmaddwd m0, m0, m0
+ vpmaddwd m6, m6, m6
+
+ vpaddd m4, m0
+ vpaddd m7, m6
+%endmacro
+
+%macro NORM_FACT_LOW 1
+ vpsrlw m1, m0, SSIMRD_SHIFT
+ vpmaddwd m1, m1, m1
+ vpaddd m3, m1
+%endmacro
+
; FIXME avoid the spilling of regs to hold 3*stride.
; for small blocks on x86_32, modify pixel pointer instead.
@@ -16014,7 +16031,7 @@
%error Unsupported BIT_DEPTH!
%endif
- SSIM_RD_COL m0, m1
+ SSIM_DIST_HIGH m0, m1
%if HIGH_BIT_DEPTH
lea r0, [r0 + 2 * r1]
@@ -16047,41 +16064,37 @@
vpxor m3, m3
vpxor m7, m7 ;ac_k
.row:
+%if HIGH_BIT_DEPTH
;Col 1-8
-%if HIGH_BIT_DEPTH
vpmovzxwd m0, [r0] ;fenc
vpmovzxwd m1, [r2] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0]
- vpmovzxbd m1, [r2]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+
+ SSIM_DIST_HIGH m0, m1
;Col 9-16
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 16] ;fenc
- vpmovzxwd m1, [r2 + 16] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 8]
- vpmovzxbd m1, [r2 + 8]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
-
-%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 16]
+ vpmovzxwd m1, [r2 + 16]
+
+ SSIM_DIST_HIGH m0, m1
+
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
-%else
+%elif BIT_DEPTH == 8
+;col 1- 16
+ vpmovzxbw m0, [r0] ;fenc
+ vpmovzxbw m1, [r2] ;recon
+
+ SSIM_DIST_LOW m0, m1
+
lea r0, [r0 + r1]
lea r2, [r2 + r3]
+%else
+ %error Unsupported BIT_DEPTH!
%endif
dec r5d
jnz .row
+
+%if HIGH_BIT_DEPTH
vextracti128 xm5, m4, 1
vpaddq xm4, xm5
punpckhqdq xm2, xm4, xm3
@@ -16091,7 +16104,23 @@
vpaddq xm7, xm5
punpckhqdq xm2, xm7, xm3
paddq xm7, xm2
-
+%else
+ vextracti128 xm5, m4, 1
+ vpaddd xm4, xm5
+ punpckhqdq xm2, xm4, xm3
+ paddd xm4, xm2
+ punpckldq xm4, xm4, xm3
+ punpckhqdq xm2, xm4, xm3
+ paddd xm4, xm2
+
+ vextracti128 xm5, m7, 1
+ vpaddd xm7, xm5
+ punpckhqdq xm2, xm7, xm3
+ paddd xm7, xm2
+ punpckldq xm7, xm7, xm3
+ punpckhqdq xm2, xm7, xm3
+ paddd xm7, xm2
+%endif
movq [r4], xm4
movq [r6], xm7
RET
@@ -16104,67 +16133,55 @@
vpxor m3, m3
vpxor m7, m7 ;ac_k
.row:
+%if HIGH_BIT_DEPTH
;Col 1-8
-%if HIGH_BIT_DEPTH
vpmovzxwd m0, [r0] ;fenc
vpmovzxwd m1, [r2] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0]
- vpmovzxbd m1, [r2]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+
+ SSIM_DIST_HIGH m0, m1
;Col 9-16
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 16] ;fenc
- vpmovzxwd m1, [r2 + 16] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 8]
- vpmovzxbd m1, [r2 + 8]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 16]
+ vpmovzxwd m1, [r2 + 16]
+
+ SSIM_DIST_HIGH m0, m1
;Col 17-24
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 32] ;fenc
- vpmovzxwd m1, [r2 + 32] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 16]
- vpmovzxbd m1, [r2 + 16]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 32]
+ vpmovzxwd m1, [r2 + 32]
+
+ SSIM_DIST_HIGH m0, m1
;Col 25-32
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 48] ;fenc
- vpmovzxwd m1, [r2 + 48] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 24]
- vpmovzxbd m1, [r2 + 24]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
-
-%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 48]
+ vpmovzxwd m1, [r2 + 48]
+
+ SSIM_DIST_HIGH m0, m1
+
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+ vpmovzxbw m0, [r0] ;fenc
+ vpmovzxbw m1, [r2] ;recon
+
+ SSIM_DIST_LOW m0, m1
+
+;col 17-32
+ vpmovzxbw m0, [r0 + 16]
+ vpmovzxbw m1, [r2 + 16]
+
+ SSIM_DIST_LOW m0, m1
+
lea r0, [r0 + r1]
lea r2, [r2 + r3]
+%else
+ %error Unsupported BIT_DEPTH!
%endif
dec r5d
jnz .row
+
+%if HIGH_BIT_DEPTH
vextracti128 xm5, m4, 1
vpaddq xm4, xm5
punpckhqdq xm2, xm4, xm3
@@ -16174,7 +16191,23 @@
vpaddq xm7, xm5
punpckhqdq xm2, xm7, xm3
paddq xm7, xm2
-
+%else
+ vextracti128 xm5, m4, 1
+ vpaddd xm4, xm5
+ punpckhqdq xm2, xm4, xm3
+ paddd xm4, xm2
+ punpckldq xm4, xm4, xm3
+ punpckhqdq xm2, xm4, xm3
+ paddd xm4, xm2
+
+ vextracti128 xm5, m7, 1
+ vpaddd xm7, xm5
+ punpckhqdq xm2, xm7, xm3
+ paddd xm7, xm2
+ punpckldq xm7, xm7, xm3
+ punpckhqdq xm2, xm7, xm3
+ paddd xm7, xm2
+%endif
movq [r4], xm4
movq [r6], xm7
RET
@@ -16187,119 +16220,89 @@
vpxor m3, m3
vpxor m7, m7 ;ac_k
.row:
+%if HIGH_BIT_DEPTH
;Col 1-8
-%if HIGH_BIT_DEPTH
vpmovzxwd m0, [r0] ;fenc
vpmovzxwd m1, [r2] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0]
- vpmovzxbd m1, [r2]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+
+ SSIM_DIST_HIGH m0, m1
;Col 9-16
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 16] ;fenc
- vpmovzxwd m1, [r2 + 16] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 8]
- vpmovzxbd m1, [r2 + 8]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 16]
+ vpmovzxwd m1, [r2 + 16]
+
+ SSIM_DIST_HIGH m0, m1
;Col 17-24
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 32] ;fenc
- vpmovzxwd m1, [r2 + 32] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 16]
- vpmovzxbd m1, [r2 + 16]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 32]
+ vpmovzxwd m1, [r2 + 32]
+
+ SSIM_DIST_HIGH m0, m1
;Col 25-32
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 48] ;fenc
- vpmovzxwd m1, [r2 + 48] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 24]
- vpmovzxbd m1, [r2 + 24]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 48]
+ vpmovzxwd m1, [r2 + 48]
+
+ SSIM_DIST_HIGH m0, m1
;Col 33-40
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 64] ;fenc
- vpmovzxwd m1, [r2 + 64] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 32]
- vpmovzxbd m1, [r2 + 32]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 64]
+ vpmovzxwd m1, [r2 + 64]
+
+ SSIM_DIST_HIGH m0, m1
;Col 41-48
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 80] ;fenc
- vpmovzxwd m1, [r2 + 80] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 40]
- vpmovzxbd m1, [r2 + 40]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 80]
+ vpmovzxwd m1, [r2 + 80]
+
+ SSIM_DIST_HIGH m0, m1
;Col 49-56
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 96] ;fenc
- vpmovzxwd m1, [r2 + 96] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 48]
- vpmovzxbd m1, [r2 + 48]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
+ vpmovzxwd m0, [r0 + 96]
+ vpmovzxwd m1, [r2 + 96]
+
+ SSIM_DIST_HIGH m0, m1
;Col 57-64
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 112] ;fenc
- vpmovzxwd m1, [r2 + 112] ;recon
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 56]
- vpmovzxbd m1, [r2 + 56]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- SSIM_RD_COL m0, m1
-
-%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 112]
+ vpmovzxwd m1, [r2 + 112]
+
+ SSIM_DIST_HIGH m0, m1
+
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+ vpmovzxbw m0, [r0] ;fenc
+ vpmovzxbw m1, [r2] ;recon
+
+ SSIM_DIST_LOW m0, m1
+
+;col 17-32
+ vpmovzxbw m0, [r0 + 16]
+ vpmovzxbw m1, [r2 + 16]
+
+ SSIM_DIST_LOW m0, m1
+
+;col 33-48
+ vpmovzxbw m0, [r0 + 32]
+ vpmovzxbw m1, [r2 + 32]
+
+ SSIM_DIST_LOW m0, m1
+
+;col 49-64
+ vpmovzxbw m0, [r0 + 48]
+ vpmovzxbw m1, [r2 + 48]
+
+ SSIM_DIST_LOW m0, m1
+
lea r0, [r0 + r1]
lea r2, [r2 + r3]
%endif
dec r5d
jnz .row
+
+%if HIGH_BIT_DEPTH
vextracti128 xm5, m4, 1
vpaddq xm4, xm5
punpckhqdq xm2, xm4, xm3
@@ -16309,7 +16312,23 @@
vpaddq xm7, xm5
punpckhqdq xm2, xm7, xm3
paddq xm7, xm2
-
+%else
+ vextracti128 xm5, m4, 1
+ vpaddd xm4, xm5
+ punpckhqdq xm2, xm4, xm3
+ paddd xm4, xm2
+ punpckldq xm4, xm4, xm3
+ punpckhqdq xm2, xm4, xm3
+ paddd xm4, xm2
+
+ vextracti128 xm5, m7, 1
+ vpaddd xm7, xm5
+ punpckhqdq xm2, xm7, xm3
+ paddd xm7, xm2
+ punpckldq xm7, xm7, xm3
+ punpckhqdq xm2, xm7, xm3
+ paddd xm7, xm2
+%endif
movq [r4], xm4
movq [r6], xm7
RET
@@ -16344,7 +16363,7 @@
%error Unsupported BIT_DEPTH!
%endif
- NORM_FACT_COL m0
+ NORM_FACT_HIGH m0
%if HIGH_BIT_DEPTH
lea r0, [r0 + 2 * r1]
@@ -16367,39 +16386,45 @@
vpxor m3, m3 ;z_k
vpxor m5, m5
.row:
+%if HIGH_BIT_DEPTH
;Col 1-8
-%if HIGH_BIT_DEPTH
vpmovzxwd m0, [r0] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+
+ NORM_FACT_HIGH m0
;Col 9-16
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 16] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 8]
-%else
+ vpmovzxwd m0, [r0 + 16]
+
+ NORM_FACT_HIGH m0
+
+ lea r0, [r0 + 2 * r1]
+%elif BIT_DEPTH == 8
+;col 1-16
+ vpmovzxbw m0, [r0] ;src
+
+ NORM_FACT_LOW m0
+
+ lea r0, [r0 + r1]
+%else
%error Unsupported BIT_DEPTH!
%endif
-
- NORM_FACT_COL m0
-
-%if HIGH_BIT_DEPTH
- lea r0, [r0 + 2 * r1]
-%else
- lea r0, [r0 + r1]
-%endif
dec r4d
jnz .row
+
+%if HIGH_BIT_DEPTH
vextracti128 xm4, m3, 1
vpaddq xm3, xm4
punpckhqdq xm2, xm3, xm5
paddq xm3, xm2
+%else
+ vextracti128 xm4, m3, 1
+ vpaddd xm3, xm4
+ punpckhqdq xm2, xm3, xm5
+ paddd xm3, xm2
+ punpckldq xm3, xm3, xm5
+ punpckhqdq xm2, xm3, xm5
+ paddd xm3, xm2
+%endif
movq [r3], xm3
RET
@@ -16410,61 +16435,59 @@
vpxor m3, m3 ;z_k
vpxor m5, m5
.row:
+%if HIGH_BIT_DEPTH
;Col 1-8
-%if HIGH_BIT_DEPTH
vpmovzxwd m0, [r0] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+
+ NORM_FACT_HIGH m0
;Col 9-16
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 16] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 8]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 16]
+
+ NORM_FACT_HIGH m0
;Col 17-24
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 32] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 16]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 32]
+
+ NORM_FACT_HIGH m0
;Col 25-32
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 48] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 24]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
-
-%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 48]
+
+ NORM_FACT_HIGH m0
+
lea r0, [r0 + 2 * r1]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+ vpmovzxbw m0, [r0] ;src
+
+ NORM_FACT_LOW m0
+;col 17-32
+ vpmovzxbw m0, [r0 + 16]
+
+ NORM_FACT_LOW m0
+
lea r0, [r0 + r1]
+%else
+ %error Unsupported BIT_DEPTH!
%endif
dec r4d
jnz .row
+
+%if HIGH_BIT_DEPTH
vextracti128 xm4, m3, 1
vpaddq xm3, xm4
punpckhqdq xm2, xm3, xm5
paddq xm3, xm2
+%else
+ vextracti128 xm4, m3, 1
+ vpaddd xm3, xm4
+ punpckhqdq xm2, xm3, xm5
+ paddd xm3, xm2
+ punpckldq xm3, xm3, xm5
+ punpckhqdq xm2, xm3, xm5
+ paddd xm3, xm2
+%endif
movq [r3], xm3
RET
@@ -16475,104 +16498,86 @@
vpxor m3, m3 ;z_k
vpxor m5, m5
.row:
+%if HIGH_BIT_DEPTH
;Col 1-8
-%if HIGH_BIT_DEPTH
vpmovzxwd m0, [r0] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+
+ NORM_FACT_HIGH m0
;Col 9-16
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 16] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 8]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 16]
+
+ NORM_FACT_HIGH m0
;Col 17-24
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 32] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 16]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 32]
+
+ NORM_FACT_HIGH m0
;Col 25-32
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 48] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 24]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 48]
+
+ NORM_FACT_HIGH m0
;Col 33-40
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 64] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 32]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 64]
+
+ NORM_FACT_HIGH m0
;Col 41-48
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 80] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 40]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 80]
+
+ NORM_FACT_HIGH m0
;Col 49-56
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 96] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 48]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
+ vpmovzxwd m0, [r0 + 96]
+
+ NORM_FACT_HIGH m0
;Col 57-64
-%if HIGH_BIT_DEPTH
- vpmovzxwd m0, [r0 + 112] ;src
-%elif BIT_DEPTH == 8
- vpmovzxbd m0, [r0 + 56]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
- NORM_FACT_COL m0
-
-%if HIGH_BIT_DEPTH
+ vpmovzxwd m0, [r0 + 112]
+
+ NORM_FACT_HIGH m0
+
lea r0, [r0 + 2 * r1]
-%else
+%elif BIT_DEPTH == 8
+;col 1-16
+ vpmovzxbw m0, [r0] ;src
+
+ NORM_FACT_LOW m0
+;col 17-32
+ vpmovzxbw m0, [r0 + 16]
+
+ NORM_FACT_LOW m0
+;col 33-48
+ vpmovzxbw m0, [r0 + 32]
+
+ NORM_FACT_LOW m0
+;col 49-56
+ vpmovzxbw m0, [r0 + 48]
+
+ NORM_FACT_LOW m0
+
lea r0, [r0 + r1]
+%else
+ %error Unsupported BIT_DEPTH!
%endif
dec r4d
jnz .row
+
+%if HIGH_BIT_DEPTH
vextracti128 xm4, m3, 1
vpaddq xm3, xm4
punpckhqdq xm2, xm3, xm5
paddq xm3, xm2
+%else
+ vextracti128 xm4, m3, 1
+ vpaddd xm3, xm4
+ punpckhqdq xm2, xm3, xm5
+ paddd xm3, xm2
+ punpckldq xm3, xm3, xm5
+ punpckhqdq xm2, xm3, xm5
+ paddd xm3, xm2
+%endif
movq [r3], xm3
RET
--
*Regards,*
*Akil R*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190405/67dd3755/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ssim_rd_perf_improvement_final.patch
Type: application/x-patch
Size: 22248 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190405/67dd3755/attachment-0001.bin>
More information about the x265-devel
mailing list