[x265] [PATCH 272 of 307] x86: AVX512 nonPsyRdoQuant optimise load and floating point multiplications

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:30 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1514520036 -19800
#      Fri Dec 29 09:30:36 2017 +0530
# Node ID ca3c04bd0a71bb263b8084283acce012f0cc397c
# Parent  fd28f49cb7b30aab97105a59ec841812af205cb9
x86: AVX512 nonPsyRdoQuant optimise load and floating point multiplications
Performance: 3.7-4.2x over C code

diff -r fd28f49cb7b3 -r ca3c04bd0a71 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Mon Dec 18 13:51:44 2017 +0530
+++ b/source/common/x86/dct8.asm	Fri Dec 29 09:30:36 2017 +0530
@@ -511,22 +511,11 @@
 tab_idct8_2:    times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
                 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
 pb_idct8odd:    db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
-;Transform shift and scale bits table for rdoQuant
-tab_nonpsyRdo8 : dq 5, 5
-                 dq 4, 7
-                 dq 3, 9
-                 dq 2, 11
-
-tab_nonpsyRdo10: dq 3, 9
-                 dq 2, 11
-                 dq 1, 13
-                 dq 0, 15
-
-tab_nonpsyRdo12: dq 1, 13
-                 dq 0, 15
-                 dq -1, 17
-                 dq -2, 19
-
+
+;Scale bits table for rdoQuant
+tab_nonpsyRdo8 : dq 5, 7, 9, 11
+tab_nonpsyRdo10: dq 9, 11, 13, 15
+tab_nonpsyRdo12: dq 13, 15, 17, 19
 SECTION .text
 cextern pd_1
 cextern pd_2
@@ -6437,64 +6426,43 @@
 ; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
 ;---------------------------------------------------------------------------------------------------------------------------------------------------------
 INIT_ZMM avx512
-cglobal nonPsyRdoQuant4, 5, 8, 8
-
+cglobal nonPsyRdoQuant4, 5, 5, 8
     mov            r4d,        r4m
     lea             r0,        [r0 + 2 * r4]
-    lea             r7,        [4 * r4]
-    lea             r1,        [r1 + 2 * r7]
-
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
 %if BIT_DEPTH == 12
-    mov            r5q,        [tab_nonpsyRdo12]                 ; transformShift
-    mov            r6q,        [tab_nonpsyRdo12 + 8]             ; scaleBits
+    mov             r4,        [tab_nonpsyRdo12]
 %elif BIT_DEPTH == 10
-    mov            r5q,        [tab_nonpsyRdo10]
-    mov            r6q,        [tab_nonpsyRdo10 + 8]
+    mov             r4,        [tab_nonpsyRdo10]
 %elif BIT_DEPTH == 8
-    mov            r5q,        [tab_nonpsyRdo8]
-    mov            r6q,        [tab_nonpsyRdo8 + 8]
+    mov             r4,        [tab_nonpsyRdo8]
 %else
     %error Unsupported BIT_DEPTH!
  %endif
-
-    movq           xm3,        r6
+    movq           xm3,        r4
     movq           xm6,        [r2]
     movq           xm7,        [r3]
     vpxor           m4,        m4
     vpxor           m5,        m5
-
 ;Row 1, 2
-    movq           xm0,        [r0]
-    pinsrq         xm0,        [r0 + 8],       1
+    movu           xm0,        [r0]
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
-
-    vfmadd132pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
-    vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1
-    movu           [r1],       ym1
-    vextracti32x8  [r1 + 32],  m1 ,     1
-
+    movu           [r1],       m1
     ;Row 3, 4
-    movq           xm0,        [r0 + 16]
-    pinsrq         xm0,        [r0 + 24],      1
+    movu           xm0,        [r0 + 16]
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1
-
-    vfmadd132pd    m2,         m2,             m5
     vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1
-    movu           [r1 + 64],  ym1
-    vextracti32x8  [r1 + 96],  m1 ,            1
-
+    movu           [r1 + 64],  m1
     vextracti32x8  ym2,        m4,             1
     paddq          ym4,        ym2
     vextracti32x4  xm2,        m4,             1
@@ -6508,29 +6476,22 @@
     movq           [r2],       xm6
     movq           [r3],       xm7
     RET
-
 INIT_ZMM avx512
-cglobal nonPsyRdoQuant8, 5, 8, 8
-
+cglobal nonPsyRdoQuant8, 5, 5, 8
     mov            r4d,        r4m
     lea             r0,        [r0 + 2 * r4]
-    lea             r7,        [4 * r4]
-    lea             r1,        [r1 + 2 * r7]
-
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
 %if BIT_DEPTH == 12
-    mov            r5q,        [tab_nonpsyRdo12 + 16]             ; transformShift
-    mov            r6q,        [tab_nonpsyRdo12 + 24]             ; scaleBits
+    mov             r4,        [tab_nonpsyRdo12 + 8]
 %elif BIT_DEPTH == 10
-    mov            r5q,        [tab_nonpsyRdo10 + 16]
-    mov            r6q,        [tab_nonpsyRdo10 + 24]
+    mov             r4,        [tab_nonpsyRdo10 + 8]
 %elif BIT_DEPTH == 8
-    mov            r5q,        [tab_nonpsyRdo8 + 16]
-    mov            r6q,        [tab_nonpsyRdo8 + 24]
+    mov             r4,        [tab_nonpsyRdo8 + 8]
 %else
     %error Unsupported BIT_DEPTH!
  %endif
-
-    movq           xm3,        r6
+    movq           xm3,        r4
     movq           xm6,        [r2]
     movq           xm7,        [r3]
     vpxor           m4,        m4
@@ -6541,11 +6502,7 @@
     pinsrq         xm0,        [r0 + mmsize/4], 1
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
-
-    vfmadd132pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
-    vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1
@@ -6557,11 +6514,7 @@
     pinsrq         xm0,        [r0 + 3 * mmsize/4],      1
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1
-
-    vfmadd132pd    m2,         m2,             m5
     vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1
@@ -6581,29 +6534,22 @@
     movq           [r2],       xm6
     movq           [r3],       xm7
     RET
-
 INIT_ZMM avx512
-cglobal nonPsyRdoQuant16, 5, 8, 8
-
+cglobal nonPsyRdoQuant16, 5, 5, 8
     mov            r4d,        r4m
     lea             r0,        [r0 + 2 * r4]
-    lea             r7,        [4 * r4]
-    lea             r1,        [r1 + 2 * r7]
-
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
 %if BIT_DEPTH == 12
-    mov            r5q,        [tab_nonpsyRdo12 + 32]             ; transformShift
-    mov            r6q,        [tab_nonpsyRdo12 + 40]             ; scaleBits
+    mov             r4,        [tab_nonpsyRdo12 + 16]
 %elif BIT_DEPTH == 10
-    mov            r5q,        [tab_nonpsyRdo10 + 32]
-    mov            r6q,        [tab_nonpsyRdo10 + 40]
+    mov             r4,        [tab_nonpsyRdo10 + 16]
 %elif BIT_DEPTH == 8
-    mov            r5q,        [tab_nonpsyRdo8 + 32]
-    mov            r6q,        [tab_nonpsyRdo8 + 40]
+    mov             r4,        [tab_nonpsyRdo8 + 16]
 %else
     %error Unsupported BIT_DEPTH!
  %endif
-
-    movq           xm3,        r6
+    movq           xm3,        r4
     movq           xm6,        [r2]
     movq           xm7,        [r3]
     vpxor           m4,        m4
@@ -6614,11 +6560,7 @@
     pinsrq         xm0,        [r0 + mmsize/2],       1
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
-
-    vfmadd132pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
-    vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1
@@ -6630,11 +6572,7 @@
     pinsrq         xm0,        [r0 + 3 * mmsize/2],      1
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1
-
-    vfmadd132pd    m2,         m2,             m5
     vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1
@@ -6654,29 +6592,22 @@
     movq           [r2],       xm6
     movq           [r3],       xm7
     RET
-
 INIT_ZMM avx512
-cglobal nonPsyRdoQuant32, 5, 8, 8
-
+cglobal nonPsyRdoQuant32, 5, 5, 8
     mov            r4d,        r4m
     lea             r0,        [r0 + 2 * r4]
-    lea             r7,        [4 * r4]
-    lea             r1,        [r1 + 2 * r7]
-
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
 %if BIT_DEPTH == 12
-    mov            r5q,        [tab_nonpsyRdo12 + 48]             ; transformShift
-    mov            r6q,        [tab_nonpsyRdo12 + 56]             ; scaleBits
+    mov             r4,        [tab_nonpsyRdo12 + 24]
 %elif BIT_DEPTH == 10
-    mov            r5q,        [tab_nonpsyRdo10 + 48]
-    mov            r6q,        [tab_nonpsyRdo10 + 56]
+    mov             r4,        [tab_nonpsyRdo10 + 24]
 %elif BIT_DEPTH == 8
-    mov            r5q,        [tab_nonpsyRdo8 + 48]
-    mov            r6q,        [tab_nonpsyRdo8 + 56]
+    mov             r4,        [tab_nonpsyRdo8 + 24]
 %else
     %error Unsupported BIT_DEPTH!
  %endif
-
-    movq           xm3,        r6
+    movq           xm3,        r4
     movq           xm6,        [r2]
     movq           xm7,        [r3]
     vpxor           m4,        m4
@@ -6687,11 +6618,7 @@
     pinsrq         xm0,        [r0 + mmsize],  1
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
-
-    vfmadd132pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
-    vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1
@@ -6703,11 +6630,7 @@
     pinsrq         xm0,        [r0 + 3 * mmsize],      1
     vpmovsxwq      m1,         xm0
     vcvtqq2pd      m2,         m1
-
-    vfmadd132pd    m2,         m2,             m5
     vfmadd213pd    m2,         m2,             m5
-    vfmadd231pd    m2,         m2,             m5
-
     vcvtpd2qq      m1,         m2
     vpsllq         m1,         xm3                              ; costUncoded
     paddq          m4,         m1


More information about the x265-devel mailing list