[x265] [PATCH 007 of 307] x86: AVX-512 pixel_sa8d_8x8

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:05 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1498473664 -19800
#      Mon Jun 26 16:11:04 2017 +0530
# Node ID 03a532a9ab714b0081aede28e1773022d2be20b6
# Parent  c7b36dac20317b3819fb30cf437a029a2ce7ca99
x86: AVX-512 pixel_sa8d_8x8

diff -r c7b36dac2031 -r 03a532a9ab71 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jun 26 16:31:02 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jun 26 16:11:04 2017 +0530
@@ -3752,6 +3752,9 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
 
+        p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512);
+
         p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
         p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
         p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
@@ -3759,6 +3762,7 @@
         p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx512);
         p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512);
         p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512);
+
     }
 #endif
 }
diff -r c7b36dac2031 -r 03a532a9ab71 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Jun 26 16:31:02 2017 +0530
+++ b/source/common/x86/pixel-a.asm	Mon Jun 26 16:11:04 2017 +0530
@@ -8187,7 +8187,7 @@
     HMAXABSW2         0, 1, 2, 3
 %endmacro
 
-%macro SATD_AVX512_END 0
+%macro SATD_AVX512_END 0-1 0 ; sa8d
     paddw          m0 {k1}{z}, m1 ; zero-extend to dwords
 %if ARCH_X86_64
 %if mmsize == 64
@@ -8202,10 +8202,19 @@
     paddd        xmm0, xmm1
     movq          rax, xmm0
     rorx          rdx, rax, 32
+%if %1
+    lea           eax, [rax+rdx+1]
+    shr           eax, 1
+%else
     add           eax, edx
+%endif
 %else
     HADDD          m0, m1
     movd          eax, xm0
+%if %1
+    inc           eax
+    shr           eax, 1
+%endif
 %endif
     RET
 %endmacro
@@ -8350,6 +8359,29 @@
     SWAP      0, 1
     SATD_AVX512_END
 
+INIT_ZMM avx512
+cglobal pixel_sa8d_8x8, 4,6
+    vbroadcasti64x4 m4, [hmul_16p]
+    mov     r4d, 0x55555555
+    kmovd    k1, r4d   ; 01010101
+    kshiftlb k2, k1, 5 ; 10100000
+    kshiftlb k3, k1, 4 ; 01010000
+    lea      r4, [3*r1]
+    lea      r5, [3*r3]
+    SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
+    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4     ; 3 1 3 1 7 5 7 5
+    SUMSUB_BA      w, 0, 1, 2
+    SBUTTERFLY   qdq, 0, 1, 2
+    SUMSUB_BA      w, 0, 1, 2
+    shufps        m2, m0, m1, q2020
+    shufps        m1, m0, m1, q3131
+    SUMSUB_BA      w, 2, 1, 0
+    vshufi32x4    m0, m2, m1, q1010
+    vshufi32x4    m1, m2, m1, q3232
+    SUMSUB_BA      w, 0, 1, 2
+    HMAXABSW2      0, 1, 2, 3
+    SATD_AVX512_END 1
+
 ; Input 10bit, Output 8bit
 ;------------------------------------------------------------------------------------------------------------------------
 ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)


More information about the x265-devel mailing list