[x265] [PATCH] asm: psyCost_pp avx2 code for BLOCK_4x4

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Mar 24 07:24:54 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1427122202 -19800
#      Mon Mar 23 20:20:02 2015 +0530
# Node ID 32c2596f4716520fd52d1685b5e2be176b5bc08c
# Parent  c6d268fedc36710dc77301c45816246a09f10ad7
asm: psyCost_pp avx2 code for BLOCK_4x4

AVX2:
psy_cost_pp[4x4]         10.30x   216.56          2230.77

SSE4:
psy_cost_pp[4x4]         6.53x    352.01          2297.35

diff -r c6d268fedc36 -r 32c2596f4716 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Mar 23 14:10:52 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 23 20:20:02 2015 +0530
@@ -1417,6 +1417,7 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_avx2;
         p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_avx2;
         p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_avx2;
diff -r c6d268fedc36 -r 32c2596f4716 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Mar 23 14:10:52 2015 +0530
+++ b/source/common/x86/pixel-a.asm	Mon Mar 23 20:20:02 2015 +0530
@@ -38,7 +38,7 @@
            times 4 db 1, -1
            times 8 db 1
            times 4 db 1, -1
-hmul_4p:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+hmul_4p:   times 4 db 1, 1, 1, 1, 1, -1, 1, -1
 mask_10:   times 4 dw 0, -1
 mask_1100: times 2 dd 0, -1
 hmul_8w:   times 4 dw 1
@@ -8146,6 +8146,62 @@
 %endif ; HIGH_BIT_DEPTH
 %endif
 
+INIT_YMM avx2
+cglobal psyCost_pp_4x4, 4, 5, 6
+    lea             r4, [3 * r1]
+    movd            xm0, [r0]
+    movd            xm1, [r0 + r1]
+    movd            xm2, [r0 + r1 * 2]
+    movd            xm3, [r0 + r4]
+    vshufps         xm0, xm1, 0
+    vshufps         xm2, xm3, 0
+
+    lea             r4, [3 * r3]
+    movd            xm1, [r2]
+    movd            xm3, [r2 + r3]
+    movd            xm4, [r2 + r3 * 2]
+    movd            xm5, [r2 + r4]
+    vshufps         xm1, xm3, 0
+    vshufps         xm4, xm5, 0
+
+    vinserti128     m0, m0, xm1, 1
+    vinserti128     m2, m2, xm4, 1
+
+    mova            m4, [hmul_4p]
+    pmaddubsw       m0, m4
+    pmaddubsw       m2, m4
+
+    paddw           m5, m0, m2
+    mova            m1, m5
+    psrldq          m4, m5, 8
+    paddw           m5, m4
+    pmaddwd         m5, [pw_1]
+    psrld           m5, 2
+
+    vpsubw          m2, m2, m0
+    vpunpckhqdq     m0, m1, m2
+    vpunpcklqdq     m1, m1, m2
+    vpaddw          m2, m1, m0
+    vpsubw          m0, m0, m1
+    vpblendw        m1, m2, m0, 10101010b
+    vpslld          m0, m0, 10h
+    vpsrld          m2, m2, 10h
+    vpor            m0, m0, m2
+    vpabsw          m1, m1
+    vpabsw          m0, m0
+    vpmaxsw         m1, m1, m0
+    vpmaddwd        m1, m1, [pw_1]
+    psrldq          m2, m1, 8
+    paddd           m1, m2
+    psrldq          m3, m1, 4
+    paddd           m1, m3
+    psubd           m1, m5
+    vextracti128    xm2, m1, 1
+    psubd           m1, m2
+    pabsd           m1, m1
+    movd            eax, xm1
+    RET
+
 %macro PSY_PP_8x8 0
     movddup         m0, [r0 + r1 * 0]
     movddup         m1, [r0 + r1 * 1]
diff -r c6d268fedc36 -r 32c2596f4716 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Mon Mar 23 14:10:52 2015 +0530
+++ b/source/common/x86/pixel.h	Mon Mar 23 20:20:02 2015 +0530
@@ -260,6 +260,7 @@
 void x265_pixel_sub_ps_32x32_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_sub_ps_64x64_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 
+int x265_psyCost_pp_4x4_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int x265_psyCost_pp_8x8_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int x265_psyCost_pp_16x16_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int x265_psyCost_pp_32x32_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);


More information about the x265-devel mailing list