[x265] [PATCH] asm: avx2 code for satd_32xN

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Apr 14 11:03:36 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1429001011 -19800
#      Tue Apr 14 14:13:31 2015 +0530
# Node ID 5644a5b24ce03290de3a5bb0fc4d49cc00a19ae2
# Parent  dd456de98c239b86e29bf349881854a699056240
asm: avx2 code for satd_32xN

AVX2:
satd[ 32x8]        8.40x    957.22          8040.38
satd[32x16]        8.31x    1950.86         16214.44
satd[32x24]        8.50x    2897.62         24636.81
satd[32x32]        8.88x    3952.35         35115.40
satd[32x64]        9.18x    7334.90         67312.13

AVX:
satd[ 32x8]        4.63x    1738.62         8048.18
satd[32x16]        5.01x    3249.63         16295.51
satd[32x24]        5.30x    4767.54         25279.60
satd[32x32]        5.67x    6156.74         34895.57
satd[32x64]        5.59x    11708.14        65479.60

diff -r dd456de98c23 -r 5644a5b24ce0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 14 13:41:40 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 14 14:13:31 2015 +0530
@@ -1669,6 +1669,12 @@
         p.pu[LUMA_8x16].satd  = x265_pixel_satd_8x16_avx2;
         p.pu[LUMA_8x8].satd   = x265_pixel_satd_8x8_avx2;
 
+        p.pu[LUMA_32x8].satd   = x265_pixel_satd_32x8_avx2;
+        p.pu[LUMA_32x16].satd   = x265_pixel_satd_32x16_avx2;
+        p.pu[LUMA_32x24].satd   = x265_pixel_satd_32x24_avx2;
+        p.pu[LUMA_32x32].satd   = x265_pixel_satd_32x32_avx2;
+        p.pu[LUMA_32x64].satd   = x265_pixel_satd_32x64_avx2;
+
         p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
         p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
         p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_avx2;
diff -r dd456de98c23 -r 5644a5b24ce0 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Apr 14 13:41:40 2015 +0800
+++ b/source/common/x86/pixel-a.asm	Tue Apr 14 14:13:31 2015 +0530
@@ -10506,3 +10506,303 @@
     mov             rsp, r5
     RET
 %endif
+
+;;---------------------------------------------------------------
+;; SATD AVX2
+;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t)
+;;---------------------------------------------------------------
+;; r0   - pix0
+;; r1   - pix0Stride
+;; r2   - pix1
+;; r3   - pix1Stride
+
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal calc_satd_16x8    ; function to compute satd cost for 16 columns, 8 rows
+    movu            xm4, [r0]
+    movu            xm5, [r0 + r1]
+    movu            xm0, [r2]
+    movu            xm1, [r2 + r3]
+
+    vpermq          m4, m4, 01010000b
+    vpermq          m5, m5, 01010000b
+    vpermq          m0, m0, 01010000b
+    vpermq          m1, m1, 01010000b
+
+    pmaddubsw       m4, m7
+    pmaddubsw       m0, m7
+    pmaddubsw       m5, m7
+    pmaddubsw       m1, m7
+    psubw           m0, m4
+    psubw           m1, m5
+
+    movu            xm4, [r0 + r1 * 2]
+    movu            xm5, [r0 + r4]
+    movu            xm2, [r2 + r3 * 2]
+    movu            xm3, [r2 + r5]
+
+    vpermq          m4, m4, 01010000b
+    vpermq          m5, m5, 01010000b
+    vpermq          m2, m2, 01010000b
+    vpermq          m3, m3, 01010000b
+
+    pmaddubsw       m4, m7
+    pmaddubsw       m2, m7
+    pmaddubsw       m5, m7
+    pmaddubsw       m3, m7
+    psubw           m2, m4
+    psubw           m3, m5
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    paddw           m4, m0, m1
+    psubw           m1, m1, m0
+    paddw           m0, m2, m3
+    psubw           m3, m3, m2
+    paddw           m2, m4, m0
+    psubw           m0, m0, m4
+    paddw           m4, m1, m3
+    psubw           m3, m3, m1
+    pabsw           m2, m2
+    pabsw           m0, m0
+    pabsw           m4, m4
+    pabsw           m3, m3
+    pblendw         m1, m2, m0, 10101010b
+    pslld           m0, m0, 16
+    psrld           m2, m2, 16
+    por             m0, m0, m2
+    pmaxsw          m1, m1, m0
+    pxor            m9, m9, m9
+    mova            m8, m1
+    punpcklwd       m8, m8, m9
+    paddd           m6, m6, m8
+    mova            m8, m1
+    punpckhwd       m8, m8, m9
+    paddd           m6, m6, m8
+    pblendw         m2, m4, m3, 10101010b
+    pslld           m3, m3, 16
+    psrld           m4, m4, 16
+    por             m3, m3, m4
+    pmaxsw          m2, m2, m3
+    pxor            m9, m9, m9
+    mova            m8, m2
+    punpcklwd       m8, m8, m9
+    paddd           m6, m6, m8
+    mova            m8, m2
+    punpckhwd       m8, m8, m9
+    paddd           m6, m6, m8
+
+    movu            xm4, [r0]
+    movu            xm5, [r0 + r1]
+    movu            xm1, [r2]
+    movu            xm2, [r2 + r3]
+
+    vpermq          m4, m4, 01010000b
+    vpermq          m5, m5, 01010000b
+    vpermq          m1, m1, 01010000b
+    vpermq          m2, m2, 01010000b
+
+    pmaddubsw       m4, m4, m7
+    pmaddubsw       m1, m1, m7
+    pmaddubsw       m5, m5, m7
+    pmaddubsw       m2, m2, m7
+    psubw           m1, m1, m4
+    psubw           m2, m2, m5
+
+    movu            xm4, [r0 + r1 * 2]
+    movu            xm5, [r0 + r4]
+    movu            xm0, [r2 + r3 * 2]
+    movu            xm3, [r2 + r5]
+
+    vpermq          m4, m4, 01010000b
+    vpermq          m5, m5, 01010000b
+    vpermq          m0, m0, 01010000b
+    vpermq          m3, m3, 01010000b
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmaddubsw       m4, m4, m7
+    pmaddubsw       m0, m0, m7
+    pmaddubsw       m5, m5, m7
+    pmaddubsw       m3, m3, m7
+    psubw           m0, m0, m4
+    psubw           m3, m3, m5
+    paddw           m4, m1, m2
+    psubw           m2, m2, m1
+    paddw           m1, m0, m3
+    psubw           m3, m3, m0
+    paddw           m0, m4, m1
+    psubw           m1, m1, m4
+    paddw           m4, m2, m3
+    psubw           m3, m3, m2
+    pabsw           m0, m0
+    pabsw           m1, m1
+    pabsw           m4, m4
+    pabsw           m3, m3
+    pblendw         m2, m0, m1, 10101010b
+    pslld           m1, m1, 16
+    psrld           m0, m0, 16
+    por             m1, m1, m0
+    pmaxsw          m2, m2, m1
+    pxor            m9, m9, m9
+    mova            m8, m2
+    punpcklwd       m8, m8, m9
+    paddd           m6, m6, m8
+    mova            m8, m2
+    punpckhwd       m8, m8, m9
+    paddd           m6, m6, m8
+    pblendw         m0, m4, m3, 10101010b
+    pslld           m3, m3, 16
+    psrld           m4, m4, 16
+    por             m3, m3, m4
+    pmaxsw          m0, m0, m3
+    pxor            m9, m9, m9
+    mova            m8, m0
+    punpcklwd       m8, m8, m9
+    paddd           m6, m6, m8
+    mova            m8, m0
+    punpckhwd       m8, m8, m9
+    paddd           m6, m6, m8
+    ret
+
+cglobal pixel_satd_32x8, 4,8,10         ; if WIN64 && cpuflag(avx2)
+    mova            m7, [hmul_8p]
+    lea             r4, [3 * r1]
+    lea             r5, [3 * r3]
+    pxor            m6, m6
+    mov             r6, r0
+    mov             r7, r2
+
+    call            calc_satd_16x8
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+
+    call            calc_satd_16x8
+
+    vextracti128    xm8, m6, 1
+    paddd           xm6, xm8
+    movhlps         xm7, xm6
+    paddd           xm6, xm7
+    pshufd          xm7, xm6, 1
+    paddd           xm6, xm7
+    movd            eax, xm6
+    RET
+
+cglobal pixel_satd_32x16, 4,8,10         ; if WIN64 && cpuflag(avx2)
+    mova            m7, [hmul_8p]
+    lea             r4, [3 * r1]
+    lea             r5, [3 * r3]
+    pxor            m6, m6
+    mov             r6, r0
+    mov             r7, r2
+
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+
+    vextracti128    xm8, m6, 1
+    paddd           xm6, xm8
+    movhlps         xm7, xm6
+    paddd           xm6, xm7
+    pshufd          xm7, xm6, 1
+    paddd           xm6, xm7
+    movd            eax, xm6
+    RET
+
+cglobal pixel_satd_32x24, 4,8,10         ; if WIN64 && cpuflag(avx2)
+    mova            m7, [hmul_8p]
+    lea             r4, [3 * r1]
+    lea             r5, [3 * r3]
+    pxor            m6, m6
+    mov             r6, r0
+    mov             r7, r2
+
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+
+    vextracti128    xm8, m6, 1
+    paddd           xm6, xm8
+    movhlps         xm7, xm6
+    paddd           xm6, xm7
+    pshufd          xm7, xm6, 1
+    paddd           xm6, xm7
+    movd            eax, xm6
+    RET
+
+cglobal pixel_satd_32x32, 4,8,10         ; if WIN64 && cpuflag(avx2)
+    mova            m7, [hmul_8p]
+    lea             r4, [3 * r1]
+    lea             r5, [3 * r3]
+    pxor            m6, m6
+    mov             r6, r0
+    mov             r7, r2
+
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+
+    vextracti128    xm8, m6, 1
+    paddd           xm6, xm8
+    movhlps         xm7, xm6
+    paddd           xm6, xm7
+    pshufd          xm7, xm6, 1
+    paddd           xm6, xm7
+    movd            eax, xm6
+    RET
+
+cglobal pixel_satd_32x64, 4,8,10         ; if WIN64 && cpuflag(avx2)
+    mova            m7, [hmul_8p]
+    lea             r4, [3 * r1]
+    lea             r5, [3 * r3]
+    pxor            m6, m6
+    mov             r6, r0
+    mov             r7, r2
+
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+    call            calc_satd_16x8
+
+    vextracti128    xm8, m6, 1
+    paddd           xm6, xm8
+    movhlps         xm7, xm6
+    paddd           xm6, xm7
+    pshufd          xm7, xm6, 1
+    paddd           xm6, xm7
+    movd            eax, xm6
+    RET
+
+%endif  ; if ARCH_X86_64 == 1


More information about the x265-devel mailing list