[x265] [PATCH] asm: improve avx2 8bpp code for convert_p2s[64xN]

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Wed Apr 8 15:49:18 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1428500140 -19800
#      Wed Apr 08 19:05:40 2015 +0530
# Node ID af774236973b2b20de2258358df02d2a21bf6e61
# Parent  533ef0c879a75f22b2667d2d7e40c99ef96f18fa
asm: improve avx2 8bpp code for convert_p2s[64xN]

     convert_p2s[64x16](14.99x),convert_p2s[64x32](14.91x),
     convert_p2s[64x48](14.16x),convert_p2s[64x64](15.61x)

diff -r 533ef0c879a7 -r af774236973b source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Apr 08 19:00:35 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Apr 08 19:05:40 2015 +0530
@@ -8429,115 +8429,96 @@
 ;-----------------------------------------------------------------------------
 %macro P2S_H_64xN_avx2 1
 INIT_YMM avx2
-cglobal filterPixelToShort_64x%1, 3, 7, 6
+cglobal filterPixelToShort_64x%1, 3, 7, 5
     mov         r3d, r3m
     add         r3d, r3d
-    lea         r4, [r3 * 3]
     lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
 
     ; load height
-    mov         r6d, %1/4
+    mov         r4d, %1/4
 
     ; load constant
-    vbroadcasti128   m4, [pb_128]
-    vbroadcasti128   m5, [tab_c_64_n64]
-
-.loop:
-    movu        m0, [r0]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 0], m2
-    movu        [r2 + r3 * 0 + 32], m3
-
-    movu        m0, [r0 + r1]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 1], m2
-    movu        [r2 + r3 * 1 + 32], m3
-
-    movu        m0, [r0 + r1 * 2]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 2], m2
-    movu        [r2 + r3 * 2 + 32], m3
-
-    movu        m0, [r0 + r5]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r4], m2
-    movu        [r2 + r4 + 32], m3
-
-    add         r0, 32
-
-    movu        m0, [r0]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 0 + 64], m2
-    movu        [r2 + r3 * 0 + 96], m3
-
-    movu        m0, [r0 + r1]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 1 + 64], m2
-    movu        [r2 + r3 * 1 + 96], m3
-
-    movu        m0, [r0 + r1 * 2]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 2 + 64], m2
-    movu        [r2 + r3 * 2 + 96], m3
-
-    movu        m0, [r0 + r5]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r4 + 64], m2
-    movu        [r2 + r4 + 96], m3
-
-    lea         r0, [r0 + r1 * 4 - 32]
+    vpbroadcastd m4, [pw_2000]
+
+.loop:
+    pmovzxbw    m0, [r0 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + 2 * mmsize/2]
+    pmovzxbw    m3, [r0 + 3 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+
+    movu        [r2 + 0 * mmsize], m0
+    movu        [r2 + 1 * mmsize], m1
+    movu        [r2 + 2 * mmsize], m2
+    movu        [r2 + 3 * mmsize], m3
+
+    pmovzxbw    m0, [r0 + r1 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r1 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + r1 + 2 * mmsize/2]
+    pmovzxbw    m3, [r0 + r1 + 3 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+
+    movu        [r2 + r3 + 0 * mmsize], m0
+    movu        [r2 + r3 + 1 * mmsize], m1
+    movu        [r2 + r3 + 2 * mmsize], m2
+    movu        [r2 + r3 + 3 * mmsize], m3
+
+    pmovzxbw    m0, [r0 + r1 * 2 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r1 * 2 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + r1 * 2 + 2 * mmsize/2]
+    pmovzxbw    m3, [r0 + r1 * 2 + 3 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+
+    movu        [r2 + r3 * 2 + 0 * mmsize], m0
+    movu        [r2 + r3 * 2 + 1 * mmsize], m1
+    movu        [r2 + r3 * 2 + 2 * mmsize], m2
+    movu        [r2 + r3 * 2 + 3 * mmsize], m3
+
+    pmovzxbw    m0, [r0 + r5 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r5 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + r5 + 2 * mmsize/2]
+    pmovzxbw    m3, [r0 + r5 + 3 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psllw       m3, 6
+    psubw       m0, m4
+    psubw       m1, m4
+    psubw       m2, m4
+    psubw       m3, m4
+
+    movu        [r2 + r6 + 0 * mmsize], m0
+    movu        [r2 + r6 + 1 * mmsize], m1
+    movu        [r2 + r6 + 2 * mmsize], m2
+    movu        [r2 + r6 + 3 * mmsize], m3
+
+    lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
 
-    dec         r6d
-    jnz         .loop
+    dec         r4d
+    jnz        .loop
     RET
 %endmacro
 P2S_H_64xN_avx2 64


More information about the x265-devel mailing list