[x265] [PATCH] asm: improve avx2 8bpp code for convert_p2s[32xN]

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Wed Apr 8 15:48:31 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1428499835 -19800
#      Wed Apr 08 19:00:35 2015 +0530
# Node ID 533ef0c879a75f22b2667d2d7e40c99ef96f18fa
# Parent  3e416dec8024b8339b18568cf65e48eb3448bed1
asm: improve avx2 8bpp code for convert_p2s[32xN]

     convert_p2s[32x8](14.67x), convert_p2s[32x16](14.92x),
     convert_p2s[32x24](14.89x), convert_p2s[32x32](14.90x),
     convert_p2s[32x64](14.66x)

diff -r 3e416dec8024 -r 533ef0c879a7 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Wed Apr 08 19:00:35 2015 +0530
@@ -8149,69 +8149,60 @@
 ;-----------------------------------------------------------------------------
 %macro P2S_H_32xN_avx2 1
 INIT_YMM avx2
-cglobal filterPixelToShort_32x%1, 3, 7, 6
+cglobal filterPixelToShort_32x%1, 3, 7, 3
     mov         r3d, r3m
     add         r3d, r3d
-    lea         r4, [r3 * 3]
     lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
 
     ; load height
-    mov         r6d, %1/4
+    mov         r4d, %1/4
 
     ; load constant
-    vbroadcasti128   m4, [pb_128]
-    vbroadcasti128   m5, [tab_c_64_n64]
-
-.loop:
-    movu        m0, [r0]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 0], m2
-    movu        [r2 + r3 * 0 + 32], m3
-
-    movu        m0, [r0 + r1]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 1], m2
-    movu        [r2 + r3 * 1 + 32], m3
-
-    movu        m0, [r0 + r1 * 2]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r3 * 2], m2
-    movu        [r2 + r3 * 2 + 32], m3
-
-    movu        m0, [r0 + r5]
-    punpcklbw   m1, m0, m4
-    punpckhbw   m2, m0, m4
-    pmaddubsw   m1, m5
-    pmaddubsw   m2, m5
-    vperm2i128  m3, m1, m2, q0301
-    vperm2i128  m2, m1, m2, q0200
-
-    movu        [r2 + r4], m2
-    movu        [r2 + r4 + 32], m3
+    vpbroadcastd m2, [pw_2000]
+
+.loop:
+    pmovzxbw    m0, [r0 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + 1 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psubw       m0, m2
+    psubw       m1, m2
+    movu        [r2 + 0 * mmsize], m0
+    movu        [r2 + 1 * mmsize], m1
+
+    pmovzxbw    m0, [r0 + r1 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r1 + 1 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psubw       m0, m2
+    psubw       m1, m2
+    movu        [r2 + r3 + 0 * mmsize], m0
+    movu        [r2 + r3 + 1 * mmsize], m1
+
+    pmovzxbw    m0, [r0 + r1 * 2 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r1 * 2 + 1 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psubw       m0, m2
+    psubw       m1, m2
+    movu        [r2 + r3 * 2 + 0 * mmsize], m0
+    movu        [r2 + r3 * 2 + 1 * mmsize], m1
+
+    pmovzxbw    m0, [r0 + r5 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r5 + 1 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psubw       m0, m2
+    psubw       m1, m2
+    movu        [r2 + r6 + 0 * mmsize], m0
+    movu        [r2 + r6 + 1 * mmsize], m1
 
     lea         r0, [r0 + r1 * 4]
     lea         r2, [r2 + r3 * 4]
 
-    dec         r6d
-    jnz         .loop
+    dec         r4d
+    jnz        .loop
     RET
 %endmacro
 P2S_H_32xN_avx2 32


More information about the x265-devel mailing list