[x265] [PATCH] asm: improve avx2 8bpp code for convert_p2s[32xN]
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Wed Apr 8 15:48:31 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1428499835 -19800
# Wed Apr 08 19:00:35 2015 +0530
# Node ID 533ef0c879a75f22b2667d2d7e40c99ef96f18fa
# Parent 3e416dec8024b8339b18568cf65e48eb3448bed1
asm: improve avx2 8bpp code for convert_p2s[32xN]
convert_p2s[32x8](14.67x), convert_p2s[32x16](14.92x),
convert_p2s[32x24](14.89x), convert_p2s[32x32](14.90x),
convert_p2s[32x64](14.66x)
diff -r 3e416dec8024 -r 533ef0c879a7 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/ipfilter8.asm Wed Apr 08 19:00:35 2015 +0530
@@ -8149,69 +8149,60 @@
;-----------------------------------------------------------------------------
%macro P2S_H_32xN_avx2 1
INIT_YMM avx2
-cglobal filterPixelToShort_32x%1, 3, 7, 6
+cglobal filterPixelToShort_32x%1, 3, 7, 3
mov r3d, r3m
add r3d, r3d
- lea r4, [r3 * 3]
lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
; load height
- mov r6d, %1/4
+ mov r4d, %1/4
; load constant
- vbroadcasti128 m4, [pb_128]
- vbroadcasti128 m5, [tab_c_64_n64]
-
-.loop:
- movu m0, [r0]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 0], m2
- movu [r2 + r3 * 0 + 32], m3
-
- movu m0, [r0 + r1]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 1], m2
- movu [r2 + r3 * 1 + 32], m3
-
- movu m0, [r0 + r1 * 2]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 2], m2
- movu [r2 + r3 * 2 + 32], m3
-
- movu m0, [r0 + r5]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r4], m2
- movu [r2 + r4 + 32], m3
+ vpbroadcastd m2, [pw_2000]
+
+.loop:
+ pmovzxbw m0, [r0 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + 1 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psubw m0, m2
+ psubw m1, m2
+ movu [r2 + 0 * mmsize], m0
+ movu [r2 + 1 * mmsize], m1
+
+ pmovzxbw m0, [r0 + r1 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + r1 + 1 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psubw m0, m2
+ psubw m1, m2
+ movu [r2 + r3 + 0 * mmsize], m0
+ movu [r2 + r3 + 1 * mmsize], m1
+
+ pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psubw m0, m2
+ psubw m1, m2
+ movu [r2 + r3 * 2 + 0 * mmsize], m0
+ movu [r2 + r3 * 2 + 1 * mmsize], m1
+
+ pmovzxbw m0, [r0 + r5 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + r5 + 1 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psubw m0, m2
+ psubw m1, m2
+ movu [r2 + r6 + 0 * mmsize], m0
+ movu [r2 + r6 + 1 * mmsize], m1
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- dec r6d
- jnz .loop
+ dec r4d
+ jnz .loop
RET
%endmacro
P2S_H_32xN_avx2 32
More information about the x265-devel
mailing list