[x265] [PATCH] asm: improve avx2 8bpp code for convert_p2s[64xN]
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Wed Apr 8 15:49:18 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1428500140 -19800
# Wed Apr 08 19:05:40 2015 +0530
# Node ID af774236973b2b20de2258358df02d2a21bf6e61
# Parent 533ef0c879a75f22b2667d2d7e40c99ef96f18fa
asm: improve avx2 8bpp code for convert_p2s[64xN]
convert_p2s[64x16](14.99x),convert_p2s[64x32](14.91x),
convert_p2s[64x48](14.16x),convert_p2s[64x64](15.61x)
diff -r 533ef0c879a7 -r af774236973b source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Apr 08 19:00:35 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Apr 08 19:05:40 2015 +0530
@@ -8429,115 +8429,96 @@
;-----------------------------------------------------------------------------
%macro P2S_H_64xN_avx2 1
INIT_YMM avx2
-cglobal filterPixelToShort_64x%1, 3, 7, 6
+cglobal filterPixelToShort_64x%1, 3, 7, 5
mov r3d, r3m
add r3d, r3d
- lea r4, [r3 * 3]
lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
; load height
- mov r6d, %1/4
+ mov r4d, %1/4
; load constant
- vbroadcasti128 m4, [pb_128]
- vbroadcasti128 m5, [tab_c_64_n64]
-
-.loop:
- movu m0, [r0]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 0], m2
- movu [r2 + r3 * 0 + 32], m3
-
- movu m0, [r0 + r1]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 1], m2
- movu [r2 + r3 * 1 + 32], m3
-
- movu m0, [r0 + r1 * 2]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 2], m2
- movu [r2 + r3 * 2 + 32], m3
-
- movu m0, [r0 + r5]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r4], m2
- movu [r2 + r4 + 32], m3
-
- add r0, 32
-
- movu m0, [r0]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 0 + 64], m2
- movu [r2 + r3 * 0 + 96], m3
-
- movu m0, [r0 + r1]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 1 + 64], m2
- movu [r2 + r3 * 1 + 96], m3
-
- movu m0, [r0 + r1 * 2]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r3 * 2 + 64], m2
- movu [r2 + r3 * 2 + 96], m3
-
- movu m0, [r0 + r5]
- punpcklbw m1, m0, m4
- punpckhbw m2, m0, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- vperm2i128 m3, m1, m2, q0301
- vperm2i128 m2, m1, m2, q0200
-
- movu [r2 + r4 + 64], m2
- movu [r2 + r4 + 96], m3
-
- lea r0, [r0 + r1 * 4 - 32]
+ vpbroadcastd m4, [pw_2000]
+
+.loop:
+ pmovzxbw m0, [r0 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + 1 * mmsize/2]
+ pmovzxbw m2, [r0 + 2 * mmsize/2]
+ pmovzxbw m3, [r0 + 3 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ movu [r2 + 0 * mmsize], m0
+ movu [r2 + 1 * mmsize], m1
+ movu [r2 + 2 * mmsize], m2
+ movu [r2 + 3 * mmsize], m3
+
+ pmovzxbw m0, [r0 + r1 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + r1 + 1 * mmsize/2]
+ pmovzxbw m2, [r0 + r1 + 2 * mmsize/2]
+ pmovzxbw m3, [r0 + r1 + 3 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ movu [r2 + r3 + 0 * mmsize], m0
+ movu [r2 + r3 + 1 * mmsize], m1
+ movu [r2 + r3 + 2 * mmsize], m2
+ movu [r2 + r3 + 3 * mmsize], m3
+
+ pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2]
+ pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2]
+ pmovzxbw m3, [r0 + r1 * 2 + 3 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ movu [r2 + r3 * 2 + 0 * mmsize], m0
+ movu [r2 + r3 * 2 + 1 * mmsize], m1
+ movu [r2 + r3 * 2 + 2 * mmsize], m2
+ movu [r2 + r3 * 2 + 3 * mmsize], m3
+
+ pmovzxbw m0, [r0 + r5 + 0 * mmsize/2]
+ pmovzxbw m1, [r0 + r5 + 1 * mmsize/2]
+ pmovzxbw m2, [r0 + r5 + 2 * mmsize/2]
+ pmovzxbw m3, [r0 + r5 + 3 * mmsize/2]
+ psllw m0, 6
+ psllw m1, 6
+ psllw m2, 6
+ psllw m3, 6
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ movu [r2 + r6 + 0 * mmsize], m0
+ movu [r2 + r6 + 1 * mmsize], m1
+ movu [r2 + r6 + 2 * mmsize], m2
+ movu [r2 + r6 + 3 * mmsize], m3
+
+ lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
- dec r6d
- jnz .loop
+ dec r4d
+ jnz .loop
RET
%endmacro
P2S_H_64xN_avx2 64
More information about the x265-devel
mailing list