[x265] [PATCH] asm: avx2 assembly code for 8bpp transpose8x8 module
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Fri Oct 10 15:48:08 CEST 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1412948445 -19800
# Fri Oct 10 19:10:45 2014 +0530
# Node ID df511e9c632a4207bb431105ffe01be776b320d5
# Parent 8b0020122e62abc061c6d4c70bd167e036507a5b
asm: avx2 assembly code for 8bpp transpose8x8 module
diff -r 8b0020122e62 -r df511e9c632a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Oct 10 19:04:17 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Oct 10 19:10:45 2014 +0530
@@ -1792,6 +1792,7 @@
p.idct[IDCT_8x8] = x265_idct8_avx2;
p.idct[IDCT_16x16] = x265_idct16_avx2;
p.idct[IDCT_32x32] = x265_idct32_avx2;
+ p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
#endif
diff -r 8b0020122e62 -r df511e9c632a source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Fri Oct 10 19:04:17 2014 +0530
+++ b/source/common/x86/pixel-util.h Fri Oct 10 19:10:45 2014 +0530
@@ -44,6 +44,7 @@
void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
+void x265_transpose8_avx2(pixel *dest, pixel *src, intptr_t stride);
void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride);
void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);
diff -r 8b0020122e62 -r df511e9c632a source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Oct 10 19:04:17 2014 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Oct 10 19:10:45 2014 +0530
@@ -48,6 +48,8 @@
hmulw_16p: times 8 dw 1
times 4 dw 1, -1
+trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+
SECTION .text
cextern pw_1
@@ -1530,7 +1532,42 @@
mov r3, r0
mov r5, 16
call transpose8_internal
+ RET
%else
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal transpose8, 3, 4, 4
+ lea r3, [r2 * 3]
+ movq xm0, [r1]
+ movhps xm0, [r1 + 2 * r2]
+ movq xm1, [r1 + r2]
+ movhps xm1, [r1 + r3]
+ lea r1, [r1 + 4 * r2]
+ movq xm2, [r1]
+ movhps xm2, [r1 + 2 * r2]
+ movq xm3, [r1 + r2]
+ movhps xm3, [r1 + r3]
+
+ vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7]
+ vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8]
+
+ punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6]
+ punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8]
+
+ punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8]
+ punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8]
+
+ mova m0, [trans8_shuf]
+
+ vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8]
+ vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8]
+
+ movu [r0], m1
+ movu [r0 + 32], m2
+ RET
+%endif
+
+INIT_XMM sse2
cglobal transpose8, 3, 5, 8, dest, src, stride
lea r3, [2 * r2]
lea r4, [3 * r2]
@@ -1562,8 +1599,8 @@
movu [r0 + 16], m2
movu [r0 + 32], m1
movu [r0 + 48], m3
+ RET
%endif
- RET
%macro TRANSPOSE_8x8 1
@@ -1636,7 +1673,7 @@
%else
%if ARCH_X86_64 == 1
INIT_YMM avx2
-cglobal transpose16, 3, 5, 16
+cglobal transpose16, 3, 5, 9
lea r3, [r2 * 3]
lea r4, [r1 + 8 * r2]
More information about the x265-devel
mailing list