[x265] [PATCH] asm: avx2 assembly code for 8bpp transpose8x8 module

murugan at multicorewareinc.com murugan at multicorewareinc.com
Fri Oct 10 15:48:08 CEST 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1412948445 -19800
#      Fri Oct 10 19:10:45 2014 +0530
# Node ID df511e9c632a4207bb431105ffe01be776b320d5
# Parent  8b0020122e62abc061c6d4c70bd167e036507a5b
asm: avx2 assembly code for 8bpp transpose8x8 module

diff -r 8b0020122e62 -r df511e9c632a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Oct 10 19:04:17 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Oct 10 19:10:45 2014 +0530
@@ -1792,6 +1792,7 @@
         p.idct[IDCT_8x8] = x265_idct8_avx2;
         p.idct[IDCT_16x16] = x265_idct16_avx2;
         p.idct[IDCT_32x32] = x265_idct32_avx2;
+        p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
         p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
         p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
 #endif
diff -r 8b0020122e62 -r df511e9c632a source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Fri Oct 10 19:04:17 2014 +0530
+++ b/source/common/x86/pixel-util.h	Fri Oct 10 19:10:45 2014 +0530
@@ -44,6 +44,7 @@
 void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
 void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
 
+void x265_transpose8_avx2(pixel *dest, pixel *src, intptr_t stride);
 void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride);
 void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);
 
diff -r 8b0020122e62 -r df511e9c632a source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Oct 10 19:04:17 2014 +0530
+++ b/source/common/x86/pixel-util8.asm	Fri Oct 10 19:10:45 2014 +0530
@@ -48,6 +48,8 @@
 hmulw_16p:  times 8 dw 1
             times 4 dw 1, -1
 
+trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+
 SECTION .text
 
 cextern pw_1
@@ -1530,7 +1532,42 @@
     mov    r3,    r0
     mov    r5,    16
     call   transpose8_internal
+    RET
 %else
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal transpose8, 3, 4, 4
+    lea          r3, [r2 * 3]
+    movq         xm0, [r1]
+    movhps       xm0, [r1 + 2 * r2]
+    movq         xm1, [r1 + r2]
+    movhps       xm1, [r1 + r3]
+    lea          r1, [r1 + 4 * r2]
+    movq         xm2, [r1]
+    movhps       xm2, [r1 + 2 * r2]
+    movq         xm3, [r1 + r2]
+    movhps       xm3, [r1 + r3]
+
+    vinserti128  m0, m0, xm2, 1             ;[row1 row3 row5 row7]
+    vinserti128  m1, m1, xm3, 1             ;[row2 row4 row6 row8]
+
+    punpcklbw    m2, m0, m1                 ;[1 - 8; 1 - 8][row1row2; row5row6]
+    punpckhbw    m0, m1                     ;[1 - 8; 1 - 8][row3row4; row7row8]
+
+    punpcklwd    m1, m2, m0                 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8]
+    punpckhwd    m2, m0                     ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8]
+
+    mova         m0, [trans8_shuf]
+
+    vpermd       m1, m0, m1                 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8]
+    vpermd       m2, m0, m2                 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8]
+
+    movu         [r0], m1
+    movu         [r0 + 32], m2
+    RET
+%endif
+
+INIT_XMM sse2
 cglobal transpose8, 3, 5, 8, dest, src, stride
     lea          r3,    [2 * r2]
     lea          r4,    [3 * r2]
@@ -1562,8 +1599,8 @@
     movu         [r0 + 16],    m2
     movu         [r0 + 32],    m1
     movu         [r0 + 48],    m3
+    RET
 %endif
-    RET
 
 %macro TRANSPOSE_8x8 1
 
@@ -1636,7 +1673,7 @@
 %else
 %if ARCH_X86_64 == 1
 INIT_YMM avx2
-cglobal transpose16, 3, 5, 16
+cglobal transpose16, 3, 5, 9
     lea          r3, [r2 * 3]
     lea          r4, [r1 + 8 * r2]
 


More information about the x265-devel mailing list