[x265] [PATCH Review only] asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks

murugan at multicorewareinc.com murugan at multicorewareinc.com
Tue Dec 3 16:02:09 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386082906 -19800
#      Tue Dec 03 20:31:46 2013 +0530
# Node ID 99134096118bff621f56949e3922cd3f53afdf10
# Parent  126f3aefc79dad37e7985953c404ccff370d2729
asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks

diff -r 126f3aefc79d -r 99134096118b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Dec 03 18:33:13 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 03 20:31:46 2013 +0530
@@ -520,6 +520,10 @@
         p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
         p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
 
+        p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
+        p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
+        p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
+
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
         PIXEL_AVG(sse2);
         PIXEL_AVG_W4(mmx2);
diff -r 126f3aefc79d -r 99134096118b source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Dec 03 18:33:13 2013 +0530
+++ b/source/common/x86/pixel-util8.asm	Tue Dec 03 20:31:46 2013 +0530
@@ -830,7 +830,20 @@
 ;-----------------------------------------------------------------
 INIT_XMM sse2
 cglobal transpose4, 3, 3, 4, dest, src, stride
-
+%if HIGH_BIT_DEPTH
+    add          r2,    r2
+    movh         m0,    [r1]
+    movh         m1,    [r1 + r2]
+    movh         m2,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m3,    [r1 + r2]
+    punpcklwd    m0,    m1
+    punpcklwd    m2,    m3
+    punpckhdq    m1,    m0,    m2
+    punpckldq    m0,    m2
+    movu         [r0],       m0
+    movu         [r0 + 16],  m1
+%else
     movd         m0,    [r1]
     movd         m1,    [r1 + r2]
     movd         m2,    [r1 + 2 * r2]
@@ -841,15 +854,88 @@
     punpcklbw    m2,    m3
     punpcklwd    m0,    m2
     movu         [r0],    m0
-
+%endif
     RET
 
 ;-----------------------------------------------------------------
 ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
 ;-----------------------------------------------------------------
 INIT_XMM sse2
-cglobal transpose8, 3, 3, 8, dest, src, stride
-
+%if HIGH_BIT_DEPTH
+%macro TRANSPOSE_4x4 1
+    movh         m0,    [r1]
+    movh         m1,    [r1 + r2]
+    movh         m2,    [r1 + 2 * r2]
+    lea          r1,    [r1 + 2 * r2]
+    movh         m3,    [r1 + r2]
+    punpcklwd    m0,    m1
+    punpcklwd    m2,    m3
+    punpckhdq    m1,    m0,    m2
+    punpckldq    m0,    m2
+    movlps         [r0],             m0
+    movhps         [r0 + %1],        m0
+    movlps         [r0 + 2 * %1],    m1
+    lea            r0,               [r0 + 2 * %1]
+    movhps         [r0 + %1],        m1
+%endmacro
+cglobal transpose8_internal
+    TRANSPOSE_4x4 r5
+    lea    r1,    [r1 + 2 * r2]
+    lea    r0,    [r3 + 8]
+    TRANSPOSE_4x4 r5
+    lea    r1,    [r1 + 2 * r2]
+    neg    r2
+    lea    r1,    [r1 + r2 * 8 + 8]
+    neg    r2
+    lea    r0,    [r3 + 4 * r5]
+    TRANSPOSE_4x4 r5
+    lea    r1,    [r1 + 2 * r2]
+    lea    r0,    [r3 + 8 + 4 * r5]
+    TRANSPOSE_4x4 r5
+    ret
+cglobal transpose8, 3, 6, 4, dest, src, stride
+    add    r2,    r2
+    mov    r3,    r0
+    mov    r4,    r1
+    mov    r5,    16
+    call   transpose8_internal
+    ret
+%else
+cglobal transpose8, 3, 5, 8, dest, src, stride
+    lea          r3,    [2 * r2]
+    lea          r4,    [3 * r2]
+    movh         m0,    [r1]
+    movh         m1,    [r1 + r2]
+    movh         m2,    [r1 + r3]
+    movh         m3,    [r1 + r4]
+    movh         m4,    [r1 + 4 * r2]
+    lea          r1,    [r1 + 4 * r2]
+    movh         m5,    [r1 + r2]
+    movh         m6,    [r1 + r3]
+    movh         m7,    [r1 + r4]
+
+    punpcklbw    m0,    m1
+    punpcklbw    m2,    m3
+    punpcklbw    m4,    m5
+    punpcklbw    m6,    m7
+
+    punpckhwd    m1,    m0,    m2
+    punpcklwd    m0,    m2
+    punpckhwd    m5,    m4,    m6
+    punpcklwd    m4,    m6
+    punpckhdq    m2,    m0,    m4
+    punpckldq    m0,    m4
+    punpckhdq    m3,    m1,    m5
+    punpckldq    m1,    m5
+
+    movu         [r0],         m0
+    movu         [r0 + 16],    m2
+    movu         [r0 + 32],    m1
+    movu         [r0 + 48],    m3
+%endif
+    RET
+
+%macro TRANSPOSE_8x8 1
     movh         m0,    [r1]
     movh         m1,    [r1 + r2]
     movh         m2,    [r1 + 2 * r2]
@@ -866,42 +952,6 @@
     punpcklbw    m2,    m3
     punpcklbw    m4,    m5
     punpcklbw    m6,    m7
-
-    punpckhwd    m1,    m0,    m2
-    punpcklwd    m0,    m2
-    punpckhwd    m5,    m4,    m6
-    punpcklwd    m4,    m6
-    punpckhdq    m2,    m0,    m4
-    punpckldq    m0,    m4
-    punpckhdq    m3,    m1,    m5
-    punpckldq    m1,    m5
-
-    movu         [r0],         m0
-    movu         [r0 + 16],    m2
-    movu         [r0 + 32],    m1
-    movu         [r0 + 48],    m3
-
-    RET
-
-%macro TRANSPOSE_8x8 1
-
-    movh         m0,    [r1]
-    movh         m1,    [r1 + r2]
-    movh         m2,    [r1 + 2 * r2]
-    lea          r1,    [r1 + 2 * r2]
-    movh         m3,    [r1 + r2]
-    movh         m4,    [r1 + 2 * r2]
-    lea          r1,    [r1 + 2 * r2]
-    movh         m5,    [r1 + r2]
-    movh         m6,    [r1 + 2 * r2]
-    lea          r1,    [r1 + 2 * r2]
-    movh         m7,    [r1 + r2]
-
-    punpcklbw    m0,    m1
-    punpcklbw    m2,    m3
-    punpcklbw    m4,    m5
-    punpcklbw    m6,    m7
-
     punpckhwd    m1,    m0,    m2
     punpcklwd    m0,    m2
     punpckhwd    m5,    m4,    m6
@@ -922,14 +972,33 @@
     movlps         [r0 + 2 * %1],    m3
     lea            r0,               [r0 + 2 * %1]
     movhps         [r0 + %1],        m3
-
 %endmacro
 
-
 ;-----------------------------------------------------------------
 ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
 ;-----------------------------------------------------------------
 INIT_XMM sse2
+%if HIGH_BIT_DEPTH
+cglobal transpose16, 3, 7, 4, dest, src, stride
+    add    r2,    r2
+    mov    r3,    r0
+    mov    r4,    r1
+    mov    r5,    32
+    mov    r6,    r0
+    call   transpose8_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r6 + 16]
+    mov    r3,    r0
+    call   transpose8_internal
+    lea    r1,    [r4 + 16]
+    lea    r0,    [r6 + 8 * 32]
+    mov    r3,    r0
+    call   transpose8_internal
+    lea    r1,    [r1 - 8 + 2 * r2]
+    lea    r0,    [r6 + 8 * 32 + 16]
+    mov    r3,    r0
+    call   transpose8_internal
+%else
 cglobal transpose16, 3, 5, 8, dest, src, stride
 
     mov    r3,    r0
@@ -944,7 +1013,7 @@
     lea    r1,    [r1 + 2 * r2]
     lea    r0,    [r3 + 8 * 16 + 8]
     TRANSPOSE_8x8 16
-
+%endif
     RET
 
 cglobal transpose16_internal


More information about the x265-devel mailing list