[x265] [PATCH Review only] asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Tue Dec 3 16:02:09 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386082906 -19800
# Tue Dec 03 20:31:46 2013 +0530
# Node ID 99134096118bff621f56949e3922cd3f53afdf10
# Parent 126f3aefc79dad37e7985953c404ccff370d2729
asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks
diff -r 126f3aefc79d -r 99134096118b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 03 18:33:13 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 20:31:46 2013 +0530
@@ -520,6 +520,10 @@
p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
+ p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
+ p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
+ p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
+
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
PIXEL_AVG(sse2);
PIXEL_AVG_W4(mmx2);
diff -r 126f3aefc79d -r 99134096118b source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Dec 03 18:33:13 2013 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Dec 03 20:31:46 2013 +0530
@@ -830,7 +830,20 @@
;-----------------------------------------------------------------
INIT_XMM sse2
cglobal transpose4, 3, 3, 4, dest, src, stride
-
+%if HIGH_BIT_DEPTH
+ add r2, r2
+ movh m0, [r1]
+ movh m1, [r1 + r2]
+ movh m2, [r1 + 2 * r2]
+ lea r1, [r1 + 2 * r2]
+ movh m3, [r1 + r2]
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ movu [r0], m0
+ movu [r0 + 16], m1
+%else
movd m0, [r1]
movd m1, [r1 + r2]
movd m2, [r1 + 2 * r2]
@@ -841,15 +854,88 @@
punpcklbw m2, m3
punpcklwd m0, m2
movu [r0], m0
-
+%endif
RET
;-----------------------------------------------------------------
; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
INIT_XMM sse2
-cglobal transpose8, 3, 3, 8, dest, src, stride
-
+%if HIGH_BIT_DEPTH
+%macro TRANSPOSE_4x4 1
+ movh m0, [r1]
+ movh m1, [r1 + r2]
+ movh m2, [r1 + 2 * r2]
+ lea r1, [r1 + 2 * r2]
+ movh m3, [r1 + r2]
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ movlps [r0], m0
+ movhps [r0 + %1], m0
+ movlps [r0 + 2 * %1], m1
+ lea r0, [r0 + 2 * %1]
+ movhps [r0 + %1], m1
+%endmacro
+cglobal transpose8_internal
+ TRANSPOSE_4x4 r5
+ lea r1, [r1 + 2 * r2]
+ lea r0, [r3 + 8]
+ TRANSPOSE_4x4 r5
+ lea r1, [r1 + 2 * r2]
+ neg r2
+ lea r1, [r1 + r2 * 8 + 8]
+ neg r2
+ lea r0, [r3 + 4 * r5]
+ TRANSPOSE_4x4 r5
+ lea r1, [r1 + 2 * r2]
+ lea r0, [r3 + 8 + 4 * r5]
+ TRANSPOSE_4x4 r5
+ ret
+cglobal transpose8, 3, 6, 4, dest, src, stride
+ add r2, r2
+ mov r3, r0
+ mov r4, r1
+ mov r5, 16
+ call transpose8_internal
+ ret
+%else
+cglobal transpose8, 3, 5, 8, dest, src, stride
+ lea r3, [2 * r2]
+ lea r4, [3 * r2]
+ movh m0, [r1]
+ movh m1, [r1 + r2]
+ movh m2, [r1 + r3]
+ movh m3, [r1 + r4]
+ movh m4, [r1 + 4 * r2]
+ lea r1, [r1 + 4 * r2]
+ movh m5, [r1 + r2]
+ movh m6, [r1 + r3]
+ movh m7, [r1 + r4]
+
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m5, m4, m6
+ punpcklwd m4, m6
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m3, m1, m5
+ punpckldq m1, m5
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + 32], m1
+ movu [r0 + 48], m3
+%endif
+ RET
+
+%macro TRANSPOSE_8x8 1
movh m0, [r1]
movh m1, [r1 + r2]
movh m2, [r1 + 2 * r2]
@@ -866,42 +952,6 @@
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
-
- punpckhwd m1, m0, m2
- punpcklwd m0, m2
- punpckhwd m5, m4, m6
- punpcklwd m4, m6
- punpckhdq m2, m0, m4
- punpckldq m0, m4
- punpckhdq m3, m1, m5
- punpckldq m1, m5
-
- movu [r0], m0
- movu [r0 + 16], m2
- movu [r0 + 32], m1
- movu [r0 + 48], m3
-
- RET
-
-%macro TRANSPOSE_8x8 1
-
- movh m0, [r1]
- movh m1, [r1 + r2]
- movh m2, [r1 + 2 * r2]
- lea r1, [r1 + 2 * r2]
- movh m3, [r1 + r2]
- movh m4, [r1 + 2 * r2]
- lea r1, [r1 + 2 * r2]
- movh m5, [r1 + r2]
- movh m6, [r1 + 2 * r2]
- lea r1, [r1 + 2 * r2]
- movh m7, [r1 + r2]
-
- punpcklbw m0, m1
- punpcklbw m2, m3
- punpcklbw m4, m5
- punpcklbw m6, m7
-
punpckhwd m1, m0, m2
punpcklwd m0, m2
punpckhwd m5, m4, m6
@@ -922,14 +972,33 @@
movlps [r0 + 2 * %1], m3
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m3
-
%endmacro
-
;-----------------------------------------------------------------
; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
INIT_XMM sse2
+%if HIGH_BIT_DEPTH
+cglobal transpose16, 3, 7, 4, dest, src, stride
+ add r2, r2
+ mov r3, r0
+ mov r4, r1
+ mov r5, 32
+ mov r6, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 16]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r4 + 16]
+ lea r0, [r6 + 8 * 32]
+ mov r3, r0
+ call transpose8_internal
+ lea r1, [r1 - 8 + 2 * r2]
+ lea r0, [r6 + 8 * 32 + 16]
+ mov r3, r0
+ call transpose8_internal
+%else
cglobal transpose16, 3, 5, 8, dest, src, stride
mov r3, r0
@@ -944,7 +1013,7 @@
lea r1, [r1 + 2 * r2]
lea r0, [r3 + 8 * 16 + 8]
TRANSPOSE_8x8 16
-
+%endif
RET
cglobal transpose16_internal
More information about the x265-devel
mailing list