[x265] [PATCH 1 of 6] asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c->974c, over C code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Mon Jun 22 14:50:33 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434712676 -19800
# Fri Jun 19 16:47:56 2015 +0530
# Node ID a94e9a1f0fde08e060a9b52e3353ce2f242d9257
# Parent 83a7d824442455ba5e0a6b53ea68e6b7043845de
asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c->974c, over C code
diff -r 83a7d8244424 -r a94e9a1f0fde source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/CMakeLists.txt Fri Jun 19 16:47:56 2015 +0530
@@ -46,7 +46,7 @@
mc-a2.asm pixel-util8.asm blockcopy8.asm
pixeladd8.asm dct8.asm)
if(HIGH_BIT_DEPTH)
- set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
+ set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm)
else()
set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm)
endif()
diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jun 19 16:47:56 2015 +0530
@@ -1089,6 +1089,8 @@
}
if (cpuMask & X265_CPU_SSE4)
{
+ p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
+
LUMA_ADDAVG(sse4);
CHROMA_420_ADDAVG(sse4);
CHROMA_422_ADDAVG(sse4);
diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/loopfilter.asm Fri Jun 19 16:47:56 2015 +0530
@@ -38,6 +38,7 @@
cextern pb_128
cextern pb_2
cextern pw_2
+cextern pw_1023
cextern pb_movemask
@@ -45,6 +46,107 @@
; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
;============================================================================================================
INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,5,9
+ mov r4d, r4m
+ movh m6, [r1]
+ movzx r1d, byte [r3]
+ pxor m5, m5
+ neg r1b
+ movd m0, r1d
+ lea r1, [r0 + r4 * 2]
+ mov r4d, r2d
+
+.loop:
+ movu m7, [r0]
+ movu m8, [r0 + 16]
+ movu m2, [r0 + 2]
+ movu m1, [r0 + 18]
+
+ pcmpgtw m3, m7, m2
+ pcmpgtw m2, m7
+ pcmpgtw m4, m8, m1
+ pcmpgtw m1, m8
+
+ packsswb m3, m4
+ packsswb m2, m1
+
+ pand m3, [pb_1]
+ por m3, m2
+
+ palignr m2, m3, m5, 15
+ por m2, m0
+
+ mova m4, [pw_1023]
+ psignb m2, [pb_128] ; m2 = signLeft
+ pxor m0, m0
+ palignr m0, m3, 15
+ paddb m3, m2
+ paddb m3, [pb_2] ; m2 = uiEdgeType
+ pshufb m2, m6, m3
+ pmovsxbw m3, m2 ; offsetEo
+ punpckhbw m2, m2
+ psraw m2, 8
+ paddw m7, m3
+ paddw m8, m2
+ pmaxsw m7, m5
+ pmaxsw m8, m5
+ pminsw m7, m4
+ pminsw m8, m4
+ movu [r0], m7
+ movu [r0 + 16], m8
+
+ add r0q, 32
+ sub r2d, 16
+ jnz .loop
+
+ movzx r3d, byte [r3 + 1]
+ neg r3b
+ movd m0, r3d
+.loopH:
+ movu m7, [r1]
+ movu m8, [r1 + 16]
+ movu m2, [r1 + 2]
+ movu m1, [r1 + 18]
+
+ pcmpgtw m3, m7, m2
+ pcmpgtw m2, m7
+ pcmpgtw m4, m8, m1
+ pcmpgtw m1, m8
+
+ packsswb m3, m4
+ packsswb m2, m1
+
+ pand m3, [pb_1]
+ por m3, m2
+
+ palignr m2, m3, m5, 15
+ por m2, m0
+
+ mova m4, [pw_1023]
+ psignb m2, [pb_128] ; m2 = signLeft
+ pxor m0, m0
+ palignr m0, m3, 15
+ paddb m3, m2
+ paddb m3, [pb_2] ; m2 = uiEdgeType
+ pshufb m2, m6, m3
+ pmovsxbw m3, m2 ; offsetEo
+ punpckhbw m2, m2
+ psraw m2, 8
+ paddw m7, m3
+ paddw m8, m2
+ pmaxsw m7, m5
+ pmaxsw m8, m5
+ pminsw m7, m4
+ pminsw m8, m4
+ movu [r1], m7
+ movu [r1 + 16], m8
+
+ add r1q, 32
+ sub r4d, 16
+ jnz .loopH
+ RET
+%else ; HIGH_BIT_DEPTH
cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
mov r4d, r4m
@@ -130,6 +232,7 @@
sub r4d, 16
jnz .loopH
RET
+%endif
INIT_YMM avx2
cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
diff -r 83a7d8244424 -r a94e9a1f0fde source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jun 22 15:15:33 2015 +0530
+++ b/source/test/pixelharness.cpp Fri Jun 19 16:47:56 2015 +0530
@@ -901,8 +901,8 @@
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
- memset(ref_dest, 0xCD, sizeof(ref_dest));
- memset(opt_dest, 0xCD, sizeof(opt_dest));
+ for (int i = 0; i < 64 * 64; i++)
+ ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
int j = 0;
More information about the x265-devel
mailing list