[x265] [PATCH 1 of 3] asm: general calSign to accelerate sao
Min Chen
chenm003 at 163.com
Fri Apr 3 13:10:44 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1428059407 -28800
# Node ID abf20efa2234fb7cd6a474d4dac6e3051a94b30c
# Parent 9a5fa67583feb6ffb7668f82632f7e93e5ec9415
asm: general calSign to accelerate sao
---
source/common/x86/const-a.asm | 3 ++
source/common/x86/loopfilter.asm | 69 ++++++++++++++++++++++++++-----------
source/encoder/sao.cpp | 14 ++------
source/test/pixelharness.cpp | 8 ++--
4 files changed, 58 insertions(+), 36 deletions(-)
diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Thu Apr 02 13:21:32 2015 -0500
+++ b/source/common/x86/const-a.asm Fri Apr 03 19:10:07 2015 +0800
@@ -65,6 +65,9 @@
const pb_32, times 32 db 32
const pb_128, times 16 db 128
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
+const pb_movemask, times 16 db 0x00
+ times 16 db 0xFF
+
const pw_0_15, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
const pw_2, times 8 dw 2
diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Thu Apr 02 13:21:32 2015 -0500
+++ b/source/common/x86/loopfilter.asm Fri Apr 03 19:10:07 2015 +0800
@@ -36,6 +36,7 @@
cextern pb_128
cextern pb_2
cextern pw_2
+cextern pb_movemask
;============================================================================================================
@@ -321,29 +322,55 @@
RET
;============================================================================================================
-; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int endX)
+; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
;============================================================================================================
INIT_XMM sse4
-cglobal calSign, 4, 4, 6
- mova m1, [pb_128]
- mova m0, [pb_1]
- shr r3d, 4
-.loop
- movu m2, [r1] ; m2 = pRec[x]
- movu m3, [r2] ; m3 = pTmpU[x]
+cglobal calSign, 4,5,6
+ mova m0, [pb_128]
+ mova m1, [pb_1]
- pxor m4, m2, m1
- pxor m3, m1
- pcmpgtb m5, m4, m3
- pcmpgtb m3, m4
- pand m5, m0
- por m5, m3
+ sub r1, r0
+ sub r2, r0
- movu [r0], m5
+ mov r4d, r3d
+ shr r3d, 4
+ jz .next
+.loop:
+ movu m2, [r0 + r1] ; m2 = pRec[x]
+ movu m3, [r0 + r2] ; m3 = pTmpU[x]
+ pxor m4, m2, m0
+ pxor m3, m0
+ pcmpgtb m5, m4, m3
+ pcmpgtb m3, m4
+ pand m5, m1
+ por m5, m3
+ movu [r0], m5
- add r0, 16
- add r1, 16
- add r2, 16
+ add r0, 16
dec r3d
jnz .loop
+
+ ; process partial
+.next:
+ and r4d, 15
+ jz .end
+
+ movu m2, [r0 + r1] ; m2 = pRec[x]
+ movu m3, [r0 + r2] ; m3 = pTmpU[x]
+ pxor m4, m2, m0
+ pxor m3, m0
+ pcmpgtb m5, m4, m3
+ pcmpgtb m3, m4
+ pand m5, m1
+ por m5, m3
+
+ lea r3, [pb_movemask + 16]
+ sub r3, r4
+ movu xmm0, [r3]
+ movu m3, [r0]
+ pblendvb m5, m5, m3, xmm0
+ movu [r0], m5
+
+.end:
RET
+
diff -r 9a5fa67583fe -r abf20efa2234 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Thu Apr 02 13:21:32 2015 -0500
+++ b/source/encoder/sao.cpp Fri Apr 03 19:10:07 2015 +0800
@@ -783,13 +783,7 @@
rec += stride;
}
- if (!(ctuWidth & 15))
- primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
- else
- {
- for (x = 0; x < ctuWidth; x++)
- upBuff1[x] = signOf(rec[x] - rec[x - stride]);
- }
+ primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
for (y = startY; y < endY; y++)
{
@@ -832,8 +826,7 @@
rec += stride;
}
- for (x = startX; x < endX; x++)
- upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
+ primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
for (y = startY; y < endY; y++)
{
@@ -879,8 +872,7 @@
rec += stride;
}
- for (x = startX - 1; x < endX; x++)
- upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
+ primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
for (y = startY; y < endY; y++)
{
diff -r 9a5fa67583fe -r abf20efa2234 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Apr 02 13:21:32 2015 -0500
+++ b/source/test/pixelharness.cpp Fri Apr 03 19:10:07 2015 +0800
@@ -870,8 +870,8 @@
bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
{
- ALIGN_VAR_16(int8_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int8_t, opt_dest[64 * 64]);
+ ALIGN_VAR_16(int8_t, ref_dest[64 * 2]);
+ ALIGN_VAR_16(int8_t, opt_dest[64 * 2]);
memset(ref_dest, 0xCD, sizeof(ref_dest));
memset(opt_dest, 0xCD, sizeof(opt_dest));
@@ -880,12 +880,12 @@
for (int i = 0; i < ITERS; i++)
{
- int width = 16 * (rand() % 4 + 1);
+ int width = (rand() % 64) + 1;
ref(ref_dest, pbuf2 + j, pbuf3 + j, width);
checked(opt, opt_dest, pbuf2 + j, pbuf3 + j, width);
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int8_t)))
+ if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
return false;
reportfail();
More information about the x265-devel
mailing list