[x265] [PATCH] asm: add pixel restoration part in saoCuOrgE2 primitive
Divya Manivannan
divya at multicorewareinc.com
Fri Apr 24 10:49:32 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429864855 -19800
# Fri Apr 24 14:10:55 2015 +0530
# Node ID cfc321e81396f4ad93b1bb92f786d5d233acadf8
# Parent deea3a0293187e142884b9aa2a719468f1ce5be6
asm: add pixel restoration part in saoCuOrgE2 primitive
diff -r deea3a029318 -r cfc321e81396 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Fri Apr 24 13:47:58 2015 +0530
+++ b/source/common/x86/loopfilter.asm Fri Apr 24 14:10:55 2015 +0530
@@ -406,60 +406,66 @@
; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
;======================================================================================================================================================
INIT_XMM sse4
-cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
-
- mov r6, 16
+cglobal saoCuOrgE2, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
+ mov r4d, r4m
mov r5d, r5m
pxor m0, m0 ; m0 = 0
mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
mova m7, [pb_128]
- shr r4d, 4
- inc r1q
+ inc r1
+ movh m5, [r0 + r4]
+ movhps m5, [r1 + r4]
- .loop
- movu m1, [r0] ; m1 = rec[x]
- movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1]
- pxor m3, m1, m7
- pxor m4, m2, m7
- pcmpgtb m2, m3, m4
- pcmpgtb m4, m3
- pand m2, [pb_1]
- por m2, m4
- movu m3, [r2] ; m3 = buff1
+.loop
+ movu m1, [r0] ; m1 = rec[x]
+ movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1]
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
+ movu m3, [r2] ; m3 = buff1
- paddb m3, m2
- paddb m3, m6 ; m3 = edgeType
+ paddb m3, m2
+ paddb m3, m6 ; m3 = edgeType
- movu m4, [r3] ; m4 = offsetEo
- pshufb m4, m3
+ movu m4, [r3] ; m4 = offsetEo
+ pshufb m4, m3
- psubb m3, m0, m2
- movu [r1], m3
+ psubb m3, m0, m2
+ movu [r1], m3
- pmovzxbw m2, m1
- punpckhbw m1, m0
- pmovsxbw m3, m4
- punpckhbw m4, m4
- psraw m4, 8
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m4
+ punpckhbw m4, m4
+ psraw m4, 8
- paddw m2, m3
- paddw m1, m4
- packuswb m2, m1
- movu [r0], m2
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ movu [r0], m2
- add r0, r6
- add r1, r6
- add r2, r6
- dec r4d
- jnz .loop
+ add r0, 16
+ add r1, 16
+ add r2, 16
+ sub r4, 16
+ jg .loop
+
+ movh [r0 + r4], m5
+ movhps [r1 + r4], m5
RET
INIT_YMM avx2
-cglobal saoCuOrgE2, 5, 6, 6, rec, bufft, buff1, offsetEo, lcuWidth
+cglobal saoCuOrgE2, 5, 6, 7, rec, bufft, buff1, offsetEo, lcuWidth
+ mov r4d, r4m
mov r5d, r5m
pxor xm0, xm0 ; xm0 = 0
mova xm5, [pb_128]
inc r1
+ movq xm6, [r0 + r4]
+ movhps xm6, [r1 + r4]
movu xm1, [r0] ; xm1 = rec[x]
movu xm2, [r0 + r5 + 1] ; xm2 = rec[x + stride + 1]
@@ -487,17 +493,21 @@
vextracti128 xm3, m2, 1
packuswb xm2, xm3
movu [r0], xm2
+
+ movq [r0 + r4], xm6
+ movhps [r1 + r4], xm6
RET
INIT_YMM avx2
cglobal saoCuOrgE2_32, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
+ mov r4d, r4m
mov r5d, r5m
pxor m0, m0 ; m0 = 0
- mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
vbroadcasti128 m7, [pb_128]
vbroadcasti128 m5, [r3] ; m5 = offsetEo
- shr r4d, 5
inc r1
+ movq xm6, [r0 + r4]
+ movhps xm6, [r1 + r4]
.loop:
movu m1, [r0] ; m1 = rec[x]
@@ -511,7 +521,7 @@
movu m3, [r2] ; m3 = buff1
paddb m3, m2
- paddb m3, m6 ; m3 = edgeType
+ paddb m3, [pb_2] ; m3 = edgeType
pshufb m4, m5, m3
@@ -534,8 +544,11 @@
add r0, 32
add r1, 32
add r2, 32
- dec r4d
- jnz .loop
+ sub r4, 32
+ jg .loop
+
+ movq [r0 + r4], xm6
+ movhps [r1 + r4], xm6
RET
;=======================================================================================================
diff -r deea3a029318 -r cfc321e81396 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Fri Apr 24 13:47:58 2015 +0530
+++ b/source/encoder/sao.cpp Fri Apr 24 14:10:55 2015 +0530
@@ -437,23 +437,8 @@
for (y = startY; y < endY; y++)
{
int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
- pixel firstPxl = rec[0]; // copy first Pxl
- pixel lastPxl = rec[ctuWidth - 1];
- int8_t one = upBufft[1];
- int8_t two = upBufft[endX + 1];
- primitives.saoCuOrgE2[ctuWidth > 16](rec, upBufft, upBuff1, m_offsetEo, ctuWidth, stride);
- if (startX)
- {
- rec[0] = firstPxl;
- upBufft[1] = one;
- }
-
- if (rpelx == picWidth)
- {
- rec[ctuWidth - 1] = lastPxl;
- upBufft[endX + 1] = two;
- }
+ primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, m_offsetEo, endX - startX, stride);
upBufft[startX] = iSignDown2;
diff -r deea3a029318 -r cfc321e81396 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Fri Apr 24 13:47:58 2015 +0530
+++ b/source/test/pixelharness.cpp Fri Apr 24 14:10:55 2015 +0530
@@ -966,7 +966,7 @@
{
for (int i = 0; i < ITERS; i++)
{
- int width = 16 * (1 << (id * (rand() % 2 + 1)));
+ int width = 16 * (1 << (id * (rand() % 2 + 1))) - (rand() % 2);
int stride = width + 1;
ref[width > 16](ref_dest, psbuf1 + j, psbuf2 + j, psbuf3 + j, width, stride);
More information about the x265-devel
mailing list