[x265] [PATCH] asm: saoCuOrgE0 avx2 code: 756c->629c
Divya Manivannan
divya at multicorewareinc.com
Thu Apr 2 12:51:40 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1427970584 -19800
# Thu Apr 02 15:59:44 2015 +0530
# Node ID 8f37dd7ec27deebbd44308df49c1ff5d040a3af3
# Parent 6f19f7a1ed620bdaef9ad4b63114da50d1dd5a15
asm: saoCuOrgE0 avx2 code: 756c->629c
diff -r 6f19f7a1ed62 -r 8f37dd7ec27d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 02 10:42:41 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 02 15:59:44 2015 +0530
@@ -1448,6 +1448,7 @@
if (cpuMask & X265_CPU_AVX2)
{
p.scale2D_64to32 = x265_scale2D_64to32_avx2;
+ p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 6f19f7a1ed62 -r 8f37dd7ec27d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Thu Apr 02 10:42:41 2015 +0530
+++ b/source/common/x86/loopfilter.asm Thu Apr 02 15:59:44 2015 +0530
@@ -128,6 +128,60 @@
jnz .loopH
RET
+INIT_YMM avx2
+cglobal saoCuOrgE0, 5, 6, 7, rec, offsetEo, lcuWidth, signLeft, stride
+
+ mov r4d, r4m
+ movzx r5d, byte [r3]
+ neg r5b
+ movd xm0, r5d
+ movzx r5d, byte [r3 + 1]
+ neg r5b
+ movd xm1, r5d
+ vinserti128 m0, m0, xm1, 1
+ vbroadcasti128 m4, [pb_128] ; m4 = [80]
+ vbroadcasti128 m6, [r1] ; m6 = offsetEo
+
+.loop:
+ movu xm5, [r0] ; xm5 = rec[x]
+ movu xm2, [r0 + 1] ; xm2 = rec[x + 1]
+ vinserti128 m5, m5, [r0 + r4], 1
+ vinserti128 m2, m2, [r0 + r4 + 1], 1
+
+ pxor m1, m5, m4
+ pxor m3, m2, m4
+ pcmpgtb m2, m1, m3
+ pcmpgtb m3, m1
+ pand m2, [pb_1]
+ por m2, m3
+
+ pslldq m3, m2, 1
+ por m3, m0
+
+ psignb m3, m4 ; m3 = signLeft
+ pxor m0, m0
+ palignr m0, m2, 15
+ paddb m2, m3
+ paddb m2, [pb_2] ; m2 = uiEdgeType
+ pshufb m3, m6, m2
+ pmovzxbw m2, xm5 ; rec
+ vextracti128 xm5, m5, 1
+ pmovzxbw m5, xm5
+ pmovsxbw m1, xm3 ; offsetEo
+ vextracti128 xm3, m3, 1
+ pmovsxbw m3, xm3
+ paddw m2, m1
+ paddw m5, m3
+ packuswb m2, m5
+ vpermq m2, m2, 11011000b
+ movu [r0], xm2
+ vextracti128 [r0 + r4], m2, 1
+
+ add r0q, 16
+ sub r2d, 16
+ jnz .loop
+ RET
+
;==================================================================================================
; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
;==================================================================================================
diff -r 6f19f7a1ed62 -r 8f37dd7ec27d source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Thu Apr 02 10:42:41 2015 +0530
+++ b/source/common/x86/loopfilter.h Thu Apr 02 15:59:44 2015 +0530
@@ -26,6 +26,7 @@
#define X265_LOOPFILTER_H
void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
+void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
More information about the x265-devel
mailing list