[x265] [PATCH] asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c
Divya Manivannan
divya at multicorewareinc.com
Wed Apr 22 09:01:09 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429685709 -19800
# Wed Apr 22 12:25:09 2015 +0530
# Node ID 4c123bec3fe04af65e3c875ff11b27f1f333f9be
# Parent 584211b333ac9640d81423b3f60a18956425e27c
asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c
diff -r 584211b333ac -r 4c123bec3fe0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 22 11:59:36 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 22 12:25:09 2015 +0530
@@ -1729,6 +1729,7 @@
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
+ p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r 584211b333ac -r 4c123bec3fe0 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Apr 22 11:59:36 2015 +0530
+++ b/source/common/x86/loopfilter.asm Wed Apr 22 12:25:09 2015 +0530
@@ -582,6 +582,72 @@
movhps [r1 + r5 - 1], xm7
RET
+INIT_YMM avx2
+cglobal saoCuOrgE3_32, 3, 6, 8
+ mov r3d, r3m
+ mov r4d, r4m
+ mov r5d, r5m
+
+ ; save latest 2 pixels for case startX=1 or left_endX=15
+ movq xm7, [r0 + r5]
+ movhps xm7, [r1 + r5 - 1]
+
+ ; move to startX+1
+ inc r4d
+ add r0, r4
+ add r1, r4
+ sub r5d, r4d
+ pxor m0, m0 ; m0 = 0
+ mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ vbroadcasti128 m5, [r2] ; m5 = m_iOffsetEo
+
+.loop:
+ movu m1, [r0] ; m1 = pRec[x]
+ movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
+
+ psubusb m3, m2, m1
+ psubusb m4, m1, m2
+ pcmpeqb m3, m0
+ pcmpeqb m4, m0
+ pcmpeqb m2, m1
+
+ pabsb m3, m3
+ por m4, m3
+ pandn m2, m4 ; m2 = iSignDown
+
+ movu m3, [r1] ; m3 = m_iUpBuff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = uiEdgeType
+
+ pshufb m4, m5, m3
+
+ psubb m3, m0, m2
+ movu [r1 - 1], m3
+
+ pmovzxbw m2, xm1
+ vextracti128 xm1, m1, 1
+ pmovzxbw m1, xm1
+ pmovsxbw m3, xm4
+ vextracti128 xm4, m4, 1
+ pmovsxbw m4, xm4
+
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ vpermq m2, m2, 11011000b
+ movu [r0], m2
+
+ add r0, 32
+ add r1, 32
+ sub r5, 32
+ jg .loop
+
+ ; restore last pixels (up to 2)
+ movq [r0 + r5], xm7
+ movhps [r1 + r5 - 1], xm7
+ RET
+
;=====================================================================================
; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
;=====================================================================================
diff -r 584211b333ac -r 4c123bec3fe0 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Wed Apr 22 11:59:36 2015 +0530
+++ b/source/common/x86/loopfilter.h Wed Apr 22 12:25:09 2015 +0530
@@ -34,6 +34,7 @@
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
+void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
More information about the x265-devel
mailing list