[x265] [PATCH] asm: saoCuOrgE3 avx2 code: 502c->473c
Divya Manivannan
divya at multicorewareinc.com
Wed Apr 15 11:33:20 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429088096 -19800
# Wed Apr 15 14:24:56 2015 +0530
# Node ID 55c9c34fb48998f21e96ce3a5767e49638ad092d
# Parent 737edf5ac0088867cbd9d6d0d85958d594ee6c05
asm: saoCuOrgE3 avx2 code: 502c->473c
diff -r 737edf5ac008 -r 55c9c34fb489 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 15 10:58:54 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 15 14:24:56 2015 +0530
@@ -1596,6 +1596,7 @@
p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
+ p.saoCuOrgE3 = x265_saoCuOrgE3_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r 737edf5ac008 -r 55c9c34fb489 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Apr 15 10:58:54 2015 +0530
+++ b/source/common/x86/loopfilter.asm Wed Apr 15 14:24:56 2015 +0530
@@ -520,6 +520,68 @@
movhps [r1 + r5 - 1], m7
RET
+INIT_YMM avx2
+cglobal saoCuOrgE3, 3, 6, 8
+ mov r3d, r3m
+ mov r4d, r4m
+ mov r5d, r5m
+
+ ; save latest 2 pixels for case startX=1 or left_endX=15
+ movq xm7, [r0 + r5]
+ movhps xm7, [r1 + r5 - 1]
+
+ ; move to startX+1
+ inc r4d
+ add r0, r4
+ add r1, r4
+ sub r5d, r4d
+ pxor xm0, xm0 ; xm0 = 0
+ mova xm6, [pb_2] ; xm6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ movu xm5, [r2] ; xm5 = m_iOffsetEo
+
+.loop:
+ movu xm1, [r0] ; xm1 = pRec[x]
+ movu xm2, [r0 + r3] ; xm2 = pRec[x + iStride]
+
+ psubusb xm3, xm2, xm1
+ psubusb xm4, xm1, xm2
+ pcmpeqb xm3, xm0
+ pcmpeqb xm4, xm0
+ pcmpeqb xm2, xm1
+
+ pabsb xm3, xm3
+ por xm4, xm3
+ pandn xm2, xm4 ; xm2 = iSignDown
+
+ movu xm3, [r1] ; xm3 = m_iUpBuff1
+
+ paddb xm3, xm2
+ paddb xm3, xm6 ; xm3 = uiEdgeType
+
+ pshufb xm4, xm5, xm3
+
+ psubb xm3, xm0, xm2
+ movu [r1 - 1], xm3
+
+ pmovzxbw m2, xm1
+ pmovsxbw m3, xm4
+
+ paddw m2, m3
+ vextracti128 xm3, m2, 1
+ packuswb xm2, xm3
+ movu [r0], xm2
+
+ add r0, 16
+ add r1, 16
+
+ sub r5, 16
+ jg .loop
+
+ ; restore last pixels (up to 2)
+ movq [r0 + r5], xm7
+ movhps [r1 + r5 - 1], xm7
+ RET
+
;=====================================================================================
; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
;=====================================================================================
diff -r 737edf5ac008 -r 55c9c34fb489 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Wed Apr 15 10:58:54 2015 +0530
+++ b/source/common/x86/loopfilter.h Wed Apr 15 14:24:56 2015 +0530
@@ -33,6 +33,7 @@
void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
+void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
More information about the x265-devel
mailing list