[x265] [PATCH] asm: saoCuOrgE1 avx2 code: 403c->331c
Divya Manivannan
divya at multicorewareinc.com
Wed Apr 8 11:32:37 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428485346 -19800
# Wed Apr 08 14:59:06 2015 +0530
# Node ID 125a429989d35695971e1a1a8cff20c0b2649074
# Parent 7044924d68147152533fe1502df2c75a3512befb
asm: saoCuOrgE1 avx2 code: 403c->331c
diff -r 7044924d6814 -r 125a429989d3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Apr 07 17:11:42 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 08 14:59:06 2015 +0530
@@ -1557,6 +1557,7 @@
p.dst4x4 = x265_dst4_avx2;
p.scale2D_64to32 = x265_scale2D_64to32_avx2;
p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
+ p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 7044924d6814 -r 125a429989d3 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Apr 07 17:11:42 2015 +0530
+++ b/source/common/x86/loopfilter.asm Wed Apr 08 14:59:06 2015 +0530
@@ -233,6 +233,49 @@
jnz .loop
RET
+INIT_YMM avx2
+cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+ mov r3d, r3m
+ mov r4d, r4m
+ movu xm0, [r2] ; xm0 = m_iOffsetEo
+ mova xm6, [pb_2] ; xm6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ mova xm7, [pb_128]
+ shr r4d, 4
+.loop
+ movu xm1, [r0] ; xm1 = pRec[x]
+ movu xm2, [r0 + r3] ; xm2 = pRec[x + iStride]
+
+ pxor xm3, xm1, xm7
+ pxor xm4, xm2, xm7
+ pcmpgtb xm2, xm3, xm4
+ pcmpgtb xm4, xm3
+ pand xm2, [pb_1]
+ por xm2, xm4
+
+ movu xm3, [r1] ; xm3 = m_iUpBuff1
+
+ paddb xm3, xm2
+ paddb xm3, xm6
+
+ pshufb xm5, xm0, xm3
+ pxor xm4, xm4
+ psubb xm3, xm4, xm2
+ movu [r1], xm3
+
+ pmovzxbw m2, xm1
+ pmovsxbw m3, xm5
+
+ paddw m2, m3
+ vextracti128 xm3, m2, 1
+ packuswb xm2, xm3
+ movu [r0], xm2
+
+ add r0, 16
+ add r1, 16
+ dec r4d
+ jnz .loop
+ RET
+
;========================================================================================================
; void saoCuOrgE1_2Rows(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
;========================================================================================================
diff -r 7044924d6814 -r 125a429989d3 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Apr 07 17:11:42 2015 +0530
+++ b/source/common/x86/loopfilter.h Wed Apr 08 14:59:06 2015 +0530
@@ -28,6 +28,7 @@
void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void x265_saoCuOrgE1_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
More information about the x265-devel
mailing list