[x265] [PATCH] asm: saoCuOrgE1_2Rows avx2 code: 657c->525c
Divya Manivannan
divya at multicorewareinc.com
Wed Apr 8 11:45:51 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428485849 -19800
# Wed Apr 08 15:07:29 2015 +0530
# Node ID 5d215b2e0a64e748c2d4f2e1cb54ece6c0eb257d
# Parent 125a429989d35695971e1a1a8cff20c0b2649074
asm: saoCuOrgE1_2Rows avx2 code: 657c->525c
diff -r 125a429989d3 -r 5d215b2e0a64 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 08 14:59:06 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 08 15:07:29 2015 +0530
@@ -1558,6 +1558,7 @@
p.scale2D_64to32 = x265_scale2D_64to32_avx2;
p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
+ p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 125a429989d3 -r 5d215b2e0a64 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Apr 08 14:59:06 2015 +0530
+++ b/source/common/x86/loopfilter.asm Wed Apr 08 15:07:29 2015 +0530
@@ -351,6 +351,55 @@
jnz .loop
RET
+INIT_YMM avx2
+cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+ mov r3d, r3m
+ mov r4d, r4m
+ pxor m0, m0 ; m0 = 0
+ vbroadcasti128 m5, [pb_128]
+ vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo
+ shr r4d, 4
+.loop
+ movu xm1, [r0] ; m1 = pRec[x]
+ movu xm2, [r0 + r3] ; m2 = pRec[x + iStride]
+ vinserti128 m1, m1, xm2, 1
+ vinserti128 m2, m2, [r0 + r3 * 2], 1
+
+ pxor m3, m1, m5
+ pxor m4, m2, m5
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
+
+ movu xm3, [r1] ; xm3 = m_iUpBuff
+ psubb m4, m0, m2
+ vinserti128 m3, m3, xm4, 1
+ paddb m3, m2
+ paddb m3, [pb_2]
+ pshufb m2, m6, m3
+ vextracti128 [r1], m4, 1
+
+ pmovzxbw m4, xm1
+ vextracti128 xm3, m1, 1
+ pmovzxbw m3, xm3
+ pmovsxbw m1, xm2
+ vextracti128 xm2, m2, 1
+ pmovsxbw m2, xm2
+
+ paddw m4, m1
+ paddw m3, m2
+ packuswb m4, m3
+ vpermq m4, m4, 11011000b
+ movu [r0], xm4
+ vextracti128 [r0 + r3], m4, 1
+
+ add r0, 16
+ add r1, 16
+ dec r4d
+ jnz .loop
+ RET
+
;======================================================================================================================================================
; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
;======================================================================================================================================================
diff -r 125a429989d3 -r 5d215b2e0a64 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Wed Apr 08 14:59:06 2015 +0530
+++ b/source/common/x86/loopfilter.h Wed Apr 08 15:07:29 2015 +0530
@@ -30,6 +30,7 @@
void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE1_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
More information about the x265-devel
mailing list