[x265] [PATCH 3 of 6] asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c->614c over SSE
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Jun 25 10:23:51 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435213462 -19800
# Thu Jun 25 11:54:22 2015 +0530
# Node ID f43aa44673dcd8e96581c938cf22ad4bbb7657e3
# Parent 31da07b7198ca730bae37577d5053a3337477f7b
asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c->614c over SSE
diff -r 31da07b7198c -r f43aa44673dc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jun 25 11:49:07 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 11:54:22 2015 +0530
@@ -1286,6 +1286,7 @@
{
p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
+ p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
diff -r 31da07b7198c -r f43aa44673dc source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Thu Jun 25 11:49:07 2015 +0530
+++ b/source/common/x86/loopfilter.asm Thu Jun 25 11:54:22 2015 +0530
@@ -728,6 +728,62 @@
%endif
INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE1_2Rows, 4,5,8
+ add r3d, r3d
+ mov r4d, r4m
+ mova m4, [pw_1023]
+ vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo
+ shr r4d, 4
+.loop
+ movu m7, [r0]
+ movu m5, [r0 + r3]
+ movu m1, [r0 + r3 * 2]
+
+ pcmpgtw m2, m7, m5
+ pcmpgtw m3, m5, m7
+ pcmpgtw m0, m5, m1
+ pcmpgtw m1, m5
+
+ packsswb m2, m0
+ packsswb m3, m1
+ vpermq m2, m2, 11011000b
+ vpermq m3, m3, 11011000b
+
+ pand m2, [pb_1]
+ por m2, m3
+
+ movu xm3, [r1] ; m3 = m_iUpBuff1
+ pxor m0, m0
+ psubb m1, m0, m2
+ vinserti128 m3, m3, xm1, 1
+ vextracti128 [r1], m1, 1
+
+ paddb m3, m2
+ paddb m3, [pb_2]
+
+ pshufb m1, m6, m3
+ pmovsxbw m3, xm1
+ vextracti128 xm1, m1, 1
+ pmovsxbw m1, xm1
+
+ paddw m7, m3
+ paddw m5, m1
+
+ pmaxsw m7, m0
+ pmaxsw m5, m0
+ pminsw m7, m4
+ pminsw m5, m4
+
+ movu [r0], m7
+ movu [r0 + r3], m5
+
+ add r0, 32
+ add r1, 16
+ dec r4d
+ jnz .loop
+ RET
+%else ; HIGH_BIT_DEPTH
cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
mov r3d, r3m
mov r4d, r4m
@@ -775,6 +831,7 @@
dec r4d
jnz .loop
RET
+%endif
;======================================================================================================================================================
; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
More information about the x265-devel
mailing list