[x265] [PATCH 1 of 6] asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c->690c over SSE
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Jun 25 10:23:49 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435212794 -19800
# Thu Jun 25 11:43:14 2015 +0530
# Node ID faec09e1ab60531924f2d919d4f283fa91bfec81
# Parent b1af4c36f48a4500a4912373ebcda9a5540b5c15
asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c->690c over SSE
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 11:43:14 2015 +0530
@@ -1284,6 +1284,8 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
+
p.cu[BLOCK_16x16].intra_pred[2] = PFX(intra_pred_ang16_2_avx2);
p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx2);
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/const-a.asm Thu Jun 25 11:43:14 2015 +0530
@@ -41,7 +41,7 @@
const pb_16, times 32 db 16
const pb_32, times 32 db 32
const pb_64, times 32 db 64
-const pb_128, times 16 db 128
+const pb_128, times 32 db 128
const pb_a1, times 16 db 0xa1
const pb_01, times 8 db 0, 1
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/loopfilter.asm Thu Jun 25 11:43:14 2015 +0530
@@ -235,6 +235,67 @@
%endif
INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,4,9
+ vbroadcasti128 m6, [r1]
+ movzx r1d, byte [r3]
+ neg r1b
+ movd xm0, r1d
+ movzx r1d, byte [r3 + 1]
+ neg r1b
+ movd xm1, r1d
+ vinserti128 m0, m0, xm1, 1
+ mova m5, [pw_1023]
+ mov r1, r4m
+ add r1d, r1d
+ shr r2d, 4
+
+.loop:
+ movu m7, [r0]
+ movu m8, [r0 + r1]
+ movu m2, [r0 + 2]
+ movu m1, [r0 + r1 + 2]
+
+ pcmpgtw m3, m7, m2
+ pcmpgtw m2, m7
+ pcmpgtw m4, m8, m1
+ pcmpgtw m1, m8
+
+ packsswb m3, m4
+ packsswb m2, m1
+ vpermq m3, m3, 11011000b
+ vpermq m2, m2, 11011000b
+
+ pand m3, [pb_1]
+ por m3, m2
+
+ pslldq m2, m3, 1
+ por m2, m0
+
+ psignb m2, [pb_128] ; m2 = signLeft
+ pxor m0, m0
+ palignr m0, m3, 15
+ paddb m3, m2
+ paddb m3, [pb_2] ; m3 = uiEdgeType
+ pshufb m2, m6, m3
+ pmovsxbw m3, xm2 ; offsetEo
+ vextracti128 xm2, m2, 1
+ pmovsxbw m2, xm2
+ pxor m4, m4
+ paddw m7, m3
+ paddw m8, m2
+ pmaxsw m7, m4
+ pmaxsw m8, m4
+ pminsw m7, m5
+ pminsw m8, m5
+ movu [r0], m7
+ movu [r0 + r1], m8
+
+ add r0q, 32
+ dec r2d
+ jnz .loop
+ RET
+%else ; HIGH_BIT_DEPTH
cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
mov r4d, r4m
@@ -287,6 +348,7 @@
sub r2d, 16
jnz .loop
RET
+%endif
;==================================================================================================
; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
More information about the x265-devel
mailing list