[x265] [PATCH 5 of 5] asm: AVX2 version of saoCuStatsE3, (136881c -> 45126c)
Min Chen
chenm003 at 163.com
Wed Dec 9 23:27:13 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449698989 21600
# Node ID 32eb64163b1d3c5d7dceb6bfedb84b61e160094e
# Parent a5f81208a7ba8043261c009582995c48a1c40f37
asm: AVX2 version of saoCuStatsE3, (136881c -> 45126c)
---
source/common/x86/asm-primitives.cpp | 1 +
source/common/x86/loopfilter.asm | 186 ++++++++++++++++++++++++++++++++++
2 files changed, 187 insertions(+), 0 deletions(-)
diff -r a5f81208a7ba -r 32eb64163b1d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:45 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:49 2015 -0600
@@ -3639,6 +3639,7 @@
p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
+ p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
if (cpuMask & X265_CPU_BMI2)
{
diff -r a5f81208a7ba -r 32eb64163b1d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Dec 09 16:09:45 2015 -0600
+++ b/source/common/x86/loopfilter.asm Wed Dec 09 16:09:49 2015 -0600
@@ -3074,6 +3074,192 @@
mov r6d, [rsp + 5 * 2 + 4 * 4]
add [r1 + 4 * 4], r6d
RET
+
+
+INIT_YMM avx2
+cglobal saoCuStatsE3, 4,10,16 ; Stack: 5 of stats and 5 of count
+ mov r4d, r4m
+ mov r5d, r5m
+
+ ; clear internal temporary buffer
+ pxor xm6, xm6 ; count[0]
+ pxor xm7, xm7 ; count[1]
+ pxor xm8, xm8 ; count[2]
+ pxor xm9, xm9 ; count[3]
+ pxor xm10, xm10 ; count[4]
+ pxor xm11, xm11 ; stats[0]
+ pxor xm12, xm12 ; stats[1]
+ pxor xm13, xm13 ; stats[2]
+ pxor xm14, xm14 ; stats[3]
+ pxor xm15, xm15 ; stats[4]
+ mova m0, [pb_128]
+
+ ; unavailable mask
+ lea r9, [pb_movemask_32 + 32]
+ push qword [r3 + r4]
+
+.loopH:
+ mov r6d, r4d
+
+.loopW:
+ movu m1, [r1]
+ movu m2, [r1 + r2 - 1]
+
+ ; signDown
+ ; stats[edgeType]
+ pxor xm1, xm0
+ pxor xm2, xm0
+ pcmpgtb xm3, xm1, xm2
+ pand xm3, [pb_1]
+ pcmpgtb xm2, xm1
+ por xm2, xm3
+ pxor xm3, xm3
+ psubb xm3, xm2
+
+ ; edgeType
+ movu xm4, [r3]
+ paddb xm4, [pb_2]
+ paddb xm2, xm4
+
+ ; update upBuff1
+ movu [r3 - 1], xm3
+
+ ; m[1-4] free in here
+
+ ; get current process group mask
+ mov r7d, 16
+ mov r8d, r6d
+ cmp r6d, r7d
+ cmovge r8d, r7d
+ neg r8
+ movu xm1, [r9 + r8]
+
+ ; tmp_count[edgeType]++
+ ; tmp_stats[edgeType] += (fenc[x] - rec[x])
+ pxor xm3, xm3
+ por xm1, xm2 ; apply unavailable pixel mask
+ movu m4, [r0] ; up to 14bits
+
+ pcmpeqb xm3, xm1, xm3
+ psubb xm6, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m11, m3
+
+ pcmpeqb xm3, xm1, [pb_1]
+ psubb xm7, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m12, m3
+
+ pcmpeqb xm3, xm1, [pb_2]
+ psubb xm8, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m13, m3
+
+ pcmpeqb xm3, xm1, [pb_3]
+ psubb xm9, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m14, m3
+
+ pcmpeqb xm3, xm1, [pb_4]
+ psubb xm10, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m15, m3
+
+ sub r6d, r7d
+ jle .next
+
+ add r0, 16*2
+ add r1, 16
+ add r3, 16
+ jmp .loopW
+
+.next:
+ ; restore pointer upBuff1
+ mov r6d, r4d
+ and r6d, ~15
+ neg r6 ; MUST BE 64-bits, it is Negtive
+
+ ; move to next row
+
+ ; move back to start point
+ add r3, r6
+
+ ; adjust with stride
+ lea r0, [r0 + (r6 + 64) * 2] ; 64 = MAX_CU_SIZE
+ add r1, r2
+ add r1, r6
+
+ dec r5d
+ jg .loopH
+
+ ; restore unavailable pixels
+ pop qword [r3 + r4]
+
+ ; sum to global buffer
+ mov r1, r6m
+ mov r0, r7m
+
+ ; sum into word
+ ; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here)
+ pxor xm0, xm0
+ psadbw xm1, xm6, xm0
+ psadbw xm2, xm7, xm0
+ psadbw xm3, xm8, xm0
+ psadbw xm4, xm9, xm0
+ psadbw xm5, xm10, xm0
+ pshufd xm1, xm1, q3120
+ pshufd xm2, xm2, q3120
+ pshufd xm3, xm3, q3120
+ pshufd xm4, xm4, q3120
+
+ ; sum count[4] only
+ movhlps xm6, xm5
+ paddd xm5, xm6
+
+ ; sum count[s_eoTable]
+ ; s_eoTable = {1, 2, 0, 3, 4}
+ punpcklqdq xm3, xm1
+ punpcklqdq xm2, xm4
+ phaddd xm3, xm2
+ movu xm1, [r0]
+ paddd xm3, xm1
+ movu [r0], xm3
+ movd r5d, xm5
+ add [r0 + 4 * 4], r5d
+
+ ; sum stats[s_eoTable]
+ vextracti128 xm1, m11, 1
+ paddd xm1, xm11
+ vextracti128 xm2, m12, 1
+ paddd xm2, xm12
+ vextracti128 xm3, m13, 1
+ paddd xm3, xm13
+ vextracti128 xm4, m14, 1
+ paddd xm4, xm14
+ vextracti128 xm5, m15, 1
+ paddd xm5, xm15
+
+ ; s_eoTable = {1, 2, 0, 3, 4}
+ phaddd xm3, xm1
+ phaddd xm2, xm4
+ phaddd xm3, xm2
+ psubd xm3, xm0, xm3 ; negtive for compensate PMADDWD sign algorithm problem
+
+ ; sum stats[4] only
+ HADDD xm5, xm6
+ psubd xm5, xm0, xm5
+
+ movu xm1, [r1]
+ paddd xm3, xm1
+ movu [r1], xm3
+ movd r6d, xm5
+ add [r1 + 4 * 4], r6d
+ RET
%endif ; ARCH_X86_64
More information about the x265-devel
mailing list