[x265] [PATCH] asm: high_bit_depth sse4 version of saoCuStatsE2 & saoCuStatsE3
Min Chen
chenm003 at 163.com
Fri Feb 19 06:27:52 CET 2016
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1455859663 -28800
# Node ID 5cdbd129c0d840669758a11597a52aa53f0fcbfa
# Parent c2228fb8151ddce111a75fb1c02b25eca5a68604
asm: high_bit_depth sse4 version of saoCuStatsE2 & saoCuStatsE3
diff -r c2228fb8151d -r 5cdbd129c0d8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Feb 19 09:50:42 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Feb 19 13:27:43 2016 +0800
@@ -1169,6 +1169,8 @@
#if X86_64
p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
+ p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
+ p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
#endif
}
if (cpuMask & X265_CPU_AVX)
diff -r c2228fb8151d -r 5cdbd129c0d8 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Fri Feb 19 09:50:42 2016 +0530
+++ b/source/common/x86/loopfilter.asm Fri Feb 19 13:27:43 2016 +0800
@@ -2872,6 +2872,129 @@
;}
%if ARCH_X86_64
+
+%if HIGH_BIT_DEPTH == 1
+INIT_XMM sse4
+cglobal saoCuStatsE2, 5,9,7,0-32 ; Stack: 5 of stats and 5 of count
+ mov r5d, r5m
+ FIX_STRIDES r2d
+
+ ; clear internal temporary buffer
+ pxor m0, m0
+ mova [rsp], m0
+ mova [rsp + mmsize], m0
+ mova m5, [pw_1]
+ mova m6, [pb_2]
+
+.loopH:
+ ; TODO: merge into SIMD in below
+ ; get upBuffX[0]
+ mov r6w, [r1 + r2]
+ sub r6w, [r1 - 1 * SIZEOF_PIXEL]
+ seta r6b
+ setb r7b
+ sub r6b, r7b
+ mov [r4], r6b
+
+ ; backup unavailable pixels
+ movh m0, [r4 + r5 + 1]
+
+ mov r6d, r5d
+.loopW:
+ ; signDown
+ ; stats[edgeType]
+ ; edgeType
+ movu m1, [r1]
+ movu m2, [r1 + r2 + 1 * SIZEOF_PIXEL]
+ pcmpgtw m3, m1, m2
+ pcmpgtw m2, m1
+ pand m2, m5
+ por m3, m2
+
+ movu m1, [r1 + mmsize]
+ movu m2, [r1 + r2 + 1 * SIZEOF_PIXEL + mmsize]
+ pcmpgtw m4, m1, m2
+ pcmpgtw m2, m1
+ pand m2, m5
+ por m4, m2
+ packsswb m3, m4
+
+ movu m4, [r3]
+ paddb m4, m6
+ psubb m4, m3
+
+ ; update upBuff1
+ movu [r4 + 1], m3
+
+ ; 16 pixels
+%assign x 0
+%rep 16
+ pextrb r7d, m4, x
+ inc word [rsp + r7 * 2]
+
+ movsx r8d, word [r0 + x * 2]
+ add [rsp + 5 * 2 + r7 * 4], r8d
+
+ dec r6d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r0, mmsize * 2
+ add r1, mmsize * SIZEOF_PIXEL
+ add r3, mmsize
+ add r4, mmsize
+ jmp .loopW
+
+.next:
+ xchg r3, r4
+
+ ; restore pointer upBuff1
+ mov r6d, r5d
+ and r6d, ~15
+ neg r6 ; MUST BE 64-bits, it is Negtive
+
+ ; move to next row
+
+ ; move back to start point
+ add r3, r6
+ add r4, r6
+
+ ; adjust with stride
+ lea r0, [r0 + (r6 + 64) * 2] ; 64 = MAX_CU_SIZE
+ add r1, r2
+ lea r1, [r1 + r6 * SIZEOF_PIXEL]
+
+ ; restore unavailable pixels
+ movh [r3 + r5 + 1], m0
+
+ dec byte r6m
+ jg .loopH
+
+ ; sum to global buffer
+ mov r1, r7m
+ mov r0, r8m
+
+ ; s_eoTable = {1,2,0,3,4}
+ pmovzxwd m0, [rsp + 0 * 2]
+ pshufd m0, m0, q3102
+ movu m1, [r0]
+ paddd m0, m1
+ movu [r0], m0
+ movzx r5d, word [rsp + 4 * 2]
+ add [r0 + 4 * 4], r5d
+
+ movu m0, [rsp + 5 * 2 + 0 * 4]
+ pshufd m0, m0, q3102
+ movu m1, [r1]
+ paddd m0, m1
+ movu [r1], m0
+ mov r6d, [rsp + 5 * 2 + 4 * 4]
+ add [r1 + 4 * 4], r6d
+ RET
+
+%else ; HIGH_BIT_DEPTH == 1
+
; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
INIT_XMM sse4
cglobal saoCuStatsE2, 5,9,8,0-32 ; Stack: 5 of stats and 5 of count
@@ -2989,6 +3112,7 @@
add [r1 + 4 * 4], r6d
RET
+%endif ; HIGH_BIT_DEPTH == 0
INIT_YMM avx2
cglobal saoCuStatsE2, 5,10,16 ; Stack: 5 of stats and 5 of count
@@ -3216,6 +3340,119 @@
;}
%if ARCH_X86_64
+
+%if HIGH_BIT_DEPTH == 1
+INIT_XMM sse4
+cglobal saoCuStatsE3, 4,9,8,0-32 ; Stack: 5 of stats and 5 of count
+ mov r4d, r4m
+ mov r5d, r5m
+ FIX_STRIDES r2d
+
+ ; clear internal temporary buffer
+ pxor m0, m0
+ mova [rsp], m0
+ mova [rsp + mmsize], m0
+ ;mova m0, [pb_128]
+ mova m5, [pw_1]
+ mova m6, [pb_2]
+ movh m7, [r3 + r4]
+
+.loopH:
+ mov r6d, r4d
+
+.loopW:
+ ; signDown
+ movu m1, [r1]
+ movu m2, [r1 + r2 - 1 * SIZEOF_PIXEL]
+ pcmpgtw m3, m1, m2
+ pcmpgtw m2, m1
+ pand m2, m5
+ por m3, m2
+
+ movu m1, [r1 + mmsize]
+ movu m2, [r1 + r2 - 1 * SIZEOF_PIXEL + mmsize]
+ pcmpgtw m4, m1, m2
+ pcmpgtw m2, m1
+ pand m2, m5
+ por m4, m2
+ packsswb m3, m4
+
+ ; edgeType
+ movu m4, [r3]
+ paddb m4, m6
+ psubb m4, m3
+
+ ; update upBuff1
+ movu [r3 - 1], m3
+
+ ; stats[edgeType]
+ pxor m1, m0
+
+ ; 16 pixels
+%assign x 0
+%rep 16
+ pextrb r7d, m4, x
+ inc word [rsp + r7 * 2]
+
+ movsx r8d, word [r0 + x * 2]
+ add [rsp + 5 * 2 + r7 * 4], r8d
+
+ dec r6d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r0, 16 * 2
+ add r1, 16 * SIZEOF_PIXEL
+ add r3, 16
+ jmp .loopW
+
+.next:
+ ; restore pointer upBuff1
+ mov r6d, r4d
+ and r6d, ~15
+ neg r6 ; MUST BE 64-bits, it is Negtive
+
+ ; move to next row
+
+ ; move back to start point
+ add r3, r6
+
+ ; adjust with stride
+ lea r0, [r0 + (r6 + 64) * 2] ; 64 = MAX_CU_SIZE
+ add r1, r2
+ lea r1, [r1 + r6 * SIZEOF_PIXEL]
+
+ dec r5d
+ jg .loopH
+
+ ; restore unavailable pixels
+ movh [r3 + r4], m7
+
+ ; sum to global buffer
+ mov r1, r6m
+ mov r0, r7m
+
+ ; s_eoTable = {1,2,0,3,4}
+ pmovzxwd m0, [rsp + 0 * 2]
+ pshufd m0, m0, q3102
+ movu m1, [r0]
+ paddd m0, m1
+ movu [r0], m0
+ movzx r5d, word [rsp + 4 * 2]
+ add [r0 + 4 * 4], r5d
+
+ movu m0, [rsp + 5 * 2 + 0 * 4]
+ pshufd m0, m0, q3102
+ movu m1, [r1]
+ paddd m0, m1
+ movu [r1], m0
+ mov r6d, [rsp + 5 * 2 + 4 * 4]
+ add [r1 + 4 * 4], r6d
+ RET
+
+%else ; HIGH_BIT_DEPTH == 1
+
INIT_XMM sse4
cglobal saoCuStatsE3, 4,9,8,0-32 ; Stack: 5 of stats and 5 of count
mov r4d, r4m
@@ -3321,6 +3558,7 @@
add [r1 + 4 * 4], r6d
RET
+%endif ; HIGH_BIT_DEPTH == 0
INIT_YMM avx2
cglobal saoCuStatsE3, 4,10,16 ; Stack: 5 of stats and 5 of count
More information about the x265-devel
mailing list