[x265] [PATCH 4 of 5] asm: AVX2 version of saoCuStatsE2, (138180c -> 44906c)
Min Chen
chenm003 at 163.com
Wed Dec 9 23:27:12 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449698985 21600
# Node ID a5f81208a7ba8043261c009582995c48a1c40f37
# Parent 7ad2050bc2aaa8083b4e2de14d5846e5074b7b73
asm: AVX2 version of saoCuStatsE2, (138180c -> 44906c)
---
source/common/x86/asm-primitives.cpp | 1 +
source/common/x86/loopfilter.asm | 707 ++++++++++++++++++++++------------
2 files changed, 453 insertions(+), 255 deletions(-)
diff -r 7ad2050bc2aa -r a5f81208a7ba source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:43 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:45 2015 -0600
@@ -3638,6 +3638,7 @@
p.propagateCost = PFX(mbtree_propagate_cost_avx2);
p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
+ p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
if (cpuMask & X265_CPU_BMI2)
{
diff -r 7ad2050bc2aa -r a5f81208a7ba source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Dec 09 16:09:43 2015 -0600
+++ b/source/common/x86/loopfilter.asm Wed Dec 09 16:09:45 2015 -0600
@@ -2467,7 +2467,7 @@
; m[1-4] free in here
- ; get current process mask
+ ; get current process group mask
mov r7d, 16
mov r8d, r6d
cmp r6d, r7d
@@ -2592,248 +2592,6 @@
RET
%endif ; ARCH_X86_64
-%if ARCH_X86_64
-;; argument registers used -
-; r0 - src
-; r1 - srcStep
-; r2 - offset
-; r3 - tcP
-; r4 - tcQ
-
-INIT_XMM sse4
-cglobal pelFilterLumaStrong_H, 5,7,10
- mov r1, r2
- neg r3d
- neg r4d
- neg r1
-
- lea r5, [r2 * 3]
- lea r6, [r1 * 3]
-
- pmovzxbw m4, [r0] ; src[0]
- pmovzxbw m3, [r0 + r1] ; src[-offset]
- pmovzxbw m2, [r0 + r1 * 2] ; src[-offset * 2]
- pmovzxbw m1, [r0 + r6] ; src[-offset * 3]
- pmovzxbw m0, [r0 + r1 * 4] ; src[-offset * 4]
- pmovzxbw m5, [r0 + r2] ; src[offset]
- pmovzxbw m6, [r0 + r2 * 2] ; src[offset * 2]
- pmovzxbw m7, [r0 + r5] ; src[offset * 3]
-
- paddw m0, m0 ; m0*2
- mova m8, m2
- paddw m8, m3 ; m2 + m3
- paddw m8, m4 ; m2 + m3 + m4
- mova m9, m8
- paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4
- paddw m8, m1 ; m2 + m3 + m4 + m1
- paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1
- paddw m9, m1
- paddw m0, m1
- paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
- paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4
-
- punpcklqdq m0, m9
- punpcklqdq m1, m3
-
- paddw m3, m4
- mova m9, m5
- paddw m9, m6
- paddw m7, m7 ; 2*m7
- paddw m9, m3 ; m3 + m4 + m5 + m6
- mova m3, m9
- paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6
- paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6
- paddw m7, m6
- psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6
- paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7
- paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
-
- punpcklqdq m9, m8
- punpcklqdq m3, m7
- punpcklqdq m5, m2
- punpcklqdq m4, m6
-
- movd m7, r3d ; -tcP
- movd m2, r4d ; -tcQ
- pshufb m7, [pb_01]
- pshufb m2, [pb_01]
- mova m6, m2
- punpcklqdq m6, m7
-
- paddw m0, [pw_4]
- paddw m3, [pw_4]
- paddw m9, [pw_2]
-
- psraw m0, 3
- psraw m3, 3
- psraw m9, 2
-
- psubw m0, m1
- psubw m3, m4
- psubw m9, m5
-
- pmaxsw m0, m7
- pmaxsw m3, m2
- pmaxsw m9, m6
- psignw m7, [pw_n1]
- psignw m2, [pw_n1]
- psignw m6, [pw_n1]
- pminsw m0, m7
- pminsw m3, m2
- pminsw m9, m6
-
- paddw m0, m1
- paddw m3, m4
- paddw m9, m5
- packuswb m0, m0
- packuswb m3, m9
-
- movd [r0 + r6], m0
- pextrd [r0 + r1], m0, 1
- movd [r0], m3
- pextrd [r0 + r2 * 2], m3, 1
- pextrd [r0 + r2 * 1], m3, 2
- pextrd [r0 + r1 * 2], m3, 3
- RET
-
-INIT_XMM sse4
-cglobal pelFilterLumaStrong_V, 5,5,10
- neg r3d
- neg r4d
- lea r2, [r1 * 3]
-
- movh m0, [r0 - 4] ; src[-offset * 4] row 0
- movh m1, [r0 + r1 * 1 - 4] ; src[-offset * 4] row 1
- movh m2, [r0 + r1 * 2 - 4] ; src[-offset * 4] row 2
- movh m3, [r0 + r2 * 1 - 4] ; src[-offset * 4] row 3
-
- punpcklbw m0, m1
- punpcklbw m2, m3
- mova m4, m0
- punpcklwd m0, m2
- punpckhwd m4, m2
- mova m1, m0
- mova m2, m0
- mova m3, m0
- pshufd m0, m0, 0
- pshufd m1, m1, 1
- pshufd m2, m2, 2
- pshufd m3, m3, 3
- mova m5, m4
- mova m6, m4
- mova m7, m4
- pshufd m4, m4, 0
- pshufd m5, m5, 1
- pshufd m6, m6, 2
- pshufd m7, m7, 3
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- pmovzxbw m2, m2
- pmovzxbw m3, m3
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- pmovzxbw m6, m6
- pmovzxbw m7, m7
-
- paddw m0, m0 ; m0*2
- mova m8, m2
- paddw m8, m3 ; m2 + m3
- paddw m8, m4 ; m2 + m3 + m4
- mova m9, m8
- paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4
- paddw m8, m1 ; m2 + m3 + m4 + m1
- paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1
- paddw m9, m1
- paddw m0, m1
- paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
- paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4
-
- punpcklqdq m0, m9
- punpcklqdq m1, m3
-
- paddw m3, m4
- mova m9, m5
- paddw m9, m6
- paddw m7, m7 ; 2*m7
- paddw m9, m3 ; m3 + m4 + m5 + m6
- mova m3, m9
- paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6
- paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6
- paddw m7, m6
- psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6
- paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7
- paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
-
- punpcklqdq m9, m8
- punpcklqdq m3, m7
- punpcklqdq m5, m2
- punpcklqdq m4, m6
-
- movd m7, r3d ; -tcP
- movd m2, r4d ; -tcQ
- pshufb m7, [pb_01]
- pshufb m2, [pb_01]
- mova m6, m2
- punpcklqdq m6, m7
-
- paddw m0, [pw_4]
- paddw m3, [pw_4]
- paddw m9, [pw_2]
-
- psraw m0, 3
- psraw m3, 3
- psraw m9, 2
-
- psubw m0, m1
- psubw m3, m4
- psubw m9, m5
-
- pmaxsw m0, m7
- pmaxsw m3, m2
- pmaxsw m9, m6
- psignw m7, [pw_n1]
- psignw m2, [pw_n1]
- psignw m6, [pw_n1]
- pminsw m0, m7
- pminsw m3, m2
- pminsw m9, m6
-
- paddw m0, m1
- paddw m3, m4
- paddw m9, m5
- packuswb m0, m0
- packuswb m3, m9
-
- ; 4x6 output rows -
- ; m0 - col 0
- ; m3 - col 3
- mova m1, m0
- mova m2, m3
- mova m4, m3
- mova m5, m3
- pshufd m1, m1, 1 ; col 2
- pshufd m2, m2, 1 ; col 5
- pshufd m4, m4, 2 ; col 4
- pshufd m5, m5, 3 ; col 1
-
- ; transpose 4x6 to 6x4
- punpcklbw m0, m5
- punpcklbw m1, m3
- punpcklbw m4, m2
- punpcklwd m0, m1
-
- movd [r0 + r1 * 0 - 3], m0
- pextrd [r0 + r1 * 1 - 3], m0, 1
- pextrd [r0 + r1 * 2 - 3], m0, 2
- pextrd [r0 + r2 * 1 - 3], m0, 3
- pextrw [r0 + r1 * 0 + 1], m4, 0
- pextrw [r0 + r1 * 1 + 1], m4, 1
- pextrw [r0 + r1 * 2 + 1], m4, 2
- pextrw [r0 + r2 * 1 + 1], m4, 3
- RET
-%endif ; ARCH_X86_64
-
-
;void saoCuStatsE2_c(const int16_t *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
;{
@@ -2900,6 +2658,7 @@
movu m2, [r1 + r2 + 1]
; signDown
+ ; stats[edgeType]
pxor m1, m0
pxor m2, m0
pcmpgtb m3, m1, m2
@@ -2917,9 +2676,6 @@
; update upBuff1
movu [r4 + 1], m3
- ; stats[edgeType]
- pxor m1, m0
-
; 16 pixels
%assign x 0
%rep 16
@@ -2986,6 +2742,205 @@
mov r6d, [rsp + 5 * 2 + 4 * 4]
add [r1 + 4 * 4], r6d
RET
+
+
+INIT_YMM avx2
+cglobal saoCuStatsE2, 5,10,16 ; Stack: 5 of stats and 5 of count
+ mov r5d, r5m
+
+ ; clear internal temporary buffer
+ pxor xm6, xm6 ; count[0]
+ pxor xm7, xm7 ; count[1]
+ pxor xm8, xm8 ; count[2]
+ pxor xm9, xm9 ; count[3]
+ pxor xm10, xm10 ; count[4]
+ pxor xm11, xm11 ; stats[0]
+ pxor xm12, xm12 ; stats[1]
+ pxor xm13, xm13 ; stats[2]
+ pxor xm14, xm14 ; stats[3]
+ pxor xm15, xm15 ; stats[4]
+ mova m0, [pb_128]
+
+ ; unavailable mask
+ lea r9, [pb_movemask_32 + 32]
+
+.loopH:
+ ; TODO: merge into SIMD in below
+ ; get upBuffX[0]
+ mov r6b, [r1 + r2]
+ sub r6b, [r1 - 1]
+ seta r6b
+ setb r7b
+ sub r6b, r7b
+ mov [r4], r6b
+
+ ; backup unavailable pixels
+ movq xm5, [r4 + r5 + 1]
+
+ mov r6d, r5d
+.loopW:
+ movu m1, [r1]
+ movu m2, [r1 + r2 + 1]
+
+ ; signDown
+ ; stats[edgeType]
+ pxor xm1, xm0
+ pxor xm2, xm0
+ pcmpgtb xm3, xm1, xm2
+ pand xm3, [pb_1]
+ pcmpgtb xm2, xm1
+ por xm2, xm3
+ psignb xm3, xm2, xm0
+
+ ; edgeType
+ movu xm4, [r3]
+ paddb xm4, [pb_2]
+ paddb xm2, xm4
+
+ ; update upBuff1
+ movu [r4 + 1], xm3
+
+ ; m[1-4] free in here
+
+ ; get current process group mask
+ mov r7d, 16
+ mov r8d, r6d
+ cmp r6d, r7d
+ cmovge r8d, r7d
+ neg r8
+ movu xm1, [r9 + r8]
+
+ ; tmp_count[edgeType]++
+ ; tmp_stats[edgeType] += (fenc[x] - rec[x])
+ pxor xm3, xm3
+ por xm1, xm2 ; apply unavailable pixel mask
+ movu m4, [r0] ; up to 14bits
+
+ pcmpeqb xm3, xm1, xm3
+ psubb xm6, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m11, m3
+
+ pcmpeqb xm3, xm1, [pb_1]
+ psubb xm7, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m12, m3
+
+ pcmpeqb xm3, xm1, [pb_2]
+ psubb xm8, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m13, m3
+
+ pcmpeqb xm3, xm1, [pb_3]
+ psubb xm9, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m14, m3
+
+ pcmpeqb xm3, xm1, [pb_4]
+ psubb xm10, xm3
+ pmovsxbw m2, xm3
+ pmaddwd m3, m4, m2
+ paddd m15, m3
+
+ sub r6d, r7d
+ jle .next
+
+ add r0, 16*2
+ add r1, 16
+ add r3, 16
+ add r4, 16
+ jmp .loopW
+
+.next:
+ xchg r3, r4
+
+ ; restore pointer upBuff1
+ ; TODO: BZHI
+ mov r6d, r5d
+ and r6d, ~15
+ neg r6 ; MUST BE 64-bits, it is Negtive
+
+ ; move to next row
+
+ ; move back to start point
+ add r3, r6
+ add r4, r6
+
+ ; adjust with stride
+ lea r0, [r0 + (r6 + 64) * 2] ; 64 = MAX_CU_SIZE
+ add r1, r2
+ add r1, r6
+
+ ; restore unavailable pixels
+ movq [r3 + r5 + 1], xm5
+
+ dec byte r6m
+ jg .loopH
+
+ ; sum to global buffer
+ mov r1, r7m
+ mov r0, r8m
+
+ ; sum into word
+ ; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here)
+ pxor xm0, xm0
+ psadbw xm1, xm6, xm0
+ psadbw xm2, xm7, xm0
+ psadbw xm3, xm8, xm0
+ psadbw xm4, xm9, xm0
+ psadbw xm5, xm10, xm0
+ pshufd xm1, xm1, q3120
+ pshufd xm2, xm2, q3120
+ pshufd xm3, xm3, q3120
+ pshufd xm4, xm4, q3120
+
+ ; sum count[4] only
+ movhlps xm6, xm5
+ paddd xm5, xm6
+
+ ; sum count[s_eoTable]
+ ; s_eoTable = {1, 2, 0, 3, 4}
+ punpcklqdq xm3, xm1
+ punpcklqdq xm2, xm4
+ phaddd xm3, xm2
+ movu xm1, [r0]
+ paddd xm3, xm1
+ movu [r0], xm3
+ movd r5d, xm5
+ add [r0 + 4 * 4], r5d
+
+ ; sum stats[s_eoTable]
+ vextracti128 xm1, m11, 1
+ paddd xm1, xm11
+ vextracti128 xm2, m12, 1
+ paddd xm2, xm12
+ vextracti128 xm3, m13, 1
+ paddd xm3, xm13
+ vextracti128 xm4, m14, 1
+ paddd xm4, xm14
+ vextracti128 xm5, m15, 1
+ paddd xm5, xm15
+
+ ; s_eoTable = {1, 2, 0, 3, 4}
+ phaddd xm3, xm1
+ phaddd xm2, xm4
+ phaddd xm3, xm2
+ psubd xm3, xm0, xm3 ; negtive for compensate PMADDWD sign algorithm problem
+
+ ; sum stats[4] only
+ HADDD xm5, xm6
+ psubd xm5, xm0, xm5
+
+ movu xm1, [r1]
+ paddd xm3, xm1
+ movu [r1], xm3
+ movd r6d, xm5
+ add [r1 + 4 * 4], r6d
+ RET
%endif ; ARCH_X86_64
@@ -3120,3 +3075,245 @@
add [r1 + 4 * 4], r6d
RET
%endif ; ARCH_X86_64
+
+
+%if ARCH_X86_64
+;; argument registers used -
+; r0 - src
+; r1 - srcStep
+; r2 - offset
+; r3 - tcP
+; r4 - tcQ
+
+INIT_XMM sse4
+cglobal pelFilterLumaStrong_H, 5,7,10
+ mov r1, r2
+ neg r3d
+ neg r4d
+ neg r1
+
+ lea r5, [r2 * 3]
+ lea r6, [r1 * 3]
+
+ pmovzxbw m4, [r0] ; src[0]
+ pmovzxbw m3, [r0 + r1] ; src[-offset]
+ pmovzxbw m2, [r0 + r1 * 2] ; src[-offset * 2]
+ pmovzxbw m1, [r0 + r6] ; src[-offset * 3]
+ pmovzxbw m0, [r0 + r1 * 4] ; src[-offset * 4]
+ pmovzxbw m5, [r0 + r2] ; src[offset]
+ pmovzxbw m6, [r0 + r2 * 2] ; src[offset * 2]
+ pmovzxbw m7, [r0 + r5] ; src[offset * 3]
+
+ paddw m0, m0 ; m0*2
+ mova m8, m2
+ paddw m8, m3 ; m2 + m3
+ paddw m8, m4 ; m2 + m3 + m4
+ mova m9, m8
+ paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4
+ paddw m8, m1 ; m2 + m3 + m4 + m1
+ paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1
+ paddw m9, m1
+ paddw m0, m1
+ paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
+ paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4
+
+ punpcklqdq m0, m9
+ punpcklqdq m1, m3
+
+ paddw m3, m4
+ mova m9, m5
+ paddw m9, m6
+ paddw m7, m7 ; 2*m7
+ paddw m9, m3 ; m3 + m4 + m5 + m6
+ mova m3, m9
+ paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6
+ paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6
+ paddw m7, m6
+ psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6
+ paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7
+ paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
+
+ punpcklqdq m9, m8
+ punpcklqdq m3, m7
+ punpcklqdq m5, m2
+ punpcklqdq m4, m6
+
+ movd m7, r3d ; -tcP
+ movd m2, r4d ; -tcQ
+ pshufb m7, [pb_01]
+ pshufb m2, [pb_01]
+ mova m6, m2
+ punpcklqdq m6, m7
+
+ paddw m0, [pw_4]
+ paddw m3, [pw_4]
+ paddw m9, [pw_2]
+
+ psraw m0, 3
+ psraw m3, 3
+ psraw m9, 2
+
+ psubw m0, m1
+ psubw m3, m4
+ psubw m9, m5
+
+ pmaxsw m0, m7
+ pmaxsw m3, m2
+ pmaxsw m9, m6
+ psignw m7, [pw_n1]
+ psignw m2, [pw_n1]
+ psignw m6, [pw_n1]
+ pminsw m0, m7
+ pminsw m3, m2
+ pminsw m9, m6
+
+ paddw m0, m1
+ paddw m3, m4
+ paddw m9, m5
+ packuswb m0, m0
+ packuswb m3, m9
+
+ movd [r0 + r6], m0
+ pextrd [r0 + r1], m0, 1
+ movd [r0], m3
+ pextrd [r0 + r2 * 2], m3, 1
+ pextrd [r0 + r2 * 1], m3, 2
+ pextrd [r0 + r1 * 2], m3, 3
+ RET
+
+INIT_XMM sse4
+cglobal pelFilterLumaStrong_V, 5,5,10
+ neg r3d
+ neg r4d
+ lea r2, [r1 * 3]
+
+ movh m0, [r0 - 4] ; src[-offset * 4] row 0
+ movh m1, [r0 + r1 * 1 - 4] ; src[-offset * 4] row 1
+ movh m2, [r0 + r1 * 2 - 4] ; src[-offset * 4] row 2
+ movh m3, [r0 + r2 * 1 - 4] ; src[-offset * 4] row 3
+
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ mova m4, m0
+ punpcklwd m0, m2
+ punpckhwd m4, m2
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ pshufd m0, m0, 0
+ pshufd m1, m1, 1
+ pshufd m2, m2, 2
+ pshufd m3, m3, 3
+ mova m5, m4
+ mova m6, m4
+ mova m7, m4
+ pshufd m4, m4, 0
+ pshufd m5, m5, 1
+ pshufd m6, m6, 2
+ pshufd m7, m7, 3
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+ pmovzxbw m6, m6
+ pmovzxbw m7, m7
+
+ paddw m0, m0 ; m0*2
+ mova m8, m2
+ paddw m8, m3 ; m2 + m3
+ paddw m8, m4 ; m2 + m3 + m4
+ mova m9, m8
+ paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4
+ paddw m8, m1 ; m2 + m3 + m4 + m1
+ paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1
+ paddw m9, m1
+ paddw m0, m1
+ paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
+ paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4
+
+ punpcklqdq m0, m9
+ punpcklqdq m1, m3
+
+ paddw m3, m4
+ mova m9, m5
+ paddw m9, m6
+ paddw m7, m7 ; 2*m7
+ paddw m9, m3 ; m3 + m4 + m5 + m6
+ mova m3, m9
+ paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6
+ paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6
+ paddw m7, m6
+ psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6
+ paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7
+ paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
+
+ punpcklqdq m9, m8
+ punpcklqdq m3, m7
+ punpcklqdq m5, m2
+ punpcklqdq m4, m6
+
+ movd m7, r3d ; -tcP
+ movd m2, r4d ; -tcQ
+ pshufb m7, [pb_01]
+ pshufb m2, [pb_01]
+ mova m6, m2
+ punpcklqdq m6, m7
+
+ paddw m0, [pw_4]
+ paddw m3, [pw_4]
+ paddw m9, [pw_2]
+
+ psraw m0, 3
+ psraw m3, 3
+ psraw m9, 2
+
+ psubw m0, m1
+ psubw m3, m4
+ psubw m9, m5
+
+ pmaxsw m0, m7
+ pmaxsw m3, m2
+ pmaxsw m9, m6
+ psignw m7, [pw_n1]
+ psignw m2, [pw_n1]
+ psignw m6, [pw_n1]
+ pminsw m0, m7
+ pminsw m3, m2
+ pminsw m9, m6
+
+ paddw m0, m1
+ paddw m3, m4
+ paddw m9, m5
+ packuswb m0, m0
+ packuswb m3, m9
+
+ ; 4x6 output rows -
+ ; m0 - col 0
+ ; m3 - col 3
+ mova m1, m0
+ mova m2, m3
+ mova m4, m3
+ mova m5, m3
+ pshufd m1, m1, 1 ; col 2
+ pshufd m2, m2, 1 ; col 5
+ pshufd m4, m4, 2 ; col 4
+ pshufd m5, m5, 3 ; col 1
+
+ ; transpose 4x6 to 6x4
+ punpcklbw m0, m5
+ punpcklbw m1, m3
+ punpcklbw m4, m2
+ punpcklwd m0, m1
+
+ movd [r0 + r1 * 0 - 3], m0
+ pextrd [r0 + r1 * 1 - 3], m0, 1
+ pextrd [r0 + r1 * 2 - 3], m0, 2
+ pextrd [r0 + r2 * 1 - 3], m0, 3
+ pextrw [r0 + r1 * 0 + 1], m4, 0
+ pextrw [r0 + r1 * 1 + 1], m4, 1
+ pextrw [r0 + r1 * 2 + 1], m4, 2
+ pextrw [r0 + r2 * 1 + 1], m4, 3
+ RET
+%endif ; ARCH_X86_64
More information about the x265-devel
mailing list