[x265] [PATCH 051 of 307] x86: AVX512 cleanup addAvg, copy_ps and copy_sp
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:49 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500980858 -19800
# Tue Jul 25 16:37:38 2017 +0530
# Node ID 984cad60283b474ed756238cf904b08df290e103
# Parent 09159f73f47b7eda15c8d0294774fe6eafdadea7
x86: AVX512 cleanup addAvg, copy_ps and copy_sp
diff -r 09159f73f47b -r 984cad60283b source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Jul 25 12:58:16 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Jul 25 16:37:38 2017 +0530
@@ -2162,15 +2162,7 @@
BLOCKCOPY_SP_W64_H4_avx2 64, 64
-%macro BLOCKCOPY_SP_W64_H4_avx512 2
-INIT_ZMM avx512
-cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
- mov r4d, %2/4
- add r3, r3
- lea r5, [3 * r3]
- lea r6, [3 * r1]
-
-.loop:
+%macro PROCESS_BLOCKCOPY_SP_64x8_AVX512 0
movu m0, [r2]
movu m1, [r2 + 64]
movu m2, [r2 + r3]
@@ -2187,8 +2179,8 @@
movu m0, [r2 + 2 * r3]
movu m1, [r2 + 2 * r3 + 64]
- movu m2, [r2 + r5]
- movu m3, [r2 + r5 + 64]
+ movu m2, [r2 + r4]
+ movu m3, [r2 + r4 + 64]
packuswb m0, m1
packuswb m2, m3
@@ -2197,17 +2189,69 @@
vshufi64x2 m0, m0, 11011000b
vshufi64x2 m2, m2, 11011000b
movu [r0 + 2 * r1], m0
- movu [r0 + r6], m2
+ movu [r0 + r5], m2
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
- dec r4d
- jnz .loop
+ movu m0, [r2]
+ movu m1, [r2 + 64]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 64]
+
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m2, m2, 11011000b
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + 2 * r3]
+ movu m1, [r2 + 2 * r3 + 64]
+ movu m2, [r2 + r4]
+ movu m3, [r2 + r4 + 64]
+
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m2, m2, 11011000b
+ movu [r0 + 2 * r1], m0
+ movu [r0 + r5], m2
+%endmacro
+
+INIT_ZMM avx512
+cglobal blockcopy_sp_64x64, 4, 6, 4
+ add r3, r3
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_SP_64x8_AVX512
RET
-%endmacro
-
-BLOCKCOPY_SP_W64_H4_avx512 64, 64
;-----------------------------------------------------------------------------
; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
@@ -3184,35 +3228,78 @@
BLOCKCOPY_PS_W32_H4_avx2 32, 32
BLOCKCOPY_PS_W32_H4_avx2 32, 64
-%macro BLOCKCOPY_PS_W32_H4_avx512 2
-INIT_ZMM avx512
-cglobal blockcopy_ps_%1x%2, 4, 7, 4
- add r1, r1
- mov r4d, %2/8
- lea r5, [3 * r3]
- lea r6, [3 * r1]
-.loop:
-%rep 2
+%macro PROCESS_BLOCKCOPY_PS_32x8_AVX512 0
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + r3]
pmovzxbw m2, [r2 + r3 * 2]
- pmovzxbw m3, [r2 + r5]
+ pmovzxbw m3, [r2 + r4]
movu [r0], m0
movu [r0 + r1], m1
movu [r0 + r1 * 2], m2
- movu [r0 + r6], m3
+ movu [r0 + r5], m3
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
-%endrep
- dec r4d
- jnz .loop
+
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + r3]
+ pmovzxbw m2, [r2 + r3 * 2]
+ pmovzxbw m3, [r2 + r4]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r5], m3
+%endmacro
+
+INIT_ZMM avx512
+cglobal blockcopy_ps_32x32, 4, 6, 4
+ add r1, r1
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
RET
-%endmacro
-
-BLOCKCOPY_PS_W32_H4_avx512 32, 32
-BLOCKCOPY_PS_W32_H4_avx512 32, 64
+
+INIT_ZMM avx512
+cglobal blockcopy_ps_32x64, 4, 6, 4
+ add r1, r1
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
+ RET
;-----------------------------------------------------------------------------
; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
@@ -3399,17 +3486,7 @@
jnz .loop
RET
-;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
-;-----------------------------------------------------------------------------
-INIT_ZMM avx512
-cglobal blockcopy_ps_64x64, 4, 7, 4
- add r1, r1
- mov r4d, 64/8
- lea r5, [3 * r3]
- lea r6, [3 * r1]
-.loop:
-%rep 2
+%macro PROCESS_BLOCKCOPY_PS_64x8_AVX512 0
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 32]
pmovzxbw m2, [r2 + r3]
@@ -3421,18 +3498,65 @@
pmovzxbw m0, [r2 + r3 * 2]
pmovzxbw m1, [r2 + r3 * 2 + 32]
- pmovzxbw m2, [r2 + r5]
- pmovzxbw m3, [r2 + r5 + 32]
+ pmovzxbw m2, [r2 + r4]
+ pmovzxbw m3, [r2 + r4 + 32]
movu [r0 + r1 * 2], m0
movu [r0 + r1 * 2 + 64], m1
- movu [r0 + r6], m2
- movu [r0 + r6 + 64], m3
+ movu [r0 + r5], m2
+ movu [r0 + r5 + 64], m3
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
-%endrep
- dec r4d
- jnz .loop
+
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 32]
+ pmovzxbw m2, [r2 + r3]
+ pmovzxbw m3, [r2 + r3 + 32]
+ movu [r0], m0
+ movu [r0 + 64], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 64], m3
+
+ pmovzxbw m0, [r2 + r3 * 2]
+ pmovzxbw m1, [r2 + r3 * 2 + 32]
+ pmovzxbw m2, [r2 + r4]
+ pmovzxbw m3, [r2 + r4 + 32]
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + 64], m1
+ movu [r0 + r5], m2
+ movu [r0 + r5 + 64], m3
+%endmacro
+;-----------------------------------------------------------------------------
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockcopy_ps_64x64, 4, 6, 4
+ add r1, r1
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
RET
;-----------------------------------------------------------------------------
diff -r 09159f73f47b -r 984cad60283b source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Tue Jul 25 12:58:16 2017 +0530
+++ b/source/common/x86/mc-a.asm Tue Jul 25 16:37:38 2017 +0530
@@ -2892,17 +2892,85 @@
ADDAVG_W64_H2_AVX2 48
ADDAVG_W64_H2_AVX2 64
-%macro ADDAVG_W64_H2_AVX512 1
-INIT_ZMM avx512
-cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
- vbroadcasti32x8 m4, [pw_256]
- vbroadcasti32x8 m5, [pw_128]
+%macro ADDAVG_W48_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ mova m4, [pw_256]
+ mova m5, [pw_128]
add r3, r3
add r4, r4
- mov r6d, %1/16
+ mov r6d, %1/2
.loop:
-%rep 8
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + 32]
+ movu m2, [r1 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2], m0
+
+ movu m0, [r0 + 64]
+ movu m1, [r1 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ vpermq m0, m0, 11011000b
+ vextracti128 [r2 + 32], m0, 0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ movu m1, [r0 + r3 + 32]
+ movu m2, [r1 + r4 + 32]
+ paddw m1, m2
+ pmulhrsw m1, m4
+ paddw m1, m5
+
+ packuswb m0, m1
+ vpermq m0, m0, 11011000b
+ movu [r2 + r5], m0
+
+ movu m0, [r0 + r3 + 64]
+ movu m1, [r1 + r4 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+
+ packuswb m0, m0
+ vpermq m0, m0, 11011000b
+ vextracti128 [r2 + r5 + 32], m0, 0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W48_H2_AVX2 64
+
+;-----------------------------------------------------------------------------
+; addAvg avx2 code end
+;-----------------------------------------------------------------------------
+; addAvg avx512 code start
+;-----------------------------------------------------------------------------
+%macro PROCESS_ADDAVG_64x2_AVX512 0
movu m0, [r0]
movu m1, [r1]
movu m2, [r0 + 64]
@@ -2919,7 +2987,6 @@
vshufi64x2 m0, m0, 11011000b
movu [r2], m0
-
movu m0, [r0 + r3]
movu m1, [r1 + r4]
movu m2, [r0 + r3 + 64]
@@ -2935,99 +3002,35 @@
vpermq m0, m0, 11011000b
vshufi64x2 m0, m0, 11011000b
movu [r2 + r5], m0
-
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------------
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+;--------------------------------------------------------------------------------------------------------------------
+%macro ADDAVG_W64_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_64x%1, 6,6,6
+ vbroadcasti32x8 m4, [pw_256]
+ vbroadcasti32x8 m5, [pw_128]
+ add r3, r3
+ add r4, r4
+
+%rep %1/2 - 1
+ PROCESS_ADDAVG_64x2_AVX512
lea r2, [r2 + 2 * r5]
lea r0, [r0 + 2 * r3]
lea r1, [r1 + 2 * r4]
%endrep
-
- dec r6d
- jnz .loop
+ PROCESS_ADDAVG_64x2_AVX512
RET
%endmacro
-ADDAVG_W64_H2_AVX512 16
-ADDAVG_W64_H2_AVX512 32
-ADDAVG_W64_H2_AVX512 48
-ADDAVG_W64_H2_AVX512 64
-
-%macro ADDAVG_W48_H2_AVX2 1
-INIT_YMM avx2
-cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
- mova m4, [pw_256]
- mova m5, [pw_128]
- add r3, r3
- add r4, r4
- mov r6d, %1/2
-
-.loop:
- movu m0, [r0]
- movu m1, [r1]
- paddw m0, m1
- pmulhrsw m0, m4
- paddw m0, m5
-
- movu m1, [r0 + 32]
- movu m2, [r1 + 32]
- paddw m1, m2
- pmulhrsw m1, m4
- paddw m1, m5
-
- packuswb m0, m1
- vpermq m0, m0, 11011000b
- movu [r2], m0
-
- movu m0, [r0 + 64]
- movu m1, [r1 + 64]
- paddw m0, m1
- pmulhrsw m0, m4
- paddw m0, m5
-
- packuswb m0, m0
- vpermq m0, m0, 11011000b
- vextracti128 [r2 + 32], m0, 0
-
- movu m0, [r0 + r3]
- movu m1, [r1 + r4]
- paddw m0, m1
- pmulhrsw m0, m4
- paddw m0, m5
-
- movu m1, [r0 + r3 + 32]
- movu m2, [r1 + r4 + 32]
- paddw m1, m2
- pmulhrsw m1, m4
- paddw m1, m5
-
- packuswb m0, m1
- vpermq m0, m0, 11011000b
- movu [r2 + r5], m0
-
- movu m0, [r0 + r3 + 64]
- movu m1, [r1 + r4 + 64]
- paddw m0, m1
- pmulhrsw m0, m4
- paddw m0, m5
-
- packuswb m0, m0
- vpermq m0, m0, 11011000b
- vextracti128 [r2 + r5 + 32], m0, 0
-
- lea r2, [r2 + 2 * r5]
- lea r0, [r0 + 2 * r3]
- lea r1, [r1 + 2 * r4]
-
- dec r6d
- jnz .loop
- RET
-%endmacro
-
-ADDAVG_W48_H2_AVX2 64
-
+ADDAVG_W64_AVX512 16
+ADDAVG_W64_AVX512 32
+ADDAVG_W64_AVX512 48
+ADDAVG_W64_AVX512 64
;-----------------------------------------------------------------------------
-; addAvg avx2 code end
-;-----------------------------------------------------------------------------
-
+; addAvg avx512 code end
;-----------------------------------------------------------------------------
%macro ADDAVG_W24_H2 2
INIT_XMM sse4
More information about the x265-devel
mailing list