[x265] [PATCH 210 of 307] x86: AVX512 cleanup main profile sad, sad_x3 sad_x4 implementation
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:28 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1511870877 -19800
# Tue Nov 28 17:37:57 2017 +0530
# Node ID 240ae5a46e63d3bebd8a4db63a5662a4000d70a7
# Parent af867976d51969b1770e6bcffd80e0389c88b561
x86: AVX512 cleanup main profile sad, sad_x3 sad_x4 implementation
diff -r af867976d519 -r 240ae5a46e63 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Tue Nov 28 15:52:13 2017 +0530
+++ b/source/common/x86/sad-a.asm Tue Nov 28 17:37:57 2017 +0530
@@ -4132,7 +4132,7 @@
;------------------------------------------------------------
;sad_x4 avx512 code start
;------------------------------------------------------------
-%macro SAD_X4_64x8_AVX512 0
+%macro PROCESS_SAD_X4_64x4_AVX512 0
movu m4, [r0]
movu m5, [r1]
movu m6, [r2]
@@ -4140,12 +4140,13 @@
movu m8, [r4]
psadbw m9, m4, m5
+ psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
+
paddd m0, m9
- psadbw m5, m4, m6
paddd m1, m5
- psadbw m6, m4, m7
paddd m2, m6
- psadbw m4, m8
paddd m3, m4
movu m4, [r0 + FENC_STRIDE]
@@ -4155,12 +4156,12 @@
movu m8, [r4 + r5]
psadbw m9, m4, m5
+ psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
paddd m0, m9
- psadbw m5, m4, m6
paddd m1, m5
- psadbw m6, m4, m7
paddd m2, m6
- psadbw m4, m8
paddd m3, m4
movu m4, [r0 + FENC_STRIDE * 2]
@@ -4170,12 +4171,13 @@
movu m8, [r4 + r5 * 2]
psadbw m9, m4, m5
+ psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
+
paddd m0, m9
- psadbw m5, m4, m6
paddd m1, m5
- psadbw m6, m4, m7
paddd m2, m6
- psadbw m4, m8
paddd m3, m4
movu m4, [r0 + FENC_STRIDE * 3]
@@ -4185,314 +4187,135 @@
movu m8, [r4 + r7]
psadbw m9, m4, m5
+ psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
paddd m0, m9
- psadbw m5, m4, m6
paddd m1, m5
- psadbw m6, m4, m7
paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
-
- movu m4, [r0]
- movu m5, [r1]
- movu m6, [r2]
- movu m7, [r3]
- movu m8, [r4]
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu m4, [r0 + FENC_STRIDE]
- movu m5, [r1 + r5]
- movu m6, [r2 + r5]
- movu m7, [r3 + r5]
- movu m8, [r4 + r5]
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu m4, [r0 + FENC_STRIDE * 2]
- movu m5, [r1 + r5 * 2]
- movu m6, [r2 + r5 * 2]
- movu m7, [r3 + r5 * 2]
- movu m8, [r4 + r5 * 2]
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu m4, [r0 + FENC_STRIDE * 3]
- movu m5, [r1 + r7]
- movu m6, [r2 + r7]
- movu m7, [r3 + r7]
- movu m8, [r4 + r7]
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
paddd m3, m4
%endmacro
-%macro SAD_X4_32x8_AVX512 0
+%macro PROCESS_SAD_X4_32x4_AVX512 0
movu ym4, [r0]
+ movu ym5, [r1]
+ movu ym6, [r2]
+ movu ym7, [r3]
+ movu ym8, [r4]
+
vinserti32x8 m4, [r0 + FENC_STRIDE], 1
- movu ym5, [r1]
vinserti32x8 m5, [r1 + r5], 1
- movu ym6, [r2]
vinserti32x8 m6, [r2 + r5], 1
- movu ym7, [r3]
vinserti32x8 m7, [r3 + r5], 1
- movu ym8, [r4]
vinserti32x8 m8, [r4 + r5], 1
psadbw m9, m4, m5
+ psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
+
paddd m0, m9
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m4
+
+ movu ym4, [r0 + FENC_STRIDE * 2]
+ movu ym5, [r1 + r5 * 2]
+ movu ym6, [r2 + r5 * 2]
+ movu ym7, [r3 + r5 * 2]
+ movu ym8, [r4 + r5 * 2]
+
+ vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
+ vinserti32x8 m5, [r1 + r7], 1
+ vinserti32x8 m6, [r2 + r7], 1
+ vinserti32x8 m7, [r3 + r7], 1
+ vinserti32x8 m8, [r4 + r7], 1
+
+ psadbw m9, m4, m5
psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
+
+ paddd m0, m9
paddd m1, m5
- psadbw m6, m4, m7
paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu ym4, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
- movu ym5, [r1 + r5 * 2]
- vinserti32x8 m5, [r1 + r7], 1
- movu ym6, [r2 + r5 * 2]
- vinserti32x8 m6, [r2 + r7], 1
- movu ym7, [r3 + r5 * 2]
- vinserti32x8 m7, [r3 + r7], 1
- movu ym8, [r4 + r5 * 2]
- vinserti32x8 m8, [r4 + r7], 1
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
-
- movu ym4, [r0]
- vinserti32x8 m4, [r0 + FENC_STRIDE], 1
- movu ym5, [r1]
- vinserti32x8 m5, [r1 + r5], 1
- movu ym6, [r2]
- vinserti32x8 m6, [r2 + r5], 1
- movu ym7, [r3]
- vinserti32x8 m7, [r3 + r5], 1
- movu ym8, [r4]
- vinserti32x8 m8, [r4 + r5], 1
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu ym4, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
- movu ym5, [r1 + r5 * 2]
- vinserti32x8 m5, [r1 + r7], 1
- movu ym6, [r2 + r5 * 2]
- vinserti32x8 m6, [r2 + r7], 1
- movu ym7, [r3 + r5 * 2]
- vinserti32x8 m7, [r3 + r7], 1
- movu ym8, [r4 + r5 * 2]
- vinserti32x8 m8, [r4 + r7], 1
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
paddd m3, m4
%endmacro
-%macro SAD_X4_48x8_AVX512 0
+%macro PROCESS_SAD_X4_48x4_AVX512 0
movu ym4, [r0]
+ movu ym5, [r1]
+ movu ym6, [r2]
+ movu ym7, [r3]
+ movu ym8, [r4]
+
vinserti32x8 m4, [r0 + FENC_STRIDE], 1
- movu ym5, [r1]
vinserti32x8 m5, [r1 + r5], 1
- movu ym6, [r2]
vinserti32x8 m6, [r2 + r5], 1
- movu ym7, [r3]
vinserti32x8 m7, [r3 + r5], 1
- movu ym8, [r4]
vinserti32x8 m8, [r4 + r5], 1
psadbw m9, m4, m5
+ psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
+
paddd m0, m9
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m4
+
+ movu ym4, [r0 + FENC_STRIDE * 2]
+ movu ym5, [r1 + r5 * 2]
+ movu ym6, [r2 + r5 * 2]
+ movu ym7, [r3 + r5 * 2]
+ movu ym8, [r4 + r5 * 2]
+
+ vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
+ vinserti32x8 m5, [r1 + r7], 1
+ vinserti32x8 m6, [r2 + r7], 1
+ vinserti32x8 m7, [r3 + r7], 1
+ vinserti32x8 m8, [r4 + r7], 1
+
+ psadbw m9, m4, m5
psadbw m5, m4, m6
+ psadbw m6, m4, m7
+ psadbw m4, m8
+
+ paddd m0, m9
paddd m1, m5
+ paddd m2, m6
+ paddd m3, m4
+
+ movu xm4, [r0 + mmsize/2]
+ movu xm5, [r1 + mmsize/2]
+ movu xm6, [r2 + mmsize/2]
+ movu xm7, [r3 + mmsize/2]
+ movu xm8, [r4 + mmsize/2]
+ vinserti32x4 m4, [r0 + FENC_STRIDE + mmsize/2], 1
+ vinserti32x4 m5, [r1 + r5 + mmsize/2], 1
+ vinserti32x4 m6, [r2 + r5 + mmsize/2], 1
+ vinserti32x4 m7, [r3 + r5 + mmsize/2], 1
+ vinserti32x4 m8, [r4 + r5 + mmsize/2], 1
+
+ vinserti32x4 m4, [r0 + FENC_STRIDE * 2 + mmsize/2], 2
+ vinserti32x4 m5, [r1 + r5 * 2 + mmsize/2], 2
+ vinserti32x4 m6, [r2 + r5 * 2 + mmsize/2], 2
+ vinserti32x4 m7, [r3 + r5 * 2 + mmsize/2], 2
+ vinserti32x4 m8, [r4 + r5 * 2 + mmsize/2], 2
+ vinserti32x4 m4, [r0 + FENC_STRIDE * 3 + mmsize/2], 3
+ vinserti32x4 m5, [r1 + r7 + mmsize/2], 3
+ vinserti32x4 m6, [r2 + r7 + mmsize/2], 3
+ vinserti32x4 m7, [r3 + r7 + mmsize/2], 3
+ vinserti32x4 m8, [r4 + r7 + mmsize/2], 3
+
+ psadbw m9, m4, m5
+ psadbw m5, m4, m6
psadbw m6, m4, m7
+ psadbw m4, m8
+ paddd m0, m9
+ paddd m1, m5
paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu ym4, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
- movu ym5, [r1 + r5 * 2]
- vinserti32x8 m5, [r1 + r7], 1
- movu ym6, [r2 + r5 * 2]
- vinserti32x8 m6, [r2 + r7], 1
- movu ym7, [r3 + r5 * 2]
- vinserti32x8 m7, [r3 + r7], 1
- movu ym8, [r4 + r5 * 2]
- vinserti32x8 m8, [r4 + r7], 1
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu xm4, [r0 + 32]
- vinserti32x4 m4, [r0 + FENC_STRIDE + 32], 1
- vinserti32x4 m4, [r0 + FENC_STRIDE * 2 + 32], 2
- vinserti32x4 m4, [r0 + FENC_STRIDE * 3 + 32], 3
- movu xm5, [r1 + 32]
- vinserti32x4 m5, [r1 + r5 + 32], 1
- vinserti32x4 m5, [r1 + r5 * 2 + 32], 2
- vinserti32x4 m5, [r1 + r7 + 32], 3
- movu xm6, [r2 + 32]
- vinserti32x4 m6, [r2 + r5 + 32], 1
- vinserti32x4 m6, [r2 + r5 * 2 + 32], 2
- vinserti32x4 m6, [r2 + r7 + 32], 3
- movu xm7, [r3 + 32]
- vinserti32x4 m7, [r3 + r5 + 32], 1
- vinserti32x4 m7, [r3 + r5 * 2 + 32], 2
- vinserti32x4 m7, [r3 + r7 + 32], 3
- movu xm8, [r4 + 32]
- vinserti32x4 m8, [r4 + r5 + 32], 1
- vinserti32x4 m8, [r4 + r5 * 2 + 32], 2
- vinserti32x4 m8, [r4 + r7 + 32], 3
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
-
- movu ym4, [r0]
- vinserti32x8 m4, [r0 + FENC_STRIDE], 1
- movu ym5, [r1]
- vinserti32x8 m5, [r1 + r5], 1
- movu ym6, [r2]
- vinserti32x8 m6, [r2 + r5], 1
- movu ym7, [r3]
- vinserti32x8 m7, [r3 + r5], 1
- movu ym8, [r4]
- vinserti32x8 m8, [r4 + r5], 1
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu ym4, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
- movu ym5, [r1 + r5 * 2]
- vinserti32x8 m5, [r1 + r7], 1
- movu ym6, [r2 + r5 * 2]
- vinserti32x8 m6, [r2 + r7], 1
- movu ym7, [r3 + r5 * 2]
- vinserti32x8 m7, [r3 + r7], 1
- movu ym8, [r4 + r5 * 2]
- vinserti32x8 m8, [r4 + r7], 1
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
- paddd m3, m4
-
- movu xm4, [r0 + 32]
- vinserti32x4 m4, [r0 + FENC_STRIDE + 32], 1
- vinserti32x4 m4, [r0 + FENC_STRIDE * 2 + 32], 2
- vinserti32x4 m4, [r0 + FENC_STRIDE * 3 + 32], 3
- movu xm5, [r1 + 32]
- vinserti32x4 m5, [r1 + r5 + 32], 1
- vinserti32x4 m5, [r1 + r5 * 2 + 32], 2
- vinserti32x4 m5, [r1 + r7 + 32], 3
- movu xm6, [r2 + 32]
- vinserti32x4 m6, [r2 + r5 + 32], 1
- vinserti32x4 m6, [r2 + r5 * 2 + 32], 2
- vinserti32x4 m6, [r2 + r7 + 32], 3
- movu xm7, [r3 + 32]
- vinserti32x4 m7, [r3 + r5 + 32], 1
- vinserti32x4 m7, [r3 + r5 * 2 + 32], 2
- vinserti32x4 m7, [r3 + r7 + 32], 3
- movu xm8, [r4 + 32]
- vinserti32x4 m8, [r4 + r5 + 32], 1
- vinserti32x4 m8, [r4 + r5 * 2 + 32], 2
- vinserti32x4 m8, [r4 + r7 + 32], 3
-
- psadbw m9, m4, m5
- paddd m0, m9
- psadbw m5, m4, m6
- paddd m1, m5
- psadbw m6, m4, m7
- paddd m2, m6
- psadbw m4, m8
paddd m3, m4
%endmacro
@@ -4527,341 +4350,38 @@
movd [r6 + 12], xm3
%endmacro
+%macro SAD_X4_AVX512 2
INIT_ZMM avx512
-cglobal pixel_sad_x4_64x16, 7,8,10
+cglobal pixel_sad_x4_%1x%2, 7,8,10
pxor m0, m0
pxor m1, m1
pxor m2, m2
pxor m3, m3
lea r7, [r5 * 3]
- SAD_X4_64x8_AVX512
+%rep %2/4 - 1
+ PROCESS_SAD_X4_%1x4_AVX512
add r0, FENC_STRIDE * 4
lea r1, [r1 + r5 * 4]
lea r2, [r2 + r5 * 4]
lea r3, [r3 + r5 * 4]
lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
+%endrep
+ PROCESS_SAD_X4_%1x4_AVX512
PIXEL_SAD_X4_END_AVX512
RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_64x32, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_64x48, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_64x64, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_64x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_32x8, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_32x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_32x16, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_32x24, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_32x32, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_32x64, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_32x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x4_48x64, 7,8,10
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- lea r7, [r5 * 3]
-
- SAD_X4_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r5 * 4]
- lea r2, [r2 + r5 * 4]
- lea r3, [r3 + r5 * 4]
- lea r4, [r4 + r5 * 4]
- SAD_X4_48x8_AVX512
- PIXEL_SAD_X4_END_AVX512
- RET
+%endmacro
+
+SAD_X4_AVX512 64, 64
+SAD_X4_AVX512 64, 48
+SAD_X4_AVX512 64, 32
+SAD_X4_AVX512 64, 16
+SAD_X4_AVX512 32, 64
+SAD_X4_AVX512 32, 32
+SAD_X4_AVX512 32, 24
+SAD_X4_AVX512 32, 16
+SAD_X4_AVX512 32, 8
+SAD_X4_AVX512 48, 64
;------------------------------------------------------------
;sad_x4 avx512 code end
;------------------------------------------------------------
@@ -6334,17 +5854,18 @@
;------------------------------------------------------------
;sad_x3 avx512 code start
;------------------------------------------------------------
-%macro SAD_X3_64x8_AVX512 0
+%macro PROCESS_SAD_X3_64x4_AVX512 0
movu m3, [r0]
movu m4, [r1]
movu m5, [r2]
movu m6, [r3]
psadbw m7, m3, m4
+ psadbw m4, m3, m5
+ psadbw m3, m6
+
paddd m0, m7
- psadbw m4, m3, m5
paddd m1, m4
- psadbw m3, m6
paddd m2, m3
movu m3, [r0 + FENC_STRIDE]
@@ -6353,10 +5874,11 @@
movu m6, [r3 + r4]
psadbw m7, m3, m4
+ psadbw m4, m3, m5
+ psadbw m3, m6
+
paddd m0, m7
- psadbw m4, m3, m5
paddd m1, m4
- psadbw m3, m6
paddd m2, m3
movu m3, [r0 + FENC_STRIDE * 2]
@@ -6365,10 +5887,11 @@
movu m6, [r3 + r4 * 2]
psadbw m7, m3, m4
+ psadbw m4, m3, m5
+ psadbw m3, m6
+
paddd m0, m7
- psadbw m4, m3, m5
paddd m1, m4
- psadbw m3, m6
paddd m2, m3
movu m3, [r0 + FENC_STRIDE * 3]
@@ -6377,253 +5900,108 @@
movu m6, [r3 + r6]
psadbw m7, m3, m4
+ psadbw m4, m3, m5
+ psadbw m3, m6
+
paddd m0, m7
- psadbw m4, m3, m5
paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
-
- movu m3, [r0]
- movu m4, [r1]
- movu m5, [r2]
- movu m6, [r3]
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu m3, [r0 + FENC_STRIDE]
- movu m4, [r1 + r4]
- movu m5, [r2 + r4]
- movu m6, [r3 + r4]
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu m3, [r0 + FENC_STRIDE * 2]
- movu m4, [r1 + r4 * 2]
- movu m5, [r2 + r4 * 2]
- movu m6, [r3 + r4 * 2]
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu m3, [r0 + FENC_STRIDE * 3]
- movu m4, [r1 + r6]
- movu m5, [r2 + r6]
- movu m6, [r3 + r6]
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
paddd m2, m3
%endmacro
-%macro SAD_X3_32x8_AVX512 0
+%macro PROCESS_SAD_X3_32x4_AVX512 0
movu ym3, [r0]
+ movu ym4, [r1]
+ movu ym5, [r2]
+ movu ym6, [r3]
vinserti32x8 m3, [r0 + FENC_STRIDE], 1
- movu ym4, [r1]
vinserti32x8 m4, [r1 + r4], 1
- movu ym5, [r2]
vinserti32x8 m5, [r2 + r4], 1
- movu ym6, [r3]
vinserti32x8 m6, [r3 + r4], 1
psadbw m7, m3, m4
+ psadbw m4, m3, m5
+ psadbw m3, m6
+
paddd m0, m7
+ paddd m1, m4
+ paddd m2, m3
+
+ movu ym3, [r0 + FENC_STRIDE * 2]
+ movu ym4, [r1 + r4 * 2]
+ movu ym5, [r2 + r4 * 2]
+ movu ym6, [r3 + r4 * 2]
+ vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
+ vinserti32x8 m4, [r1 + r6], 1
+ vinserti32x8 m5, [r2 + r6], 1
+ vinserti32x8 m6, [r3 + r6], 1
+
+ psadbw m7, m3, m4
psadbw m4, m3, m5
+ psadbw m3, m6
+
+ paddd m0, m7
paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu ym3, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
- movu ym4, [r1 + r4 * 2]
- vinserti32x8 m4, [r1 + r6], 1
- movu ym5, [r2 + r4 * 2]
- vinserti32x8 m5, [r2 + r6], 1
- movu ym6, [r3 + r4 * 2]
- vinserti32x8 m6, [r3 + r6], 1
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
-
- movu ym3, [r0]
- vinserti32x8 m3, [r0 + FENC_STRIDE], 1
- movu ym4, [r1]
- vinserti32x8 m4, [r1 + r4], 1
- movu ym5, [r2]
- vinserti32x8 m5, [r2 + r4], 1
- movu ym6, [r3]
- vinserti32x8 m6, [r3 + r4], 1
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu ym3, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
- movu ym4, [r1 + r4 * 2]
- vinserti32x8 m4, [r1 + r6], 1
- movu ym5, [r2 + r4 * 2]
- vinserti32x8 m5, [r2 + r6], 1
- movu ym6, [r3 + r4 * 2]
- vinserti32x8 m6, [r3 + r6], 1
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
paddd m2, m3
%endmacro
-%macro SAD_X3_48x8_AVX512 0
+%macro PROCESS_SAD_X3_48x4_AVX512 0
movu ym3, [r0]
+ movu ym4, [r1]
+ movu ym5, [r2]
+ movu ym6, [r3]
vinserti32x8 m3, [r0 + FENC_STRIDE], 1
- movu ym4, [r1]
vinserti32x8 m4, [r1 + r4], 1
- movu ym5, [r2]
vinserti32x8 m5, [r2 + r4], 1
- movu ym6, [r3]
vinserti32x8 m6, [r3 + r4], 1
psadbw m7, m3, m4
+ psadbw m4, m3, m5
+ psadbw m3, m6
+
paddd m0, m7
+ paddd m1, m4
+ paddd m2, m3
+
+ movu ym3, [r0 + FENC_STRIDE * 2]
+ movu ym4, [r1 + r4 * 2]
+ movu ym5, [r2 + r4 * 2]
+ movu ym6, [r3 + r4 * 2]
+ vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
+ vinserti32x8 m4, [r1 + r6], 1
+ vinserti32x8 m5, [r2 + r6], 1
+ vinserti32x8 m6, [r3 + r6], 1
+
+ psadbw m7, m3, m4
psadbw m4, m3, m5
+ psadbw m3, m6
+
+ paddd m0, m7
paddd m1, m4
+ paddd m2, m3
+
+ movu xm3, [r0 + mmsize/2]
+ movu xm4, [r1 + mmsize/2]
+ movu xm5, [r2 + mmsize/2]
+ movu xm6, [r3 + mmsize/2]
+ vinserti32x4 m3, [r0 + FENC_STRIDE + mmsize/2], 1
+ vinserti32x4 m4, [r1 + r4 + mmsize/2], 1
+ vinserti32x4 m5, [r2 + r4 + mmsize/2], 1
+ vinserti32x4 m6, [r3 + r4 + mmsize/2], 1
+
+ vinserti32x4 m3, [r0 + 2 * FENC_STRIDE + mmsize/2], 2
+ vinserti32x4 m4, [r1 + 2 * r4 + mmsize/2], 2
+ vinserti32x4 m5, [r2 + 2 * r4 + mmsize/2], 2
+ vinserti32x4 m6, [r3 + 2 * r4 + mmsize/2], 2
+ vinserti32x4 m3, [r0 + 3 * FENC_STRIDE + mmsize/2], 3
+ vinserti32x4 m4, [r1 + r6 + mmsize/2], 3
+ vinserti32x4 m5, [r2 + r6 + mmsize/2], 3
+ vinserti32x4 m6, [r3 + r6 + mmsize/2], 3
+
+ psadbw m7, m3, m4
+ psadbw m4, m3, m5
psadbw m3, m6
- paddd m2, m3
-
- movu ym3, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
- movu ym4, [r1 + r4 * 2]
- vinserti32x8 m4, [r1 + r6], 1
- movu ym5, [r2 + r4 * 2]
- vinserti32x8 m5, [r2 + r6], 1
- movu ym6, [r3 + r4 * 2]
- vinserti32x8 m6, [r3 + r6], 1
-
- psadbw m7, m3, m4
paddd m0, m7
- psadbw m4, m3, m5
paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu xm3, [r0 + 32]
- vinserti32x4 m3, [r0 + FENC_STRIDE + 32], 1
- vinserti32x4 m3, [r0 + 2 * FENC_STRIDE + 32], 2
- vinserti32x4 m3, [r0 + 3 * FENC_STRIDE + 32], 3
- movu xm4, [r1 + 32]
- vinserti32x4 m4, [r1 + r4 + 32], 1
- vinserti32x4 m4, [r1 + 2 * r4 + 32], 2
- vinserti32x4 m4, [r1 + r6 + 32], 3
- movu xm5, [r2 + 32]
- vinserti32x4 m5, [r2 + r4 + 32], 1
- vinserti32x4 m5, [r2 + 2 * r4 + 32], 2
- vinserti32x4 m5, [r2 + r6 + 32], 3
- movu xm6, [r3 + 32]
- vinserti32x4 m6, [r3 + r4 + 32], 1
- vinserti32x4 m6, [r3 + 2 * r4 + 32], 2
- vinserti32x4 m6, [r3 + r6 + 32], 3
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
-
- movu ym3, [r0]
- vinserti32x8 m3, [r0 + FENC_STRIDE], 1
- movu ym4, [r1]
- vinserti32x8 m4, [r1 + r4], 1
- movu ym5, [r2]
- vinserti32x8 m5, [r2 + r4], 1
- movu ym6, [r3]
- vinserti32x8 m6, [r3 + r4], 1
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu ym3, [r0 + FENC_STRIDE * 2]
- vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
- movu ym4, [r1 + r4 * 2]
- vinserti32x8 m4, [r1 + r6], 1
- movu ym5, [r2 + r4 * 2]
- vinserti32x8 m5, [r2 + r6], 1
- movu ym6, [r3 + r4 * 2]
- vinserti32x8 m6, [r3 + r6], 1
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
- paddd m2, m3
-
- movu xm3, [r0 + 32]
- vinserti32x4 m3, [r0 + FENC_STRIDE + 32], 1
- vinserti32x4 m3, [r0 + 2 * FENC_STRIDE + 32], 2
- vinserti32x4 m3, [r0 + 3 * FENC_STRIDE + 32], 3
- movu xm4, [r1 + 32]
- vinserti32x4 m4, [r1 + r4 + 32], 1
- vinserti32x4 m4, [r1 + 2 * r4 + 32], 2
- vinserti32x4 m4, [r1 + r6 + 32], 3
- movu xm5, [r2 + 32]
- vinserti32x4 m5, [r2 + r4 + 32], 1
- vinserti32x4 m5, [r2 + 2 * r4 + 32], 2
- vinserti32x4 m5, [r2 + r6 + 32], 3
- movu xm6, [r3 + 32]
- vinserti32x4 m6, [r3 + r4 + 32], 1
- vinserti32x4 m6, [r3 + 2 * r4 + 32], 2
- vinserti32x4 m6, [r3 + r6 + 32], 3
-
- psadbw m7, m3, m4
- paddd m0, m7
- psadbw m4, m3, m5
- paddd m1, m4
- psadbw m3, m6
paddd m2, m3
%endmacro
@@ -6651,295 +6029,36 @@
movd [r5 + 8], xm2
%endmacro
+%macro SAD_X3_AVX512 2
INIT_ZMM avx512
-cglobal pixel_sad_x3_64x16, 6,7,8
+cglobal pixel_sad_x3_%1x%2, 6,7,8
pxor m0, m0
pxor m1, m1
pxor m2, m2
lea r6, [r4 * 3]
- SAD_X3_64x8_AVX512
+%rep %2/4 - 1
+ PROCESS_SAD_X3_%1x4_AVX512
add r0, FENC_STRIDE * 4
lea r1, [r1 + r4 * 4]
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
+%endrep
+ PROCESS_SAD_X3_%1x4_AVX512
PIXEL_SAD_X3_END_AVX512
RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_64x32, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_64x48, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_64x64, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_64x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_32x8, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_32x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_32x16, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_32x24, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_32x32, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_32x64, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_32x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_x3_48x64, 6,7,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- lea r6, [r4 * 3]
-
- SAD_X3_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_48x8_AVX512
- add r0, FENC_STRIDE * 4
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r4 * 4]
- SAD_X3_48x8_AVX512
- PIXEL_SAD_X3_END_AVX512
- RET
+%endmacro
+
+SAD_X3_AVX512 64, 64
+SAD_X3_AVX512 64, 48
+SAD_X3_AVX512 64, 32
+SAD_X3_AVX512 64, 16
+SAD_X3_AVX512 32, 64
+SAD_X3_AVX512 32, 32
+SAD_X3_AVX512 32, 24
+SAD_X3_AVX512 32, 16
+SAD_X3_AVX512 32, 8
+SAD_X3_AVX512 48, 64
;------------------------------------------------------------
;sad_x3 avx512 code end
;------------------------------------------------------------
@@ -7565,7 +6684,7 @@
movd eax, xm0
RET
-%macro PROCESS_SAD_64x8_AVX512 0
+%macro PROCESS_SAD_64x4_AVX512 0
movu m1, [r0]
movu m2, [r2]
movu m3, [r0 + r1]
@@ -7573,7 +6692,7 @@
psadbw m1, m2
psadbw m3, m4
paddd m0, m1
- paddd m5, m3
+ paddd m0, m3
movu m1, [r0 + 2 * r1]
movu m2, [r2 + 2 * r3]
movu m3, [r0 + r5]
@@ -7581,53 +6700,16 @@
psadbw m1, m2
psadbw m3, m4
paddd m0, m1
- paddd m5, m3
-
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
-
- movu m1, [r0]
- movu m2, [r2]
- movu m3, [r0 + r1]
- movu m4, [r2 + r3]
- psadbw m1, m2
- psadbw m3, m4
- paddd m0, m1
- paddd m5, m3
- movu m1, [r0 + 2 * r1]
- movu m2, [r2 + 2 * r3]
- movu m3, [r0 + r5]
- movu m4, [r2 + r6]
- psadbw m1, m2
- psadbw m3, m4
- paddd m0, m1
- paddd m5, m3
+ paddd m0, m3
%endmacro
-%macro PROCESS_SAD_32x8_AVX512 0
+%macro PROCESS_SAD_32x4_AVX512 0
movu ym1, [r0]
- movu ym2, [r2]
- vinserti32x8 m1, [r0 + r1], 1
- vinserti32x8 m2, [r2 + r3], 1
+ movu ym2, [r2]
movu ym3, [r0 + 2 * r1]
movu ym4, [r2 + 2 * r3]
- vinserti32x8 m3, [r0 + r5], 1
- vinserti32x8 m4, [r2 + r6], 1
-
- psadbw m1, m2
- psadbw m3, m4
- paddd m0, m1
- paddd m0, m3
-
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
-
- movu ym1, [r0]
- movu ym2, [r2]
vinserti32x8 m1, [r0 + r1], 1
vinserti32x8 m2, [r2 + r3], 1
- movu ym3, [r0 + 2 * r1]
- movu ym4, [r2 + 2 * r3]
vinserti32x8 m3, [r0 + r5], 1
vinserti32x8 m4, [r2 + r6], 1
@@ -7649,191 +6731,30 @@
;-----------------------------------------------------------------------------
; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
+%macro SAD_MxN_AVX512 2
INIT_ZMM avx512
-cglobal pixel_sad_64x16, 4,5,6
- xorps m0, m0
- xorps m5, m5
+cglobal pixel_sad_%1x%2, 4, 7, 5
+ pxor m0, m0
lea r5, [3 * r1]
lea r6, [3 * r3]
- PROCESS_SAD_64x8_AVX512
+%rep %2/4 - 1
+ PROCESS_SAD_%1x4_AVX512
lea r2, [r2 + 4 * r3]
lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- paddd m0, m5
+%endrep
+ PROCESS_SAD_%1x4_AVX512
PROCESS_SAD_AVX512_END
RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_64x32, 4,5,6
- xorps m0, m0
- xorps m5, m5
- lea r5, [3 * r1]
- lea r6, [3 * r3]
-
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- paddd m0, m5
- PROCESS_SAD_AVX512_END
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_64x48, 4,5,6
- xorps m0, m0
- xorps m5, m5
- lea r5, [3 * r1]
- lea r6, [3 * r3]
-
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- paddd m0, m5
- PROCESS_SAD_AVX512_END
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_64x64, 4,5,6
- xorps m0, m0
- xorps m5, m5
- lea r5, [3 * r1]
- lea r6, [3 * r3]
-
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_64x8_AVX512
- paddd m0, m5
- PROCESS_SAD_AVX512_END
- RET
-
-;-----------------------------------------------------------------------------
-; int pixel_sad_32x%1( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-INIT_ZMM avx512
-cglobal pixel_sad_32x8, 4,7,5
- xorps m0, m0
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
-
- PROCESS_SAD_32x8_AVX512
- PROCESS_SAD_AVX512_END
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_32x16, 4,7,5
- xorps m0, m0
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
-
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- PROCESS_SAD_AVX512_END
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_32x24, 4,7,5
- xorps m0, m0
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
-
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- PROCESS_SAD_AVX512_END
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_32x32, 4,7,5
- xorps m0, m0
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
-
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- PROCESS_SAD_AVX512_END
- RET
-
-INIT_ZMM avx512
-cglobal pixel_sad_32x64, 4,7,5
- xorps m0, m0
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
-
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- lea r2, [r2 + 4 * r3]
- lea r0, [r0 + 4 * r1]
- PROCESS_SAD_32x8_AVX512
- PROCESS_SAD_AVX512_END
- RET
+%endmacro
+
+SAD_MxN_AVX512 64, 16
+SAD_MxN_AVX512 64, 32
+SAD_MxN_AVX512 64, 48
+SAD_MxN_AVX512 64, 64
+SAD_MxN_AVX512 32, 8
+SAD_MxN_AVX512 32, 16
+SAD_MxN_AVX512 32, 24
+SAD_MxN_AVX512 32, 32
+SAD_MxN_AVX512 32, 64
%endif
More information about the x265-devel
mailing list