[x265] [PATCH 137 of 307] x86: dct8x8 avx512 asm kernel - improved by 5% over avx2
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:32:15 CEST 2018
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1509515697 -19800
# Wed Nov 01 11:24:57 2017 +0530
# Node ID 709fffa188b3160e05576b7ec77525db2776d146
# Parent 9368a50489d05ef0887f75d9f7697ce69a7e2bf3
x86: dct8x8 avx512 asm kernel - improved by 5% over avx2
diff -r 9368a50489d0 -r 709fffa188b3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Oct 30 17:07:09 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Nov 01 11:24:57 2017 +0530
@@ -2635,6 +2635,8 @@
p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
+ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
+
}
#endif
}
@@ -4567,6 +4569,8 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512);
+ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
+
}
#endif
}
diff -r 9368a50489d0 -r 709fffa188b3 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Oct 30 17:07:09 2017 +0530
+++ b/source/common/x86/const-a.asm Wed Nov 01 11:24:57 2017 +0530
@@ -28,7 +28,7 @@
%include "x86inc.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
;; 8-bit constants
diff -r 9368a50489d0 -r 709fffa188b3 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Mon Oct 30 17:07:09 2017 +0530
+++ b/source/common/x86/dct8.asm Wed Nov 01 11:24:57 2017 +0530
@@ -28,7 +28,15 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+dct8_shuf5_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
+dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
+dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
+dct8_shuf_AVX512: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
+
tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64
dw 89, 75, 50, 18, -18, -50, -75, -89
dw 83, 36, -36, -83, -83, -36, 36, 83
@@ -38,7 +46,10 @@
dw 36, -83, 83, -36, -36, 83, -83, 36
dw 18, -50, 75, -89, 89, -75, 50, -18
-dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
+tab_dct8_avx512: dw 64, 64, 64, 64, 89, 75, 50, 18
+ dw 83, 36, -36, -83, 75, -18, -89, -50
+ dw 64, -64, -64, 64, 50, -89, 18, 75
+ dw 36, -83, 83, -36, 18, -50, 75, -89
tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64
dw 90, 87, 80, 70, 57, 43, 25, 9
@@ -57,7 +68,6 @@
dw 18, -50, 75, -89, 89, -75, 50, -18
dw 9, -25, 43, -57, 70, -80, 87, -90
-
tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64
dw -9, -25, -43, -57, -70, -80, -87, -90
dw -89, -75, -50, -18, 18, 50, 75, 89
@@ -2270,6 +2280,162 @@
movu [r1 + 96], m10
RET
+
+%macro DCT8_AVX512_PASS_1 4
+ vpmaddwd m%2, m3, m%1
+ vpshufb m8, m%2, m6
+ vpaddd m%2, m8
+ vpermd m%2, m17, m%2
+
+ vpmaddwd m%4, m2, m%3
+ vpshufb m8, m%4, m6
+ vpaddd m%4, m8
+ vpermd m%4, m17, m%4
+
+ vinserti64x4 m%2, m%2, ym%4, 1
+ vpaddd m%2, m5
+ vpsrad m%2, DCT8_SHIFT1
+ vpackssdw m%2, m%2
+ vpermq m%2, m1, m%2
+%endmacro
+
+%macro DCT8_AVX512_PASS_2 4
+ vpmaddwd m0, m9, m%1
+ vpmaddwd m1, m10, m%1
+ vpshufb m2, m0, m6
+ vpshufb m3, m1, m6
+ vpaddd m0, m2
+ vpaddd m1, m3
+ vpermd m0, m18, m0
+ vpermd m1, m18, m1
+ vinserti64x4 m0, m0, ym1, 1
+ vpshufb m1, m0, m6
+ vpaddd m0, m1
+ vpermd m0, m18, m0
+
+ vpmaddwd m1, m9, m%2
+ vpmaddwd m2, m10, m%2
+ vpshufb m3, m1, m6
+ vpshufb m4, m2, m6
+ vpaddd m1, m3
+ vpaddd m2, m4
+ vpermd m1, m18, m1
+ vpermd m2, m18, m2
+ vinserti64x4 m1, m1, ym2, 1
+ vpshufb m2, m1, m6
+ vpaddd m1, m2
+ vpermd m1, m18, m1
+
+ vinserti64x4 m0, m0, ym1, 1
+ vpaddd m0, m5
+ vpsrad m0, DCT8_SHIFT2
+
+ vpmaddwd m1, m9, m%3
+ vpmaddwd m2, m10, m%3
+ vpshufb m3, m1, m6
+ vpshufb m4, m2, m6
+ vpaddd m1, m3
+ vpaddd m2, m4
+ vpermd m1, m18, m1
+ vpermd m2, m18, m2
+ vinserti64x4 m1, m1, ym2, 1
+ vpshufb m2, m1, m6
+ vpaddd m1, m2
+ vpermd m1, m18, m1
+
+ vpmaddwd m2, m9, m%4
+ vpmaddwd m3, m10, m%4
+ vpshufb m4, m2, m6
+ vpshufb m7, m3, m6
+ vpaddd m2, m4
+ vpaddd m3, m7
+ vpermd m2, m18, m2
+ vpermd m3, m18, m3
+ vinserti64x4 m2, m2, ym3, 1
+ vpshufb m3, m2, m6
+ vpaddd m2, m3
+ vpermd m2, m18, m2
+
+ vinserti64x4 m1, m1, ym2, 1
+ vpaddd m1, m5
+ vpsrad m1, DCT8_SHIFT2
+
+ vpackssdw m0, m1
+ vpermq m0, m19, m0
+%endmacro
+
+INIT_ZMM avx512
+cglobal dct8, 3, 7, 28
+
+ vbroadcasti32x4 m5, [pd_ %+ DCT8_ROUND1]
+ vbroadcasti32x4 m6, [dct8_shuf_AVX512]
+ vbroadcasti32x8 m18, [dct8_shuf4_AVX512]
+ vbroadcasti32x8 m4, [dct8_shuf]
+ mova m19, [dct8_shuf5_AVX512]
+ mova m17, [dct8_shuf8_AVX512]
+
+ add r2d, r2d
+ lea r3, [r2 * 3]
+ lea r4, [r0 + r2 * 4]
+ lea r5, [tab_dct8]
+ lea r6, [tab_dct8_avx512]
+
+ ;pass1
+ mova xm0, [r0]
+ vinserti128 ym0, ym0, [r4], 1
+ mova xm1, [r0 + r2]
+ vinserti128 ym1, ym1, [r4 + r2], 1
+ mova xm2, [r0 + r2 * 2]
+ vinserti128 ym2, ym2, [r4 + r2 * 2], 1
+ mova xm3, [r0 + r3]
+ vinserti128 ym3, ym3, [r4 + r3], 1
+
+ vinserti64x4 m0, m0, ym2, 1
+ vinserti64x4 m1, m1, ym3, 1
+
+ vpunpcklqdq m2, m0, m1
+ vpunpckhqdq m0, m1
+
+ vpshufb m0, m4
+ vpaddw m3, m2, m0
+ vpsubw m2, m0
+ mova m1, [dct8_shuf6_AVX512]
+
+ ; Load all the coefficients togather for better caching
+ vpbroadcastq m20, [r6 + 0 * 8]
+ vpbroadcastq m21, [r6 + 1 * 8]
+ vpbroadcastq m22, [r6 + 2 * 8]
+ vpbroadcastq m23, [r6 + 3 * 8]
+ vpbroadcastq m24, [r6 + 4 * 8]
+ vpbroadcastq m25, [r6 + 5 * 8]
+ vpbroadcastq m26, [r6 + 6 * 8]
+ vpbroadcastq m27, [r6 + 7 * 8]
+
+ DCT8_AVX512_PASS_1 20, 9, 21, 10
+ DCT8_AVX512_PASS_1 22, 11, 23, 12
+ DCT8_AVX512_PASS_1 24, 13, 25, 14
+ DCT8_AVX512_PASS_1 26, 15, 27, 16
+
+ ;pass2
+ vbroadcasti32x4 m5, [pd_ %+ DCT8_ROUND2]
+
+ vinserti64x4 m9, m9, ym11, 1
+ vinserti64x4 m10, m13, ym15, 1
+
+ ;Load all the coefficients togather for better caching
+ vbroadcasti32x4 m21, [r5 + 1 * 16]
+ vbroadcasti32x4 m22, [r5 + 2 * 16]
+ vbroadcasti32x4 m23, [r5 + 3 * 16]
+ vbroadcasti32x4 m25, [r5 + 5 * 16]
+ vbroadcasti32x4 m26, [r5 + 6 * 16]
+ vbroadcasti32x4 m27, [r5 + 7 * 16]
+
+ DCT8_AVX512_PASS_2 20, 21, 22, 23
+ movu [r1], m0
+ DCT8_AVX512_PASS_2 24, 25, 26, 27
+ movu [r1 + 64], m0
+ RET
+
%macro DCT16_PASS_1_E 2
vpbroadcastq m7, [r7 + %1]
diff -r 9368a50489d0 -r 709fffa188b3 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Mon Oct 30 17:07:09 2017 +0530
+++ b/source/common/x86/dct8.h Wed Nov 01 11:24:57 2017 +0530
@@ -43,4 +43,6 @@
void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+
#endif // ifndef X265_DCT8_H
More information about the x265-devel
mailing list