[x265] [PATCH 158 of 307] x86: dct8 PASS1 optimize for shuffle instructions
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:32:36 CEST 2018
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1510574887 28800
# Mon Nov 13 04:08:07 2017 -0800
# Node ID a7ce91c5db95ac0eb3f58b5c993ace3bfe0bbe2f
# Parent 4b01781203a4e7a08cee94346f52a24ac78a3478
x86: dct8 PASS1 optimize for shuffle instructions
diff -r 4b01781203a4 -r a7ce91c5db95 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Sun Nov 12 23:17:58 2017 -0800
+++ b/source/common/x86/dct8.asm Mon Nov 13 04:08:07 2017 -0800
@@ -34,6 +34,7 @@
dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30
dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
dct8_shuf_AVX512: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
@@ -2306,20 +2307,19 @@
%macro DCT8_AVX512_PASS_1 4
vpmaddwd m%2, m3, m%1
- vpshufb m8, m%2, m6
+ vpsrlq m8, m%2, 32
vpaddd m%2, m8
- vpermd m%2, m17, m%2
-
- vpmaddwd m%4, m2, m%3
- vpshufb m8, m%4, m6
- vpaddd m%4, m8
- vpermd m%4, m17, m%4
-
- vinserti64x4 m%2, m%2, ym%4, 1
vpaddd m%2, m5
vpsrad m%2, DCT8_SHIFT1
- vpackssdw m%2, m%2
- vpermq m%2, m19, m%2
+
+ vpmaddwd m%4, m2, m%3
+ vpsrlq m8, m%4, 32
+ vpaddd m%4, m8
+ vpaddd m%4, m5
+ vpsrad m%4, DCT8_SHIFT1
+
+ vpackssdw m%2, m%4
+ vpermw m%2, m1, m%2
%endmacro
%macro DCT8_AVX512_PASS_2 4
@@ -2423,6 +2423,8 @@
vpaddw m3, m2, m0
vpsubw m2, m0
+ vbroadcasti32x8 m1, [dct8_shuf7_AVX512]
+
; Load all the coefficients togather for better caching
vpbroadcastq m20, [r6 + 0 * 8]
vpbroadcastq m21, [r6 + 1 * 8]
More information about the x265-devel
mailing list