[x264-devel] [Git][videolan/x264][master] 6 commits: x86inc: Properly sort instructions in alphabetical order
Henrik Gramner (@gramner)
gitlab at videolan.org
Tue Feb 20 00:39:51 UTC 2024
Henrik Gramner pushed to branch master at VideoLAN / x264
Commits:
436be41f by Henrik Gramner at 2024-02-19T23:49:36+01:00
x86inc: Properly sort instructions in alphabetical order
- - - - -
5207a74e by Henrik Gramner at 2024-02-20T00:02:59+01:00
x86inc: Add template defines for EVEX broadcasts
Broadcasting a memory operand is a binary flag, you either broadcast
or you don't, and there's only a single possible element size for
any given instruction.
The instruction syntax however requires the broadcast semanticts
to be explicitly defined, which is an issue when using macros to
template code for multiple register widths.
Add some helper defines to alleviate the issue.
- - - - -
a6b56179 by Henrik Gramner at 2024-02-20T00:03:09+01:00
x86inc: Add CLMUL cpu flag
Also make the GFNI cpu flag imply the presence of both AESNI and CLMUL.
- - - - -
87476b4c by Henrik Gramner at 2024-02-20T00:03:09+01:00
x86inc: Add a cpu flag for the Ice Lake AVX-512 subset
- - - - -
6fc4480c by Henrik Gramner at 2024-02-20T00:03:09+01:00
x86inc.asm: Add the crc32 SSE4.2 GPR instruction
- - - - -
12426f5f by Henrik Gramner at 2024-02-20T00:03:09+01:00
x86inc: Add support for ELF CET properties
Automatically flag x86-64 asm object files as SHSTK-compatible.
Shadow Stack (SHSTK) is a part of Control-flow Enforcement Technology
(CET) which is a feature aimed at defending against ROP attacks by
verifying that 'call' and 'ret' instructions are correctly matched.
For well-written code this works transparently without any code changes,
as return addresses popped from the shadow stack should match return
addresses popped from the normal stack for performance reasons anyway.
- - - - -
1 changed file:
- common/x86/x86inc.asm
Changes:
=====================================
common/x86/x86inc.asm
=====================================
@@ -852,9 +852,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%1: %2
%endmacro
-; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
%if FORMAT_ELF
+ ; The GNU linker assumes the stack is executable by default.
[SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+
+ %ifdef __NASM_VER__
+ %if __NASM_VER__ >= 0x020e0300 ; 2.14.03
+ %if ARCH_X86_64
+ ; Control-flow Enforcement Technology (CET) properties.
+ [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize]
+ dd 0x00000004 ; n_namesz
+ dd gprsize + 8 ; n_descsz
+ dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0
+ db "GNU",0 ; n_name
+ dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND
+ dd 0x00000004 ; pr_datasz
+ dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK
+ dd 0x00000000 ; pr_padding
+ %endif
+ %endif
+ %endif
%endif
; Tell debuggers how large the function was.
@@ -877,33 +894,35 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
; cpuflags
-%assign cpuflags_mmx (1<<0)
-%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
-%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
-%assign cpuflags_sse (1<<4) | cpuflags_mmx2
-%assign cpuflags_sse2 (1<<5) | cpuflags_sse
-%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
-%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
-%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
-%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
-%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
-%assign cpuflags_aesni (1<<12)| cpuflags_sse42
-%assign cpuflags_gfni (1<<13)| cpuflags_sse42
-%assign cpuflags_avx (1<<14)| cpuflags_sse42
-%assign cpuflags_xop (1<<15)| cpuflags_avx
-%assign cpuflags_fma4 (1<<16)| cpuflags_avx
-%assign cpuflags_fma3 (1<<17)| cpuflags_avx
-%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
-%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
-
-%assign cpuflags_cache32 (1<<22)
-%assign cpuflags_cache64 (1<<23)
-%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<25)
+%assign cpuflags_mmx (1<<0)
+%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2 (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3
+%assign cpuflags_sse42 (1<<11) | cpuflags_sse4
+%assign cpuflags_aesni (1<<12) | cpuflags_sse42
+%assign cpuflags_clmul (1<<13) | cpuflags_sse42
+%assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul
+%assign cpuflags_avx (1<<15) | cpuflags_sse42
+%assign cpuflags_xop (1<<16) | cpuflags_avx
+%assign cpuflags_fma4 (1<<17) | cpuflags_avx
+%assign cpuflags_fma3 (1<<18) | cpuflags_avx
+%assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1
+%assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
+
+%assign cpuflags_cache32 (1<<24)
+%assign cpuflags_cache64 (1<<25)
+%assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<27)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -1041,6 +1060,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%if WIN64
AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
%endif
+ %xdefine bcstw 1to8
+ %xdefine bcstd 1to4
+ %xdefine bcstq 1to2
%endmacro
%macro INIT_YMM 0-1+
@@ -1054,6 +1076,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
INIT_CPUFLAGS %1
DEFINE_MMREGS ymm
AVX512_MM_PERMUTATION
+ %xdefine bcstw 1to16
+ %xdefine bcstd 1to8
+ %xdefine bcstq 1to4
%endmacro
%macro INIT_ZMM 0-1+
@@ -1067,6 +1092,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
INIT_CPUFLAGS %1
DEFINE_MMREGS zmm
AVX512_MM_PERMUTATION
+ %xdefine bcstw 1to32
+ %xdefine bcstd 1to16
+ %xdefine bcstq 1to8
%endmacro
INIT_XMM
@@ -1588,18 +1616,18 @@ AVX_INSTR orps, sse, 1, 0, 1
AVX_INSTR pabsb, ssse3
AVX_INSTR pabsd, ssse3
AVX_INSTR pabsw, ssse3
-AVX_INSTR packsswb, mmx, 0, 0, 0
AVX_INSTR packssdw, mmx, 0, 0, 0
-AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packsswb, mmx, 0, 0, 0
AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
AVX_INSTR paddb, mmx, 0, 0, 1
-AVX_INSTR paddw, mmx, 0, 0, 1
AVX_INSTR paddd, mmx, 0, 0, 1
AVX_INSTR paddq, sse2, 0, 0, 1
AVX_INSTR paddsb, mmx, 0, 0, 1
AVX_INSTR paddsw, mmx, 0, 0, 1
AVX_INSTR paddusb, mmx, 0, 0, 1
AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
AVX_INSTR palignr, ssse3, 0, 1, 0
AVX_INSTR pand, mmx, 0, 0, 1
AVX_INSTR pandn, mmx, 0, 0, 0
@@ -1607,71 +1635,71 @@ AVX_INSTR pavgb, mmx2, 0, 0, 1
AVX_INSTR pavgw, mmx2, 0, 0, 1
AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
AVX_INSTR pblendw, sse4, 0, 1, 0
-AVX_INSTR pclmulqdq, fnord, 0, 1, 0
-AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
-AVX_INSTR pcmpestri, sse42
-AVX_INSTR pcmpestrm, sse42
-AVX_INSTR pcmpistri, sse42
-AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulqdq, clmul, 0, 1, 0
AVX_INSTR pcmpeqb, mmx, 0, 0, 1
-AVX_INSTR pcmpeqw, mmx, 0, 0, 1
AVX_INSTR pcmpeqd, mmx, 0, 0, 1
AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
AVX_INSTR pcmpgtb, mmx, 0, 0, 0
-AVX_INSTR pcmpgtw, mmx, 0, 0, 0
AVX_INSTR pcmpgtd, mmx, 0, 0, 0
AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
AVX_INSTR pextrb, sse4
AVX_INSTR pextrd, sse4
AVX_INSTR pextrq, sse4
AVX_INSTR pextrw, mmx2
-AVX_INSTR phaddw, ssse3, 0, 0, 0
AVX_INSTR phaddd, ssse3, 0, 0, 0
AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phaddw, ssse3, 0, 0, 0
AVX_INSTR phminposuw, sse4
-AVX_INSTR phsubw, ssse3, 0, 0, 0
AVX_INSTR phsubd, ssse3, 0, 0, 0
AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR phsubw, ssse3, 0, 0, 0
AVX_INSTR pinsrb, sse4, 0, 1, 0
AVX_INSTR pinsrd, sse4, 0, 1, 0
AVX_INSTR pinsrq, sse4, 0, 1, 0
AVX_INSTR pinsrw, mmx2, 0, 1, 0
-AVX_INSTR pmaddwd, mmx, 0, 0, 1
AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
AVX_INSTR pmaxsb, sse4, 0, 0, 1
-AVX_INSTR pmaxsw, mmx2, 0, 0, 1
AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
AVX_INSTR pmaxub, mmx2, 0, 0, 1
-AVX_INSTR pmaxuw, sse4, 0, 0, 1
AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
AVX_INSTR pminsb, sse4, 0, 0, 1
-AVX_INSTR pminsw, mmx2, 0, 0, 1
AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
AVX_INSTR pminub, mmx2, 0, 0, 1
-AVX_INSTR pminuw, sse4, 0, 0, 1
AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
AVX_INSTR pmovmskb, mmx2
-AVX_INSTR pmovsxbw, sse4
AVX_INSTR pmovsxbd, sse4
AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxdq, sse4
AVX_INSTR pmovsxwd, sse4
AVX_INSTR pmovsxwq, sse4
-AVX_INSTR pmovsxdq, sse4
-AVX_INSTR pmovzxbw, sse4
AVX_INSTR pmovzxbd, sse4
AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxdq, sse4
AVX_INSTR pmovzxwd, sse4
AVX_INSTR pmovzxwq, sse4
-AVX_INSTR pmovzxdq, sse4
AVX_INSTR pmuldq, sse4, 0, 0, 1
AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
AVX_INSTR pmulhuw, mmx2, 0, 0, 1
AVX_INSTR pmulhw, mmx, 0, 0, 1
-AVX_INSTR pmullw, mmx, 0, 0, 1
AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
AVX_INSTR pmuludq, sse2, 0, 0, 1
AVX_INSTR por, mmx, 0, 0, 1
AVX_INSTR psadbw, mmx2, 0, 0, 1
@@ -1680,35 +1708,35 @@ AVX_INSTR pshufd, sse2
AVX_INSTR pshufhw, sse2
AVX_INSTR pshuflw, sse2
AVX_INSTR psignb, ssse3, 0, 0, 0
-AVX_INSTR psignw, ssse3, 0, 0, 0
AVX_INSTR psignd, ssse3, 0, 0, 0
-AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
AVX_INSTR pslld, mmx, 0, 0, 0
-AVX_INSTR psllq, mmx, 0, 0, 0
AVX_INSTR pslldq, sse2, 0, 0, 0
-AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
AVX_INSTR psrad, mmx, 0, 0, 0
-AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
AVX_INSTR psrld, mmx, 0, 0, 0
-AVX_INSTR psrlq, mmx, 0, 0, 0
AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
AVX_INSTR psubb, mmx, 0, 0, 0
-AVX_INSTR psubw, mmx, 0, 0, 0
AVX_INSTR psubd, mmx, 0, 0, 0
AVX_INSTR psubq, sse2, 0, 0, 0
AVX_INSTR psubsb, mmx, 0, 0, 0
AVX_INSTR psubsw, mmx, 0, 0, 0
AVX_INSTR psubusb, mmx, 0, 0, 0
AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
AVX_INSTR ptest, sse4
AVX_INSTR punpckhbw, mmx, 0, 0, 0
-AVX_INSTR punpckhwd, mmx, 0, 0, 0
AVX_INSTR punpckhdq, mmx, 0, 0, 0
AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
AVX_INSTR punpcklbw, mmx, 0, 0, 0
-AVX_INSTR punpcklwd, mmx, 0, 0, 0
AVX_INSTR punpckldq, mmx, 0, 0, 0
AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
AVX_INSTR pxor, mmx, 0, 0, 1
AVX_INSTR rcpps, sse, 1
AVX_INSTR rcpss, sse, 1, 0, 0
@@ -1740,8 +1768,8 @@ AVX_INSTR xorps, sse, 1, 0, 1
; 3DNow instructions, for sharing code between AVX, SSE and 3DN
AVX_INSTR pfadd, 3dnow, 1, 0, 1
-AVX_INSTR pfsub, 3dnow, 1, 0, 0
AVX_INSTR pfmul, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
;%1 == instruction
;%2 == minimal instruction set
@@ -1763,9 +1791,10 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1
GPR_INSTR andn, bmi1
GPR_INSTR bextr, bmi1
GPR_INSTR blsi, bmi1
-GPR_INSTR blsr, bmi1
GPR_INSTR blsmsk, bmi1
+GPR_INSTR blsr, bmi1
GPR_INSTR bzhi, bmi2
+GPR_INSTR crc32, sse42
GPR_INSTR mulx, bmi2
GPR_INSTR pdep, bmi2
GPR_INSTR pext, bmi2
@@ -1806,9 +1835,9 @@ GPR_INSTR shrx, bmi2
%endmacro
%endmacro
-FMA_INSTR pmacsww, pmullw, paddw
-FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
-FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmacsww, pmullw, paddw
FMA_INSTR pmadcswd, pmaddwd, paddd
; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/4815ccadb1890572f2bf8b9d9553d56f6c9122ad...12426f5f4906e34d483a07da0debb6d56c6f8434
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/4815ccadb1890572f2bf8b9d9553d56f6c9122ad...12426f5f4906e34d483a07da0debb6d56c6f8434
You're receiving this email because of your account on code.videolan.org.
VideoLAN code repository instance
More information about the x264-devel
mailing list