[x264-devel] x86: Always use PIC in x86-64 asm
Henrik Gramner
git at videolan.org
Tue Mar 12 19:31:47 CET 2019
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Aug 12 17:00:13 2018 +0200| [275ef5332dffec445a0c5a78dbc00c3e0766011d] | committer: Anton Mitrofanov
x86: Always use PIC in x86-64 asm
Most x86-64 operating systems nowadays doesn't even allow .text relocations
in object files any more, and there is no measurable overall performance
difference from using RIP-relative addressing in x264 asm.
Enforcing PIC reduces complexity and simplifies testing.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=275ef5332dffec445a0c5a78dbc00c3e0766011d
---
common/x86/cabac-a.asm | 47 ++++++++++++-----------------------------------
common/x86/mc-a.asm | 4 ++--
common/x86/pixel-a.asm | 6 +++---
common/x86/predict-a.asm | 2 +-
common/x86/quant-a.asm | 17 ++++++-----------
common/x86/sad-a.asm | 2 +-
common/x86/trellis-64.asm | 12 ------------
configure | 6 +++---
8 files changed, 28 insertions(+), 68 deletions(-)
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index fcafd9c4..f9d223a3 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -36,11 +36,7 @@ SECTION_RODATA 64
%xdefine %%funccpu2 %3 ; last64
%xdefine %%funccpu3 %4 ; last15/last16
coeff_last_%1:
- %ifdef PIC
- %xdefine %%base coeff_last_%1 ; offset relative to the start of the table
- %else
- %xdefine %%base 0 ; absolute address
- %endif
+ %xdefine %%base coeff_last_%1
%rep 14
%ifidn %5, 4
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu1) - %%base
@@ -121,15 +117,13 @@ struc cb
endstruc
%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
-%ifdef PIC
- %ifidn %4, 0
- movzx %1, byte [%2+%3+r7-$$]
- %else
- lea %5, [r7+%4]
- movzx %1, byte [%2+%3+%5-$$]
- %endif
-%else
+%if ARCH_X86_64 == 0
movzx %1, byte [%2+%3+%4]
+%elifidn %4, 0
+ movzx %1, byte [%2+%3+r7-$$]
+%else
+ lea %5, [r7+%4]
+ movzx %1, byte [%2+%3+%5-$$]
%endif
%endmacro
@@ -154,9 +148,9 @@ cglobal cabac_encode_decision_%1, 1,7
shr t5d, 6
movifnidn t2d, r2m
%if WIN64
- PUSH r7
+ PUSH r7
%endif
-%ifdef PIC
+%if ARCH_X86_64
lea r7, [$$]
%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
@@ -183,7 +177,7 @@ cglobal cabac_encode_decision_%1, 1,7
shl t6d, t3b
%endif
%if WIN64
- POP r7
+ POP r7
%endif
mov [t0+cb.range], t4d
add t3d, [t0+cb.queue]
@@ -278,6 +272,7 @@ cabac_putbyte_%1:
CABAC asm
CABAC bmi2
+%if ARCH_X86_64
; %1 = label name
; %2 = node_ctx init?
%macro COEFF_ABS_LEVEL_GT1 2
@@ -409,13 +404,9 @@ CABAC bmi2
%endmacro
%macro COEFF_LAST 2 ; table, ctx_block_cat
-%ifdef PIC
lea r1, [%1 GLOBAL]
movsxd r6, [r1+4*%2]
add r6, r1
-%else
- movsxd r6, [%1+4*%2]
-%endif
call r6
%endmacro
@@ -436,15 +427,9 @@ CABAC bmi2
%define dct r4
%endif
-%ifdef PIC
- cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
+cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
lea r12, [$$]
%define GLOBAL +r12-$$
-%else
- cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
- %define GLOBAL
-%endif
-
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
@@ -554,7 +539,6 @@ CABAC bmi2
RET
%endmacro
-%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
@@ -575,7 +559,6 @@ INIT_YMM avx512
CABAC_RESIDUAL_RD 0, coeff_last_avx512
INIT_ZMM avx512
CABAC_RESIDUAL_RD 1, coeff_last_avx512
-%endif
;-----------------------------------------------------------------------------
; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
@@ -653,15 +636,10 @@ CABAC_RESIDUAL_RD 1, coeff_last_avx512
%macro CABAC_RESIDUAL 1
cglobal cabac_block_residual_internal, 4,15,0,-4*64
-%ifdef PIC
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
%define lastm [rsp+4*1]
%define GLOBAL +r7-$$
-%else
- %define lastm r7d
- %define GLOBAL
-%endif
shl r1d, 4
%define sigoffq r8
@@ -779,7 +757,6 @@ cglobal cabac_block_residual_internal, 4,15,0,-4*64
RET
%endmacro
-%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL coeff_last_sse2
INIT_XMM lzcnt
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 48a201b1..440cf4d5 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1331,7 +1331,7 @@ cglobal pixel_avg2_w16_cache64_ssse3
sub r4, r2
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
-%ifdef PIC
+%if ARCH_X86_64
lea r7, [avg_w16_addr]
add r6, r7
%else
@@ -2020,7 +2020,7 @@ cglobal mc_chroma
%if cpuflag(cache64)
mov t0d, r3d
and t0d, 7
-%ifdef PIC
+%if ARCH_X86_64
lea t1, [ch_shuf_adj]
movddup xm5, [t1 + t0*4]
%else
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 1dfb2897..101a2ae5 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -2866,7 +2866,7 @@ cglobal intra_satd_x3_8x8c, 0,6
; output the predicted samples
mov r3d, eax
shr r3d, 16
-%ifdef PIC
+%if ARCH_X86_64
lea r2, [%2_lut]
movzx r2d, byte [r2+r3]
%else
@@ -5103,7 +5103,7 @@ cglobal pixel_ssim_end4, 2,3
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
-%ifdef PIC
+%if ARCH_X86_64
lea r3, [mask_ff + 16]
%xdefine %%mask r3
%else
@@ -5553,7 +5553,7 @@ ads_mvs_ssse3:
add r5, r6
xor r0d, r0d ; nmv
mov [r5], r0d
-%ifdef PIC
+%if ARCH_X86_64
lea r1, [$$]
%define GLOBAL +r1-$$
%else
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index cf2b4649..7d7495ba 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -688,7 +688,7 @@ INIT_XMM cpuname
je .fix_lt_2
.do_top:
and r2d, 4
-%ifdef PIC
+%if ARCH_X86_64
lea r3, [shuf_fixtr]
pshufb m3, [r3+r2*4]
%else
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index cf15badd..e168a564 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -673,7 +673,7 @@ cglobal dequant_%1x%1_flat16, 0,3
sub t2d, t0d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %2
-%ifdef PIC
+%if ARCH_X86_64
lea r1, [dequant%1_scale]
add r1, t2
%else
@@ -761,7 +761,7 @@ DEQUANT 8, 6, 4
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
%if %2
-%ifdef PIC
+%if ARCH_X86_64
%define dmf r1+t2
lea r1, [dequant8_scale]
%else
@@ -1449,7 +1449,7 @@ cglobal decimate_score%1, 1,3
shr edx, 1
%endif
%endif
-%ifdef PIC
+%if ARCH_X86_64
lea r4, [decimate_mask_table4]
%define mask_table r4
%else
@@ -1580,16 +1580,11 @@ cglobal decimate_score64, 1,5
add eax, r3d
jnz .ret9
%endif
-%ifdef PIC
- lea r4, [decimate_table8]
- %define table r4
-%else
- %define table decimate_table8
-%endif
+ lea r4, [decimate_table8]
mov al, -6
.loop:
tzcnt rcx, r1
- add al, byte [table + rcx]
+ add al, byte [r4 + rcx]
jge .ret9
shr r1, 1
SHRX r1, rcx
@@ -2165,7 +2160,7 @@ COEFF_LEVELRUN 16
%macro COEFF_LEVELRUN_LUT 1
cglobal coeff_level_run%1,2,4+(%1/9)
-%ifdef PIC
+%if ARCH_X86_64
lea r5, [$$]
%define GLOBAL +r5-$$
%else
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index e98d6132..741aa95b 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1920,7 +1920,7 @@ cglobal pixel_sad_16x%2_cache64_%1
shl r4d, 4 ; code size = 80
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
-%ifdef PIC
+%if ARCH_X86_64
lea r5, [sad_w16_addr]
add r5, r4
%else
diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm
index 590d5518..62ba781b 100644
--- a/common/x86/trellis-64.asm
+++ b/common/x86/trellis-64.asm
@@ -202,7 +202,6 @@ cglobal %1, 4,15,9
paddd m6, m6
%define unquant_mf m6
%endif
-%ifdef PIC
%if dc == 0
mov unquant_mfm, unquant_mfq
%endif
@@ -212,9 +211,6 @@ cglobal %1, 4,15,9
; (Any address in .text would work, this one was just convenient.)
lea r0, [$$]
%define GLOBAL +r0-$$
-%else
- %define GLOBAL
-%endif
TRELLIS_LOOP 0 ; node_ctx 0..3
TRELLIS_LOOP 1 ; node_ctx 1..7
@@ -304,12 +300,8 @@ cglobal %1, 4,15,9
mov r10, cabac_state_sigm
%if num_coefs == 64
mov r6d, b_interlacedm
-%ifdef PIC
add r6d, iid
movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
-%else
- movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
-%endif
movzx r10, byte [r10 + r6]
%elif num_coefs == 8
movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
@@ -408,12 +400,8 @@ cglobal %1, 4,15,9
%if dc
pmuludq m0, unquant_mf
%else
-%ifdef PIC
mov r10, unquant_mfm
LOAD_DUP m3, [r10 + zigzagiq*4]
-%else
- LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
-%endif
pmuludq m0, m3
%endif
paddd m0, [pq_128]
diff --git a/configure b/configure
index 3cf63e09..91c2855c 100755
--- a/configure
+++ b/configure
@@ -734,11 +734,11 @@ case $host_cpu in
ARCH="X86_64"
AS="${AS-nasm}"
AS_EXT=".asm"
- ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/"
+ ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -DPIC -I\$(SRCPATH)/common/x86/"
stack_alignment=16
[ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS"
if [ "$SYS" = MACOSX ]; then
- ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX"
+ ASFLAGS="$ASFLAGS -f macho64 -DPREFIX"
if cc_check '' "-arch x86_64"; then
CFLAGS="$CFLAGS -arch x86_64"
LDFLAGS="$LDFLAGS -arch x86_64"
@@ -1253,7 +1253,7 @@ cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {
if [ "$pic" = "yes" ] ; then
[ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC"
- ASFLAGS="$ASFLAGS -DPIC"
+ [[ "$ASFLAGS" != *"-DPIC"* ]] && ASFLAGS="$ASFLAGS -DPIC"
# resolve textrels in the x86 asm
cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic"
[ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text"
More information about the x264-devel
mailing list