[x264-devel] x86: Always use PIC in x86-64 asm

Tue Mar 12 19:31:47 CET 2019

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Aug 12 17:00:13 2018 +0200| [275ef5332dffec445a0c5a78dbc00c3e0766011d] | committer: Anton Mitrofanov

x86: Always use PIC in x86-64 asm

Most x86-64 operating systems nowadays doesn't even allow .text relocations
in object files any more, and there is no measurable overall performance
difference from using RIP-relative addressing in x264 asm.

Enforcing PIC reduces complexity and simplifies testing.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=275ef5332dffec445a0c5a78dbc00c3e0766011d
---

 common/x86/cabac-a.asm    | 47 ++++++++++++-----------------------------------
 common/x86/mc-a.asm       |  4 ++--
 common/x86/pixel-a.asm    |  6 +++---
 common/x86/predict-a.asm  |  2 +-
 common/x86/quant-a.asm    | 17 ++++++-----------
 common/x86/sad-a.asm      |  2 +-
 common/x86/trellis-64.asm | 12 ------------
 configure                 |  6 +++---
 8 files changed, 28 insertions(+), 68 deletions(-)

diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index fcafd9c4..f9d223a3 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -36,11 +36,7 @@ SECTION_RODATA 64
     %xdefine %%funccpu2 %3 ; last64
     %xdefine %%funccpu3 %4 ; last15/last16
     coeff_last_%1:
-    %ifdef PIC
-        %xdefine %%base coeff_last_%1 ; offset relative to the start of the table
-    %else
-        %xdefine %%base 0             ; absolute address
-    %endif
+    %xdefine %%base coeff_last_%1
     %rep 14
         %ifidn %5, 4
             dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu1) - %%base
@@ -121,15 +117,13 @@ struc cb
 endstruc
 
 %macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
-%ifdef PIC
-    %ifidn %4, 0
-        movzx %1, byte [%2+%3+r7-$$]
-    %else
-        lea   %5, [r7+%4]
-        movzx %1, byte [%2+%3+%5-$$]
-    %endif
-%else
+%if ARCH_X86_64 == 0
     movzx %1, byte [%2+%3+%4]
+%elifidn %4, 0
+    movzx %1, byte [%2+%3+r7-$$]
+%else
+    lea   %5, [r7+%4]
+    movzx %1, byte [%2+%3+%5-$$]
 %endif
 %endmacro
 
@@ -154,9 +148,9 @@ cglobal cabac_encode_decision_%1, 1,7
     shr   t5d, 6
     movifnidn t2d, r2m
 %if WIN64
-    PUSH r7
+    PUSH   r7
 %endif
-%ifdef PIC
+%if ARCH_X86_64
     lea    r7, [$$]
 %endif
     LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
@@ -183,7 +177,7 @@ cglobal cabac_encode_decision_%1, 1,7
     shl   t6d, t3b
 %endif
 %if WIN64
-    POP r7
+    POP    r7
 %endif
     mov   [t0+cb.range], t4d
     add   t3d, [t0+cb.queue]
@@ -278,6 +272,7 @@ cabac_putbyte_%1:
 CABAC asm
 CABAC bmi2
 
+%if ARCH_X86_64
 ; %1 = label name
 ; %2 = node_ctx init?
 %macro COEFF_ABS_LEVEL_GT1 2
@@ -409,13 +404,9 @@ CABAC bmi2
 %endmacro
 
 %macro COEFF_LAST 2 ; table, ctx_block_cat
-%ifdef PIC
     lea    r1, [%1 GLOBAL]
     movsxd r6, [r1+4*%2]
     add    r6, r1
-%else
-    movsxd r6, [%1+4*%2]
-%endif
     call   r6
 %endmacro
 
@@ -436,15 +427,9 @@ CABAC bmi2
     %define dct r4
 %endif
 
-%ifdef PIC
-    cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
+cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
     lea     r12, [$$]
     %define GLOBAL +r12-$$
-%else
-    cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
-    %define GLOBAL
-%endif
-
     shl     r1d, 4                                            ; MB_INTERLACED*16
 %if %1
     lea      r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]     ; r12 = sig offset 8x8
@@ -554,7 +539,6 @@ CABAC bmi2
     RET
 %endmacro
 
-%if ARCH_X86_64
 INIT_XMM sse2
 CABAC_RESIDUAL_RD 0, coeff_last_sse2
 CABAC_RESIDUAL_RD 1, coeff_last_sse2
@@ -575,7 +559,6 @@ INIT_YMM avx512
 CABAC_RESIDUAL_RD 0, coeff_last_avx512
 INIT_ZMM avx512
 CABAC_RESIDUAL_RD 1, coeff_last_avx512
-%endif
 
 ;-----------------------------------------------------------------------------
 ; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
@@ -653,15 +636,10 @@ CABAC_RESIDUAL_RD 1, coeff_last_avx512
 
 %macro CABAC_RESIDUAL 1
 cglobal cabac_block_residual_internal, 4,15,0,-4*64
-%ifdef PIC
 ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
     lea     r7, [$$]
     %define lastm [rsp+4*1]
     %define GLOBAL +r7-$$
-%else
-    %define lastm r7d
-    %define GLOBAL
-%endif
     shl     r1d, 4
 
     %define sigoffq r8
@@ -779,7 +757,6 @@ cglobal cabac_block_residual_internal, 4,15,0,-4*64
     RET
 %endmacro
 
-%if ARCH_X86_64
 INIT_XMM sse2
 CABAC_RESIDUAL coeff_last_sse2
 INIT_XMM lzcnt
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 48a201b1..440cf4d5 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1331,7 +1331,7 @@ cglobal pixel_avg2_w16_cache64_ssse3
     sub    r4, r2
     shl    r6, 4         ;jump = (offset + align*2)*48
 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
-%ifdef PIC
+%if ARCH_X86_64
     lea    r7, [avg_w16_addr]
     add    r6, r7
 %else
@@ -2020,7 +2020,7 @@ cglobal mc_chroma
 %if cpuflag(cache64)
     mov       t0d, r3d
     and       t0d, 7
-%ifdef PIC
+%if ARCH_X86_64
     lea        t1, [ch_shuf_adj]
     movddup   xm5, [t1 + t0*4]
 %else
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 1dfb2897..101a2ae5 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -2866,7 +2866,7 @@ cglobal intra_satd_x3_8x8c, 0,6
     ; output the predicted samples
     mov       r3d, eax
     shr       r3d, 16
-%ifdef PIC
+%if ARCH_X86_64
     lea        r2, [%2_lut]
     movzx     r2d, byte [r2+r3]
 %else
@@ -5103,7 +5103,7 @@ cglobal pixel_ssim_end4, 2,3
     je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
     neg       r2
 
-%ifdef PIC
+%if ARCH_X86_64
     lea       r3, [mask_ff + 16]
     %xdefine %%mask r3
 %else
@@ -5553,7 +5553,7 @@ ads_mvs_ssse3:
     add       r5, r6
     xor      r0d, r0d ; nmv
     mov     [r5], r0d
-%ifdef PIC
+%if ARCH_X86_64
     lea       r1, [$$]
     %define GLOBAL +r1-$$
 %else
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index cf2b4649..7d7495ba 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -688,7 +688,7 @@ INIT_XMM cpuname
     je .fix_lt_2
 .do_top:
     and        r2d, 4
-%ifdef PIC
+%if ARCH_X86_64
     lea         r3, [shuf_fixtr]
     pshufb      m3, [r3+r2*4]
 %else
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index cf15badd..e168a564 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -673,7 +673,7 @@ cglobal dequant_%1x%1_flat16, 0,3
     sub  t2d, t0d
     sub  t2d, t1d   ; i_mf = i_qp % 6
     shl  t2d, %2
-%ifdef PIC
+%if ARCH_X86_64
     lea  r1, [dequant%1_scale]
     add  r1, t2
 %else
@@ -761,7 +761,7 @@ DEQUANT 8, 6, 4
     sub  t2d, t1d   ; i_mf = i_qp % 6
     shl  t2d, %1
 %if %2
-%ifdef PIC
+%if ARCH_X86_64
 %define dmf r1+t2
     lea   r1, [dequant8_scale]
 %else
@@ -1449,7 +1449,7 @@ cglobal decimate_score%1, 1,3
     shr   edx, 1
 %endif
 %endif
-%ifdef PIC
+%if ARCH_X86_64
     lea    r4, [decimate_mask_table4]
     %define mask_table r4
 %else
@@ -1580,16 +1580,11 @@ cglobal decimate_score64, 1,5
     add   eax, r3d
     jnz .ret9
 %endif
-%ifdef PIC
-    lea r4, [decimate_table8]
-    %define table r4
-%else
-    %define table decimate_table8
-%endif
+    lea    r4, [decimate_table8]
     mov    al, -6
 .loop:
     tzcnt rcx, r1
-    add    al, byte [table + rcx]
+    add    al, byte [r4 + rcx]
     jge .ret9
     shr    r1, 1
     SHRX   r1, rcx
@@ -2165,7 +2160,7 @@ COEFF_LEVELRUN 16
 
 %macro COEFF_LEVELRUN_LUT 1
 cglobal coeff_level_run%1,2,4+(%1/9)
-%ifdef PIC
+%if ARCH_X86_64
     lea       r5, [$$]
     %define GLOBAL +r5-$$
 %else
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index e98d6132..741aa95b 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1920,7 +1920,7 @@ cglobal pixel_sad_16x%2_cache64_%1
     shl     r4d, 4  ; code size = 80
 %endif
 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
-%ifdef PIC
+%if ARCH_X86_64
     lea     r5, [sad_w16_addr]
     add     r5, r4
 %else
diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm
index 590d5518..62ba781b 100644
--- a/common/x86/trellis-64.asm
+++ b/common/x86/trellis-64.asm
@@ -202,7 +202,6 @@ cglobal %1, 4,15,9
     paddd      m6, m6
     %define unquant_mf m6
 %endif
-%ifdef PIC
 %if dc == 0
     mov unquant_mfm, unquant_mfq
 %endif
@@ -212,9 +211,6 @@ cglobal %1, 4,15,9
     ; (Any address in .text would work, this one was just convenient.)
     lea r0, [$$]
     %define GLOBAL +r0-$$
-%else
-    %define GLOBAL
-%endif
 
     TRELLIS_LOOP 0 ; node_ctx 0..3
     TRELLIS_LOOP 1 ; node_ctx 1..7
@@ -304,12 +300,8 @@ cglobal %1, 4,15,9
     mov    r10, cabac_state_sigm
 %if num_coefs == 64
     mov    r6d, b_interlacedm
-%ifdef PIC
     add    r6d, iid
     movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
-%else
-    movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
-%endif
     movzx  r10, byte [r10 + r6]
 %elif num_coefs == 8
     movzx  r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
@@ -408,12 +400,8 @@ cglobal %1, 4,15,9
 %if dc
     pmuludq m0, unquant_mf
 %else
-%ifdef PIC
     mov    r10, unquant_mfm
     LOAD_DUP m3, [r10 + zigzagiq*4]
-%else
-    LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
-%endif
     pmuludq m0, m3
 %endif
     paddd   m0, [pq_128]
diff --git a/configure b/configure
index 3cf63e09..91c2855c 100755
--- a/configure
+++ b/configure
@@ -734,11 +734,11 @@ case $host_cpu in
         ARCH="X86_64"
         AS="${AS-nasm}"
         AS_EXT=".asm"
-        ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/"
+        ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -DPIC -I\$(SRCPATH)/common/x86/"
         stack_alignment=16
         [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS"
         if [ "$SYS" = MACOSX ]; then
-            ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX"
+            ASFLAGS="$ASFLAGS -f macho64 -DPREFIX"
             if cc_check '' "-arch x86_64"; then
                 CFLAGS="$CFLAGS -arch x86_64"
                 LDFLAGS="$LDFLAGS -arch x86_64"
@@ -1253,7 +1253,7 @@ cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {
 
 if [ "$pic" = "yes" ] ; then
     [ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC"
-    ASFLAGS="$ASFLAGS -DPIC"
+    [[ "$ASFLAGS" != *"-DPIC"* ]] && ASFLAGS="$ASFLAGS -DPIC"
     # resolve textrels in the x86 asm
     cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic"
     [ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text"