[vlc-commits] Contribs: fix dav1d issue on Android/x86
Jean-Baptiste Kempf
git at videolan.org
Thu Jun 25 11:27:00 CEST 2020
vlc | branch: master | Jean-Baptiste Kempf <jb at videolan.org> | Thu Jun 25 11:25:58 2020 +0200| [9c7e8fd666017121331be4bb615809cf5b7b4d20] | committer: Jean-Baptiste Kempf
Contribs: fix dav1d issue on Android/x86
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=9c7e8fd666017121331be4bb615809cf5b7b4d20
---
contrib/src/dav1d/0001-SSE2-PIC-464ca6c2.patch | 380 +++++++++++++++++++++++++
contrib/src/dav1d/rules.mak | 1 +
2 files changed, 381 insertions(+)
diff --git a/contrib/src/dav1d/0001-SSE2-PIC-464ca6c2.patch b/contrib/src/dav1d/0001-SSE2-PIC-464ca6c2.patch
new file mode 100644
index 0000000000..097ae63117
--- /dev/null
+++ b/contrib/src/dav1d/0001-SSE2-PIC-464ca6c2.patch
@@ -0,0 +1,380 @@
+From 464ca6c2f37b93180cc27ea41889ffaf1eab388e Mon Sep 17 00:00:00 2001
+From: Henrik Gramner <gramner at twoorioles.com>
+Date: Thu, 25 Jun 2020 01:27:28 +0200
+Subject: [PATCH] x86: Fix 32-bit build with PIC enabled
+
+---
+ src/x86/mc_sse.asm | 147 +++++++++++++++++----------------------------
+ 1 file changed, 56 insertions(+), 91 deletions(-)
+
+diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm
+index d98ac621..5d5c5e3f 100644
+--- a/src/x86/mc_sse.asm
++++ b/src/x86/mc_sse.asm
+@@ -1263,7 +1263,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ %if ARCH_X86_64
+ mova m8, [pw_8]
+ %else
+- %define m8 [pw_8]
++ %define m8 [t1-prep_sse2+pw_8]
+ %endif
+ pxor m7, m7
+ %endif
+@@ -1272,13 +1272,11 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ pshuflw m6, m6, q0000
+ %if cpuflag(ssse3)
+ punpcklqdq m6, m6
+-%else
+- %if ARCH_X86_64
++%elif ARCH_X86_64
+ psrlw m0, m8, 3
+ punpcklwd m6, m0
+- %else
++%else
+ punpcklwd m6, [base+pw_1]
+- %endif
+ %endif
+ %if ARCH_X86_32
+ mov t1, t2 ; save base reg for w4
+@@ -1396,8 +1394,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ PUSH r7
+ %endif
+ mov r7, tmpq
++ mov r5, srcq
+ %endif
+- mov t1, srcq
+ .hv_w16_hloop:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+@@ -1440,14 +1438,17 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ sub hd, 2
+ jg .hv_w16_vloop
+ movzx hd, t2w
+- add t1, 16
+- mov srcq, t1
+ %if ARCH_X86_64
++ add r5, 16
+ add r7, 2*16
++ mov srcq, r5
+ mov tmpq, r7
+ %else
++ mov srcq, srcmp
+ mov tmpq, tmpmp
++ add srcq, 16
+ add tmpq, 2*16
++ mov srcmp, srcq
+ mov tmpmp, tmpq
+ %endif
+ sub t2d, 1<<16
+@@ -2624,22 +2625,20 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+ phaddw %1, %2
+- %else
+- %ifnidn %1, %2
++ %elifnidn %1, %2
+ %if %4 == 1
+- mova %3, [pw_1]
++ mova %3, [base+pw_1]
+ %endif
+ pmaddwd %1, %3
+ pmaddwd %2, %3
+ packssdw %1, %2
+- %else
++ %else
+ %if %4 == 1
+- pmaddwd %1, [pw_1]
++ pmaddwd %1, [base+pw_1]
+ %else
+ pmaddwd %1, %3
+ %endif
+ packssdw %1, %1
+- %endif
+ %endif
+ %endmacro
+
+@@ -2795,11 +2794,9 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+ %if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep%+SUFFIX
+- %define W32_RESTORE_SSQ mov strideq, stridem
+ %else
+ %define base_reg r7
+ %define base 0
+- %define W32_RESTORE_SSQ
+ %endif
+ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ %assign org_stack_offset stack_offset
+@@ -2834,6 +2831,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ WIN64_SPILL_XMM 12
+ %else
+ WIN64_SPILL_XMM 16
++%endif
++%if ARCH_X86_32
++ %define strideq r6
++ mov strideq, stridem
+ %endif
+ cmp wd, 4
+ je .h_w4
+@@ -2894,7 +2895,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ punpcklbw m4, m4
+ psraw m4, 8
+ %endif
+- W32_RESTORE_SSQ
+ %if ARCH_X86_64
+ lea stride3q, [strideq*3]
+ %endif
+@@ -2916,8 +2916,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ pshufb m1, m5
+ pshufb m2, m5
+ pshufb m3, m5
+-%else
+- %if ARCH_X86_64
++%elif ARCH_X86_64
+ movd m0, [srcq+strideq*0+0]
+ movd m12, [srcq+strideq*0+1]
+ movd m1, [srcq+strideq*1+0]
+@@ -2947,7 +2946,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ punpcklqdq m1, m5 ; 1
+ punpcklqdq m2, m13 ; 2
+ punpcklqdq m3, m7 ; 3
+- %else
++%else
+ movd m0, [srcq+strideq*0+0]
+ movd m1, [srcq+strideq*0+1]
+ movd m2, [srcq+strideq*0+2]
+@@ -2978,7 +2977,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ lea srcq, [srcq+strideq*2]
+ punpckldq m7, m5
+ punpcklqdq m3, m7 ; 3
+- %endif
+ %endif
+ PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
+ PMADDUBSW m1, m4, m5, m7, 0
+@@ -2994,14 +2992,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+- ;
+ .h_w8:
+-%if ARCH_X86_32
+- mov r3, r2
+- %define base_reg r3
+- W32_RESTORE_SSQ
+-%endif
+-.h_w8_loop:
+ %if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+strideq*0
+ PREP_8TAP_H 1, srcq+strideq*1
+@@ -3017,51 +3008,42 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ add tmpq, 16
+ dec hd
+ %endif
+- jg .h_w8_loop
++ jg .h_w8
+ RET
+ .h_w16:
+- mov r6, -16*1
++ mov r3, -16*1
+ jmp .h_start
+ .h_w32:
+- mov r6, -16*2
++ mov r3, -16*2
+ jmp .h_start
+ .h_w64:
+- mov r6, -16*4
++ mov r3, -16*4
+ jmp .h_start
+ .h_w128:
+- mov r6, -16*8
++ mov r3, -16*8
+ .h_start:
+-%if ARCH_X86_32
+- mov r3, r2
+- %define base_reg r3
+-%endif
+- sub srcq, r6
+- mov r5, r6
+- W32_RESTORE_SSQ
++ sub srcq, r3
++ mov r5, r3
+ .h_loop:
+ %if cpuflag(ssse3)
+- PREP_8TAP_H 0, srcq+r6+8*0
+- PREP_8TAP_H 1, srcq+r6+8*1
++ PREP_8TAP_H 0, srcq+r3+8*0
++ PREP_8TAP_H 1, srcq+r3+8*1
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+- add r6, 16
++ add r3, 16
+ %else
+- PREP_8TAP_H 0, srcq+r6
++ PREP_8TAP_H 0, srcq+r3
+ mova [tmpq], m0
+ add tmpq, 16
+- add r6, 8
++ add r3, 8
+ %endif
+ jl .h_loop
+ add srcq, strideq
+- mov r6, r5
++ mov r3, r5
+ dec hd
+ jg .h_loop
+ RET
+-%if ARCH_X86_32
+- %define base_reg r2
+-%endif
+- ;
+ .v:
+ LEA base_reg, prep%+SUFFIX
+ %if ARCH_X86_32
+@@ -3086,7 +3068,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+-%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
++%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
+ ALLOC_STACK -mmsize*4
+ %else
+@@ -3105,15 +3087,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ movd m0, [myq+6]
+ PSHUFB_0X1X m0, m2
+ mova subpel3, m0
+- %if notcpuflag(ssse3)
+- mov r6, base_reg
+- %define base_reg r6
+- %endif
+- mov strideq, [rstk+stack_offset+gprsize*3]
+- lea strideq, [strideq*3]
+- sub [rstk+stack_offset+gprsize*2], strideq
+ mov strideq, [rstk+stack_offset+gprsize*3]
+- mov srcq, [rstk+stack_offset+gprsize*2]
++ lea r5, [strideq*3]
++ sub srcq, r5
+ %else
+ %define subpel0 m8
+ %define subpel1 m9
+@@ -3245,10 +3221,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ jg .v_w4_loop0
+ %endif
+ RET
+-%if ARCH_X86_32 && notcpuflag(ssse3)
+- %define base_reg r2
+-%endif
+- ;
+ %if ARCH_X86_64
+ .v_w8:
+ lea r5d, [wq - 8] ; horizontal loop
+@@ -3373,16 +3345,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+- mov r5, r2; use as new base
+- %define base_reg r5
+- %assign regs_used 2
++ mov strideq, stridem
++ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+- mov strideq, [rstk+stack_offset+gprsize*3]
+- lea strideq, [strideq*3 + 1]
+- sub [rstk+stack_offset+gprsize*2], strideq
+- mov strideq, [rstk+stack_offset+gprsize*3]
+- mov srcq, [rstk+stack_offset+gprsize*2]
++ lea r5, [strideq*3+1]
++ sub srcq, r5
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+@@ -3445,9 +3413,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ %define hv4_line_1_3 13
+ %if ARCH_X86_32
+ %if cpuflag(ssse3)
+- %define w8192reg [base+pw_8192]
++ %define w8192reg [base+pw_8192]
+ %else
+- %define w8192reg [base+pw_2]
++ %define w8192reg [base+pw_2]
+ %endif
+ %define d32reg [base+pd_32]
+ %else
+@@ -3676,7 +3644,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ %define hv8_line_6 4
+ shr mxd, 16
+ %if ARCH_X86_32
+- %define base_reg r2
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+@@ -3692,16 +3659,16 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+- ALLOC_STACK -mmsize*13
++ mov strideq, stridem
++ %assign regs_used 6
++ ALLOC_STACK -mmsize*14
++ %assign regs_used 7
+ %if STACK_ALIGNMENT < mmsize
+- mov rstk, r2m
+- %define tmpm [rsp+mmsize*13+gprsize*1]
+- %define srcm [rsp+mmsize*13+gprsize*2]
+- %define stridem [rsp+mmsize*13+gprsize*3]
+- mov stridem, rstk
++ %define tmpm [rsp+mmsize*13+gprsize*1]
++ %define srcm [rsp+mmsize*13+gprsize*2]
++ %define stridem [rsp+mmsize*13+gprsize*3]
++ mov stridem, strideq
+ %endif
+- mov r6, r2
+- %define base_reg r6
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+@@ -3724,12 +3691,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+- W32_RESTORE_SSQ
+- lea strided, [strided*3]
+- sub srcd, strided
+- sub srcd, 3
+- mov srcm, srcd
+- W32_RESTORE_SSQ
++ lea r5, [strideq*3+3]
++ sub srcq, r5
++ mov srcm, srcq
+ %else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+@@ -3765,7 +3729,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ %if notcpuflag(ssse3)
+ mova m7, [base+pw_2]
+ %endif
+- lea stride3q, [strideq*3]
++ lea stride3q, [strideq*3]
+ sub srcq, 3
+ sub srcq, stride3q
+ mov r6, srcq
+@@ -3939,11 +3903,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+ .hv_w8_outer:
+ movzx hd, r5w
+ %if ARCH_X86_32
+- add dword tmpm, 8
+- mov tmpq, tmpm
+ mov srcq, srcm
++ mov tmpq, tmpm
+ add srcq, 4
++ add tmpq, 8
+ mov srcm, srcq
++ mov tmpm, tmpq
+ %else
+ add r8, 8
+ mov tmpq, r8
+--
+2.26.2
+
diff --git a/contrib/src/dav1d/rules.mak b/contrib/src/dav1d/rules.mak
index e7bc7ea7d9..fe0e222b16 100644
--- a/contrib/src/dav1d/rules.mak
+++ b/contrib/src/dav1d/rules.mak
@@ -18,6 +18,7 @@ $(TARBALLS)/dav1d-$(DAV1D_VERSION).tar.xz:
dav1d: dav1d-$(DAV1D_VERSION).tar.xz .sum-dav1d
$(UNPACK)
+ $(APPLY) $(SRC)/dav1d/0001-SSE2-PIC-464ca6c2.patch
$(MOVE)
.dav1d: dav1d crossfile.meson
More information about the vlc-commits
mailing list