[x264-devel] [PATCH 3/3] x32 asm/checkasm various fixes
Matthias Räncker
theonetruecamper at gmx.de
Sun Jan 27 00:00:10 CET 2013
This patch tries to be minimal and only fix issues with x32.
It is not optimized for that abi and leaves the code unmodified
if used with tradtional arches.
Signed-off-by: Matthias Räncker <theonetruecamper at gmx.de>
---
common/x86/cabac-a.asm | 19 +++++------
common/x86/deblock-a.asm | 28 ++++++++++++++++
common/x86/mc-a.asm | 64 +++++++++++++++++++++++++++++++----
common/x86/mc-a2.asm | 52 ++++++++++++++++++++++++-----
common/x86/pixel-a.asm | 86 ++++++++++++++++++++++++++++++++++++++++++++----
common/x86/quant-a.asm | 8 ++---
common/x86/sad-a.asm | 26 +++++++++++++--
common/x86/sad16-a.asm | 9 +++--
tools/checkasm-a.asm | 12 +++----
tools/checkasm.c | 11 +++++++
10 files changed, 267 insertions(+), 48 deletions(-)
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 19d2aa2..8b1b936 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -36,13 +36,10 @@ cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%if WIN64
DECLARE_REG_TMP 3,1,2,0,6,5,4,2
- %define pointer resq
%elif ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
- %define pointer resq
%else
DECLARE_REG_TMP 0,4,2,1,3,5,6,2
- %define pointer resd
%endif
struc cb
@@ -50,9 +47,9 @@ struc cb
.range: resd 1
.queue: resd 1
.bytes_outstanding: resd 1
- .start: pointer 1
- .p: pointer 1
- .end: pointer 1
+ .start: resp 1
+ .p: resp 1
+ .end: resp 1
align 16, resb 1
.bits_encoded: resd 1
.state: resb 1024
@@ -72,7 +69,7 @@ endstruc
%endmacro
cglobal cabac_encode_decision_asm, 0,7
- movifnidn t0, r0mp
+ movifnidn t0p, r0mp
movifnidn t1d, r1m
mov t5d, [t0+cb.range]
movzx t6d, byte [t0+cb.state+t1]
@@ -112,7 +109,7 @@ cglobal cabac_encode_decision_asm, 0,7
RET
cglobal cabac_encode_bypass_asm, 0,3
- movifnidn t0, r0mp
+ movifnidn t0p, r0mp
movifnidn t3d, r1m
mov t7d, [t0+cb.low]
and t3d, [t0+cb.range]
@@ -133,7 +130,7 @@ cglobal cabac_encode_bypass_asm, 0,3
jmp cabac_putbyte
cglobal cabac_encode_terminal_asm, 0,3
- movifnidn t0, r0mp
+ movifnidn t0p, r0mp
sub dword [t0+cb.range], 2
; shortcut: the renormalization shift in terminal
; can only be 0 or 1 and is zero over 99% of the time.
@@ -167,7 +164,7 @@ cabac_putbyte:
mov t5d, [t0+cb.bytes_outstanding]
cmp t2b, 0xff ; FIXME is a 32bit op faster?
jz .postpone
- mov t1, [t0+cb.p]
+ mov t1p, [t0+cb.p]
add [t1-1], t2h
dec t2h
.loop_outstanding:
@@ -176,7 +173,7 @@ cabac_putbyte:
dec t5d
jge .loop_outstanding
mov [t1-1], t2b
- mov [t0+cb.p], t1
+ mov [t0+cb.p], t1p
.postpone:
inc t5d
mov [t0+cb.bytes_outstanding], t5d
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index b1c9a88..b85080e 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -163,6 +163,7 @@ cextern pw_pixel_max
; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma, 5,5,8
+ movsxdifnidn r1, r1p
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
@@ -217,6 +218,7 @@ cglobal deblock_v_luma, 5,5,8
RET
cglobal deblock_h_luma, 5,6,8
+ movsxdifnidn r1, r1p
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
@@ -345,6 +347,7 @@ cglobal deblock_h_luma, 5,6,8
%macro DEBLOCK_LUMA_64 0
cglobal deblock_v_luma, 5,5,15
+ movsxdifnidn r1, r1p
%define p2 m8
%define p1 m0
%define p0 m1
@@ -381,6 +384,7 @@ cglobal deblock_v_luma, 5,5,15
RET
cglobal deblock_h_luma, 5,7,15
+ movsxdifnidn r1, r1p
add r1, r1
LOAD_AB m12, m13, r2d, r3d
mov r2, r1
@@ -607,6 +611,7 @@ DEBLOCK_LUMA_64
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA_INTRA_64 0
cglobal deblock_v_luma_intra, 4,7,16
+ movsxdifnidn r1, r1p
%define t0 m1
%define t1 m2
%define t2 m4
@@ -656,6 +661,7 @@ cglobal deblock_v_luma_intra, 4,7,16
; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,16
+ movsxdifnidn r1, r1p
%define t0 m15
%define t1 m14
%define t2 m2
@@ -725,6 +731,7 @@ DEBLOCK_LUMA_INTRA_64
; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra, 4,7,8
+ movsxdifnidn r1, r1p
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
@@ -751,6 +758,7 @@ cglobal deblock_v_luma_intra, 4,7,8
; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,8
+ movsxdifnidn r1, r1p
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
@@ -1094,6 +1102,7 @@ DEBLOCK_LUMA_INTRA
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA 0
cglobal deblock_v_luma, 5,5,10
+ movsxdifnidn r1, r1p
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
@@ -1139,6 +1148,7 @@ cglobal deblock_v_luma, 5,5,10
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 5,9
+ movsxdifnidn r1, r1p
lea r8, [r1*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
@@ -1449,6 +1459,7 @@ DEBLOCK_LUMA v, 16
; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
+ movsxdifnidn r1, r1p
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
dec r2d ; alpha-1
@@ -1505,6 +1516,7 @@ INIT_MMX cpuname
; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,9
+ movsxdifnidn r1, r1p
lea r8, [r1*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
@@ -1673,6 +1685,7 @@ cglobal deblock_inter_body
; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma, 5,7,8
+ movsxdifnidn r1, r1p
FIX_STRIDES r1
mov r5, r0
sub r0, r1
@@ -1693,6 +1706,7 @@ cglobal deblock_v_chroma, 5,7,8
; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
+ movsxdifnidn r1, r1p
add r1, r1
mov r5, 32/mmsize
%if mmsize == 16
@@ -1719,6 +1733,7 @@ cglobal deblock_intra_body
; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,6,8
+ movsxdifnidn r1, r1p
add r1, r1
mov r5, 32/mmsize
movd m5, r3d
@@ -1740,6 +1755,7 @@ cglobal deblock_v_chroma_intra, 4,6,8
; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
+ movsxdifnidn r1, r1p
add r1, r1
mov r4, 32/mmsize
%if mmsize == 16
@@ -1758,6 +1774,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+ movsxdifnidn r1, r1p
add r1, r1
%if mmsize == 8
mov r4, 16/mmsize
@@ -1781,6 +1798,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_mbaff, 5,7,8
+ movsxdifnidn r1, r1p
add r1, r1
lea r6, [r1*3]
%if mmsize == 8
@@ -1809,6 +1827,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8
; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422_intra, 4,6,8
+ movsxdifnidn r1, r1p
add r1, r1
mov r4, 64/mmsize
%if mmsize == 16
@@ -1827,6 +1846,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8
; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422, 5,7,8
+ movsxdifnidn r1, r1p
add r1, r1
mov r5, 64/mmsize
lea r6, [r1*3]
@@ -1928,6 +1948,7 @@ cglobal chroma_inter_body
; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma, 5,6,8
+ movsxdifnidn r1, r1p
CHROMA_V_START
mova m0, [t5]
mova m1, [t5+r1]
@@ -1943,6 +1964,7 @@ cglobal deblock_v_chroma, 5,6,8
; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
+ movsxdifnidn r1, r1p
CHROMA_H_START
%if mmsize==8
mov dword r0m, 2
@@ -1969,6 +1991,7 @@ DEBLOCK_CHROMA
;-----------------------------------------------------------------------------
%macro DEBLOCK_H_CHROMA_420_MBAFF 0
cglobal deblock_h_chroma_mbaff, 5,7,8
+ movsxdifnidn r1, r1p
dec r2d
dec r3d
sub r0, 4
@@ -1994,6 +2017,7 @@ DEBLOCK_H_CHROMA_420_MBAFF
%macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,8,8
+ movsxdifnidn r1, r1p
%if ARCH_X86_64
%define cntr r7
%else
@@ -2064,6 +2088,7 @@ cglobal chroma_intra_body
; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,5,8
+ movsxdifnidn r1, r1p
CHROMA_V_START
mova m0, [t5]
mova m1, [t5+r1]
@@ -2079,6 +2104,7 @@ cglobal deblock_v_chroma_intra, 4,5,8
; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
+ movsxdifnidn r1, r1p
CHROMA_H_START
%if mmsize==8
mov dword r0m, 2
@@ -2091,6 +2117,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
RET
cglobal deblock_h_chroma_422_intra, 4,7,8
+ movsxdifnidn r1, r1p
CHROMA_H_START
mov r6d, 32/mmsize
.loop:
@@ -2121,6 +2148,7 @@ DEBLOCK_CHROMA_INTRA
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+ movsxdifnidn r1, r1p
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
call chroma_intra_body
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 8e568cf..44dd5a5 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -193,6 +193,9 @@ cextern pd_32
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 1-2 0
cglobal pixel_avg_weight_w%1
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
+ movsxdifnidn r5, r5p
BIWEIGHT_START
AVG_START %2
%if HIGH_BIT_DEPTH
@@ -407,6 +410,8 @@ AVG_WEIGHT 16, 7
%macro WEIGHTER 1
cglobal mc_weight_w%1, 6,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
WEIGHT_START %1
.loop:
@@ -482,6 +487,8 @@ WEIGHTER 20
;-----------------------------------------------------------------------------
%macro OFFSET 2
cglobal mc_offset%2_w%1, 6,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
mova m2, [r4]
%if HIGH_BIT_DEPTH
@@ -528,6 +535,9 @@ OFFSETPN 8
;-----------------------------------------------------------------------------
%macro AVGH 2
cglobal pixel_avg_%1x%2
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
+ movsxdifnidn r5, r5p
mov eax, %2
cmp dword r6m, 32
jne pixel_avg_weight_w%1 %+ SUFFIX
@@ -545,6 +555,9 @@ cglobal pixel_avg_%1x%2
%macro AVG_FUNC 3
cglobal pixel_avg_w%1
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
+ movsxdifnidn r5, r5p
AVG_START
.height_loop:
%assign x 0
@@ -652,6 +665,8 @@ AVGH 4, 2
;-----------------------------------------------------------------------------
%macro AVG2_W_ONE 1
cglobal pixel_avg2_w%1, 6,7,4
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
@@ -677,6 +692,8 @@ cglobal pixel_avg2_w%1, 6,7,4
%macro AVG2_W_TWO 3
cglobal pixel_avg2_w%1, 6,7,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
@@ -720,6 +737,8 @@ AVG2_W_TWO 16, movu, mova
INIT_MMX
cglobal pixel_avg2_w10_mmx2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
@@ -748,6 +767,8 @@ cglobal pixel_avg2_w10_mmx2, 6,7
RET
cglobal pixel_avg2_w16_mmx2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
@@ -782,6 +803,8 @@ cglobal pixel_avg2_w16_mmx2, 6,7
RET
cglobal pixel_avg2_w18_mmx2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
.height_loop:
movu m0, [r2+ 0]
@@ -807,6 +830,8 @@ cglobal pixel_avg2_w18_mmx2, 6,7
INIT_XMM
cglobal pixel_avg2_w18_sse2, 6,7,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
.height_loop:
movu m0, [r2+ 0]
@@ -836,6 +861,8 @@ cglobal pixel_avg2_w18_sse2, 6,7,6
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
cglobal pixel_avg2_w%1_mmx2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
lea r6, [r4+r3]
.height_loop:
@@ -858,6 +885,8 @@ AVG2_W8 8, movq
%macro AVG2_W16 2
cglobal pixel_avg2_w%1_mmx2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r2, r4
lea r6, [r2+r3]
.height_loop:
@@ -884,6 +913,8 @@ AVG2_W16 12, movd
AVG2_W16 16, movq
cglobal pixel_avg2_w20_mmx2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r2, r4
lea r6, [r2+r3]
.height_loop:
@@ -912,6 +943,8 @@ cglobal pixel_avg2_w20_mmx2, 6,7
RET
cglobal pixel_avg2_w16_sse2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r4, r2
lea r6, [r4+r3]
.height_loop:
@@ -931,6 +964,8 @@ cglobal pixel_avg2_w16_sse2, 6,7
%macro AVG2_W20 1
cglobal pixel_avg2_w20_%1, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
sub r2, r4
lea r6, [r2+r3]
.height_loop:
@@ -1033,6 +1068,8 @@ pixel_avg2_w%1_cache_mmx2:
%define cachesplit pixel_avg2_w%1_cache_mmx2
%endif
cglobal pixel_avg2_w%1_cache%2_%3
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
mov eax, r2m
and eax, %2-1
cmp eax, (%2-%1-(%1 % 8))
@@ -1116,6 +1153,8 @@ avg_w16_align%1_%2_ssse3:
%endmacro
cglobal pixel_avg2_w16_cache64_ssse3
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
mov eax, r2m
and eax, 0x3f
@@ -1197,6 +1236,8 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
;-----------------------------------------------------------------------------
INIT_MMX
cglobal mc_copy_w4_mmx, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
cmp dword r4m, 4
lea r5, [r3*3]
@@ -1217,6 +1258,8 @@ cglobal mc_copy_w4_mmx, 4,6
%assign %%w %1*SIZEOF_PIXEL/mmsize
%if %%w > 0
cglobal mc_copy_w%1, 5,7,8*(%%w/2)
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
@@ -1255,6 +1298,8 @@ MC_COPY 16
%macro PREFETCH_FENC 1
%if ARCH_X86_64
cglobal prefetch_fenc_%1, 5,5
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
and r4d, 3
mov eax, r4d
@@ -1319,6 +1364,7 @@ PREFETCH_FENC 422
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal prefetch_ref, 3,3
+ movsxdifnidn r1, r1p
FIX_STRIDES r1
dec r2d
and r2d, r1d
@@ -1353,7 +1399,7 @@ cglobal prefetch_ref, 3,3
%else
PROLOGUE 0,6,%1
%endif
- movifnidn r3, r3mp
+ movifnidn r3p, r3mp
movifnidn r4d, r4m
movifnidn r5d, r5m
movifnidn t0d, r6m
@@ -1402,6 +1448,8 @@ cglobal prefetch_ref, 3,3
;-----------------------------------------------------------------------------
%macro MC_CHROMA 0
cglobal mc_chroma
+ movsxdifnidn r2, r2p
+ movsxdifnidn r4, r4p
MC_CHROMA_START 0
FIX_STRIDES r4
and r5d, 7
@@ -1426,8 +1474,8 @@ cglobal mc_chroma
WIN64_SPILL_XMM 9
%endif
movd m5, t2d
- movifnidn r0, r0mp
- movifnidn r1, r1mp
+ movifnidn r0p, r0mp
+ movifnidn r1p, r1mp
movifnidn r2d, r2m
movifnidn r5d, r8m
pxor m6, m6
@@ -1671,8 +1719,8 @@ ALIGN 4
mova m4, [pw_8]
SPLATW m5, m5
psubw m4, m5
- movifnidn r0, r0mp
- movifnidn r1, r1mp
+ movifnidn r0p, r0mp
+ movifnidn r1p, r1mp
movifnidn r2d, r2m
FIX_STRIDES r2
movifnidn r5d, r8m
@@ -1781,6 +1829,8 @@ ALIGN 4
%macro MC_CHROMA_SSSE3 0
cglobal mc_chroma
+ movsxdifnidn r2, r2p
+ movsxdifnidn r4, r4p
MC_CHROMA_START 9
and r5d, 7
and t2d, 7
@@ -1808,8 +1858,8 @@ cglobal mc_chroma
%else
mova m5, [ch_shuf]
%endif
- movifnidn r0, r0mp
- movifnidn r1, r1mp
+ movifnidn r0p, r0mp
+ movifnidn r1p, r1mp
movifnidn r2d, r2m
movifnidn r5d, r8m
SPLATW m6, m6
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 86769c5..7b93459 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -159,6 +159,8 @@ cextern pd_ffff
;-----------------------------------------------------------------------------
%macro HPEL_FILTER 0
cglobal hpel_filter_v, 5,6,11
+ movsxdifnidn r3, r3p
+ movsxdifnidn r4, r4p
FIX_STRIDES r3, r4
lea r5, [r1+r3]
sub r1, r3
@@ -216,6 +218,7 @@ cglobal hpel_filter_v, 5,6,11
; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,10
+ movsxdifnidn r2, r2p
add r2, r2
add r0, r2
add r1, r2
@@ -265,6 +268,7 @@ cglobal hpel_filter_c, 3,3,10
; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h, 3,4,8
+ movsxdifnidn r2, r2p
%define src r1+r2
add r2, r2
add r0, r2
@@ -317,6 +321,8 @@ HPEL_FILTER
; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_v, 5,6,%1
+ movsxdifnidn r3, r3p
+ movsxdifnidn r4, r4p
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
@@ -373,6 +379,7 @@ cglobal hpel_filter_v, 5,6,%1
;-----------------------------------------------------------------------------
INIT_MMX
cglobal hpel_filter_c_mmx2, 3,3
+ movsxdifnidn r2, r2p
add r0, r2
lea r1, [r1+r2*2]
neg r2
@@ -402,6 +409,7 @@ cglobal hpel_filter_c_mmx2, 3,3
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_mmx2, 3,3
+ movsxdifnidn r2, r2p
add r0, r2
add r1, r2
neg r2
@@ -449,6 +457,7 @@ INIT_XMM
; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,9
+ movsxdifnidn r2, r2p
add r0, r2
lea r1, [r1+r2*2]
neg r2
@@ -517,6 +526,7 @@ cglobal hpel_filter_c, 3,3,9
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_sse2, 3,3,8
+ movsxdifnidn r2, r2p
add r0, r2
add r1, r2
neg r2
@@ -566,6 +576,7 @@ cglobal hpel_filter_h_sse2, 3,3,8
;-----------------------------------------------------------------------------
%macro HPEL_H 0
cglobal hpel_filter_h, 3,3
+ movsxdifnidn r2, r2p
add r0, r2
add r1, r2
neg r2
@@ -736,6 +747,7 @@ HPEL_H
; uint8_t *src, intptr_t stride, int width, int height )
;-----------------------------------------------------------------------------
cglobal hpel_filter, 7,9,16
+ movsxdifnidn r4, r4p
mov r7, r3
sub r5d, 16
mov r8, r1
@@ -811,6 +823,8 @@ HPEL
; assumes i_dst and w are multiples of 16, and i_dst>w
INIT_MMX
cglobal plane_copy_core_mmx2, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3, r4d
%if HIGH_BIT_DEPTH == 0
movsxdifnidn r4, r4d
@@ -854,8 +868,8 @@ cglobal plane_copy_core_mmx2, 6,7
sub r6d, 16
jg .loop16
.end16:
- add r0, r1
- add r2, r3
+ add r0p, r1p
+ add r2p, r3p
dec r5d
jg .loopy
sfence
@@ -953,11 +967,14 @@ cglobal plane_copy_core_mmx2, 6,7
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
cglobal plane_copy_interleave_core, 6,9
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
+ movsxdifnidn r5, r5p
mov r6d, r6m
%if HIGH_BIT_DEPTH
FIX_STRIDES r1, r3, r5, r6d
- movifnidn r1mp, r1
- movifnidn r3mp, r3
+ movifnidn r1mp, r1p
+ movifnidn r3mp, r3p
mov r6m, r6d
%endif
lea r0, [r0+r6*2]
@@ -968,7 +985,7 @@ cglobal plane_copy_interleave_core, 6,9
%else
DECLARE_REG_TMP 1,3
%endif
- mov t1, r1
+ mov t1p, r1p
shr t1, SIZEOF_PIXEL
sub t1, r6
mov t0d, r7m
@@ -1002,11 +1019,11 @@ cglobal plane_copy_interleave_core, 6,9
%assign n n+32
%endrep
add r6, 16*SIZEOF_PIXEL
- cmp r6, t1
+ cmp r6p, t1p
jl .pad
- add r0, r1mp
- add r2, r3mp
- add r4, r5
+ add r0p, r1mp
+ add r2p, r3mp
+ add r4p, r5p
dec t0d
jg .loopy
sfence
@@ -1017,6 +1034,7 @@ cglobal plane_copy_interleave_core, 6,9
; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
;-----------------------------------------------------------------------------
cglobal store_interleave_chroma, 5,5
+ movsxdifnidn r1, r1p
FIX_STRIDES r1
.loop:
INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
@@ -1046,6 +1064,9 @@ cglobal store_interleave_chroma, 5,5
; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
cglobal plane_copy_deinterleave, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
+ movsxdifnidn r5, r5p
DEINTERLEAVE_START
mov r6d, r6m
FIX_STRIDES r1, r3, r5, r6d
@@ -1074,6 +1095,7 @@ cglobal plane_copy_deinterleave, 6,7
; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fenc, 4,4
+ movsxdifnidn r2, r2p
DEINTERLEAVE_START
FIX_STRIDES r2
.loop:
@@ -1089,6 +1111,7 @@ cglobal load_deinterleave_chroma_fenc, 4,4
; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fdec, 4,4
+ movsxdifnidn r2, r2p
DEINTERLEAVE_START
FIX_STRIDES r2
.loop:
@@ -1133,6 +1156,7 @@ PLANE_DEINTERLEAVE
;-----------------------------------------------------------------------------
INIT_MMX
cglobal memcpy_aligned_mmx, 3,3
+IFNIDN r2, r2p, mov r2p, r2p
test r2d, 16
jz .copy32start
movq mm0, [r1 + r2 - 16]
@@ -1161,6 +1185,7 @@ cglobal memcpy_aligned_mmx, 3,3
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
cglobal memcpy_aligned_sse2, 3,3
+IFNIDN r2, r2p, mov r2p, r2p
test r2d, 16
jz .copy32
movdqa xmm0, [r1 + r2 - 16]
@@ -1196,6 +1221,7 @@ cglobal memcpy_aligned_sse2, 3,3
;-----------------------------------------------------------------------------
%macro MEMZERO 0
cglobal memzero_aligned, 2,2
+IFNIDN r2, r2p, mov r2p, r2p
add r0, r1
neg r1
pxor m0, m0
@@ -1223,6 +1249,7 @@ MEMZERO
;-----------------------------------------------------------------------------
INIT_XMM
cglobal integral_init4h_sse4, 3,4
+ movsxdifnidn r2, r2p
lea r3, [r0+r2*2]
add r1, r2
neg r2
@@ -1243,6 +1270,7 @@ cglobal integral_init4h_sse4, 3,4
%macro INTEGRAL_INIT8H 0
cglobal integral_init8h, 3,4
+ movsxdifnidn r2, r2p
lea r3, [r0+r2*2]
add r1, r2
neg r2
@@ -1277,6 +1305,7 @@ INTEGRAL_INIT8H
; void integral_init8v( uint16_t *sum8, intptr_t stride )
;-----------------------------------------------------------------------------
cglobal integral_init8v, 3,3
+ movsxdifnidn r1, r1p
shl r1, 1
add r0, r1
lea r2, [r0+r1*8]
@@ -1303,6 +1332,7 @@ INTEGRAL_INIT_8V
;-----------------------------------------------------------------------------
INIT_MMX
cglobal integral_init4v_mmx, 3,5
+ movsxdifnidn r2, r2p
shl r2, 1
lea r3, [r0+r2*4]
lea r4, [r0+r2*8]
@@ -1325,6 +1355,7 @@ cglobal integral_init4v_mmx, 3,5
INIT_XMM
cglobal integral_init4v_sse2, 3,5
+ movsxdifnidn r2, r2p
shl r2, 1
add r0, r2
add r1, r2
@@ -1350,6 +1381,7 @@ cglobal integral_init4v_sse2, 3,5
RET
cglobal integral_init4v_ssse3, 3,5
+ movsxdifnidn r2, r2p
shl r2, 1
add r0, r2
add r1, r2
@@ -1492,6 +1524,8 @@ cglobal integral_init4v_ssse3, 3,5
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 0
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+ movsxdifnidn r5, r5p
+ movsxdifnidn r6, r6p
%if HIGH_BIT_DEPTH
shl dword r6m, 1
FIX_STRIDES r5
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d5d3e90..6a582b5 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -148,6 +148,8 @@ cextern hsub_mul
;-----------------------------------------------------------------------------
%macro SSD_ONE 2
cglobal pixel_ssd_%1x%2, 4,5,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
mov r4, %1*%2/mmsize
pxor m0, m0
.loop
@@ -178,6 +180,8 @@ cglobal pixel_ssd_%1x%2, 4,5,6
%macro SSD_16_MMX 2
cglobal pixel_ssd_%1x%2, 4,5
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
mov r4, %1*%2/mmsize/2
pxor m0, m0
.loop
@@ -370,6 +374,8 @@ SSD_ONE 16, 16
%assign function_align 16
%endif
cglobal pixel_ssd_%1x%2, 0,0,0
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
mov al, %1*%2/mmsize/2
%if %1 != %2
@@ -480,6 +486,8 @@ SSD 8, 4
%if HIGH_BIT_DEPTH
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
shl r4d, 2
FIX_STRIDES r1, r3
add r0, r4
@@ -569,6 +577,8 @@ cglobal pixel_ssd_nv12_core, 6,7,7
;-----------------------------------------------------------------------------
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
shl r4d, 1
add r0, r4
add r2, r4
@@ -701,18 +711,21 @@ SSD_NV12
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
+ movsxdifnidn r1, r1p
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
cglobal pixel_var_8x16, 2,3
+ movsxdifnidn r1, r1p
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 8, 16
cglobal pixel_var_8x8, 2,3
+ movsxdifnidn r1, r1p
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
@@ -721,12 +734,14 @@ cglobal pixel_var_8x8, 2,3
%if HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
+ movsxdifnidn r1, r1p
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 16, 16
cglobal pixel_var_8x8, 2,3,8
+ movsxdifnidn r1, r1p
lea r2, [r1*3]
VAR_START 0
mova m0, [r0]
@@ -754,6 +769,7 @@ VAR
%if HIGH_BIT_DEPTH == 0
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
+ movsxdifnidn r1, r1p
VAR_START 1
mov r2d, 8
.loop:
@@ -767,6 +783,7 @@ cglobal pixel_var_16x16, 2,3,8
VAR_END 16, 16
cglobal pixel_var_8x8, 2,4,8
+ movsxdifnidn r1, r1p
VAR_START 1
mov r2d, 2
lea r3, [r1*3]
@@ -783,6 +800,7 @@ cglobal pixel_var_8x8, 2,4,8
VAR_END 8, 8
cglobal pixel_var_8x16, 2,4,8
+ movsxdifnidn r1, r1p
VAR_START 1
mov r2d, 4
lea r3, [r1*3]
@@ -824,6 +842,8 @@ VAR
;-----------------------------------------------------------------------------
%macro VAR2_8x8_MMX 2
cglobal pixel_var2_8x%1, 5,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, %1
@@ -866,6 +886,8 @@ VAR2_8x8_MMX 16, 7
%macro VAR2_8x8_SSE2 2
cglobal pixel_var2_8x%1, 5,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
VAR_START 1
mov r5d, %1/2
.loop:
@@ -903,6 +925,8 @@ VAR2_8x8_SSE2 16, 7
%if HIGH_BIT_DEPTH == 0
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 5,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
@@ -1157,6 +1181,8 @@ pixel_satd_8x4_internal_mmx2:
%if HIGH_BIT_DEPTH
%macro SATD_MxN_MMX 3
cglobal pixel_satd_%1x%2, 4,7
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
pxor m0, m0
call pixel_satd_%1x%3_internal_mmx2
@@ -1184,6 +1210,8 @@ SATD_MxN_MMX 8, 16, 8
%if HIGH_BIT_DEPTH == 0
cglobal pixel_satd_16x16, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
pxor m0, m0
%rep 3
@@ -1197,6 +1225,8 @@ cglobal pixel_satd_16x16, 4,6
RET
cglobal pixel_satd_16x8, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
pxor m0, m0
call pixel_satd_16x4_internal_mmx2
@@ -1206,6 +1236,8 @@ cglobal pixel_satd_16x8, 4,6
SATD_END_MMX
cglobal pixel_satd_8x16, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
pxor m0, m0
call pixel_satd_8x8_internal_mmx2
@@ -1216,18 +1248,24 @@ cglobal pixel_satd_8x16, 4,6
%endif ; !HIGH_BIT_DEPTH
cglobal pixel_satd_8x8, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
pxor m0, m0
call pixel_satd_8x8_internal_mmx2
SATD_END_MMX
cglobal pixel_satd_8x4, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
pxor m0, m0
call pixel_satd_8x4_internal_mmx2
SATD_END_MMX
cglobal pixel_satd_4x16, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 1
@@ -1239,6 +1277,8 @@ cglobal pixel_satd_4x16, 4,6
SATD_END_MMX
cglobal pixel_satd_4x8, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
@@ -1246,6 +1286,8 @@ cglobal pixel_satd_4x8, 4,6
SATD_END_MMX
cglobal pixel_satd_4x4, 4,6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
@@ -1340,6 +1382,8 @@ cglobal pixel_satd_4x4, 4,6
%macro SATDS_SSE2 0
%if cpuflag(ssse3)
cglobal pixel_satd_4x4, 4, 6, 6
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
@@ -1356,6 +1400,8 @@ cglobal pixel_satd_4x4, 4, 6, 6
%endif
cglobal pixel_satd_4x8, 4, 6, 8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
%if cpuflag(ssse3)
mova m7, [hmul_4p]
@@ -1366,6 +1412,8 @@ cglobal pixel_satd_4x8, 4, 6, 8
RET
cglobal pixel_satd_4x16, 4, 6, 8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_MMX
%if cpuflag(ssse3)
mova m7, [hmul_4p]
@@ -1397,6 +1445,8 @@ cglobal pixel_satd_16x4_internal
ret
cglobal pixel_satd_16x8, 4,6,12
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_SSE2 m10, m7
%if notcpuflag(ssse3)
mova m7, [pw_00ff]
@@ -1404,6 +1454,8 @@ cglobal pixel_satd_16x8, 4,6,12
jmp %%pixel_satd_16x8_internal
cglobal pixel_satd_16x16, 4,6,12
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_SSE2 m10, m7
%if notcpuflag(ssse3)
mova m7, [pw_00ff]
@@ -1416,6 +1468,8 @@ cglobal pixel_satd_16x16, 4,6,12
SATD_END_SSE2 m10
%else
cglobal pixel_satd_16x8, 4,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_SSE2 m6, m7
BACKUP_POINTERS
call pixel_satd_8x8_internal
@@ -1424,6 +1478,8 @@ cglobal pixel_satd_16x8, 4,6,8
SATD_END_SSE2 m6
cglobal pixel_satd_16x16, 4,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_SSE2 m6, m7
BACKUP_POINTERS
call pixel_satd_8x8_internal
@@ -1435,17 +1491,23 @@ cglobal pixel_satd_16x16, 4,6,8
%endif
cglobal pixel_satd_8x16, 4,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_SSE2 m6, m7
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
cglobal pixel_satd_8x8, 4,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_SSE2 m6, m7
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
cglobal pixel_satd_8x4, 4,6,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SATD_START_SSE2 m6, m7
call %%pixel_satd_8x4_internal
SATD_END_SSE2 m6
@@ -1479,6 +1541,8 @@ cglobal pixel_satd_8x4, 4,6,8
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
lea r6, [r0+4*r1]
lea r7, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
@@ -1495,6 +1559,8 @@ cglobal pixel_sa8d_8x8_internal
ret
cglobal pixel_sa8d_8x8, 4,8,12
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
@@ -1513,6 +1579,8 @@ cglobal pixel_sa8d_8x8, 4,8,12
RET
cglobal pixel_sa8d_16x16, 4,8,12
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
@@ -1958,7 +2026,7 @@ cglobal intra_satd_x3_16x16, 0,5
%define sums rsp+64 ; size 56
%define top_1d rsp+32 ; size 32
%define left_1d rsp ; size 32
- movifnidn r1, r1mp
+ movifnidn r1p, r1mp
pxor m7, m7
mova [sums+ 0], m7
@@ -1985,7 +2053,7 @@ cglobal intra_satd_x3_16x16, 0,5
pand m6, [sw_f0] ; dc
; 2D hadamards
- movifnidn r0, r0mp
+ movifnidn r0p, r0mp
mov r3, -4
.loop_y:
mov r4, -4
@@ -2035,7 +2103,7 @@ cglobal intra_satd_x3_16x16, 0,5
jl .loop_y
; horizontal sum
- movifnidn r2, r2mp
+ movifnidn r2p, r2mp
%if HIGH_BIT_DEPTH
mova m1, m5
paddd m5, m3
@@ -2079,7 +2147,7 @@ cglobal intra_satd_x3_8x8c, 0,6
%define dc_1d rsp+32 ; size 16
%define top_1d rsp+16 ; size 16
%define left_1d rsp ; size 16
- movifnidn r1, r1mp
+ movifnidn r1p, r1mp
pxor m7, m7
mova [sums+ 0], m7
mova [sums+ 8], m7
@@ -2115,8 +2183,8 @@ cglobal intra_satd_x3_8x8c, 0,6
lea r5, [dc_1d]
; 2D hadamards
- movifnidn r0, r0mp
- movifnidn r2, r2mp
+ movifnidn r0p, r0mp
+ movifnidn r2p, r2mp
mov r3, -2
.loop_y:
mov r4, -2
@@ -3490,6 +3558,7 @@ cglobal hadamard_ac_8x8
%macro HADAMARD_AC_WXH_MMX 2
cglobal pixel_hadamard_ac_%1x%2, 2,4
+ movsxdifnidn r1, r1p
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
FIX_STRIDES r1
@@ -3719,6 +3788,7 @@ HADAMARD_AC_WXH_SSE2 8, 8
; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 2
cglobal pixel_hadamard_ac_%1x%2, 2,3,11
+ movsxdifnidn r1, r1p
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
FIX_STRIDES r1
@@ -3875,6 +3945,8 @@ HADAMARD_AC_SSE2
%macro SSIM 0
cglobal pixel_ssim_4x4x2_core, 4,4,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
FIX_STRIDES r1, r3
pxor m0, m0
SSIM_ITER 0
@@ -4006,6 +4078,8 @@ SSIM
;-----------------------------------------------------------------------------
%macro ASD8 0
cglobal pixel_asd8, 5,5
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
pxor m0, m0
pxor m1, m1
.loop:
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 183f16a..aa0e7de 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -508,7 +508,7 @@ cglobal dequant_%1x%1_flat16, 0,3
%else
lea r1, [dequant%1_scale + t2]
%endif
- movifnidn r0, r0mp
+ movifnidn r0p, r0mp
movd m4, t0d
%if %1 == 4
%if mmsize == 8
@@ -650,7 +650,7 @@ DEQUANT_DC w, pmullw
%assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
%endif
cglobal optimize_chroma_2x2_dc, 0,%%regs,7
- movifnidn t0, r0mp
+ movifnidn t0p, r0mp
movd m2, r1m
movq m1, [t0]
%if cpuflag(sse4)
@@ -1343,8 +1343,8 @@ COEFF_LAST
%macro COEFF_LEVELRUN 1
cglobal coeff_level_run%1,0,7
- movifnidn t0, r0mp
- movifnidn t1, r1mp
+ movifnidn t0p, r0mp
+ movifnidn t1p, r1mp
pxor m2, m2
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
%if %1==15
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 5723199..eb7dd13 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -84,6 +84,8 @@ cextern sw_64
;-----------------------------------------------------------------------------
%macro SAD 2
cglobal pixel_sad_%1x%2_mmx2, 4,4
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
pxor mm0, mm0
%rep %2/2
SAD_INC_2x%1P
@@ -119,6 +121,8 @@ SAD 4, 4
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x16, 4,4,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
movu m0, [r2]
movu m1, [r2+r3]
lea r2, [r2+2*r3]
@@ -186,6 +190,8 @@ cglobal pixel_sad_16x16, 4,4,8
; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x8, 4,4
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
movu m0, [r2]
movu m2, [r2+r3]
lea r2, [r2+2*r3]
@@ -249,6 +255,8 @@ SAD_W16
INIT_XMM
;Even on Nehalem, no sizes other than 8x16 benefit from this method.
cglobal pixel_sad_8x16_sse2, 4,4
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SAD_INC_4x8P_SSE 0
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
@@ -263,6 +271,7 @@ cglobal pixel_sad_8x16_sse2, 4,4
%if ARCH_X86_64 == 0
INIT_MMX
cglobal pixel_vsad_mmx2, 3,3
+ movsxdifnidn r1, r1p
mova m0, [r0]
mova m1, [r0+8]
mova m2, [r0+r1]
@@ -299,6 +308,7 @@ cglobal pixel_vsad_mmx2, 3,3
INIT_XMM
cglobal pixel_vsad_sse2, 3,3
+ movsxdifnidn r1, r1p
mova m0, [r0]
mova m1, [r0+r1]
lea r0, [r0+r1*2]
@@ -857,7 +867,7 @@ INTRA_SAD16
%endmacro
%macro SAD_X4_END 0
- mov r0, r6mp
+ mov r0p, r6mp
movd [r0+0], mm0
movd [r0+4], mm1
movd [r0+8], mm2
@@ -871,6 +881,7 @@ INTRA_SAD16
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
+IF %1 == 3, movsxdifnidn, {r4, r4p}, {r5, r5p}
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
@@ -1170,7 +1181,7 @@ SAD_X 4, 4, 4
%endmacro
%macro SAD_X4_END_SSE2 0
- mov r0, r6mp
+ mov r0p, r6mp
psllq xmm1, 32
psllq xmm3, 32
paddw xmm0, xmm1
@@ -1190,6 +1201,7 @@ SAD_X 4, 4, 4
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 3
cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
+IF %1 == 3, movsxdifnidn, {r4, r4p}, {r5, r5p}
SAD_X%1_2x%2P_SSE2 1
%rep %3/2-1
SAD_X%1_2x%2P_SSE2 0
@@ -1293,6 +1305,8 @@ sad_w16_align%1_ssse3:
%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
cglobal pixel_sad_16x%2_cache64_%1
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
mov eax, r2m
and eax, 0x37
cmp eax, 0x30
@@ -1341,6 +1355,8 @@ cglobal pixel_sad_16x%2_cache64_%1
%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
cglobal pixel_sad_16x%1_cache%2_mmx2
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SAD_CACHELINE_START_MMX2 16, %1, %1, %2
.loop:
movq mm1, [r2]
@@ -1367,6 +1383,8 @@ cglobal pixel_sad_16x%1_cache%2_mmx2
%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
cglobal pixel_sad_8x%1_cache%2_mmx2
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
.loop:
movq mm1, [r2+8]
@@ -1403,6 +1421,7 @@ cglobal pixel_sad_8x%1_cache%2_mmx2
%macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
cglobal pixel_sad_x3_%1x%2_cache%3_%6
+ movsxdifnidn r4, r4p
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
@@ -1468,6 +1487,7 @@ cglobal pixel_sad_x3_%1x%2_cache%3_%6
%macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
cglobal pixel_sad_x4_%1x%2_cache%3_%6
+ movsxdifnidn r5, r5p
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
@@ -1476,7 +1496,7 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6
.split:
%if ARCH_X86_64
PROLOGUE 6,9
- mov r8, r6mp
+ mov r8p, r6mp
push r4
push r3
push r2
diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm
index 8e3dba7..fe67d3b 100644
--- a/common/x86/sad16-a.asm
+++ b/common/x86/sad16-a.asm
@@ -91,6 +91,8 @@ cextern pw_8
;-----------------------------------------------------------------------------
%macro SAD_MMX 3
cglobal pixel_sad_%1x%2, 4,4
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
pxor m0, m0
%rep %2/%3
SAD_INC_%3x%1P_MMX
@@ -156,6 +158,8 @@ SAD_MMX 4, 4, 2
;-----------------------------------------------------------------------------
%macro SAD_XMM 2
cglobal pixel_sad_%1x%2, 4,4,8
+ movsxdifnidn r1, r1p
+ movsxdifnidn r3, r3p
pxor m0, m0
%rep %2/2
SAD_INC_2x%1P_XMM
@@ -241,7 +245,7 @@ SAD_XMM 8, 8
movd [r5+4], m1
movd [r5+8], m2
%else
- mov r0, r5mp
+ mov r0p, r5mp
movd [r0+0], m0
movd [r0+4], m1
movd [r0+8], m2
@@ -332,7 +336,7 @@ SAD_XMM 8, 8
HADDW m2, m6
HADDW m3, m7
%endif
- mov r0, r6mp
+ mov r0p, r6mp
movd [r0+ 0], m0
movd [r0+ 4], m1
movd [r0+ 8], m2
@@ -406,6 +410,7 @@ PIXEL_VSAD
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
+ movsxdifnidn r4, r4p
%assign regnum %1+1
%xdefine STRIDE r %+ regnum
mov r6, %3/2-1
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index 7b39d93..76fa9c3 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -95,13 +95,13 @@ cglobal checkasm_call, 2,15,16
; All arguments have been pushed on the stack instead of registers in order to
; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
- mov r0, r6mp
- mov r1, r7mp
- mov r2, r8mp
- mov r3, r9mp
+ mov r0, r6mq
+ mov r1, r7mq
+ mov r2, r8mq
+ mov r3, r9mq
%if UNIX64
- mov r4, r10mp
- mov r5, r11mp
+ mov r4, r10mq
+ mov r5, r11mq
%assign i 6
%rep max_args-6
mov r9, [rsp+stack_offset+(i+1)*8]
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 89d444e..756250a 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -218,10 +218,21 @@ intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
* detect all functions that assumes zero-extension.
*/
void x264_checkasm_stack_clobber( uint64_t clobber, ... );
+#if ARCH_X86_64_X64
#define call_a1(func,...) ({ \
uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
+#else
+/* If passed in registers high order bits of pointer arguments are zero.
+ * The call wrapper would have to know which are arguments are pointers to load them
+ * correctly in the presence of an otherwise clobbered stack. We can still use the clobber
+ * function to check ordinary stack based arguments */
+#define call_a1(func,...) ({ \
+ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
+ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
+ func( __VA_ARGS__ ); })
+#endif
#elif ARCH_X86
#define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
#else
--
1.8.1.1
More information about the x264-devel
mailing list