[x264-devel] [PATCH 3/3] x32 asm/checkasm various fixes

Matthias Räncker theonetruecamper at gmx.de
Sun Jan 27 00:00:10 CET 2013


This patch tries to be minimal and only fix issues with x32.
It is not optimized for that abi and leaves the code unmodified
if used with tradtional arches.

Signed-off-by: Matthias Räncker <theonetruecamper at gmx.de>
---
 common/x86/cabac-a.asm   | 19 +++++------
 common/x86/deblock-a.asm | 28 ++++++++++++++++
 common/x86/mc-a.asm      | 64 +++++++++++++++++++++++++++++++----
 common/x86/mc-a2.asm     | 52 ++++++++++++++++++++++++-----
 common/x86/pixel-a.asm   | 86 ++++++++++++++++++++++++++++++++++++++++++++----
 common/x86/quant-a.asm   |  8 ++---
 common/x86/sad-a.asm     | 26 +++++++++++++--
 common/x86/sad16-a.asm   |  9 +++--
 tools/checkasm-a.asm     | 12 +++----
 tools/checkasm.c         | 11 +++++++
 10 files changed, 267 insertions(+), 48 deletions(-)

diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 19d2aa2..8b1b936 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -36,13 +36,10 @@ cextern cabac_renorm_shift
 ; t3 must be ecx, since it's used for shift.
 %if WIN64
     DECLARE_REG_TMP 3,1,2,0,6,5,4,2
-    %define pointer resq
 %elif ARCH_X86_64
     DECLARE_REG_TMP 0,1,2,3,4,5,6,6
-    %define pointer resq
 %else
     DECLARE_REG_TMP 0,4,2,1,3,5,6,2
-    %define pointer resd
 %endif
 
 struc cb
@@ -50,9 +47,9 @@ struc cb
     .range: resd 1
     .queue: resd 1
     .bytes_outstanding: resd 1
-    .start: pointer 1
-    .p: pointer 1
-    .end: pointer 1
+    .start: resp 1
+    .p: resp 1
+    .end: resp 1
     align 16, resb 1
     .bits_encoded: resd 1
     .state: resb 1024
@@ -72,7 +69,7 @@ endstruc
 %endmacro
 
 cglobal cabac_encode_decision_asm, 0,7
-    movifnidn t0,  r0mp
+    movifnidn t0p, r0mp
     movifnidn t1d, r1m
     mov   t5d, [t0+cb.range]
     movzx t6d, byte [t0+cb.state+t1]
@@ -112,7 +109,7 @@ cglobal cabac_encode_decision_asm, 0,7
     RET
 
 cglobal cabac_encode_bypass_asm, 0,3
-    movifnidn  t0, r0mp
+    movifnidn t0p, r0mp
     movifnidn t3d, r1m
     mov       t7d, [t0+cb.low]
     and       t3d, [t0+cb.range]
@@ -133,7 +130,7 @@ cglobal cabac_encode_bypass_asm, 0,3
     jmp cabac_putbyte
 
 cglobal cabac_encode_terminal_asm, 0,3
-    movifnidn  t0, r0mp
+    movifnidn t0p, r0mp
     sub  dword [t0+cb.range], 2
 ; shortcut: the renormalization shift in terminal
 ; can only be 0 or 1 and is zero over 99% of the time.
@@ -167,7 +164,7 @@ cabac_putbyte:
     mov   t5d, [t0+cb.bytes_outstanding]
     cmp   t2b, 0xff ; FIXME is a 32bit op faster?
     jz    .postpone
-    mov    t1, [t0+cb.p]
+    mov   t1p, [t0+cb.p]
     add   [t1-1], t2h
     dec   t2h
 .loop_outstanding:
@@ -176,7 +173,7 @@ cabac_putbyte:
     dec   t5d
     jge .loop_outstanding
     mov   [t1-1], t2b
-    mov   [t0+cb.p], t1
+    mov   [t0+cb.p], t1p
 .postpone:
     inc   t5d
     mov   [t0+cb.bytes_outstanding], t5d
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index b1c9a88..b85080e 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -163,6 +163,7 @@ cextern pw_pixel_max
 ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_luma, 5,5,8
+    movsxdifnidn r1, r1p
     %assign pad 5*mmsize+12-(stack_offset&15)
     %define tcm [rsp]
     %define ms1 [rsp+mmsize]
@@ -217,6 +218,7 @@ cglobal deblock_v_luma, 5,5,8
     RET
 
 cglobal deblock_h_luma, 5,6,8
+    movsxdifnidn r1, r1p
     %assign pad 7*mmsize+12-(stack_offset&15)
     %define tcm [rsp]
     %define ms1 [rsp+mmsize]
@@ -345,6 +347,7 @@ cglobal deblock_h_luma, 5,6,8
 
 %macro DEBLOCK_LUMA_64 0
 cglobal deblock_v_luma, 5,5,15
+    movsxdifnidn r1, r1p
     %define p2 m8
     %define p1 m0
     %define p0 m1
@@ -381,6 +384,7 @@ cglobal deblock_v_luma, 5,5,15
     RET
 
 cglobal deblock_h_luma, 5,7,15
+    movsxdifnidn r1, r1p
     add         r1, r1
     LOAD_AB    m12, m13, r2d, r3d
     mov         r2, r1
@@ -607,6 +611,7 @@ DEBLOCK_LUMA_64
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_LUMA_INTRA_64 0
 cglobal deblock_v_luma_intra, 4,7,16
+    movsxdifnidn r1, r1p
     %define t0 m1
     %define t1 m2
     %define t2 m4
@@ -656,6 +661,7 @@ cglobal deblock_v_luma_intra, 4,7,16
 ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_luma_intra, 4,7,16
+    movsxdifnidn r1, r1p
     %define t0 m15
     %define t1 m14
     %define t2 m2
@@ -725,6 +731,7 @@ DEBLOCK_LUMA_INTRA_64
 ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_luma_intra, 4,7,8
+    movsxdifnidn r1, r1p
     LUMA_INTRA_INIT 3
     lea     r4, [r1*4]
     lea     r5, [r1*3]
@@ -751,6 +758,7 @@ cglobal deblock_v_luma_intra, 4,7,8
 ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_luma_intra, 4,7,8
+    movsxdifnidn r1, r1p
     LUMA_INTRA_INIT 8
 %if mmsize == 8
     lea     r4, [r1*3]
@@ -1094,6 +1102,7 @@ DEBLOCK_LUMA_INTRA
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_LUMA 0
 cglobal deblock_v_luma, 5,5,10
+    movsxdifnidn r1, r1p
     movd    m8, [r4] ; tc0
     lea     r4, [r1*3]
     dec     r2d        ; alpha-1
@@ -1139,6 +1148,7 @@ cglobal deblock_v_luma, 5,5,10
 ;-----------------------------------------------------------------------------
 INIT_MMX cpuname
 cglobal deblock_h_luma, 5,9
+    movsxdifnidn r1, r1p
     lea    r8, [r1*3]
     lea    r6, [r0-4]
     lea    r5, [r0-4+r8]
@@ -1449,6 +1459,7 @@ DEBLOCK_LUMA v, 16
 ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
+    movsxdifnidn r1, r1p
     lea     r4, [r1*4]
     lea     r5, [r1*3] ; 3*stride
     dec     r2d        ; alpha-1
@@ -1505,6 +1516,7 @@ INIT_MMX cpuname
 ; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_luma_intra, 4,9
+    movsxdifnidn r1, r1p
     lea    r8, [r1*3]
     lea    r6, [r0-4]
     lea    r5, [r0-4+r8]
@@ -1673,6 +1685,7 @@ cglobal deblock_inter_body
 ; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma, 5,7,8
+    movsxdifnidn r1, r1p
     FIX_STRIDES r1
     mov         r5, r0
     sub         r0, r1
@@ -1693,6 +1706,7 @@ cglobal deblock_v_chroma, 5,7,8
 ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma, 5,7,8
+    movsxdifnidn r1, r1p
     add         r1, r1
     mov         r5, 32/mmsize
 %if mmsize == 16
@@ -1719,6 +1733,7 @@ cglobal deblock_intra_body
 ; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma_intra, 4,6,8
+    movsxdifnidn r1, r1p
     add         r1, r1
     mov         r5, 32/mmsize
     movd        m5, r3d
@@ -1740,6 +1755,7 @@ cglobal deblock_v_chroma_intra, 4,6,8
 ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_intra, 4,6,8
+    movsxdifnidn r1, r1p
     add         r1, r1
     mov         r4, 32/mmsize
 %if mmsize == 16
@@ -1758,6 +1774,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
 ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+    movsxdifnidn r1, r1p
     add         r1, r1
 %if mmsize == 8
     mov         r4, 16/mmsize
@@ -1781,6 +1798,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
 ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_mbaff, 5,7,8
+    movsxdifnidn r1, r1p
     add         r1, r1
     lea         r6, [r1*3]
 %if mmsize == 8
@@ -1809,6 +1827,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8
 ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_422_intra, 4,6,8
+    movsxdifnidn r1, r1p
     add         r1, r1
     mov         r4, 64/mmsize
 %if mmsize == 16
@@ -1827,6 +1846,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8
 ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_422, 5,7,8
+    movsxdifnidn r1, r1p
     add         r1, r1
     mov         r5, 64/mmsize
     lea         r6, [r1*3]
@@ -1928,6 +1948,7 @@ cglobal chroma_inter_body
 ; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma, 5,6,8
+    movsxdifnidn r1, r1p
     CHROMA_V_START
     mova  m0, [t5]
     mova  m1, [t5+r1]
@@ -1943,6 +1964,7 @@ cglobal deblock_v_chroma, 5,6,8
 ; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma, 5,7,8
+    movsxdifnidn r1, r1p
     CHROMA_H_START
 %if mmsize==8
     mov   dword r0m, 2
@@ -1969,6 +1991,7 @@ DEBLOCK_CHROMA
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_H_CHROMA_420_MBAFF 0
 cglobal deblock_h_chroma_mbaff, 5,7,8
+    movsxdifnidn r1, r1p
     dec    r2d
     dec    r3d
     sub    r0, 4
@@ -1994,6 +2017,7 @@ DEBLOCK_H_CHROMA_420_MBAFF
 
 %macro DEBLOCK_H_CHROMA_422 0
 cglobal deblock_h_chroma_422, 5,8,8
+    movsxdifnidn r1, r1p
 %if ARCH_X86_64
     %define cntr r7
 %else
@@ -2064,6 +2088,7 @@ cglobal chroma_intra_body
 ; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma_intra, 4,5,8
+    movsxdifnidn r1, r1p
     CHROMA_V_START
     mova  m0, [t5]
     mova  m1, [t5+r1]
@@ -2079,6 +2104,7 @@ cglobal deblock_v_chroma_intra, 4,5,8
 ; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_intra, 4,6,8
+    movsxdifnidn r1, r1p
     CHROMA_H_START
 %if mmsize==8
     mov   dword r0m, 2
@@ -2091,6 +2117,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
     RET
 
 cglobal deblock_h_chroma_422_intra, 4,7,8
+    movsxdifnidn r1, r1p
     CHROMA_H_START
     mov   r6d, 32/mmsize
 .loop:
@@ -2121,6 +2148,7 @@ DEBLOCK_CHROMA_INTRA
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal deblock_h_chroma_intra_mbaff, 4,6,8
+    movsxdifnidn r1, r1p
     CHROMA_H_START
     TRANSPOSE4x8W_LOAD  PASS8ROWS(t5, r0, r1, t6)
     call chroma_intra_body
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 8e568cf..44dd5a5 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -193,6 +193,9 @@ cextern pd_32
 ;-----------------------------------------------------------------------------
 %macro AVG_WEIGHT 1-2 0
 cglobal pixel_avg_weight_w%1
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
+    movsxdifnidn r5, r5p
     BIWEIGHT_START
     AVG_START %2
 %if HIGH_BIT_DEPTH
@@ -407,6 +410,8 @@ AVG_WEIGHT 16, 7
 
 %macro WEIGHTER 1
 cglobal mc_weight_w%1, 6,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     WEIGHT_START %1
 .loop:
@@ -482,6 +487,8 @@ WEIGHTER 20
 ;-----------------------------------------------------------------------------
 %macro OFFSET 2
 cglobal mc_offset%2_w%1, 6,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     mova m2, [r4]
 %if HIGH_BIT_DEPTH
@@ -528,6 +535,9 @@ OFFSETPN  8
 ;-----------------------------------------------------------------------------
 %macro AVGH 2
 cglobal pixel_avg_%1x%2
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
+    movsxdifnidn r5, r5p
     mov eax, %2
     cmp dword r6m, 32
     jne pixel_avg_weight_w%1 %+ SUFFIX
@@ -545,6 +555,9 @@ cglobal pixel_avg_%1x%2
 
 %macro AVG_FUNC 3
 cglobal pixel_avg_w%1
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
+    movsxdifnidn r5, r5p
     AVG_START
 .height_loop:
 %assign x 0
@@ -652,6 +665,8 @@ AVGH  4,  2
 ;-----------------------------------------------------------------------------
 %macro AVG2_W_ONE 1
 cglobal pixel_avg2_w%1, 6,7,4
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub     r4, r2
     lea     r6, [r4+r3*2]
 .height_loop:
@@ -677,6 +692,8 @@ cglobal pixel_avg2_w%1, 6,7,4
 
 %macro AVG2_W_TWO 3
 cglobal pixel_avg2_w%1, 6,7,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub     r4, r2
     lea     r6, [r4+r3*2]
 .height_loop:
@@ -720,6 +737,8 @@ AVG2_W_TWO 16, movu, mova
 
 INIT_MMX
 cglobal pixel_avg2_w10_mmx2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub     r4, r2
     lea     r6, [r4+r3*2]
 .height_loop:
@@ -748,6 +767,8 @@ cglobal pixel_avg2_w10_mmx2, 6,7
     RET
 
 cglobal pixel_avg2_w16_mmx2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub     r4, r2
     lea     r6, [r4+r3*2]
 .height_loop:
@@ -782,6 +803,8 @@ cglobal pixel_avg2_w16_mmx2, 6,7
     RET
 
 cglobal pixel_avg2_w18_mmx2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub     r4, r2
 .height_loop:
     movu    m0, [r2+ 0]
@@ -807,6 +830,8 @@ cglobal pixel_avg2_w18_mmx2, 6,7
 
 INIT_XMM
 cglobal pixel_avg2_w18_sse2, 6,7,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub     r4, r2
 .height_loop:
     movu    m0, [r2+ 0]
@@ -836,6 +861,8 @@ cglobal pixel_avg2_w18_sse2, 6,7,6
 ;-----------------------------------------------------------------------------
 %macro AVG2_W8 2
 cglobal pixel_avg2_w%1_mmx2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub    r4, r2
     lea    r6, [r4+r3]
 .height_loop:
@@ -858,6 +885,8 @@ AVG2_W8 8, movq
 
 %macro AVG2_W16 2
 cglobal pixel_avg2_w%1_mmx2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub    r2, r4
     lea    r6, [r2+r3]
 .height_loop:
@@ -884,6 +913,8 @@ AVG2_W16 12, movd
 AVG2_W16 16, movq
 
 cglobal pixel_avg2_w20_mmx2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub    r2, r4
     lea    r6, [r2+r3]
 .height_loop:
@@ -912,6 +943,8 @@ cglobal pixel_avg2_w20_mmx2, 6,7
     RET
 
 cglobal pixel_avg2_w16_sse2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub    r4, r2
     lea    r6, [r4+r3]
 .height_loop:
@@ -931,6 +964,8 @@ cglobal pixel_avg2_w16_sse2, 6,7
 
 %macro AVG2_W20 1
 cglobal pixel_avg2_w20_%1, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     sub    r2, r4
     lea    r6, [r2+r3]
 .height_loop:
@@ -1033,6 +1068,8 @@ pixel_avg2_w%1_cache_mmx2:
 %define cachesplit pixel_avg2_w%1_cache_mmx2
 %endif
 cglobal pixel_avg2_w%1_cache%2_%3
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     mov    eax, r2m
     and    eax, %2-1
     cmp    eax, (%2-%1-(%1 % 8))
@@ -1116,6 +1153,8 @@ avg_w16_align%1_%2_ssse3:
 %endmacro
 
 cglobal pixel_avg2_w16_cache64_ssse3
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
     mov   eax, r2m
     and   eax, 0x3f
@@ -1197,6 +1236,8 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal mc_copy_w4_mmx, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     cmp dword r4m, 4
     lea     r5, [r3*3]
@@ -1217,6 +1258,8 @@ cglobal mc_copy_w4_mmx, 4,6
 %assign %%w %1*SIZEOF_PIXEL/mmsize
 %if %%w > 0
 cglobal mc_copy_w%1, 5,7,8*(%%w/2)
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     lea     r6, [r3*3]
     lea     r5, [r1*3]
@@ -1255,6 +1298,8 @@ MC_COPY 16
 %macro PREFETCH_FENC 1
 %if ARCH_X86_64
 cglobal prefetch_fenc_%1, 5,5
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     and    r4d, 3
     mov    eax, r4d
@@ -1319,6 +1364,7 @@ PREFETCH_FENC 422
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal prefetch_ref, 3,3
+    movsxdifnidn r1, r1p
     FIX_STRIDES r1
     dec    r2d
     and    r2d, r1d
@@ -1353,7 +1399,7 @@ cglobal prefetch_ref, 3,3
 %else
     PROLOGUE 0,6,%1
 %endif
-    movifnidn r3,  r3mp
+    movifnidn r3p, r3mp
     movifnidn r4d, r4m
     movifnidn r5d, r5m
     movifnidn t0d, r6m
@@ -1402,6 +1448,8 @@ cglobal prefetch_ref, 3,3
 ;-----------------------------------------------------------------------------
 %macro MC_CHROMA 0
 cglobal mc_chroma
+    movsxdifnidn r2, r2p
+    movsxdifnidn r4, r4p
     MC_CHROMA_START 0
     FIX_STRIDES r4
     and       r5d, 7
@@ -1426,8 +1474,8 @@ cglobal mc_chroma
     WIN64_SPILL_XMM 9
 %endif
     movd       m5, t2d
-    movifnidn  r0, r0mp
-    movifnidn  r1, r1mp
+    movifnidn r0p, r0mp
+    movifnidn r1p, r1mp
     movifnidn r2d, r2m
     movifnidn r5d, r8m
     pxor       m6, m6
@@ -1671,8 +1719,8 @@ ALIGN 4
     mova       m4, [pw_8]
     SPLATW     m5, m5
     psubw      m4, m5
-    movifnidn  r0, r0mp
-    movifnidn  r1, r1mp
+    movifnidn r0p, r0mp
+    movifnidn r1p, r1mp
     movifnidn r2d, r2m
     FIX_STRIDES r2
     movifnidn r5d, r8m
@@ -1781,6 +1829,8 @@ ALIGN 4
 
 %macro MC_CHROMA_SSSE3 0
 cglobal mc_chroma
+    movsxdifnidn r2, r2p
+    movsxdifnidn r4, r4p
     MC_CHROMA_START 9
     and       r5d, 7
     and       t2d, 7
@@ -1808,8 +1858,8 @@ cglobal mc_chroma
 %else
     mova       m5, [ch_shuf]
 %endif
-    movifnidn  r0, r0mp
-    movifnidn  r1, r1mp
+    movifnidn r0p, r0mp
+    movifnidn r1p, r1mp
     movifnidn r2d, r2m
     movifnidn r5d, r8m
     SPLATW     m6, m6
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 86769c5..7b93459 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -159,6 +159,8 @@ cextern pd_ffff
 ;-----------------------------------------------------------------------------
 %macro HPEL_FILTER 0
 cglobal hpel_filter_v, 5,6,11
+    movsxdifnidn r3, r3p
+    movsxdifnidn r4, r4p
     FIX_STRIDES r3, r4
     lea        r5, [r1+r3]
     sub        r1, r3
@@ -216,6 +218,7 @@ cglobal hpel_filter_v, 5,6,11
 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_c, 3,3,10
+    movsxdifnidn r2, r2p
     add        r2, r2
     add        r0, r2
     add        r1, r2
@@ -265,6 +268,7 @@ cglobal hpel_filter_c, 3,3,10
 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_h, 3,4,8
+    movsxdifnidn r2, r2p
     %define src r1+r2
     add        r2, r2
     add        r0, r2
@@ -317,6 +321,8 @@ HPEL_FILTER
 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_v, 5,6,%1
+    movsxdifnidn r3, r3p
+    movsxdifnidn r4, r4p
     lea r5, [r1+r3]
     sub r1, r3
     sub r1, r3
@@ -373,6 +379,7 @@ cglobal hpel_filter_v, 5,6,%1
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal hpel_filter_c_mmx2, 3,3
+    movsxdifnidn r2, r2p
     add r0, r2
     lea r1, [r1+r2*2]
     neg r2
@@ -402,6 +409,7 @@ cglobal hpel_filter_c_mmx2, 3,3
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_h_mmx2, 3,3
+    movsxdifnidn r2, r2p
     add r0, r2
     add r1, r2
     neg r2
@@ -449,6 +457,7 @@ INIT_XMM
 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_c, 3,3,9
+    movsxdifnidn r2, r2p
     add r0, r2
     lea r1, [r1+r2*2]
     neg r2
@@ -517,6 +526,7 @@ cglobal hpel_filter_c, 3,3,9
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_h_sse2, 3,3,8
+    movsxdifnidn r2, r2p
     add r0, r2
     add r1, r2
     neg r2
@@ -566,6 +576,7 @@ cglobal hpel_filter_h_sse2, 3,3,8
 ;-----------------------------------------------------------------------------
 %macro HPEL_H 0
 cglobal hpel_filter_h, 3,3
+    movsxdifnidn r2, r2p
     add r0, r2
     add r1, r2
     neg r2
@@ -736,6 +747,7 @@ HPEL_H
 ;                   uint8_t *src, intptr_t stride, int width, int height )
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter, 7,9,16
+    movsxdifnidn r4, r4p
     mov       r7, r3
     sub      r5d, 16
     mov       r8, r1
@@ -811,6 +823,8 @@ HPEL
 ; assumes i_dst and w are multiples of 16, and i_dst>w
 INIT_MMX
 cglobal plane_copy_core_mmx2, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3, r4d
 %if HIGH_BIT_DEPTH == 0
     movsxdifnidn r4, r4d
@@ -854,8 +868,8 @@ cglobal plane_copy_core_mmx2, 6,7
     sub    r6d, 16
     jg .loop16
 .end16:
-    add    r0, r1
-    add    r2, r3
+    add    r0p, r1p
+    add    r2p, r3p
     dec    r5d
     jg .loopy
     sfence
@@ -953,11 +967,14 @@ cglobal plane_copy_core_mmx2, 6,7
 ;-----------------------------------------------------------------------------
 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
 cglobal plane_copy_interleave_core, 6,9
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
+    movsxdifnidn r5, r5p
     mov   r6d, r6m
 %if HIGH_BIT_DEPTH
     FIX_STRIDES r1, r3, r5, r6d
-    movifnidn r1mp, r1
-    movifnidn r3mp, r3
+    movifnidn r1mp, r1p
+    movifnidn r3mp, r3p
     mov  r6m, r6d
 %endif
     lea    r0, [r0+r6*2]
@@ -968,7 +985,7 @@ cglobal plane_copy_interleave_core, 6,9
 %else
     DECLARE_REG_TMP 1,3
 %endif
-    mov  t1, r1
+    mov  t1p, r1p
     shr  t1, SIZEOF_PIXEL
     sub  t1, r6
     mov  t0d, r7m
@@ -1002,11 +1019,11 @@ cglobal plane_copy_interleave_core, 6,9
     %assign n n+32
 %endrep
     add    r6, 16*SIZEOF_PIXEL
-    cmp    r6, t1
+    cmp   r6p, t1p
     jl .pad
-    add    r0, r1mp
-    add    r2, r3mp
-    add    r4, r5
+    add   r0p, r1mp
+    add   r2p, r3mp
+    add   r4p, r5p
     dec    t0d
     jg .loopy
     sfence
@@ -1017,6 +1034,7 @@ cglobal plane_copy_interleave_core, 6,9
 ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
 ;-----------------------------------------------------------------------------
 cglobal store_interleave_chroma, 5,5
+    movsxdifnidn r1, r1p
     FIX_STRIDES r1
 .loop:
     INTERLEAVE r0+ 0, r2+           0, r3+           0, a
@@ -1046,6 +1064,9 @@ cglobal store_interleave_chroma, 5,5
 ;                               pixel *src,  intptr_t i_src, int w, int h )
 ;-----------------------------------------------------------------------------
 cglobal plane_copy_deinterleave, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
+    movsxdifnidn r5, r5p
     DEINTERLEAVE_START
     mov    r6d, r6m
     FIX_STRIDES r1, r3, r5, r6d
@@ -1074,6 +1095,7 @@ cglobal plane_copy_deinterleave, 6,7
 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
 ;-----------------------------------------------------------------------------
 cglobal load_deinterleave_chroma_fenc, 4,4
+    movsxdifnidn r2, r2p
     DEINTERLEAVE_START
     FIX_STRIDES r2
 .loop:
@@ -1089,6 +1111,7 @@ cglobal load_deinterleave_chroma_fenc, 4,4
 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
 ;-----------------------------------------------------------------------------
 cglobal load_deinterleave_chroma_fdec, 4,4
+    movsxdifnidn r2, r2p
     DEINTERLEAVE_START
     FIX_STRIDES r2
 .loop:
@@ -1133,6 +1156,7 @@ PLANE_DEINTERLEAVE
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal memcpy_aligned_mmx, 3,3
+IFNIDN r2, r2p, mov r2p, r2p
     test r2d, 16
     jz .copy32start
     movq mm0, [r1 + r2 - 16]
@@ -1161,6 +1185,7 @@ cglobal memcpy_aligned_mmx, 3,3
 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
 ;-----------------------------------------------------------------------------
 cglobal memcpy_aligned_sse2, 3,3
+IFNIDN r2, r2p, mov r2p, r2p
     test r2d, 16
     jz .copy32
     movdqa xmm0, [r1 + r2 - 16]
@@ -1196,6 +1221,7 @@ cglobal memcpy_aligned_sse2, 3,3
 ;-----------------------------------------------------------------------------
 %macro MEMZERO 0
 cglobal memzero_aligned, 2,2
+IFNIDN r2, r2p, mov r2p, r2p
     add  r0, r1
     neg  r1
     pxor m0, m0
@@ -1223,6 +1249,7 @@ MEMZERO
 ;-----------------------------------------------------------------------------
 INIT_XMM
 cglobal integral_init4h_sse4, 3,4
+    movsxdifnidn r2, r2p
     lea     r3, [r0+r2*2]
     add     r1, r2
     neg     r2
@@ -1243,6 +1270,7 @@ cglobal integral_init4h_sse4, 3,4
 
 %macro INTEGRAL_INIT8H 0
 cglobal integral_init8h, 3,4
+    movsxdifnidn r2, r2p
     lea     r3, [r0+r2*2]
     add     r1, r2
     neg     r2
@@ -1277,6 +1305,7 @@ INTEGRAL_INIT8H
 ; void integral_init8v( uint16_t *sum8, intptr_t stride )
 ;-----------------------------------------------------------------------------
 cglobal integral_init8v, 3,3
+    movsxdifnidn r1, r1p
     shl   r1, 1
     add   r0, r1
     lea   r2, [r0+r1*8]
@@ -1303,6 +1332,7 @@ INTEGRAL_INIT_8V
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal integral_init4v_mmx, 3,5
+    movsxdifnidn r2, r2p
     shl   r2, 1
     lea   r3, [r0+r2*4]
     lea   r4, [r0+r2*8]
@@ -1325,6 +1355,7 @@ cglobal integral_init4v_mmx, 3,5
 
 INIT_XMM
 cglobal integral_init4v_sse2, 3,5
+    movsxdifnidn r2, r2p
     shl     r2, 1
     add     r0, r2
     add     r1, r2
@@ -1350,6 +1381,7 @@ cglobal integral_init4v_sse2, 3,5
     RET
 
 cglobal integral_init4v_ssse3, 3,5
+    movsxdifnidn r2, r2p
     shl     r2, 1
     add     r0, r2
     add     r1, r2
@@ -1492,6 +1524,8 @@ cglobal integral_init4v_ssse3, 3,5
 ;-----------------------------------------------------------------------------
 %macro FRAME_INIT_LOWRES 0
 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+    movsxdifnidn r5, r5p
+    movsxdifnidn r6, r6p
 %if HIGH_BIT_DEPTH
     shl   dword r6m, 1
     FIX_STRIDES r5
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d5d3e90..6a582b5 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -148,6 +148,8 @@ cextern hsub_mul
 ;-----------------------------------------------------------------------------
 %macro SSD_ONE 2
 cglobal pixel_ssd_%1x%2, 4,5,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     mov     r4, %1*%2/mmsize
     pxor    m0, m0
 .loop
@@ -178,6 +180,8 @@ cglobal pixel_ssd_%1x%2, 4,5,6
 
 %macro SSD_16_MMX 2
 cglobal pixel_ssd_%1x%2, 4,5
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     mov     r4, %1*%2/mmsize/2
     pxor    m0, m0
 .loop
@@ -370,6 +374,8 @@ SSD_ONE    16, 16
     %assign function_align 16
 %endif
 cglobal pixel_ssd_%1x%2, 0,0,0
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     mov     al, %1*%2/mmsize/2
 
 %if %1 != %2
@@ -480,6 +486,8 @@ SSD  8,  4
 %if HIGH_BIT_DEPTH
 %macro SSD_NV12 0
 cglobal pixel_ssd_nv12_core, 6,7,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     shl        r4d, 2
     FIX_STRIDES r1, r3
     add         r0, r4
@@ -569,6 +577,8 @@ cglobal pixel_ssd_nv12_core, 6,7,7
 ;-----------------------------------------------------------------------------
 %macro SSD_NV12 0
 cglobal pixel_ssd_nv12_core, 6,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     shl    r4d, 1
     add     r0, r4
     add     r2, r4
@@ -701,18 +711,21 @@ SSD_NV12
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal pixel_var_16x16, 2,3
+    movsxdifnidn r1, r1p
     FIX_STRIDES r1
     VAR_START 0
     VAR_2ROW 8*SIZEOF_PIXEL, 16
     VAR_END 16, 16
 
 cglobal pixel_var_8x16, 2,3
+    movsxdifnidn r1, r1p
     FIX_STRIDES r1
     VAR_START 0
     VAR_2ROW r1, 8
     VAR_END 8, 16
 
 cglobal pixel_var_8x8, 2,3
+    movsxdifnidn r1, r1p
     FIX_STRIDES r1
     VAR_START 0
     VAR_2ROW r1, 4
@@ -721,12 +734,14 @@ cglobal pixel_var_8x8, 2,3
 %if HIGH_BIT_DEPTH
 %macro VAR 0
 cglobal pixel_var_16x16, 2,3,8
+    movsxdifnidn r1, r1p
     FIX_STRIDES r1
     VAR_START 0
     VAR_2ROW r1, 8
     VAR_END 16, 16
 
 cglobal pixel_var_8x8, 2,3,8
+    movsxdifnidn r1, r1p
     lea       r2, [r1*3]
     VAR_START 0
     mova      m0, [r0]
@@ -754,6 +769,7 @@ VAR
 %if HIGH_BIT_DEPTH == 0
 %macro VAR 0
 cglobal pixel_var_16x16, 2,3,8
+    movsxdifnidn r1, r1p
     VAR_START 1
     mov      r2d, 8
 .loop:
@@ -767,6 +783,7 @@ cglobal pixel_var_16x16, 2,3,8
     VAR_END 16, 16
 
 cglobal pixel_var_8x8, 2,4,8
+    movsxdifnidn r1, r1p
     VAR_START 1
     mov      r2d, 2
     lea       r3, [r1*3]
@@ -783,6 +800,7 @@ cglobal pixel_var_8x8, 2,4,8
     VAR_END 8, 8
 
 cglobal pixel_var_8x16, 2,4,8
+    movsxdifnidn r1, r1p
     VAR_START 1
     mov      r2d, 4
     lea       r3, [r1*3]
@@ -824,6 +842,8 @@ VAR
 ;-----------------------------------------------------------------------------
 %macro VAR2_8x8_MMX 2
 cglobal pixel_var2_8x%1, 5,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     VAR_START 0
     mov      r5d, %1
@@ -866,6 +886,8 @@ VAR2_8x8_MMX 16, 7
 
 %macro VAR2_8x8_SSE2 2
 cglobal pixel_var2_8x%1, 5,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     VAR_START 1
     mov      r5d, %1/2
 .loop:
@@ -903,6 +925,8 @@ VAR2_8x8_SSE2 16, 7
 %if HIGH_BIT_DEPTH == 0
 %macro VAR2_8x8_SSSE3 2
 cglobal pixel_var2_8x%1, 5,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     pxor      m5, m5    ; sum
     pxor      m6, m6    ; sum squared
     mova      m7, [hsub_mul]
@@ -1157,6 +1181,8 @@ pixel_satd_8x4_internal_mmx2:
 %if HIGH_BIT_DEPTH
 %macro SATD_MxN_MMX 3
 cglobal pixel_satd_%1x%2, 4,7
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     pxor   m0, m0
     call pixel_satd_%1x%3_internal_mmx2
@@ -1184,6 +1210,8 @@ SATD_MxN_MMX  8, 16, 8
 
 %if HIGH_BIT_DEPTH == 0
 cglobal pixel_satd_16x16, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     pxor   m0, m0
 %rep 3
@@ -1197,6 +1225,8 @@ cglobal pixel_satd_16x16, 4,6
     RET
 
 cglobal pixel_satd_16x8, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     pxor   m0, m0
     call pixel_satd_16x4_internal_mmx2
@@ -1206,6 +1236,8 @@ cglobal pixel_satd_16x8, 4,6
     SATD_END_MMX
 
 cglobal pixel_satd_8x16, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     pxor   m0, m0
     call pixel_satd_8x8_internal_mmx2
@@ -1216,18 +1248,24 @@ cglobal pixel_satd_8x16, 4,6
 %endif ; !HIGH_BIT_DEPTH
 
 cglobal pixel_satd_8x8, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     pxor   m0, m0
     call pixel_satd_8x8_internal_mmx2
     SATD_END_MMX
 
 cglobal pixel_satd_8x4, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     pxor   m0, m0
     call pixel_satd_8x4_internal_mmx2
     SATD_END_MMX
 
 cglobal pixel_satd_4x16, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     SATD_4x4_MMX m0, 0, 1
     SATD_4x4_MMX m1, 0, 1
@@ -1239,6 +1277,8 @@ cglobal pixel_satd_4x16, 4,6
     SATD_END_MMX
 
 cglobal pixel_satd_4x8, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     SATD_4x4_MMX m0, 0, 1
     SATD_4x4_MMX m1, 0, 0
@@ -1246,6 +1286,8 @@ cglobal pixel_satd_4x8, 4,6
     SATD_END_MMX
 
 cglobal pixel_satd_4x4, 4,6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     SATD_4x4_MMX m0, 0, 0
     SATD_END_MMX
@@ -1340,6 +1382,8 @@ cglobal pixel_satd_4x4, 4,6
 %macro SATDS_SSE2 0
 %if cpuflag(ssse3)
 cglobal pixel_satd_4x4, 4, 6, 6
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
     mova m4, [hmul_4p]
     LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
@@ -1356,6 +1400,8 @@ cglobal pixel_satd_4x4, 4, 6, 6
 %endif
 
 cglobal pixel_satd_4x8, 4, 6, 8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
 %if cpuflag(ssse3)
     mova m7, [hmul_4p]
@@ -1366,6 +1412,8 @@ cglobal pixel_satd_4x8, 4, 6, 8
     RET
 
 cglobal pixel_satd_4x16, 4, 6, 8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_MMX
 %if cpuflag(ssse3)
     mova m7, [hmul_4p]
@@ -1397,6 +1445,8 @@ cglobal pixel_satd_16x4_internal
     ret
 
 cglobal pixel_satd_16x8, 4,6,12
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_SSE2 m10, m7
 %if notcpuflag(ssse3)
     mova m7, [pw_00ff]
@@ -1404,6 +1454,8 @@ cglobal pixel_satd_16x8, 4,6,12
     jmp %%pixel_satd_16x8_internal
 
 cglobal pixel_satd_16x16, 4,6,12
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_SSE2 m10, m7
 %if notcpuflag(ssse3)
     mova m7, [pw_00ff]
@@ -1416,6 +1468,8 @@ cglobal pixel_satd_16x16, 4,6,12
     SATD_END_SSE2 m10
 %else
 cglobal pixel_satd_16x8, 4,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_SSE2 m6, m7
     BACKUP_POINTERS
     call pixel_satd_8x8_internal
@@ -1424,6 +1478,8 @@ cglobal pixel_satd_16x8, 4,6,8
     SATD_END_SSE2 m6
 
 cglobal pixel_satd_16x16, 4,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_SSE2 m6, m7
     BACKUP_POINTERS
     call pixel_satd_8x8_internal
@@ -1435,17 +1491,23 @@ cglobal pixel_satd_16x16, 4,6,8
 %endif
 
 cglobal pixel_satd_8x16, 4,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_SSE2 m6, m7
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
     SATD_END_SSE2 m6
 
 cglobal pixel_satd_8x8, 4,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_SSE2 m6, m7
     call pixel_satd_8x8_internal
     SATD_END_SSE2 m6
 
 cglobal pixel_satd_8x4, 4,6,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SATD_START_SSE2 m6, m7
     call %%pixel_satd_8x4_internal
     SATD_END_SSE2 m6
@@ -1479,6 +1541,8 @@ cglobal pixel_satd_8x4, 4,6,8
 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 cglobal pixel_sa8d_8x8_internal
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     lea  r6, [r0+4*r1]
     lea  r7, [r2+4*r3]
     LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
@@ -1495,6 +1559,8 @@ cglobal pixel_sa8d_8x8_internal
     ret
 
 cglobal pixel_sa8d_8x8, 4,8,12
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     lea  r4, [3*r1]
     lea  r5, [3*r3]
@@ -1513,6 +1579,8 @@ cglobal pixel_sa8d_8x8, 4,8,12
     RET
 
 cglobal pixel_sa8d_16x16, 4,8,12
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     lea  r4, [3*r1]
     lea  r5, [3*r3]
@@ -1958,7 +2026,7 @@ cglobal intra_satd_x3_16x16, 0,5
 %define sums    rsp+64 ; size 56
 %define top_1d  rsp+32 ; size 32
 %define left_1d rsp    ; size 32
-    movifnidn   r1,  r1mp
+    movifnidn  r1p, r1mp
 
     pxor        m7, m7
     mova [sums+ 0], m7
@@ -1985,7 +2053,7 @@ cglobal intra_satd_x3_16x16, 0,5
     pand        m6, [sw_f0] ; dc
 
     ; 2D hadamards
-    movifnidn   r0, r0mp
+    movifnidn  r0p, r0mp
     mov         r3, -4
 .loop_y:
     mov         r4, -4
@@ -2035,7 +2103,7 @@ cglobal intra_satd_x3_16x16, 0,5
     jl  .loop_y
 
 ; horizontal sum
-    movifnidn   r2, r2mp
+    movifnidn  r2p, r2mp
 %if HIGH_BIT_DEPTH
     mova        m1, m5
     paddd       m5, m3
@@ -2079,7 +2147,7 @@ cglobal intra_satd_x3_8x8c, 0,6
 %define  dc_1d   rsp+32 ; size 16
 %define  top_1d  rsp+16 ; size 16
 %define  left_1d rsp    ; size 16
-    movifnidn   r1,  r1mp
+    movifnidn  r1p,  r1mp
     pxor        m7, m7
     mova [sums+ 0], m7
     mova [sums+ 8], m7
@@ -2115,8 +2183,8 @@ cglobal intra_satd_x3_8x8c, 0,6
     lea         r5, [dc_1d]
 
     ; 2D hadamards
-    movifnidn   r0,  r0mp
-    movifnidn   r2,  r2mp
+    movifnidn  r0p,  r0mp
+    movifnidn  r2p,  r2mp
     mov         r3,  -2
 .loop_y:
     mov         r4,  -2
@@ -3490,6 +3558,7 @@ cglobal hadamard_ac_8x8
 
 %macro HADAMARD_AC_WXH_MMX 2
 cglobal pixel_hadamard_ac_%1x%2, 2,4
+    movsxdifnidn r1, r1p
     %assign pad 16-gprsize-(stack_offset&15)
     %define ysub r1
     FIX_STRIDES r1
@@ -3719,6 +3788,7 @@ HADAMARD_AC_WXH_SSE2  8,  8
 ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
 %macro HADAMARD_AC_WXH_SSE2 2
 cglobal pixel_hadamard_ac_%1x%2, 2,3,11
+    movsxdifnidn r1, r1p
     %assign pad 16-gprsize-(stack_offset&15)
     %define ysub r1
     FIX_STRIDES r1
@@ -3875,6 +3945,8 @@ HADAMARD_AC_SSE2
 
 %macro SSIM 0
 cglobal pixel_ssim_4x4x2_core, 4,4,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     FIX_STRIDES r1, r3
     pxor      m0, m0
     SSIM_ITER 0
@@ -4006,6 +4078,8 @@ SSIM
 ;-----------------------------------------------------------------------------
 %macro ASD8 0
 cglobal pixel_asd8, 5,5
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     pxor     m0, m0
     pxor     m1, m1
 .loop:
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 183f16a..aa0e7de 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -508,7 +508,7 @@ cglobal dequant_%1x%1_flat16, 0,3
 %else
     lea  r1, [dequant%1_scale + t2]
 %endif
-    movifnidn r0, r0mp
+    movifnidn r0p, r0mp
     movd m4, t0d
 %if %1 == 4
 %if mmsize == 8
@@ -650,7 +650,7 @@ DEQUANT_DC w, pmullw
     %assign %%regs %%regs+1      ; t0-t4 are volatile on x86-64
 %endif
 cglobal optimize_chroma_2x2_dc, 0,%%regs,7
-    movifnidn t0, r0mp
+    movifnidn t0p, r0mp
     movd      m2, r1m
     movq      m1, [t0]
 %if cpuflag(sse4)
@@ -1343,8 +1343,8 @@ COEFF_LAST
 
 %macro COEFF_LEVELRUN 1
 cglobal coeff_level_run%1,0,7
-    movifnidn t0, r0mp
-    movifnidn t1, r1mp
+    movifnidn t0p, r0mp
+    movifnidn t1p, r1mp
     pxor    m2, m2
     LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
 %if %1==15
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 5723199..eb7dd13 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -84,6 +84,8 @@ cextern sw_64
 ;-----------------------------------------------------------------------------
 %macro SAD 2
 cglobal pixel_sad_%1x%2_mmx2, 4,4
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     pxor    mm0, mm0
 %rep %2/2
     SAD_INC_2x%1P
@@ -119,6 +121,8 @@ SAD  4,  4
 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_16x16, 4,4,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     movu    m0, [r2]
     movu    m1, [r2+r3]
     lea     r2, [r2+2*r3]
@@ -186,6 +190,8 @@ cglobal pixel_sad_16x16, 4,4,8
 ; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_16x8, 4,4
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     movu    m0, [r2]
     movu    m2, [r2+r3]
     lea     r2, [r2+2*r3]
@@ -249,6 +255,8 @@ SAD_W16
 INIT_XMM
 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
 cglobal pixel_sad_8x16_sse2, 4,4
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SAD_INC_4x8P_SSE 0
     SAD_INC_4x8P_SSE 1
     SAD_INC_4x8P_SSE 1
@@ -263,6 +271,7 @@ cglobal pixel_sad_8x16_sse2, 4,4
 %if ARCH_X86_64 == 0
 INIT_MMX
 cglobal pixel_vsad_mmx2, 3,3
+    movsxdifnidn r1, r1p
     mova      m0, [r0]
     mova      m1, [r0+8]
     mova      m2, [r0+r1]
@@ -299,6 +308,7 @@ cglobal pixel_vsad_mmx2, 3,3
 
 INIT_XMM
 cglobal pixel_vsad_sse2, 3,3
+    movsxdifnidn r1, r1p
     mova      m0, [r0]
     mova      m1, [r0+r1]
     lea       r0, [r0+r1*2]
@@ -857,7 +867,7 @@ INTRA_SAD16
 %endmacro
 
 %macro SAD_X4_END 0
-    mov     r0, r6mp
+    mov     r0p, r6mp
     movd    [r0+0], mm0
     movd    [r0+4], mm1
     movd    [r0+8], mm2
@@ -871,6 +881,7 @@ INTRA_SAD16
 ;-----------------------------------------------------------------------------
 %macro SAD_X 3
 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
+IF %1 == 3, movsxdifnidn, {r4, r4p}, {r5, r5p}
     SAD_X%1_2x%2P 1
 %rep %3/2-1
     SAD_X%1_2x%2P 0
@@ -1170,7 +1181,7 @@ SAD_X 4,  4,  4
 %endmacro
 
 %macro SAD_X4_END_SSE2 0
-    mov       r0, r6mp
+    mov      r0p, r6mp
     psllq   xmm1, 32
     psllq   xmm3, 32
     paddw   xmm0, xmm1
@@ -1190,6 +1201,7 @@ SAD_X 4,  4,  4
 ;-----------------------------------------------------------------------------
 %macro SAD_X_SSE2 3
 cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
+IF %1 == 3, movsxdifnidn, {r4, r4p}, {r5, r5p}
     SAD_X%1_2x%2P_SSE2 1
 %rep %3/2-1
     SAD_X%1_2x%2P_SSE2 0
@@ -1293,6 +1305,8 @@ sad_w16_align%1_ssse3:
 
 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
 cglobal pixel_sad_16x%2_cache64_%1
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     mov     eax, r2m
     and     eax, 0x37
     cmp     eax, 0x30
@@ -1341,6 +1355,8 @@ cglobal pixel_sad_16x%2_cache64_%1
 
 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
 cglobal pixel_sad_16x%1_cache%2_mmx2
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SAD_CACHELINE_START_MMX2 16, %1, %1, %2
 .loop:
     movq   mm1, [r2]
@@ -1367,6 +1383,8 @@ cglobal pixel_sad_16x%1_cache%2_mmx2
 
 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
 cglobal pixel_sad_8x%1_cache%2_mmx2
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
 .loop:
     movq   mm1, [r2+8]
@@ -1403,6 +1421,7 @@ cglobal pixel_sad_8x%1_cache%2_mmx2
 
 %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
 cglobal pixel_sad_x3_%1x%2_cache%3_%6
+    movsxdifnidn r4, r4p
     CHECK_SPLIT r1m, %1, %3
     CHECK_SPLIT r2m, %1, %3
     CHECK_SPLIT r3m, %1, %3
@@ -1468,6 +1487,7 @@ cglobal pixel_sad_x3_%1x%2_cache%3_%6
 
 %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
 cglobal pixel_sad_x4_%1x%2_cache%3_%6
+    movsxdifnidn r5, r5p
     CHECK_SPLIT r1m, %1, %3
     CHECK_SPLIT r2m, %1, %3
     CHECK_SPLIT r3m, %1, %3
@@ -1476,7 +1496,7 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6
 .split:
 %if ARCH_X86_64
     PROLOGUE 6,9
-    mov  r8,  r6mp
+    mov  r8p, r6mp
     push r4
     push r3
     push r2
diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm
index 8e3dba7..fe67d3b 100644
--- a/common/x86/sad16-a.asm
+++ b/common/x86/sad16-a.asm
@@ -91,6 +91,8 @@ cextern pw_8
 ;-----------------------------------------------------------------------------
 %macro SAD_MMX 3
 cglobal pixel_sad_%1x%2, 4,4
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     pxor    m0, m0
 %rep %2/%3
     SAD_INC_%3x%1P_MMX
@@ -156,6 +158,8 @@ SAD_MMX  4,  4, 2
 ;-----------------------------------------------------------------------------
 %macro SAD_XMM 2
 cglobal pixel_sad_%1x%2, 4,4,8
+    movsxdifnidn r1, r1p
+    movsxdifnidn r3, r3p
     pxor    m0, m0
 %rep %2/2
     SAD_INC_2x%1P_XMM
@@ -241,7 +245,7 @@ SAD_XMM  8,  8
     movd [r5+4], m1
     movd [r5+8], m2
 %else
-    mov      r0, r5mp
+    mov     r0p, r5mp
     movd [r0+0], m0
     movd [r0+4], m1
     movd [r0+8], m2
@@ -332,7 +336,7 @@ SAD_XMM  8,  8
     HADDW     m2, m6
     HADDW     m3, m7
 %endif
-    mov       r0, r6mp
+    mov      r0p, r6mp
     movd [r0+ 0], m0
     movd [r0+ 4], m1
     movd [r0+ 8], m2
@@ -406,6 +410,7 @@ PIXEL_VSAD
 ;-----------------------------------------------------------------------------
 %macro SAD_X 3
 cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
+    movsxdifnidn r4, r4p
     %assign regnum %1+1
     %xdefine STRIDE r %+ regnum
     mov     r6, %3/2-1
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index 7b39d93..76fa9c3 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -95,13 +95,13 @@ cglobal checkasm_call, 2,15,16
 
     ; All arguments have been pushed on the stack instead of registers in order to
     ; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
-    mov  r0, r6mp
-    mov  r1, r7mp
-    mov  r2, r8mp
-    mov  r3, r9mp
+    mov  r0, r6mq
+    mov  r1, r7mq
+    mov  r2, r8mq
+    mov  r3, r9mq
 %if UNIX64
-    mov  r4, r10mp
-    mov  r5, r11mp
+    mov  r4, r10mq
+    mov  r5, r11mq
     %assign i 6
     %rep max_args-6
         mov  r9, [rsp+stack_offset+(i+1)*8]
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 89d444e..756250a 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -218,10 +218,21 @@ intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
  * detect all functions that assumes zero-extension.
  */
 void x264_checkasm_stack_clobber( uint64_t clobber, ... );
+#if ARCH_X86_64_X64
 #define call_a1(func,...) ({ \
     uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
     x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
     x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
+#else
+/* If passed in registers high order bits of pointer arguments are zero.
+ * The call wrapper would have to know which are arguments are pointers to load them
+ * correctly in the presence of an otherwise clobbered stack. We can still use the clobber
+ * function to check ordinary stack based arguments */
+#define call_a1(func,...) ({ \
+    uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
+    x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
+    func( __VA_ARGS__ ); })
+#endif
 #elif ARCH_X86
 #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
 #else
-- 
1.8.1.1



More information about the x264-devel mailing list