[x265] [PATCH] asm: fix for 32-bit build satd overflow issue

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Wed Jan 29 14:16:56 CET 2014


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1391001289 -19800
#      Wed Jan 29 18:44:49 2014 +0530
# Branch stable
# Node ID 86743912a5b0459645e5aeccd1c35313e3f0af58
# Parent  d6091cb46ae1afeeec40d247d5d5247f26e3372c
asm: fix for 32-bit build satd overflow issue.

diff -r d6091cb46ae1 -r 86743912a5b0 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Jan 29 12:05:06 2014 +0530
+++ b/source/common/x86/pixel-a.asm	Wed Jan 29 18:44:49 2014 +0530
@@ -626,23 +626,17 @@
     movd eax, m7
     RET
 
-cglobal pixel_satd_8x8_internal
-    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
-    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
-%%pixel_satd_8x4_internal:
-    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
-    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
-    ret
-
 cglobal pixel_satd_8x8_internal2
 %if WIN64
     LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
     SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
+%%pixel_satd_8x4_internal2:
     LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
     SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
 %else
     LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
     SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
+%%pixel_satd_8x4_internal2:
     LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
     SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
 %endif
@@ -652,15 +646,6 @@
 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
 %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
 
-cglobal pixel_satd_16x4_internal
-    LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
-    lea  r2, [r2+4*r3]
-    lea  r0, [r0+4*r1]
-    ; always use horizontal mode here
-    SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
-    SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
-    ret
-
 cglobal pixel_satd_16x4_internal2
     LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
     lea  r2, [r2+4*r3]
@@ -1250,20 +1235,26 @@
     SATD_START_SSE2 m6, m7
     mov r6, r0
     mov [rsp], r2
-    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 8*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 16*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 16*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 24*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 24*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %endif
 
 %if WIN64
@@ -1297,24 +1288,30 @@
     SATD_START_SSE2 m6, m7
     mov r6, r0
     mov [rsp], r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 8*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 16*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 16*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 24*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 24*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %endif
 
 %if WIN64
@@ -1352,34 +1349,34 @@
     SATD_START_SSE2 m6, m7
     mov r6, r0
     mov [rsp], r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-%if HIGH_BIT_DEPTH
-    pxor       m7, m7
-%endif
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 8*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 16*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 16*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 24*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 24*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %endif
 
 %if WIN64
@@ -1755,54 +1752,50 @@
     SATD_START_SSE2 m6, m7
     mov r6, r0
     mov [rsp], r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-%if HIGH_BIT_DEPTH
-    pxor       m7, m7
-%endif
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2,8*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 16*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2,16*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 24*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2,24*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 32*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2,32*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 40*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2,40*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 48*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2,48*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 56*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2,56*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %endif
 
 %if WIN64
@@ -2279,78 +2272,110 @@
 cglobal pixel_satd_16x4, 4,6,8
     SATD_START_SSE2 m6, m7
     BACKUP_POINTERS
-    call %%pixel_satd_8x4_internal
+    call %%pixel_satd_8x4_internal2
     RESTORE_AND_INC_POINTERS
-    call %%pixel_satd_8x4_internal
-    SATD_END_SSE2 m6
+    call %%pixel_satd_8x4_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 
 cglobal pixel_satd_16x8, 4,6,8
     SATD_START_SSE2 m6, m7
     BACKUP_POINTERS
-    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal2
     RESTORE_AND_INC_POINTERS
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 
 cglobal pixel_satd_16x12, 4,6,8
     SATD_START_SSE2 m6, m7, 1
     BACKUP_POINTERS
-    call pixel_satd_8x8_internal
-    call %%pixel_satd_8x4_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call %%pixel_satd_8x4_internal2
     RESTORE_AND_INC_POINTERS
-    call pixel_satd_8x8_internal
-    call %%pixel_satd_8x4_internal
-    SATD_END_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call %%pixel_satd_8x4_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 
 cglobal pixel_satd_16x16, 4,6,8
     SATD_START_SSE2 m6, m7, 1
     BACKUP_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     RESTORE_AND_INC_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 
 cglobal pixel_satd_16x32, 4,6,8
     SATD_START_SSE2 m6, m7, 1
     BACKUP_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     RESTORE_AND_INC_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 
 cglobal pixel_satd_16x64, 4,6,8
     SATD_START_SSE2 m6, m7, 1
     BACKUP_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     RESTORE_AND_INC_POINTERS
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
 %endif
 
 %if WIN64
@@ -2444,37 +2469,20 @@
     SATD_START_SSE2 m6, m7
     mov r6, r0
     mov [rsp], r2
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-%if HIGH_BIT_DEPTH
-    pxor       m7, m7
-%endif
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 8*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_ACCUM m6, m0, m7
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     lea r0, [r6 + 16*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 16*SIZEOF_PIXEL
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6, m7
-%endif    ;WIN64
-
-cglobal pixel_satd_8x32, 4,6,8
-    SATD_START_SSE2 m6, m7
-%if vertical
-    mova m7, [pw_00ff]
-%endif
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
@@ -2486,9 +2494,15 @@
     paddd   m6, m7
     movd   eax, m6
     RET
-
-cglobal pixel_satd_8x16, 4,6,8
+%endif    ;WIN64
+
+cglobal pixel_satd_8x32, 4,6,8
     SATD_START_SSE2 m6, m7
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     pxor    m7, m7
@@ -2499,9 +2513,10 @@
     movd   eax, m6
     RET
 
-cglobal pixel_satd_8x8, 4,6,8
+cglobal pixel_satd_8x16, 4,6,8
     SATD_START_SSE2 m6, m7
     call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
     pxor    m7, m7
     movhlps m7, m6
     paddd   m6, m7
@@ -2510,9 +2525,20 @@
     movd   eax, m6
     RET
 
+cglobal pixel_satd_8x8, 4,6,8
+    SATD_START_SSE2 m6, m7
+    call pixel_satd_8x8_internal2
+    pxor    m7, m7
+    movhlps m7, m6
+    paddd   m6, m7
+    pshufd  m7, m6, 1
+    paddd   m6, m7
+    movd   eax, m6
+    RET
+
 cglobal pixel_satd_8x4, 4,6,8
     SATD_START_SSE2 m6, m7
-    call %%pixel_satd_8x4_internal
+    call %%pixel_satd_8x4_internal2
     SATD_END_SSE2 m6
 %endmacro ; SATDS_SSE2
 


More information about the x265-devel mailing list