[x265] [PATCH] asm: modified satd and sad asm functions in 16bpp to avoid overflow
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Wed Feb 5 10:33:21 CET 2014
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1391592757 -19800
# Wed Feb 05 15:02:37 2014 +0530
# Node ID b14a8528c478bf3068ed95aeef68c050014785cd
# Parent 1374f1168c5cbb97a893172e37bd9f5c6ed5690c
asm: modified satd and sad asm functions in 16bpp to avoid overflow
diff -r 1374f1168c5c -r b14a8528c478 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Feb 05 12:59:57 2014 +0530
+++ b/source/common/x86/pixel-a.asm Wed Feb 05 15:02:37 2014 +0530
@@ -511,7 +511,7 @@
%endif
%endmacro
-%macro SATD_4x8_SSE 3
+%macro SATD_4x8_SSE 3-4
%if HIGH_BIT_DEPTH
movh m0, [r0+0*r1]
movh m4, [r2+0*r3]
@@ -577,7 +577,11 @@
DIFFOP 2, 6, 3, 5, 7
%endif
%endif ; HIGH_BIT_DEPTH
+%if %0 == 4
+ SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
+%else
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
+%endif
%endmacro
;-----------------------------------------------------------------------------
@@ -2391,56 +2395,66 @@
SATD_START_MMX
mov r6, r0
mov r7, r2
+ pxor m7, m7
%if vertical==0
mova m7, [hmul_4p]
%endif
- SATD_4x8_SSE vertical, 0, swap
+ SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
lea r2, [r7 + 4*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
- HADDW m7, m1
- movd eax, m7
+ SATD_4x8_SSE vertical, 1, 4, 5
+ pxor m1, m1
+ movhlps m1, m7
+ paddd m7, m1
+ pshufd m1, m7, 1
+ paddd m7, m1
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x16, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
+ pxor m7, m7
%if vertical==0
mova m7, [hmul_4p]
%endif
- SATD_4x8_SSE vertical, 0, swap
+ SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 4*SIZEOF_PIXEL
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
- HADDW m7, m1
- movd eax, m7
+ SATD_4x8_SSE vertical, 1, 4, 5
+ pxor m1, m1
+ movhlps m1, m7
+ paddd m7, m1
+ pshufd m1, m7, 1
+ paddd m7, m1
+ movd eax, m7
RET
%endif
diff -r 1374f1168c5c -r b14a8528c478 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Wed Feb 05 12:59:57 2014 +0530
+++ b/source/common/x86/sad16-a.asm Wed Feb 05 15:02:37 2014 +0530
@@ -274,9 +274,10 @@
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
ABSW2 m3, m4, m3, m4, m7, m5
- paddd m1, m2
- paddd m3, m4
- paddd m0, m1
+ paddw m1, m2
+ paddw m3, m4
+ paddw m3, m1
+ pmaddwd m3, [pw_1]
paddd m0, m3
%else
movu m1, [r2]
@@ -286,8 +287,9 @@
ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- paddw m0, m1
- paddw m0, m2
+ paddw m2, m1
+ pmaddwd m2, [pw_1]
+ paddd m0, m2
%endif
%endmacro
@@ -308,7 +310,7 @@
jg .loop
%endif
- HADDW m0, m1
+ HADDD m0, m1
movd eax, xm0
RET
%endmacro
More information about the x265-devel
mailing list