[x265] [PATCH] asm: modified satd and sad asm functions in 16bpp to avoid overflow
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Feb 6 07:59:39 CET 2014
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1391669912 -19800
# Thu Feb 06 12:28:32 2014 +0530
# Node ID bbc13f3fa80fcf13b005829e33efd518e549edcd
# Parent 634bc0b1c24653dd254df77cd80f96f81e71e888
asm: modified satd and sad asm functions in 16bpp to avoid overflow
diff -r 634bc0b1c246 -r bbc13f3fa80f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Thu Feb 06 12:28:32 2014 +0530
@@ -638,16 +638,6 @@
INIT6(satd, _sse2);
HEVC_SATD(sse2);
p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
- p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse2;
- p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
- p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
- p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
- p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
- p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
- p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
- p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;
- p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
- p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
SA8D_INTER_FROM_BLOCK(sse2);
@@ -692,27 +682,6 @@
PIXEL_AVG_W4(mmx2);
LUMA_VAR(_sse2);
- INIT8(sad, _mmx2);
- p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2;
- p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2;
- p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
- p.sad[LUMA_16x32] = x265_pixel_sad_16x32_sse2;
-
- p.sad[LUMA_32x8] = x265_pixel_sad_32x8_sse2;
- p.sad[LUMA_32x16] = x265_pixel_sad_32x16_sse2;
- p.sad[LUMA_32x24] = x265_pixel_sad_32x24_sse2;
- p.sad[LUMA_32x32] = x265_pixel_sad_32x32_sse2;
- p.sad[LUMA_32x64] = x265_pixel_sad_32x64_sse2;
-
- p.sad[LUMA_64x16] = x265_pixel_sad_64x16_sse2;
- p.sad[LUMA_64x32] = x265_pixel_sad_64x32_sse2;
- p.sad[LUMA_64x48] = x265_pixel_sad_64x48_sse2;
- p.sad[LUMA_64x64] = x265_pixel_sad_64x64_sse2;
-
- p.sad[LUMA_48x64] = x265_pixel_sad_48x64_sse2;
- p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2;
- p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
-
SAD_X3(sse2);
p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
diff -r 634bc0b1c246 -r bbc13f3fa80f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/x86/pixel-a.asm Thu Feb 06 12:28:32 2014 +0530
@@ -511,7 +511,7 @@
%endif
%endmacro
-%macro SATD_4x8_SSE 3
+%macro SATD_4x8_SSE 3-4
%if HIGH_BIT_DEPTH
movh m0, [r0+0*r1]
movh m4, [r2+0*r3]
@@ -577,7 +577,11 @@
DIFFOP 2, 6, 3, 5, 7
%endif
%endif ; HIGH_BIT_DEPTH
+%if %0 == 4
+ SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
+%else
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
+%endif
%endmacro
;-----------------------------------------------------------------------------
@@ -2386,63 +2390,127 @@
RET
%endif
+%if HIGH_BIT_DEPTH
%if WIN64
cglobal pixel_satd_12x16, 4,8,8
SATD_START_MMX
mov r6, r0
mov r7, r2
-%if vertical==0
- mova m7, [hmul_4p]
-%endif
- SATD_4x8_SSE vertical, 0, swap
+ pxor m7, m7
+ SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
lea r2, [r7 + 4*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
- HADDW m7, m1
- movd eax, m7
+ SATD_4x8_SSE vertical, 1, 4, 5
+ pxor m1, m1
+ movhlps m1, m7
+ paddd m7, m1
+ pshufd m1, m7, 1
+ paddd m7, m1
+ movd eax, m7
RET
%else
cglobal pixel_satd_12x16, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
-%if vertical==0
- mova m7, [hmul_4p]
-%endif
- SATD_4x8_SSE vertical, 0, swap
+ pxor m7, m7
+ SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 4*SIZEOF_PIXEL
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
- SATD_4x8_SSE vertical, 1, add
+ SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE vertical, 1, add
- HADDW m7, m1
- movd eax, m7
+ SATD_4x8_SSE vertical, 1, 4, 5
+ pxor m1, m1
+ movhlps m1, m7
+ paddd m7, m1
+ pshufd m1, m7, 1
+ paddd m7, m1
+ movd eax, m7
RET
%endif
+%else ;HIGH_BIT_DEPTH
+%if WIN64
+cglobal pixel_satd_12x16, 4,8,8
+ SATD_START_MMX
+ mov r6, r0
+ mov r7, r2
+%if vertical==0
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE vertical, 0, swap
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r6 + 4*SIZEOF_PIXEL]
+ lea r2, [r7 + 4*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ HADDW m7, m1
+ movd eax, m7
+ RET
+%else
+cglobal pixel_satd_12x16, 4,7,8,0-gprsize
+ SATD_START_MMX
+ mov r6, r0
+ mov [rsp], r2
+%if vertical==0
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE vertical, 0, swap
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r6 + 4*SIZEOF_PIXEL]
+ mov r2, [rsp]
+ add r2, 4*SIZEOF_PIXEL
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ mov r2, [rsp]
+ add r2, 8*SIZEOF_PIXEL
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ HADDW m7, m1
+ movd eax, m7
+ RET
+%endif
+%endif
%if WIN64
cglobal pixel_satd_24x32, 4,8,8
diff -r 634bc0b1c246 -r bbc13f3fa80f source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/x86/sad16-a.asm Thu Feb 06 12:28:32 2014 +0530
@@ -274,9 +274,10 @@
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
ABSW2 m3, m4, m3, m4, m7, m5
- paddd m1, m2
- paddd m3, m4
- paddd m0, m1
+ paddw m1, m2
+ paddw m3, m4
+ paddw m3, m1
+ pmaddwd m3, [pw_1]
paddd m0, m3
%else
movu m1, [r2]
@@ -286,8 +287,9 @@
ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- paddw m0, m1
- paddw m0, m2
+ paddw m2, m1
+ pmaddwd m2, [pw_1]
+ paddd m0, m2
%endif
%endmacro
@@ -308,7 +310,7 @@
jg .loop
%endif
- HADDW m0, m1
+ HADDD m0, m1
movd eax, xm0
RET
%endmacro
More information about the x265-devel
mailing list