[x265] [PATCH] asm: modified satd and sad asm functions in 16bpp to avoid overflow

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Thu Feb 6 07:59:39 CET 2014


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1391669912 -19800
#      Thu Feb 06 12:28:32 2014 +0530
# Node ID bbc13f3fa80fcf13b005829e33efd518e549edcd
# Parent  634bc0b1c24653dd254df77cd80f96f81e71e888
asm: modified satd and sad asm functions in 16bpp to avoid overflow

diff -r 634bc0b1c246 -r bbc13f3fa80f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Thu Feb 06 12:28:32 2014 +0530
@@ -638,16 +638,6 @@
         INIT6(satd, _sse2);
         HEVC_SATD(sse2);
         p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
-        p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse2;
-        p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
-        p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
-        p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
-        p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
-        p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
-        p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
-        p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;
-        p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
-        p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
 
         p.sa8d_inter[LUMA_4x4]  = x265_pixel_satd_4x4_mmx2;
         SA8D_INTER_FROM_BLOCK(sse2);
@@ -692,27 +682,6 @@
         PIXEL_AVG_W4(mmx2);
         LUMA_VAR(_sse2);
 
-        INIT8(sad, _mmx2);
-        p.sad[LUMA_8x32]  = x265_pixel_sad_8x32_sse2;
-        p.sad[LUMA_16x4]  = x265_pixel_sad_16x4_sse2;
-        p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
-        p.sad[LUMA_16x32] = x265_pixel_sad_16x32_sse2;
-
-        p.sad[LUMA_32x8]  = x265_pixel_sad_32x8_sse2;
-        p.sad[LUMA_32x16] = x265_pixel_sad_32x16_sse2;
-        p.sad[LUMA_32x24] = x265_pixel_sad_32x24_sse2;
-        p.sad[LUMA_32x32] = x265_pixel_sad_32x32_sse2;
-        p.sad[LUMA_32x64] = x265_pixel_sad_32x64_sse2;
-
-        p.sad[LUMA_64x16] = x265_pixel_sad_64x16_sse2;
-        p.sad[LUMA_64x32] = x265_pixel_sad_64x32_sse2;
-        p.sad[LUMA_64x48] = x265_pixel_sad_64x48_sse2;
-        p.sad[LUMA_64x64] = x265_pixel_sad_64x64_sse2;
-
-        p.sad[LUMA_48x64] = x265_pixel_sad_48x64_sse2;
-        p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2;
-        p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
-
         SAD_X3(sse2);
         p.sad_x3[LUMA_4x4] = x265_pixel_sad_x3_4x4_mmx2;
         p.sad_x3[LUMA_4x8] = x265_pixel_sad_x3_4x8_mmx2;
diff -r 634bc0b1c246 -r bbc13f3fa80f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/x86/pixel-a.asm	Thu Feb 06 12:28:32 2014 +0530
@@ -511,7 +511,7 @@
 %endif
 %endmacro
 
-%macro SATD_4x8_SSE 3
+%macro SATD_4x8_SSE 3-4
 %if HIGH_BIT_DEPTH
     movh    m0, [r0+0*r1]
     movh    m4, [r2+0*r3]
@@ -577,7 +577,11 @@
     DIFFOP 2, 6, 3, 5, 7
 %endif
 %endif ; HIGH_BIT_DEPTH
+%if %0 == 4
+    SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
+%else
     SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -2386,63 +2390,127 @@
     RET
 %endif
 
+%if HIGH_BIT_DEPTH
 %if WIN64
 cglobal pixel_satd_12x16, 4,8,8
     SATD_START_MMX
     mov r6, r0
     mov r7, r2
-%if vertical==0
-    mova m7, [hmul_4p]
-%endif
-    SATD_4x8_SSE vertical, 0, swap
+    pxor m7, m7
+    SATD_4x8_SSE vertical, 0, 4, 5
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r6 + 4*SIZEOF_PIXEL]
     lea r2, [r7 + 4*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     lea r2, [r7 + 8*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
-    HADDW m7, m1
-    movd eax, m7
+    SATD_4x8_SSE vertical, 1, 4, 5
+    pxor    m1, m1
+    movhlps m1, m7
+    paddd   m7, m1
+    pshufd  m1, m7, 1
+    paddd   m7, m1
+    movd   eax, m7
     RET
 %else
 cglobal pixel_satd_12x16, 4,7,8,0-gprsize
     SATD_START_MMX
     mov r6, r0
     mov [rsp], r2
-%if vertical==0
-    mova m7, [hmul_4p]
-%endif
-    SATD_4x8_SSE vertical, 0, swap
+    pxor m7, m7
+    SATD_4x8_SSE vertical, 0, 4, 5
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r6 + 4*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 4*SIZEOF_PIXEL
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
     add r2, 8*SIZEOF_PIXEL
-    SATD_4x8_SSE vertical, 1, add
+    SATD_4x8_SSE vertical, 1, 4, 5
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
-    SATD_4x8_SSE vertical, 1, add
-    HADDW m7, m1
-    movd eax, m7
+    SATD_4x8_SSE vertical, 1, 4, 5
+    pxor    m1, m1
+    movhlps m1, m7
+    paddd   m7, m1
+    pshufd  m1, m7, 1
+    paddd   m7, m1
+    movd   eax, m7
     RET
 %endif
+%else    ;HIGH_BIT_DEPTH
+%if WIN64
+cglobal pixel_satd_12x16, 4,8,8
+    SATD_START_MMX
+    mov r6, r0
+    mov r7, r2
+%if vertical==0
+    mova m7, [hmul_4p]
+%endif
+    SATD_4x8_SSE vertical, 0, swap
+    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r6 + 4*SIZEOF_PIXEL]
+    lea r2, [r7 + 4*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    HADDW m7, m1
+    movd eax, m7
+    RET
+%else
+cglobal pixel_satd_12x16, 4,7,8,0-gprsize
+    SATD_START_MMX
+    mov r6, r0
+    mov [rsp], r2
+%if vertical==0
+    mova m7, [hmul_4p]
+%endif
+    SATD_4x8_SSE vertical, 0, swap
+    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r6 + 4*SIZEOF_PIXEL]
+    mov r2, [rsp]
+    add r2, 4*SIZEOF_PIXEL
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    mov r2, [rsp]
+    add r2, 8*SIZEOF_PIXEL
+    SATD_4x8_SSE vertical, 1, add
+    lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+    lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+    SATD_4x8_SSE vertical, 1, add
+    HADDW m7, m1
+    movd eax, m7
+    RET
+%endif
+%endif
 
 %if WIN64
 cglobal pixel_satd_24x32, 4,8,8
diff -r 634bc0b1c246 -r bbc13f3fa80f source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/x86/sad16-a.asm	Thu Feb 06 12:28:32 2014 +0530
@@ -274,9 +274,10 @@
     lea     r0, [r0+4*r1]
     lea     r2, [r2+4*r3]
     ABSW2   m3, m4, m3, m4, m7, m5
-    paddd   m1, m2
-    paddd   m3, m4
-    paddd   m0, m1
+    paddw   m1, m2
+    paddw   m3, m4
+    paddw   m3, m1
+    pmaddwd m3, [pw_1]
     paddd   m0, m3
 %else
     movu    m1, [r2]
@@ -286,8 +287,9 @@
     ABSW2   m1, m2, m1, m2, m3, m4
     lea     r0, [r0+4*r1]
     lea     r2, [r2+4*r3]
-    paddw   m0, m1
-    paddw   m0, m2
+    paddw   m2, m1
+    pmaddwd m2, [pw_1]
+    paddd   m0, m2
 %endif
 %endmacro
 
@@ -308,7 +310,7 @@
     jg .loop
 %endif
 
-    HADDW   m0, m1
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 %endmacro


More information about the x265-devel mailing list