[x265] [PATCH] asm: modified some of 8bpp satd routines to support 16bpp

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Mon Dec 2 09:23:50 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385972529 -19800
#      Mon Dec 02 13:52:09 2013 +0530
# Node ID 78d667f42623d0aace99a2774337fdc86c9b540e
# Parent  df0b4f81609e611989c5b1743e7729adeb51cb01
asm: modified some of 8bpp satd routines to support 16bpp

diff -r df0b4f81609e -r 78d667f42623 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Dec 02 12:19:34 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Dec 02 13:52:09 2013 +0530
@@ -495,6 +495,9 @@
         p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
         p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
         p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
+        p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
+        p.satd[LUMA_24x32] = x265_pixel_satd_24x32_sse2;
+        p.satd[LUMA_48x64] = x265_pixel_satd_48x64_sse2;
 
         p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
         p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
diff -r df0b4f81609e -r 78d667f42623 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Dec 02 12:19:34 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Mon Dec 02 13:52:09 2013 +0530
@@ -4100,48 +4100,48 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    lea r0, [r6 + 8]
-    lea r2, [r7 + 8]
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 24]
-    lea r2, [r7 + 24]
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 32]
-    lea r2, [r7 + 32]
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 40]
-    lea r2, [r7 + 40]
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
+    lea r2, [r7 + 24*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 32*SIZEOF_PIXEL]
+    lea r2, [r7 + 32*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 40*SIZEOF_PIXEL]
+    lea r2, [r7 + 40*SIZEOF_PIXEL]
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
@@ -4170,53 +4170,53 @@
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
-    lea r0, [r6 + 8]
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2,8
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 16]
+    add r2,8*SIZEOF_PIXEL
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2,16
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 24]
+    add r2,16*SIZEOF_PIXEL
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2,24
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 32]
+    add r2,24*SIZEOF_PIXEL
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 32*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2,32
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    call pixel_satd_8x8_internal2
-    lea r0, [r6 + 40]
+    add r2,32*SIZEOF_PIXEL
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    call pixel_satd_8x8_internal2
+    lea r0, [r6 + 40*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2,40
+    add r2,40*SIZEOF_PIXEL
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
     call pixel_satd_8x8_internal2
@@ -4876,14 +4876,14 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
-    lea r0, [r6 + 4]
-    lea r2, [r7 + 4]
+    lea r0, [r6 + 4*SIZEOF_PIXEL]
+    lea r2, [r7 + 4*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
-    lea r0, [r6 + 8]
-    lea r2, [r7 + 8]
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
@@ -4903,16 +4903,16 @@
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
-    lea r0, [r6 + 4]
+    lea r0, [r6 + 4*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2, 4
+    add r2, 4*SIZEOF_PIXEL
     SATD_4x8_SSE vertical, 1, add
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE vertical, 1, add
-    lea r0, [r6 + 8]
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2, 8
+    add r2, 8*SIZEOF_PIXEL
     SATD_4x8_SSE vertical, 1, add
     lea r0, [r0 + r1*2*SIZEOF_PIXEL]
     lea r2, [r2 + r3*2*SIZEOF_PIXEL]
@@ -4931,19 +4931,21 @@
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
-    lea r0, [r6 + 8]
-    lea r2, [r7 + 8]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 16]
-    lea r2, [r7 + 16]
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+    SATD_ACCUM m6, m0, m7
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_ACCUM m6, m0, m7
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_END_SSE2 m6, m7
 %else
 cglobal pixel_satd_24x32, 4,7,8,0-4
     SATD_START_SSE2 m6, m7
@@ -4953,21 +4955,24 @@
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
-    lea r0, [r6 + 8]
+    pxor       m7, m7
+    SATD_ACCUM m6, m0, m7
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2, 8
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    lea r0, [r6 + 16]
+    add r2, 8*SIZEOF_PIXEL
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_ACCUM m6, m0, m7
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
     mov r2, [rsp]
-    add r2, 16
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+    add r2, 16*SIZEOF_PIXEL
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    call pixel_satd_8x8_internal
+    SATD_END_SSE2 m6, m7
 %endif    ;WIN64
 
 cglobal pixel_satd_8x32, 4,6,8


More information about the x265-devel mailing list