[x265] [PATCH] asm: 16bpp support for sa8d - 24x32 and 48x64

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Wed Dec 4 08:38:21 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386142683 -19800
#      Wed Dec 04 13:08:03 2013 +0530
# Node ID 546523046d990119dc910b87ebe3f4c8ab25f236
# Parent  6a41cb559feb98056d30482651f5a83f5e326300
asm: 16bpp support for sa8d - 24x32 and 48x64

diff -r 6a41cb559feb -r 546523046d99 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 04 12:39:42 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 04 13:08:03 2013 +0530
@@ -516,6 +516,8 @@
         p.sa8d_inter[LUMA_16x12]  = x265_pixel_satd_16x12_sse2;
         p.sa8d_inter[LUMA_16x32]  = x265_pixel_sa8d_16x32_sse2;
         p.sa8d_inter[LUMA_16x64]  = x265_pixel_sa8d_16x64_sse2;
+        p.sa8d_inter[LUMA_24x32]  = x265_pixel_sa8d_24x32_sse2;
+        p.sa8d_inter[LUMA_48x64]  = x265_pixel_sa8d_48x64_sse2;
         p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
         p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
 
diff -r 6a41cb559feb -r 546523046d99 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Dec 04 12:39:42 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Wed Dec 04 13:08:03 2013 +0530
@@ -2683,38 +2683,38 @@
     mova m7, [hmul_8p]
 %endif
     SA8D_8x8
-    add r0, 8
-    add r2, 8
+    add r0, 8*SIZEOF_PIXEL
+    add r2, 8*SIZEOF_PIXEL
     SA8D_8x8
-    add r0, 8
-    add r2, 8
+    add r0, 8*SIZEOF_PIXEL
+    add r2, 8*SIZEOF_PIXEL
     SA8D_8x8
     lea r0, [r0 + r1*8]
     lea r2, [r2 + r3*8]
     SA8D_8x8
-    sub r0, 8
-    sub r2, 8
+    sub r0, 8*SIZEOF_PIXEL
+    sub r2, 8*SIZEOF_PIXEL
     SA8D_8x8
-    sub r0, 8
-    sub r2, 8
+    sub r0, 8*SIZEOF_PIXEL
+    sub r2, 8*SIZEOF_PIXEL
     SA8D_8x8
     lea r0, [r0 + r1*8]
     lea r2, [r2 + r3*8]
     SA8D_8x8
-    add r0, 8
-    add r2, 8
+    add r0, 8*SIZEOF_PIXEL
+    add r2, 8*SIZEOF_PIXEL
     SA8D_8x8
-    add r0, 8
-    add r2, 8
+    add r0, 8*SIZEOF_PIXEL
+    add r2, 8*SIZEOF_PIXEL
     SA8D_8x8
     lea r0, [r0 + r1*8]
     lea r2, [r2 + r3*8]
     SA8D_8x8
-    sub r0, 8
-    sub r2, 8
+    sub r0, 8*SIZEOF_PIXEL
+    sub r2, 8*SIZEOF_PIXEL
     SA8D_8x8
-    sub r0, 8
-    sub r2, 8
+    sub r0, 8*SIZEOF_PIXEL
+    sub r2, 8*SIZEOF_PIXEL
     SA8D_8x8
     movd eax, m12
     RET
@@ -2909,8 +2909,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    add  r2, 16
-    add  r0, 16
+    add  r2, 16*SIZEOF_PIXEL
+    add  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -2918,8 +2918,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    add  r2, 16
-    add  r0, 16
+    add  r2, 16*SIZEOF_PIXEL
+    add  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -2930,8 +2930,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    sub  r2, 16
-    sub  r0, 16
+    sub  r2, 16*SIZEOF_PIXEL
+    sub  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -2939,8 +2939,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    sub  r2, 16
-    sub  r0, 16
+    sub  r2, 16*SIZEOF_PIXEL
+    sub  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -2951,8 +2951,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    add  r2, 16
-    add  r0, 16
+    add  r2, 16*SIZEOF_PIXEL
+    add  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -2960,8 +2960,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    add  r2, 16
-    add  r0, 16
+    add  r2, 16*SIZEOF_PIXEL
+    add  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -2972,8 +2972,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    sub  r2, 16
-    sub  r0, 16
+    sub  r2, 16*SIZEOF_PIXEL
+    sub  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -2981,8 +2981,8 @@
     lea  r5, [8*r3]
     sub  r2, r4
     sub  r0, r5
-    sub  r2, 16
-    sub  r0, 16
+    sub  r2, 16*SIZEOF_PIXEL
+    sub  r0, 16*SIZEOF_PIXEL
     lea  r4, [3*r1]
     lea  r5, [3*r3]
     SA8D_16x16
@@ -4577,6 +4577,9 @@
     lea  r4, [r1 + 2*r1]
     lea  r5, [r3 + 2*r3]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [rsp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4590,8 +4593,10 @@
     SA8D_INTER
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
-    paddusw m0, [esp+48]
+    SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
     HADDUW m0, m1
+%endif
     movd r4d, m0
     add  r4d, 1
     shr  r4d, 1
@@ -4603,6 +4608,9 @@
     add  r2, 16*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4624,6 +4632,9 @@
     add  r2, 32*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4650,6 +4661,9 @@
 
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4671,6 +4685,9 @@
     add  r2, 16*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4692,6 +4709,9 @@
     add  r2, 32*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4718,6 +4738,9 @@
 
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4739,6 +4762,9 @@
     add  r2, 16*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4760,6 +4786,9 @@
     add  r2, 32*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4786,6 +4815,9 @@
 
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4807,6 +4839,9 @@
     add  r2, 16*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4828,6 +4863,9 @@
     add  r2, 32*SIZEOF_PIXEL
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -4841,8 +4879,10 @@
     SA8D_INTER
     mova [esp+64-mmsize], m0
     call pixel_sa8d_8x8_internal2
-    paddusw m0, [esp+48]
+    SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
     HADDUW m0, m1
+%endif
     movd r4d, m0
     add  r4d, 1
     shr  r4d, 1


More information about the x265-devel mailing list