[x265] [PATCH] asm: 16bpp asm code for pixel_sa8d_16xN

Wed Dec 4 08:10:51 CET 2013

# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386140982 -19800
#      Wed Dec 04 12:39:42 2013 +0530
# Node ID 6a41cb559feb98056d30482651f5a83f5e326300
# Parent  55c0bf9d99661073a7acdb5749e2625379d8393a
asm: 16bpp asm code for pixel_sa8d_16xN

diff -r 55c0bf9d9966 -r 6a41cb559feb source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Tue Dec 03 14:14:44 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 04 12:39:42 2013 +0530
@@ -504,6 +504,18 @@
         p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
         p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
 
+        p.sa8d_inter[LUMA_4x4]  = x265_pixel_satd_4x4_mmx2;
+        p.sa8d_inter[LUMA_4x8]  = x265_pixel_satd_4x8_sse2;
+        p.sa8d_inter[LUMA_4x16]  = x265_pixel_satd_4x16_sse2;
+        p.sa8d_inter[LUMA_8x4]  = x265_pixel_satd_8x4_sse2;
+        p.sa8d_inter[LUMA_8x16] = x265_pixel_sa8d_8x16_sse2;
+        p.sa8d_inter[LUMA_8x32] = x265_pixel_sa8d_8x32_sse2;
+        p.sa8d_inter[LUMA_12x16]  = x265_pixel_satd_12x16_sse2;
+        p.sa8d_inter[LUMA_16x4]  = x265_pixel_satd_16x4_sse2;
+        p.sa8d_inter[LUMA_16x8]  = x265_pixel_sa8d_16x8_sse2;
+        p.sa8d_inter[LUMA_16x12]  = x265_pixel_satd_16x12_sse2;
+        p.sa8d_inter[LUMA_16x32]  = x265_pixel_sa8d_16x32_sse2;
+        p.sa8d_inter[LUMA_16x64]  = x265_pixel_sa8d_16x64_sse2;
         p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
         p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
 
diff -r 55c0bf9d9966 -r 6a41cb559feb source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Dec 03 14:14:44 2013 -0600
+++ b/source/common/x86/pixel-a.asm	Wed Dec 04 12:39:42 2013 +0530
@@ -2501,8 +2501,10 @@
 %endmacro
 
 %macro AVG_16x16 0
-    paddusw m0, [esp+48]
+    SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
     HADDUW m0, m1
+%endif
     movd r4d, m0
     add  r4d, 1
     shr  r4d, 1
@@ -2630,8 +2632,8 @@
     mova m7, [hmul_8p]
 %endif
     SA8D_8x8
-    add r0, 8
-    add r2, 8
+    add r0, 8*SIZEOF_PIXEL
+    add r2, 8*SIZEOF_PIXEL
     SA8D_8x8
     movd eax, m12
     RET
@@ -3601,6 +3603,9 @@
     lea  r4, [r1 + 2*r1]
     lea  r5, [r3 + 2*r3]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [rsp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -3614,8 +3619,10 @@
     SA8D_INTER
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
-    paddusw m0, [esp+48]
+    SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
     HADDUW m0, m1
+%endif
     movd r4d, m0
     add  r4d, 1
     shr  r4d, 1
@@ -3629,6 +3636,9 @@
     lea  r2, [r2 + r3*8]
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -3646,8 +3656,10 @@
     SA8D_INTER
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
-    paddusw m0, [esp+48]
+    SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
     HADDUW m0, m1
+%endif
     movd r4d, m0
     add  r4d, 1
     shr  r4d, 1
@@ -3665,6 +3677,9 @@
     lea  r4, [r1 + 2*r1]
     lea  r5, [r3 + 2*r3]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [rsp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -3678,8 +3693,10 @@
     SA8D_INTER
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
-    paddusw m0, [esp+48]
+    SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
     HADDUW m0, m1
+%endif
     movd r4d, m0
     add  r4d, 1
     shr  r4d, 1
@@ -3696,6 +3713,9 @@
 
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -3722,6 +3742,9 @@
 
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -3748,6 +3771,9 @@
 
     lea  r4, [r1 + 2*r1]
     call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+    HADDUW m0, m1
+%endif
     mova [esp+48], m0
     call pixel_sa8d_8x8_internal2
     SA8D_INTER
@@ -3761,8 +3787,10 @@
     SA8D_INTER
     mova [esp+64-mmsize], m0
     call pixel_sa8d_8x8_internal2
-    paddusw m0, [esp+48]
+    SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
     HADDUW m0, m1
+%endif
     movd r4d, m0
     add  r4d, 1
     shr  r4d, 1