[x265] [PATCH] asm: 16bpp support for sa8d - 24x32 and 48x64
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Wed Dec 4 12:40:08 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386157198 -19800
# Wed Dec 04 17:09:58 2013 +0530
# Node ID f79e21247dc3de0a21e3adfd5d800220d285e631
# Parent 9b062eb8124e9fb12bc16e32eab524ba080cf258
asm: 16bpp support for sa8d - 24x32 and 48x64
diff -r 9b062eb8124e -r f79e21247dc3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 04 14:54:59 2013 +0550
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 17:09:58 2013 +0530
@@ -516,6 +516,8 @@
p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_sse2;
p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_sse2;
+ p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_sse2;
+ p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_sse2;
p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
diff -r 9b062eb8124e -r f79e21247dc3 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 04 14:54:59 2013 +0550
+++ b/source/common/x86/pixel-a.asm Wed Dec 04 17:09:58 2013 +0530
@@ -2683,38 +2683,38 @@
mova m7, [hmul_8p]
%endif
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
- sub r0, 8
- sub r2, 8
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
- sub r0, 8
- sub r2, 8
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
- sub r0, 8
- sub r2, 8
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
- sub r0, 8
- sub r2, 8
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
@@ -2909,8 +2909,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2918,8 +2918,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2930,8 +2930,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- sub r2, 16
- sub r0, 16
+ sub r2, 16*SIZEOF_PIXEL
+ sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2939,8 +2939,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- sub r2, 16
- sub r0, 16
+ sub r2, 16*SIZEOF_PIXEL
+ sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2951,8 +2951,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2960,8 +2960,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2972,8 +2972,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- sub r2, 16
- sub r0, 16
+ sub r2, 16*SIZEOF_PIXEL
+ sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2981,8 +2981,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- sub r2, 16
- sub r0, 16
+ sub r2, 16*SIZEOF_PIXEL
+ sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -4577,6 +4577,9 @@
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4590,8 +4593,10 @@
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -4603,6 +4608,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4624,6 +4632,9 @@
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4650,6 +4661,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4671,6 +4685,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4692,6 +4709,9 @@
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4718,6 +4738,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4739,6 +4762,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4760,6 +4786,9 @@
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4786,6 +4815,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4807,6 +4839,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4828,6 +4863,9 @@
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4841,8 +4879,10 @@
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
More information about the x265-devel
mailing list