[x265] [PATCH] asm: 16bpp asm code for pixel_sa8d_16xN
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Wed Dec 4 08:10:51 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386140982 -19800
# Wed Dec 04 12:39:42 2013 +0530
# Node ID 6a41cb559feb98056d30482651f5a83f5e326300
# Parent 55c0bf9d99661073a7acdb5749e2625379d8393a
asm: 16bpp asm code for pixel_sa8d_16xN
diff -r 55c0bf9d9966 -r 6a41cb559feb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 03 14:14:44 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 12:39:42 2013 +0530
@@ -504,6 +504,18 @@
p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
+ p.sa8d_inter[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
+ p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_sse2;
+ p.sa8d_inter[LUMA_4x16] = x265_pixel_satd_4x16_sse2;
+ p.sa8d_inter[LUMA_8x4] = x265_pixel_satd_8x4_sse2;
+ p.sa8d_inter[LUMA_8x16] = x265_pixel_sa8d_8x16_sse2;
+ p.sa8d_inter[LUMA_8x32] = x265_pixel_sa8d_8x32_sse2;
+ p.sa8d_inter[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
+ p.sa8d_inter[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
+ p.sa8d_inter[LUMA_16x8] = x265_pixel_sa8d_16x8_sse2;
+ p.sa8d_inter[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
+ p.sa8d_inter[LUMA_16x32] = x265_pixel_sa8d_16x32_sse2;
+ p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_sse2;
p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
diff -r 55c0bf9d9966 -r 6a41cb559feb source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Dec 03 14:14:44 2013 -0600
+++ b/source/common/x86/pixel-a.asm Wed Dec 04 12:39:42 2013 +0530
@@ -2501,8 +2501,10 @@
%endmacro
%macro AVG_16x16 0
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -2630,8 +2632,8 @@
mova m7, [hmul_8p]
%endif
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
@@ -3601,6 +3603,9 @@
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -3614,8 +3619,10 @@
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -3629,6 +3636,9 @@
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -3646,8 +3656,10 @@
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -3665,6 +3677,9 @@
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -3678,8 +3693,10 @@
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -3696,6 +3713,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -3722,6 +3742,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -3748,6 +3771,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -3761,8 +3787,10 @@
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
More information about the x265-devel
mailing list