[x265] [PATCH] asm: 16bpp support for sa8d_32xN
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Wed Dec 4 12:50:23 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386157811 -19800
# Wed Dec 04 17:20:11 2013 +0530
# Node ID 4ce258ca871d23f73fa1aff580a22c2c36dcc27b
# Parent f79e21247dc3de0a21e3adfd5d800220d285e631
asm: 16bpp support for sa8d_32xN
diff -r f79e21247dc3 -r 4ce258ca871d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 04 17:09:58 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 17:20:11 2013 +0530
@@ -518,6 +518,11 @@
p.sa8d_inter[LUMA_16x64] = x265_pixel_sa8d_16x64_sse2;
p.sa8d_inter[LUMA_24x32] = x265_pixel_sa8d_24x32_sse2;
p.sa8d_inter[LUMA_48x64] = x265_pixel_sa8d_48x64_sse2;
+ p.sa8d_inter[LUMA_32x8] = x265_pixel_sa8d_32x8_sse2;
+ p.sa8d_inter[LUMA_32x16] = x265_pixel_sa8d_32x16_sse2;
+ p.sa8d_inter[LUMA_32x24] = x265_pixel_sa8d_32x24_sse2;
+ p.sa8d_inter[LUMA_32x32] = x265_pixel_sa8d_32x32_sse2;
+ p.sa8d_inter[LUMA_32x64] = x265_pixel_sa8d_32x64_sse2;
p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
diff -r f79e21247dc3 -r 4ce258ca871d source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 04 17:09:58 2013 +0530
+++ b/source/common/x86/pixel-a.asm Wed Dec 04 17:20:11 2013 +0530
@@ -2728,14 +2728,14 @@
mova m7, [hmul_8p]
%endif
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
@@ -2753,8 +2753,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2770,38 +2770,38 @@
mova m7, [hmul_8p]
%endif
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
- sub r0, 8
- sub r2, 8
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
- sub r0, 8
- sub r2, 8
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
- sub r0, 8
- sub r2, 8
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
@@ -2819,8 +2819,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2831,8 +2831,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- sub r2, 16
- sub r0, 16
+ sub r2, 16*SIZEOF_PIXEL
+ sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2852,8 +2852,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2864,8 +2864,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- sub r2, 16
- sub r0, 16
+ sub r2, 16*SIZEOF_PIXEL
+ sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2876,8 +2876,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- add r2, 16
- add r0, 16
+ add r2, 16*SIZEOF_PIXEL
+ add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -2888,8 +2888,8 @@
lea r5, [8*r3]
sub r2, r4
sub r0, r5
- sub r2, 16
- sub r0, 16
+ sub r2, 16*SIZEOF_PIXEL
+ sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
@@ -4030,6 +4030,9 @@
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4043,8 +4046,10 @@
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -4056,6 +4061,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4069,8 +4077,10 @@
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -4252,6 +4262,9 @@
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4265,8 +4278,10 @@
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -4278,6 +4293,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4291,13 +4309,7 @@
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
- HADDUW m0, m1
- movd r4d, m0
- add r4d, 1
- shr r4d, 1
- add r4d, dword [esp+36]
- mov dword [esp+36], r4d
+ AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
@@ -4307,6 +4319,9 @@
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4324,13 +4339,7 @@
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
- HADDUW m0, m1
- movd r4d, m0
- add r4d, 1
- shr r4d, 1
- add r4d, dword [esp+36]
- mov dword [esp+36], r4d
+ AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
@@ -4342,6 +4351,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4359,8 +4371,10 @@
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -4378,6 +4392,9 @@
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4391,8 +4408,10 @@
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
@@ -4404,6 +4423,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4430,6 +4452,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4451,6 +4476,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4477,6 +4505,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4498,6 +4529,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4524,6 +4558,9 @@
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4545,6 +4582,9 @@
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
+%if HIGH_BIT_DEPTH
+ HADDUW m0, m1
+%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
@@ -4558,8 +4598,10 @@
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
- paddusw m0, [esp+48]
+ SA8D_INTER
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
+%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
More information about the x265-devel
mailing list