[x265] [PATCH] asm: pixel_satd_32xN for 16bpp
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Tue Dec 3 10:56:26 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386064572 -19800
# Tue Dec 03 15:26:12 2013 +0530
# Node ID 31c21157620ce37d18c8d11132caf8c79a7e449a
# Parent a616349e2a19c18369a9cf4524202fa6ebe5b6be
asm: pixel_satd_32xN for 16bpp
diff -r a616349e2a19 -r 31c21157620c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 03 15:13:25 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 15:26:12 2013 +0530
@@ -491,6 +491,8 @@
if (cpuMask & X265_CPU_SSE2)
{
INIT6(satd, _sse2);
+ HEVC_SATD(sse2);
+ p.satd[LUMA_4x4] = x265_pixel_satd_4x4_mmx2;
p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse2;
p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
@@ -498,12 +500,9 @@
p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
p.satd[LUMA_16x64] = x265_pixel_satd_16x64_sse2;
p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
- p.satd[LUMA_24x32] = x265_pixel_satd_24x32_sse2;
- p.satd[LUMA_48x64] = x265_pixel_satd_48x64_sse2;
- p.satd[LUMA_64x16] = x265_pixel_satd_64x16_sse2;
- p.satd[LUMA_64x32] = x265_pixel_satd_64x32_sse2;
- p.satd[LUMA_64x48] = x265_pixel_satd_64x48_sse2;
- p.satd[LUMA_64x64] = x265_pixel_satd_64x64_sse2;
+ p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;
+ p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
+ p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
diff -r a616349e2a19 -r 31c21157620c source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Dec 03 15:13:25 2013 +0530
+++ b/source/common/x86/pixel-a.asm Tue Dec 03 15:26:12 2013 +0530
@@ -1190,14 +1190,14 @@
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
%else
@@ -1206,17 +1206,17 @@
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
+ add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
%endif
@@ -1228,16 +1228,16 @@
mov r7, r2
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
@@ -1248,19 +1248,19 @@
mov [rsp], r2
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
+ add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
@@ -1274,22 +1274,25 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
%else
cglobal pixel_satd_32x24, 4,7,8,0-4 ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -1298,25 +1301,31 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+%if HIGH_BIT_DEPTH
+ pxor m7, m7
+%endif
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ add r2, 24*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
%endif
%if WIN64
@@ -1328,28 +1337,29 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
-
-
-%else
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
+%else
cglobal pixel_satd_32x32, 4,7,8,0-4 ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -1359,28 +1369,34 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+%if HIGH_BIT_DEPTH
+ pxor m7, m7
+%endif
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ add r2, 24*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
%endif
@@ -1397,28 +1413,28 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
@@ -1447,31 +1463,31 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- lea r0, [r6 + 8]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
+ add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
More information about the x265-devel
mailing list