[x265] [PATCH] asm: 16bpp support for satd_32xN
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Mon Dec 2 13:20:29 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385986812 -19800
# Mon Dec 02 17:50:12 2013 +0530
# Node ID fa3a3eced7228599400f9403dba159d433d05222
# Parent d0bed5d188d772231f946d0e54bb96c2eff5d38a
asm: 16bpp support for satd_32xN
diff -r d0bed5d188d7 -r fa3a3eced722 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Dec 02 15:29:22 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 02 17:50:12 2013 +0530
@@ -498,6 +498,12 @@
p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
p.satd[LUMA_24x32] = x265_pixel_satd_24x32_sse2;
p.satd[LUMA_48x64] = x265_pixel_satd_48x64_sse2;
+ p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse2;
+ p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse2;
+ p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse2;
+ p.satd[LUMA_32x32] = x265_pixel_satd_32x32_sse2;
+ p.satd[LUMA_32x64] = x265_pixel_satd_32x64_sse2;
+
p.sa8d_inter[LUMA_8x8] = x265_pixel_sa8d_8x8_sse2;
p.sa8d_inter[LUMA_16x16] = x265_pixel_sa8d_16x16_sse2;
diff -r d0bed5d188d7 -r fa3a3eced722 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Dec 02 15:29:22 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Dec 02 17:50:12 2013 +0530
@@ -3788,14 +3788,14 @@
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
%else
@@ -3804,17 +3804,17 @@
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
+ add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
%endif
@@ -3826,16 +3826,16 @@
mov r7, r2
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
@@ -3846,19 +3846,19 @@
mov [rsp], r2
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
+ add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
@@ -3872,22 +3872,25 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
%else
cglobal pixel_satd_32x24, 4,7,8,0-4 ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -3896,25 +3899,29 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+ pxor m7, m7
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ add r2, 24*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
%endif
%if WIN64
@@ -3926,28 +3933,29 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
-
-
-%else
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
+%else
cglobal pixel_satd_32x32, 4,7,8,0-4 ;if !WIN64
SATD_START_SSE2 m6, m7
@@ -3957,28 +3965,32 @@
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- lea r0, [r6 + 8]
+ pxor m7, m7
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ add r2, 24*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ call pixel_satd_8x8_internal
+ SATD_END_SSE2 m6, m7
%endif
@@ -3995,28 +4007,28 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- lea r0, [r6 + 8]
- lea r2, [r7 + 8]
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 16]
- lea r2, [r7 + 16]
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 24]
- lea r2, [r7 + 24]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
+ lea r2, [r7 + 8*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
+ lea r2, [r7 + 16*SIZEOF_PIXEL]
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
+ lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
@@ -4045,31 +4057,31 @@
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
- lea r0, [r6 + 8]
+ lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 8
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 16]
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 16
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- call pixel_satd_8x8_internal2
- lea r0, [r6 + 24]
+ add r2, 16*SIZEOF_PIXEL
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ call pixel_satd_8x8_internal2
+ lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
- add r2, 24
+ add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
@@ -4087,6 +4099,7 @@
RET
%endif
+
%if WIN64
cglobal pixel_satd_48x64, 4,8,8 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
More information about the x265-devel
mailing list