[x265] [PATCH] asm: assembly code for pixel_satd_12x16
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Tue Nov 12 14:56:21 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1384264561 -19800
# Tue Nov 12 19:26:01 2013 +0530
# Node ID 794c74c7339734a2fa5c87d05462c76d43adafba
# Parent 76120c6aa908b35734f8e09541090e1834584070
asm: assembly code for pixel_satd_12x16
diff -r 76120c6aa908 -r 794c74c73397 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Nov 12 16:34:37 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Nov 12 19:26:01 2013 +0530
@@ -319,7 +319,7 @@
INIT8(satd, _mmx2);
HEVC_SATD(mmx2);
p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
- p.satd[LUMA_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_mmx2>;
+ p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse2;
p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
p.satd[LUMA_16x12] = x265_pixel_satd_16x12_sse2;
p.satd[LUMA_16x32] = x265_pixel_satd_16x32_sse2;
@@ -467,7 +467,7 @@
if (cpuMask & X265_CPU_SSE4)
{
p.satd[LUMA_4x16] = x265_pixel_satd_4x16_sse4;
- p.satd[LUMA_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_sse4>;
+ p.satd[LUMA_12x16] = x265_pixel_satd_12x16_sse4;
p.satd[LUMA_32x8] = x265_pixel_satd_32x8_sse4;
p.satd[LUMA_32x16] = x265_pixel_satd_32x16_sse4;
p.satd[LUMA_32x24] = x265_pixel_satd_32x24_sse4;
@@ -486,7 +486,7 @@
{
p.frame_init_lowres_core = x265_frame_init_lowres_core_avx;
p.satd[LUMA_4x16] = x265_pixel_satd_4x16_avx;
- p.satd[LUMA_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_avx>;
+ p.satd[LUMA_12x16] = x265_pixel_satd_12x16_avx;
p.satd[LUMA_32x8] = x265_pixel_satd_32x8_avx;
p.satd[LUMA_32x16] = x265_pixel_satd_32x16_avx;
p.satd[LUMA_32x24] = x265_pixel_satd_32x24_avx;
diff -r 76120c6aa908 -r 794c74c73397 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Nov 12 16:34:37 2013 +0530
+++ b/source/common/x86/pixel-a.asm Tue Nov 12 19:26:01 2013 +0530
@@ -1993,6 +1993,64 @@
SATD_END_SSE2 m6, m7
%endif
+%if WIN64
+cglobal pixel_satd_12x16, 4,8,8
+ SATD_START_MMX
+ mov r6, r0
+ mov r7, r2
+%if vertical==0
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE vertical, 0, swap
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r6 + 4]
+ lea r2, [r7 + 4]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r6 + 8]
+ lea r2, [r7 + 8]
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ HADDW m7, m1
+ movd eax, m7
+ RET
+%else
+cglobal pixel_satd_12x16, 4,6,8
+ SATD_START_MMX
+%if vertical==0
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE vertical, 0, swap
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 4
+ add r2, 4
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ mov r0, r0mp
+ mov r2, r2mp
+ add r0, 8
+ add r2, 8
+ SATD_4x8_SSE vertical, 1, add
+ lea r0, [r0 + r1*2*SIZEOF_PIXEL]
+ lea r2, [r2 + r3*2*SIZEOF_PIXEL]
+ SATD_4x8_SSE vertical, 1, add
+ HADDW m7, m1
+ movd eax, m7
+ RET
+%endif
+
cglobal pixel_satd_8x32, 4,6,8
SATD_START_SSE2 m6, m7
%if vertical
More information about the x265-devel
mailing list