[x265] [PATCH Review only] asm: code for scale2D_64to32 routine
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Thu Nov 14 15:52:08 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384440689 -19800
# Thu Nov 14 20:21:29 2013 +0530
# Node ID 939b58fa36f56506f32ad761f6c3df72e20e0f2b
# Parent 4526a727f0b4975eeaa1094e0ced0a3b3b5c5a7d
asm: code for scale2D_64to32 routine
diff -r 4526a727f0b4 -r 939b58fa36f5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 14 16:57:19 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 14 20:21:29 2013 +0530
@@ -464,6 +464,7 @@
PIXEL_AVG_W4(ssse3);
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
diff -r 4526a727f0b4 -r 939b58fa36f5 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Nov 14 16:57:19 2013 +0530
+++ b/source/common/x86/pixel-a.asm Thu Nov 14 20:21:29 2013 +0530
@@ -153,6 +153,7 @@
cextern pb_0
cextern pb_1
cextern pw_1
+cextern pw_2
cextern pw_8
cextern pw_16
cextern pw_32
@@ -6846,3 +6847,102 @@
movu [r0 + 48], m4
RET
+
+;-----------------------------------------------------------------
+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM ssse3
+cglobal scale2D_64to32, 3, 7, 8, dest, src, stride
+
+ mova m7, [pw_00ff]
+ mova m6, [pw_2]
+ xor r3, r3
+ mov r6d, 32
+.loop
+
+ mov r4, r3
+ imul r4, r2
+
+ mov r5, r3
+ inc r5
+ imul r5, r2
+
+ movu m0, [r1 + r4]
+ palignr m1, m0, 1
+ movu m2, [r1 + r5]
+ palignr m3, m2, 1
+
+ pand m0, m7
+ pand m1, m7
+ pand m2, m7
+ pand m3, m7
+
+ paddusw m0, m1
+ paddusw m0, m2
+ paddusw m0, m3
+ paddusw m0, m6
+
+ psrlw m0, 2
+
+ movu m4, [r1 + r4 + 16]
+ palignr m5, m4, 1
+ movu m1, [r1 + r5 + 16]
+ palignr m2, m1, 1
+
+ pand m4, m7
+ pand m5, m7
+ pand m1, m7
+ pand m2, m7
+
+ paddusw m4, m5
+ paddusw m4, m1
+ paddusw m4, m2
+ paddusw m4, m6
+ psrlw m4, 2
+
+ packuswb m0, m4
+ movu [r0], m0
+
+ movu m0, [r1 + r4 + 32]
+ palignr m1, m0, 1
+ movu m2, [r1 + r5 + 32]
+ palignr m3, m2, 1
+
+ pand m0, m7
+ pand m1, m7
+ pand m2, m7
+ pand m3, m7
+
+ paddusw m0, m1
+ paddusw m0, m2
+ paddusw m0, m3
+ paddusw m0, m6
+
+ psrlw m0, 2
+
+ movu m4, [r1 + r4 + 48]
+ palignr m5, m4, 1
+ movu m1, [r1 + r5 + 48]
+ palignr m2, m1, 1
+
+ pand m4, m7
+ pand m5, m7
+ pand m1, m7
+ pand m2, m7
+
+ paddusw m4, m5
+ paddusw m4, m1
+ paddusw m4, m2
+ paddusw m4, m6
+ psrlw m4, 2
+
+ packuswb m0, m4
+ movu [r0 + 16], m0
+
+ lea r0, [r0 + 32]
+ add r3, 2
+ dec r6d
+
+ jnz .loop
+
+RET
diff -r 4526a727f0b4 -r 939b58fa36f5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Thu Nov 14 16:57:19 2013 +0530
+++ b/source/common/x86/pixel.h Thu Nov 14 20:21:29 2013 +0530
@@ -117,6 +117,7 @@
int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t);
void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t);
+void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
DECL_PIXELS(uint64_t, var, mmx2, (pixel * pix, intptr_t i_stride))
DECL_PIXELS(uint64_t, var, sse2, (pixel * pix, intptr_t i_stride))
More information about the x265-devel
mailing list