[x265] [PATCH] asm: code for scale2D_64to32 routine
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Nov 18 12:20:04 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384773570 -19800
# Mon Nov 18 16:49:30 2013 +0530
# Node ID c355ba4b6711bfad87ff37d650a8f1946f878eec
# Parent 2321ebe0bf64e5f3c0034076c7edb3ecbcd48039
asm: code for scale2D_64to32 routine
diff -r 2321ebe0bf64 -r c355ba4b6711 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 18 11:32:06 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 18 16:49:30 2013 +0530
@@ -530,6 +530,7 @@
PIXEL_AVG_W4(ssse3);
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
diff -r 2321ebe0bf64 -r c355ba4b6711 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 18 11:32:06 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Nov 18 16:49:30 2013 +0530
@@ -8230,3 +8230,113 @@
movu [r0 + 48], m4
RET
+
+;-----------------------------------------------------------------
+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM ssse3
+cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
+
+ mova m7, [deinterleave_shuf]
+ mov r3d, 32
+.loop
+
+ movu m0, [r1] ;i
+ movu m1, [r1 + 1] ;j
+ movu m2, [r1 + r2] ;k
+ movu m3, [r1 + r2 + 1] ;l
+ movu m4, m0
+ movu m5, m2
+
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+
+ pavgb m0, m1 ;s
+ pavgb m2, m3 ;t
+ movu m5, m0
+ pavgb m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmul_16p]
+ psubb m0, m4 ;Result
+
+ movu m1, [r1 + 16] ;i
+ movu m2, [r1 + 16 + 1] ;j
+ movu m3, [r1 + r2 + 16] ;k
+ movu m4, [r1 + r2 + 16 + 1] ;l
+ movu m5, m1
+ movu m6, m3
+
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+
+ pavgb m1, m2 ;s
+ pavgb m3, m4 ;t
+ movu m6, m1
+ pavgb m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmul_16p]
+ psubb m1, m5 ;Result
+
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0], m0
+
+ movu m0, [r1 + 32] ;i
+ movu m1, [r1 + 32 + 1] ;j
+ movu m2, [r1 + r2 + 32] ;k
+ movu m3, [r1 + r2 + 32 + 1] ;l
+ movu m4, m0
+ movu m5, m2
+
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+
+ pavgb m0, m1 ;s
+ pavgb m2, m3 ;t
+ movu m5, m0
+ pavgb m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmul_16p]
+ psubb m0, m4 ;Result
+
+ movu m1, [r1 + 48] ;i
+ movu m2, [r1 + 48 + 1] ;j
+ movu m3, [r1 + r2 + 48] ;k
+ movu m4, [r1 + r2 + 48 + 1] ;l
+ movu m5, m1
+ movu m6, m3
+
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+
+ pavgb m1, m2 ;s
+ pavgb m3, m4 ;t
+ movu m6, m1
+ pavgb m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmul_16p]
+ psubb m1, m5 ;Result
+
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0 + 16], m0
+
+ lea r0, [r0 + 32]
+ lea r1, [r1 + 2 * r2]
+ dec r3d
+
+ jnz .loop
+
+RET
diff -r 2321ebe0bf64 -r c355ba4b6711 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Nov 18 11:32:06 2013 +0530
+++ b/source/common/x86/pixel.h Mon Nov 18 16:49:30 2013 +0530
@@ -117,6 +117,7 @@
int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t);
void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t);
+void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
DECL_PIXELS(uint64_t, var, mmx2, (pixel * pix, intptr_t i_stride))
DECL_PIXELS(uint64_t, var, sse2, (pixel * pix, intptr_t i_stride))
More information about the x265-devel
mailing list