<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>reg0: i0 j0 i1 j1...</DIV>
<DIV>reg1: k0 l0 k1 l1... <BR>(reg0 ^ reg1) = (i ^ k) (j ^ l)...</DIV><PRE>so we can reduce load operator</PRE><PRE>of course, this code is right, we have wasting some days on it, keep this implement code and work on other functions.</PRE><PRE>At 2013-11-18 19:20:04,murugan@multicorewareinc.com wrote:</PRE>
># HG changeset patch
># User Murugan Vairavel <murugan@multicorewareinc.com>
># Date 1384773570 -19800
># Mon Nov 18 16:49:30 2013 +0530
># Node ID c355ba4b6711bfad87ff37d650a8f1946f878eec
># Parent 2321ebe0bf64e5f3c0034076c7edb3ecbcd48039
>asm: code for scale2D_64to32 routine>
>diff -r 2321ebe0bf64 -r c355ba4b6711 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Mon Nov 18 11:32:06 2013 +0530
>+++ b/source/common/x86/asm-primitives.cpp Mon Nov 18 16:49:30 2013 +0530
>@@ -530,6 +530,7 @@
> PIXEL_AVG_W4(ssse3);
>
> p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
>+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
>
> p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
> p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
>diff -r 2321ebe0bf64 -r c355ba4b6711 source/common/x86/pixel-a.asm
>--- a/source/common/x86/pixel-a.asm Mon Nov 18 11:32:06 2013 +0530
>+++ b/source/common/x86/pixel-a.asm Mon Nov 18 16:49:30 2013 +0530
>@@ -8230,3 +8230,113 @@
> movu [r0 + 48], m4
> > RET>+
>+;-----------------------------------------------------------------
>+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
>+
>+ mova m7, [deinterleave_shuf]
>+ mov r3d, 32
>+.loop>+
>+ movu m0, [r1] ;i
>+ movu m1, [r1 + 1] ;j
>+ movu m2, [r1 + r2] ;k
>+ movu m3, [r1 + r2 + 1] ;l
>+ movu m4, m0
>+ movu m5, m2
>+
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+
>+ pavgb m0, m1 ;s
>+ pavgb m2, m3 ;t
>+ movu m5, m0
>+ pavgb m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmul_16p]
>+ psubb m0, m4 ;Result
>+
>+ movu m1, [r1 + 16] ;i
>+ movu m2, [r1 + 16 + 1] ;j
>+ movu m3, [r1 + r2 + 16] ;k
>+ movu m4, [r1 + r2 + 16 + 1] ;l
>+ movu m5, m1
>+ movu m6, m3
>+
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+
>+ pavgb m1, m2 ;s
>+ pavgb m3, m4 ;t
>+ movu m6, m1
>+ pavgb m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmul_16p]
>+ psubb m1, m5 ;Result
>+
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0], m0
>+
>+ movu m0, [r1 + 32] ;i
>+ movu m1, [r1 + 32 + 1] ;j
>+ movu m2, [r1 + r2 + 32] ;k
>+ movu m3, [r1 + r2 + 32 + 1] ;l
>+ movu m4, m0
>+ movu m5, m2
>+
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+
>+ pavgb m0, m1 ;s
>+ pavgb m2, m3 ;t
>+ movu m5, m0
>+ pavgb m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmul_16p]
>+ psubb m0, m4 ;Result
>+
>+ movu m1, [r1 + 48] ;i
>+ movu m2, [r1 + 48 + 1] ;j
>+ movu m3, [r1 + r2 + 48] ;k
>+ movu m4, [r1 + r2 + 48 + 1] ;l
>+ movu m5, m1
>+ movu m6, m3
>+
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+
>+ pavgb m1, m2 ;s
>+ pavgb m3, m4 ;t
>+ movu m6, m1
>+ pavgb m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmul_16p]
>+ psubb m1, m5 ;Result
>+
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0 + 16], m0
>+
>+ lea r0, [r0 + 32]
>+ lea r1, [r1 + 2 * r2]
>+ dec r3d>+
>+ jnz .loop>+>+RET
>diff -r 2321ebe0bf64 -r c355ba4b6711 source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Mon Nov 18 11:32:06 2013 +0530
>+++ b/source/common/x86/pixel.h Mon Nov 18 16:49:30 2013 +0530
>@@ -117,6 +117,7 @@
> int x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t);
> int x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t);
> void x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t);
>+void x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t);
>
> DECL_PIXELS(uint64_t, var, mmx2, (pixel * pix, intptr_t i_stride))
> DECL_PIXELS(uint64_t, var, sse2, (pixel * pix, intptr_t i_stride))
>_______________________________________________
>x265-devel mailing list>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel</div>