[x265] [PATCH] asm: 10bpp code for scale2D_64to32 routine
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Thu Dec 5 08:59:24 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386230342 -19800
# Thu Dec 05 13:29:02 2013 +0530
# Node ID dbfde5222782eec2ba414d473fd4ba2494c6f333
# Parent e4a7885f377e37841c3ecd8e2419454fa1ba03db
asm: 10bpp code for scale2D_64to32 routine
diff -r e4a7885f377e -r dbfde5222782 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 04 13:45:29 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp Thu Dec 05 13:29:02 2013 +0530
@@ -567,6 +567,7 @@
if (cpuMask & X265_CPU_SSSE3)
{
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r e4a7885f377e -r dbfde5222782 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Dec 04 13:45:29 2013 -0600
+++ b/source/common/x86/pixel-util8.asm Thu Dec 05 13:29:02 2013 +0530
@@ -47,6 +47,8 @@
deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 15, 15
hmul_16p: times 16 db 1
times 8 db 1, -1
+hmulw_16p: times 8 dw 1
+ times 4 dw 1, -1
SECTION .text
@@ -1797,9 +1799,173 @@
;-----------------------------------------------------------------
INIT_XMM ssse3
cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
-
+ mov r3d, 32
+%if HIGH_BIT_DEPTH
+ mova m7, [deinterleave_word_shuf]
+ add r2, r2
+.loop
+ movu m0, [r1] ;i
+ movu m1, [r1 + 2] ;j
+ movu m2, [r1 + r2] ;k
+ movu m3, [r1 + r2 + 2] ;l
+ movu m4, m0
+ movu m5, m2
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+ pavgw m0, m1 ;s
+ pavgw m2, m3 ;t
+ movu m5, m0
+ pavgw m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmulw_16p]
+ psubw m0, m4 ;Result
+ movu m1, [r1 + 16] ;i
+ movu m2, [r1 + 16 + 2] ;j
+ movu m3, [r1 + r2 + 16] ;k
+ movu m4, [r1 + r2 + 16 + 2] ;l
+ movu m5, m1
+ movu m6, m3
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+ pavgw m1, m2 ;s
+ pavgw m3, m4 ;t
+ movu m6, m1
+ pavgw m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmulw_16p]
+ psubw m1, m5 ;Result
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0], m0
+
+ movu m0, [r1 + 32] ;i
+ movu m1, [r1 + 32 + 2] ;j
+ movu m2, [r1 + r2 + 32] ;k
+ movu m3, [r1 + r2 + 32 + 2] ;l
+ movu m4, m0
+ movu m5, m2
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+ pavgw m0, m1 ;s
+ pavgw m2, m3 ;t
+ movu m5, m0
+ pavgw m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmulw_16p]
+ psubw m0, m4 ;Result
+ movu m1, [r1 + 48] ;i
+ movu m2, [r1 + 48 + 2] ;j
+ movu m3, [r1 + r2 + 48] ;k
+ movu m4, [r1 + r2 + 48 + 2] ;l
+ movu m5, m1
+ movu m6, m3
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+ pavgw m1, m2 ;s
+ pavgw m3, m4 ;t
+ movu m6, m1
+ pavgw m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmulw_16p]
+ psubw m1, m5 ;Result
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0 + 16], m0
+
+ movu m0, [r1 + 64] ;i
+ movu m1, [r1 + 64 + 2] ;j
+ movu m2, [r1 + r2 + 64] ;k
+ movu m3, [r1 + r2 + 64 + 2] ;l
+ movu m4, m0
+ movu m5, m2
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+ pavgw m0, m1 ;s
+ pavgw m2, m3 ;t
+ movu m5, m0
+ pavgw m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmulw_16p]
+ psubw m0, m4 ;Result
+ movu m1, [r1 + 80] ;i
+ movu m2, [r1 + 80 + 2] ;j
+ movu m3, [r1 + r2 + 80] ;k
+ movu m4, [r1 + r2 + 80 + 2] ;l
+ movu m5, m1
+ movu m6, m3
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+ pavgw m1, m2 ;s
+ pavgw m3, m4 ;t
+ movu m6, m1
+ pavgw m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmulw_16p]
+ psubw m1, m5 ;Result
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0 + 32], m0
+
+ movu m0, [r1 + 96] ;i
+ movu m1, [r1 + 96 + 2] ;j
+ movu m2, [r1 + r2 + 96] ;k
+ movu m3, [r1 + r2 + 96 + 2] ;l
+ movu m4, m0
+ movu m5, m2
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+ pavgw m0, m1 ;s
+ pavgw m2, m3 ;t
+ movu m5, m0
+ pavgw m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmulw_16p]
+ psubw m0, m4 ;Result
+ movu m1, [r1 + 112] ;i
+ movu m2, [r1 + 112 + 2] ;j
+ movu m3, [r1 + r2 + 112] ;k
+ movu m4, [r1 + r2 + 112 + 2] ;l
+ movu m5, m1
+ movu m6, m3
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+ pavgw m1, m2 ;s
+ pavgw m3, m4 ;t
+ movu m6, m1
+ pavgw m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmulw_16p]
+ psubw m1, m5 ;Result
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0 + 48], m0
+ lea r0, [r0 + 64]
+%else
mova m7, [deinterleave_shuf]
- mov r3d, 32
.loop
movu m0, [r1] ;i
@@ -1895,9 +2061,9 @@
movu [r0 + 16], m0
lea r0, [r0 + 32]
+%endif
lea r1, [r1 + 2 * r2]
dec r3d
-
jnz .loop
RET
More information about the x265-devel
mailing list