[x265] [PATCH Review only] asm: code for scale2D_64to32 routine
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Fri Nov 15 15:59:51 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384527565 -19800
# Fri Nov 15 20:29:25 2013 +0530
# Node ID c795b4a126875f24d9df0bfeb48ab28396c9d7c8
# Parent 939b58fa36f56506f32ad761f6c3df72e20e0f2b
asm: code for scale2D_64to32 routine
diff -r 939b58fa36f5 -r c795b4a12687 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Nov 14 20:21:29 2013 +0530
+++ b/source/common/x86/pixel-a.asm Fri Nov 15 20:29:25 2013 +0530
@@ -6848,101 +6848,112 @@
RET
-;-----------------------------------------------------------------
-; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
-;-----------------------------------------------------------------
-INIT_XMM ssse3
-cglobal scale2D_64to32, 3, 7, 8, dest, src, stride
-
- mova m7, [pw_00ff]
- mova m6, [pw_2]
- xor r3, r3
- mov r6d, 32
-.loop
-
- mov r4, r3
- imul r4, r2
-
- mov r5, r3
- inc r5
- imul r5, r2
-
- movu m0, [r1 + r4]
- palignr m1, m0, 1
- movu m2, [r1 + r5]
- palignr m3, m2, 1
-
- pand m0, m7
- pand m1, m7
- pand m2, m7
- pand m3, m7
-
- paddusw m0, m1
- paddusw m0, m2
- paddusw m0, m3
- paddusw m0, m6
-
- psrlw m0, 2
-
- movu m4, [r1 + r4 + 16]
- palignr m5, m4, 1
- movu m1, [r1 + r5 + 16]
- palignr m2, m1, 1
-
- pand m4, m7
- pand m5, m7
- pand m1, m7
- pand m2, m7
-
- paddusw m4, m5
- paddusw m4, m1
- paddusw m4, m2
- paddusw m4, m6
- psrlw m4, 2
-
- packuswb m0, m4
- movu [r0], m0
-
- movu m0, [r1 + r4 + 32]
- palignr m1, m0, 1
- movu m2, [r1 + r5 + 32]
- palignr m3, m2, 1
-
- pand m0, m7
- pand m1, m7
- pand m2, m7
- pand m3, m7
-
- paddusw m0, m1
- paddusw m0, m2
- paddusw m0, m3
- paddusw m0, m6
-
- psrlw m0, 2
-
- movu m4, [r1 + r4 + 48]
- palignr m5, m4, 1
- movu m1, [r1 + r5 + 48]
- palignr m2, m1, 1
-
- pand m4, m7
- pand m5, m7
- pand m1, m7
- pand m2, m7
-
- paddusw m4, m5
- paddusw m4, m1
- paddusw m4, m2
- paddusw m4, m6
- psrlw m4, 2
-
- packuswb m0, m4
- movu [r0 + 16], m0
-
- lea r0, [r0 + 32]
- add r3, 2
- dec r6d
-
- jnz .loop
-
+;-----------------------------------------------------------------
+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+INIT_XMM ssse3
+cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
+
+ mova m7, [deinterleave_shuf]
+ mov r3d, 32
+.loop
+
+ movu m0, [r1] ;i
+ palignr m1, m0, 1 ;j
+ movu m2, [r1 + r2] ;k
+ palignr m3, m2, 1 ;l
+ movu m4, m0
+ movu m5, m2
+
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+
+ pavgb m0, m1 ;s
+ pavgb m2, m3 ;t
+ movu m5, m0
+ pavgb m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmul_16p]
+ psubb m0, m4 ;Result
+
+ movu m1, [r1 + 16] ;i
+ palignr m2, m1, 1 ;j
+ movu m3, [r1 + r2 + 16] ;k
+ palignr m4, m3, 1 ;l
+ movu m5, m1
+ movu m6, m3
+
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+
+ pavgb m1, m2 ;s
+ pavgb m3, m4 ;t
+ movu m6, m1
+ pavgb m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmul_16p]
+ psubb m1, m5 ;Result
+
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0], m0
+
+ movu m0, [r1 + 32] ;i
+ palignr m1, m0, 1 ;j
+ movu m2, [r1 + r2 + 32] ;k
+ palignr m3, m2, 1 ;l
+ movu m4, m0
+ movu m5, m2
+
+ pxor m4, m1 ;i^j
+ pxor m5, m3 ;k^l
+ por m4, m5 ;ij|kl
+
+ pavgb m0, m1 ;s
+ pavgb m2, m3 ;t
+ movu m5, m0
+ pavgb m0, m2 ;(s+t+1)/2
+ pxor m5, m2 ;s^t
+ pand m4, m5 ;(ij|kl)&st
+ pand m4, [hmul_16p]
+ psubb m0, m4 ;Result
+
+ movu m1, [r1 + 48] ;i
+ palignr m2, m1, 1 ;j
+ movu m3, [r1 + r2 + 48] ;k
+ palignr m4, m3, 1 ;l
+ movu m5, m1
+ movu m6, m3
+
+ pxor m5, m2 ;i^j
+ pxor m6, m4 ;k^l
+ por m5, m6 ;ij|kl
+
+ pavgb m1, m2 ;s
+ pavgb m3, m4 ;t
+ movu m6, m1
+ pavgb m1, m3 ;(s+t+1)/2
+ pxor m6, m3 ;s^t
+ pand m5, m6 ;(ij|kl)&st
+ pand m5, [hmul_16p]
+ psubb m1, m5 ;Result
+
+ pshufb m0, m0, m7
+ pshufb m1, m1, m7
+
+ punpcklqdq m0, m1
+ movu [r0 + 16], m0
+
+ lea r0, [r0 + 32]
+ lea r1, [r1 + 2 * r2]
+ dec r3d
+
+ jnz .loop
+
RET
More information about the x265-devel
mailing list