[x265] [PATCH] asm: apply new algorithm on upShift_8_sse4
Min Chen
chenm003 at 163.com
Tue Jul 21 02:56:25 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1437437934 25200
# Node ID 024500300a23bc490c66cd06d2edb1e772ed4321
# Parent b2ba7df1fc6992516cfce36fac1ff4fc6bac1a5e
asm: apply new algorithm on upShift_8_sse4
---
source/common/x86/pixel-a.asm | 90 +++++++++++-----------------------------
1 files changed, 25 insertions(+), 65 deletions(-)
diff -r b2ba7df1fc69 -r 024500300a23 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/x86/pixel-a.asm Mon Jul 20 17:18:54 2015 -0700
@@ -7302,7 +7302,7 @@
;---------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal upShift_8, 6,7,3
- movd m2, r6m ; m0 = shift
+ movd xm2, r6m
add r3d, r3d
dec r5d
@@ -7310,84 +7310,44 @@
xor r6, r6
.loopW:
pmovzxbw m0,[r0 + r6]
- pmovzxbw m1,[r0 + r6 + 8]
+ pmovzxbw m1,[r0 + r6 + mmsize/2]
psllw m0, m2
psllw m1, m2
movu [r2 + r6 * 2], m0
- movu [r2 + r6 * 2 + 16], m1
-
- add r6, 16
+ movu [r2 + r6 * 2 + mmsize], m1
+
+ add r6d, mmsize
cmp r6d, r4d
- jl .loopW
+ jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
- jnz .loopH
-
-;processing last row of every frame [To handle width which not a multiple of 16]
-
-.loop16:
- pmovzxbw m0,[r0]
- pmovzxbw m1,[r0 + 8]
- psllw m0, m2
- psllw m1, m2
- movu [r2], m0
- movu [r2 + 16], m1
-
- add r0, mmsize
- add r2, 2 * mmsize
- sub r4d, 16
- jz .end
- cmp r4d, 15
- jg .loop16
-
- cmp r4d, 8
- jl .process4
+ jg .loopH
+
+ ; processing last row of every frame [To handle width which not a multiple of 16]
+ mov r1d, (mmsize/2 - 1)
+ and r1d, r4d
+ sub r1, mmsize/2
+
+ ; NOTE: Width MUST BE more than or equal to 8
+ shr r4d, 3 ; log2(mmsize)
+.loopW8:
pmovzxbw m0,[r0]
psllw m0, m2
movu [r2], m0
-
- add r0, 8
+ add r0, mmsize/2
add r2, mmsize
- sub r4d, 8
- jz .end
-
-.process4:
- cmp r4d, 4
- jl .process2
- movd m0,[r0]
- pmovzxbw m0,m0
+ dec r4d
+ jg .loopW8
+
+ ; Mac OS X can't read beyond array bound, so rollback some bytes
+ pmovzxbw m0,[r0 + r1]
psllw m0, m2
- movh [r2], m0
-
- add r0, 4
- add r2, 8
- sub r4d, 4
- jz .end
-
-.process2:
- cmp r4d, 2
- jl .process1
- movzx r3d, byte [r0]
- shl r3d, 2
- mov [r2], r3w
- movzx r3d, byte [r0 + 1]
- shl r3d, 2
- mov [r2 + 2], r3w
-
- add r0, 2
- add r2, 4
- sub r4d, 2
- jz .end
-
-.process1:
- movzx r3d, byte [r0]
- shl r3d, 2
- mov [r2], r3w
-.end:
- RET
+ movu [r2 + r1 * 2], m0
+ RET
+
;---------------------------------------------------------------------------------------------------------------------
;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
@@ -7420,7 +7380,7 @@
jg .loopH
; processing last row of every frame [To handle width which not a multiple of 32]
- mov r1d, 15
+ mov r1d, (mmsize/2 - 1)
and r1d, r4d
sub r1, mmsize/2
More information about the x265-devel
mailing list