[x265] [PATCH] asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Tue Feb 11 10:51:09 CET 2014
# HG changeset patch
# User Nabajit Deka
# Date 1392112254 -19800
# Tue Feb 11 15:20:54 2014 +0530
# Node ID e6e9310bc545a84fd30533fc7739912c55179d17
# Parent 07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552
asm : Optimisations in blockcopy_sp asm routines(2x4, 2x8, 6x8)
diff -r 07b5d6b82f5f -r e6e9310bc545 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Mon Feb 10 15:05:04 2014 -0600
+++ b/source/common/x86/blockcopy8.asm Tue Feb 11 15:20:54 2014 +0530
@@ -1246,31 +1246,27 @@
; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_sp_2x4, 4, 5, 4, dest, destStride, src, srcStride
-
-add r3, r3
-
-movd m0, [r2]
-movd m1, [r2 + r3]
-movd m2, [r2 + 2 * r3]
-lea r2, [r2 + 2 * r3]
-movd m3, [r2 + r3]
-
-packuswb m0, m1
-packuswb m2, m3
-
-pextrw r4, m0, 0
-mov [r0], r4w
-
-pextrw r4, m0, 4
-mov [r0 + r1], r4w
-
-pextrw r4, m2, 0
+cglobal blockcopy_sp_2x4, 4, 5, 2
+
+add r3, r3
+
+;Row 0-1
+movd m0, [r2]
+movd m1, [r2 + r3]
+packuswb m0, m1
+movd r4d, m0
+mov [r0], r4w
+pextrw [r0 + r1], m0, 4
+
+;Row 2-3
+movd m0, [r2 + 2 * r3]
+lea r2, [r2 + 2 * r3]
+movd m1, [r2 + r3]
+packuswb m0, m1
+movd r4d, m0
mov [r0 + 2 * r1], r4w
-
-lea r0, [r0 + 2 * r1]
-pextrw r4, m2, 4
-mov [r0 + r1], r4w
+lea r0, [r0 + 2 * r1]
+pextrw [r0 + r1], m0, 4
RET
@@ -1279,53 +1275,47 @@
; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_sp_2x8, 4, 5, 8, dest, destStride, src, srcStride
-
-add r3, r3
-
-movd m0, [r2]
-movd m1, [r2 + r3]
-movd m2, [r2 + 2 * r3]
-lea r2, [r2 + 2 * r3]
-movd m3, [r2 + r3]
-movd m4, [r2 + 2 * r3]
-lea r2, [r2 + 2 * r3]
-movd m5, [r2 + r3]
-movd m6, [r2 + 2 * r3]
-lea r2, [r2 + 2 * r3]
-movd m7, [r2 + r3]
-
-packuswb m0, m1
-packuswb m2, m3
-packuswb m4, m5
-packuswb m6, m7
-
-pextrw r4, m0, 0
-mov [r0], r4w
-
-pextrw r4, m0, 4
-mov [r0 + r1], r4w
-
-pextrw r4, m2, 0
+cglobal blockcopy_sp_2x8, 4, 5, 2
+
+add r3, r3
+
+;Row 0-1
+movd m0, [r2]
+movd m1, [r2 + r3]
+packuswb m0, m1
+movd r4d, m0
+mov [r0], r4w
+pextrw [r0 + r1], m0, 4
+
+;Row 2-3
+movd m0, [r2 + 2 * r3]
+lea r2, [r2 + 2 * r3]
+movd m1, [r2 + r3]
+packuswb m0, m1
+movd r4d, m0
mov [r0 + 2 * r1], r4w
-
-lea r0, [r0 + 2 * r1]
-pextrw r4, m2, 4
-mov [r0 + r1], r4w
-
-pextrw r4, m4, 0
+lea r0, [r0 + 2 * r1]
+pextrw [r0 + r1], m0, 4
+
+;Row 4-5
+movd m0, [r2 + 2 * r3]
+lea r2, [r2 + 2 * r3]
+movd m1, [r2 + r3]
+packuswb m0, m1
+movd r4d, m0
mov [r0 + 2 * r1], r4w
-
-lea r0, [r0 + 2 * r1]
-pextrw r4, m4, 4
-mov [r0 + r1], r4w
-
-pextrw r4, m6, 0
+lea r0, [r0 + 2 * r1]
+pextrw [r0 + r1], m0, 4
+
+;Row 6-7
+movd m0, [r2 + 2 * r3]
+lea r2, [r2 + 2 * r3]
+movd m1, [r2 + r3]
+packuswb m0, m1
+movd r4d, m0
mov [r0 + 2 * r1], r4w
-
-lea r0, [r0 + 2 * r1]
-pextrw r4, m6, 4
-mov [r0 + r1], r4w
+lea r0, [r0 + 2 * r1]
+pextrw [r0 + r1], m0, 4
RET
@@ -1477,40 +1467,65 @@
;-----------------------------------------------------------------------------
; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_SP_W6_H4 2
INIT_XMM sse4
-cglobal blockcopy_sp_6x8, 4, 6, 2, dest, destStride, src, srcStride
-
-mov r4d, %2/2
-
-add r3, r3
-
-.loop
- movu m0, [r2]
- movu m1, [r2 + r3]
-
-
- packuswb m0, m1
-
- movd [r0], m0
- pextrw r5, m0, 2
- mov [r0 + 4], r5w
-
- pextrw r5, m0, 6
- pshufd m0, m0, 2
- movd [r0 + r1], m0
- mov [r0 + r1 + 4], r5w
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
-
-RET
-%endmacro
-
-BLOCKCOPY_SP_W6_H4 6, 8
+cglobal blockcopy_sp_6x8, 4, 4, 2
+
+ add r3, r3
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ packuswb m0, m1
+
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movhlps m0, m0
+ movd [r0 + r1], m0
+ pextrw [r0 + r1 + 4], m0, 2
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ packuswb m0, m1
+
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movhlps m0, m0
+ movd [r0 + r1], m0
+ pextrw [r0 + r1 + 4], m0, 2
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ packuswb m0, m1
+
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movhlps m0, m0
+ movd [r0 + r1], m0
+ pextrw [r0 + r1 + 4], m0, 2
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ packuswb m0, m1
+
+ movd [r0], m0
+ pextrw [r0 + 4], m0, 2
+
+ movhlps m0, m0
+ movd [r0 + r1], m0
+ pextrw [r0 + r1 + 4], m0, 2
+
+ RET
;-----------------------------------------------------------------------------
; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
More information about the x265-devel
mailing list