[x265] [PATCH] asm: improve on intra_dc32
Min Chen
chenm003 at 163.com
Sat Mar 7 02:07:18 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1425690429 28800
# Node ID 63d132c844b9d299081b40e7589275b78fe71093
# Parent 043c2418864b0a3ada6f597e6def6ead73d90b5f
asm: improve on intra_dc32
---
source/common/x86/intrapred8.asm | 71 ++++++++++++--------------------------
1 files changed, 22 insertions(+), 49 deletions(-)
diff -r 043c2418864b -r 63d132c844b9 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Fri Mar 06 13:15:55 2015 -0600
+++ b/source/common/x86/intrapred8.asm Fri Mar 06 17:07:09 2015 -0800
@@ -524,15 +524,21 @@
pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
pshufd m1, m1, 0x00
+ lea r2, [r0 + r1 * 2]
%assign x 0
-%rep 16
+%rep 8
; store DC 16x16
movu [r0], m1
+ movu [r0 + 16], m1
movu [r0 + r1], m1
- movu [r0 + 16], m1
movu [r0 + r1 + 16], m1
-%if x < 16
- lea r0, [r0 + 2 * r1]
+ movu [r2], m1
+ movu [r2 + 16], m1
+ movu [r2 + r1], m1
+ movu [r2 + r1 + 16], m1
+%if x < 8
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r1]
%endif
%assign x x+1
%endrep
@@ -996,14 +1002,13 @@
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_dc32, 3, 5, 5
- lea r3, [r2 + 65]
+cglobal intra_pred_dc32, 3,3,5
inc r2
pxor m0, m0
movu m1, [r2]
movu m2, [r2 + 16]
- movu m3, [r3]
- movu m4, [r3 + 16]
+ movu m3, [r2 + 64]
+ movu m4, [r2 + 64 + 16]
psadbw m1, m0
psadbw m2, m0
psadbw m3, m0
@@ -1014,54 +1019,22 @@
pshufd m2, m1, 2
paddw m1, m2
- movd r4d, m1
- add r4d, 32
- shr r4d, 6 ; sum = sum / 64
- movd m1, r4d
- pshufb m1, m0 ; m1 = byte [dc_val ...]
-
-%rep 2
+ paddw m1, [pw_32] ; sum = (sum + 32) / 64
+ psrlw m1, 6
+ pshufb m1, m0 ; m1 = byte [dc_val ...]
+
+ ; store DC 16x16
+%assign x 0
+%rep 16
; store DC 16x16
movu [r0], m1
+ movu [r0 + 16], m1
movu [r0 + r1], m1
- movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
+ %if (x < 16)
lea r0, [r0 + 2 * r1]
- movu [r0], m1
- movu [r0 + r1], m1
- movu [r0 + 16], m1
- movu [r0 + r1 + 16],m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m1
- movu [r0 + r1], m1
- movu [r0 + 16], m1
- movu [r0 + r1 + 16],m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m1
- movu [r0 + r1], m1
- movu [r0 + 16], m1
- movu [r0 + r1 + 16],m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m1
- movu [r0 + r1], m1
- movu [r0 + 16], m1
- movu [r0 + r1 + 16],m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m1
- movu [r0 + r1], m1
- movu [r0 + 16], m1
- movu [r0 + r1 + 16],m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m1
- movu [r0 + r1], m1
- movu [r0 + 16], m1
- movu [r0 + r1 + 16],m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m1
- movu [r0 + r1], m1
- movu [r0 + 16], m1
- movu [r0 + r1 + 16],m1
- lea r0, [r0 + 2 * r1]
+ %endif
+%assign x x+1
%endrep
RET
More information about the x265-devel
mailing list