[x265] [PATCH] asm: improve intra_pred_dc4_sse4 by merge reduce code
Min Chen
chenm003 at 163.com
Tue Mar 3 03:29:13 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1425349743 28800
# Node ID fc34a9f443e8537c7e6c653fc4823556c6044ae3
# Parent 64214b2faa324d91a015190b8dc69716ebab41f8
asm: improve intra_pred_dc4_sse4 by merge reduce code
---
source/common/x86/asm-primitives.cpp | 1 +
source/common/x86/intrapred8.asm | 21 ++++++++++-----------
2 files changed, 11 insertions(+), 11 deletions(-)
diff -r 64214b2faa32 -r fc34a9f443e8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Mar 02 16:54:00 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 02 18:29:03 2015 -0800
@@ -1349,6 +1349,7 @@
p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
+ //p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2;
#if X86_64
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
diff -r 64214b2faa32 -r fc34a9f443e8 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Mar 02 16:54:00 2015 -0800
+++ b/source/common/x86/intrapred8.asm Mon Mar 02 18:29:03 2015 -0800
@@ -123,6 +123,7 @@
cextern pw_32
cextern pw_257
cextern pw_1024
+cextern pw_4096
cextern pb_unpackbd1
cextern multiL
cextern multiH
@@ -218,9 +219,7 @@
test r4d, r4d
- mov r4d, 4096
- movd m2, r4d
- pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
+ pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8
movd r4d, m1 ; r4d = dc_val
pshufb m1, m0 ; m1 = byte [dc_val ...]
@@ -237,9 +236,13 @@
add r4d, r3d ; r4d = DC * 3 + 2
movd m1, r4d
pshuflw m1, m1, 0 ; m1 = pixDCx3
+ pshufd m1, m1, 0
; filter top
- pmovzxbw m2, [r2]
+ movd m2, [r2]
+ movd m0, [r2 + 9]
+ punpckldq m2, m0
+ pmovzxbw m2, m2
paddw m2, m1
psraw m2, 2
packuswb m2, m2
@@ -255,13 +258,9 @@
; filter left
add r0, r1
- pmovzxbw m2, [r2 + 9]
- paddw m2, m1
- psraw m2, 2
- packuswb m2, m2
- pextrb [r0], m2, 0
- pextrb [r0 + r1], m2, 1
- pextrb [r0 + r1 * 2], m2, 2
+ pextrb [r0], m2, 4
+ pextrb [r0 + r1], m2, 5
+ pextrb [r0 + r1 * 2], m2, 6
.end:
RET
More information about the x265-devel
mailing list