[x265] [PATCH] asm: improve intra_pred_dc4_sse4 by merge reduce code

Min Chen chenm003 at 163.com
Tue Mar 3 03:29:13 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1425349743 28800
# Node ID fc34a9f443e8537c7e6c653fc4823556c6044ae3
# Parent  64214b2faa324d91a015190b8dc69716ebab41f8
asm: improve intra_pred_dc4_sse4 by merge reduce code
---
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/intrapred8.asm     |   21 ++++++++++-----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff -r 64214b2faa32 -r fc34a9f443e8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Mar 02 16:54:00 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 02 18:29:03 2015 -0800
@@ -1349,6 +1349,7 @@
         p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
 
+        //p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2;
 #if X86_64
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
diff -r 64214b2faa32 -r fc34a9f443e8 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Mar 02 16:54:00 2015 -0800
+++ b/source/common/x86/intrapred8.asm	Mon Mar 02 18:29:03 2015 -0800
@@ -123,6 +123,7 @@
 cextern pw_32
 cextern pw_257
 cextern pw_1024
+cextern pw_4096
 cextern pb_unpackbd1
 cextern multiL
 cextern multiH
@@ -218,9 +219,7 @@
 
     test        r4d, r4d
 
-    mov         r4d, 4096
-    movd        m2, r4d
-    pmulhrsw    m1, m2              ; m1 = (sum + 4) / 8
+    pmulhrsw    m1, [pw_4096]       ; m1 = (sum + 4) / 8
     movd        r4d, m1             ; r4d = dc_val
     pshufb      m1, m0              ; m1 = byte [dc_val ...]
 
@@ -237,9 +236,13 @@
     add         r4d, r3d            ; r4d = DC * 3 + 2
     movd        m1, r4d
     pshuflw     m1, m1, 0           ; m1 = pixDCx3
+    pshufd      m1, m1, 0
 
     ; filter top
-    pmovzxbw    m2, [r2]
+    movd        m2, [r2]
+    movd        m0, [r2 + 9]
+    punpckldq   m2, m0
+    pmovzxbw    m2, m2
     paddw       m2, m1
     psraw       m2, 2
     packuswb    m2, m2
@@ -255,13 +258,9 @@
 
     ; filter left
     add         r0, r1
-    pmovzxbw    m2, [r2 + 9]
-    paddw       m2, m1
-    psraw       m2, 2
-    packuswb    m2, m2
-    pextrb      [r0], m2, 0
-    pextrb      [r0 + r1], m2, 1
-    pextrb      [r0 + r1 * 2], m2, 2
+    pextrb      [r0], m2, 4
+    pextrb      [r0 + r1], m2, 5
+    pextrb      [r0 + r1 * 2], m2, 6
 
 .end:
     RET



More information about the x265-devel mailing list