[x265] [PATCH 4 of 9] asm:intra pred dc32 sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Fri Mar 6 01:19:57 CET 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1425594719 28800
# Node ID 912c42dcb4d9b399515e6c1ed6be70db3bf5f675
# Parent  c5fa433ffda0a95889e99f4df787f3edc5880d0f
asm:intra pred dc32 sse2

This replaces c code for systems using ssse3 to sse2 processors
The code is backported from intrapred dc32 sse4

64-bit

./test/TestBench --testbench intrapred | grep intra_dc_32x32
intra_dc_32x32[f=0]	4.53x 	 1650.00  	 7474.94

32-bit

./test/TestBench --testbench intrapred | grep intra_dc_32x32
intra_dc_32x32[f=0]	7.79x 	 1749.94  	 13627.45

diff -r c5fa433ffda0 -r 912c42dcb4d9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Mar 05 13:54:48 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp	Thu Mar 05 14:31:59 2015 -0800
@@ -1213,6 +1213,7 @@
         p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2;
         p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2;
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2;
+        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2;
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2;
 
diff -r c5fa433ffda0 -r 912c42dcb4d9 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Thu Mar 05 13:54:48 2015 -0800
+++ b/source/common/x86/intrapred.h	Thu Mar 05 14:31:59 2015 -0800
@@ -29,6 +29,7 @@
 void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc16_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+void x265_intra_pred_dc32_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
diff -r c5fa433ffda0 -r 912c42dcb4d9 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Thu Mar 05 13:54:48 2015 -0800
+++ b/source/common/x86/intrapred8.asm	Thu Mar 05 14:31:59 2015 -0800
@@ -495,6 +495,46 @@
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
 ;---------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal intra_pred_dc32, 3, 3, 5
+    pxor            m0,            m0
+    movu            m1,            [r2 + 1]
+    movu            m2,            [r2 + 17]
+    movu            m3,            [r2 + 65]
+    movu            m4,            [r2 + 81]
+    psadbw          m1,            m0
+    psadbw          m2,            m0
+    psadbw          m3,            m0
+    psadbw          m4,            m0
+    paddw           m1,            m2
+    paddw           m3,            m4
+    paddw           m1,            m3
+    pshufd          m2,            m1, 2
+    paddw           m1,            m2
+
+    paddw           m1,            [pw_32]
+    psraw           m1,            6
+    pmullw          m1,            [pw_257]
+    pshuflw         m1,            m1, 0x00       ; m1 = byte [dc_val ...]
+    pshufd          m1,            m1, 0x00
+
+%assign x 0
+%rep 16
+    ; store DC 16x16
+    movu            [r0],               m1
+    movu            [r0 + r1],          m1
+    movu            [r0 + 16],          m1
+    movu            [r0 + r1 + 16],     m1
+%if x < 16
+    lea             r0,            [r0 + 2 * r1]
+%endif
+%assign x x+1
+%endrep
+    RET
+
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal intra_pred_dc4, 5,5,3
     inc         r2


More information about the x265-devel mailing list