[x265] [PATCH] asm:intra pred dc16 sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Mon Mar 2 23:48:13 CET 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1425335534 28800
# Node ID a89ced7112fdff2c1a586f29f6bfbf584ec2f697
# Parent  ff2b3397092367123e936957cd57a9dd2985b03f
asm:intra pred dc16 sse2

This replaces c code for systems using ssse3 to sse2 processors
The code is backported from intrapred dc16 sse4 high bit

64-bit

./test/TestBench --testbench intrapred | grep 16x16
intra_dc_16x16[f=0]	2.43x 	 580.09   	 1412.50
intra_dc_16x16[f=1]	2.36x 	 1017.66  	 2400.02

32-bit

./test/TestBench --testbench intrapred | grep 16x16
intra_dc_16x16[f=0]	3.58x 	 754.99   	 2705.04
intra_dc_16x16[f=1]	3.00x 	 1230.08  	 3687.46

diff -r ff2b33970923 -r a89ced7112fd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 27 14:56:56 2015 -0800
+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 02 14:32:14 2015 -0800
@@ -1206,6 +1206,7 @@
 
         p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2;
         p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2;
+        p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2;
 
         p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
         p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
diff -r ff2b33970923 -r a89ced7112fd source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Feb 27 14:56:56 2015 -0800
+++ b/source/common/x86/intrapred.h	Mon Mar 02 14:32:14 2015 -0800
@@ -28,6 +28,7 @@
 
 void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+void x265_intra_pred_dc16_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
diff -r ff2b33970923 -r a89ced7112fd source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Feb 27 14:56:56 2015 -0800
+++ b/source/common/x86/intrapred8.asm	Mon Mar 02 14:32:14 2015 -0800
@@ -291,6 +291,206 @@
 .end:
     RET
 
+;--------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;--------------------------------------------------------------------------------------------
+INIT_XMM sse2
+%if ARCH_X86_64
+cglobal intra_pred_dc16, 5, 10, 4
+%else
+cglobal intra_pred_dc16, 5, 7, 4
+%endif
+    pxor            m0,            m0
+    movu            m1,            [r2 + 1]
+    movu            m2,            [r2 + 33]
+    psadbw          m1,            m0
+    psadbw          m2,            m0
+    paddw           m1,            m2
+    pshufd          m2,            m1, 2
+    paddw           m1,            m2
+
+    paddw           m1,            [pw_16]
+    psraw           m1,            5
+    pmullw          m1,            [pw_257]
+    pshuflw         m1,            m1, 0x00       ; m1 = byte [dc_val ...]
+    pshufd          m1,            m1, 0x00
+
+
+    test            r4d,           r4d
+
+    ; store DC 16x16
+%if ARCH_X86_64
+    lea             r6,            [r1 + r1 * 2]        ;index 3
+    lea             r7,            [r1 + r1 * 4]        ;index 5
+    lea             r8,            [r6 + r1 * 4]        ;index 7
+    lea             r9,            [r0 + r8]            ;base + 7
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r6],     m1
+    movu            [r0 + r1 * 4], m1
+    movu            [r0 + r7],     m1
+    movu            [r0 + r6 * 2], m1
+    movu            [r0 + r8],     m1
+    movu            [r0 + r1 * 8], m1
+    movu            [r9 + r1 * 2], m1
+    movu            [r0 + r7 * 2], m1
+    movu            [r9 + r1 * 4], m1
+    movu            [r0 + r6 * 4], m1
+    movu            [r9 + r6 * 2], m1
+    movu            [r0 + r8 * 2], m1
+    movu            [r9 + r1 * 8], m1
+%else ;32 bit
+    mov             r6,            r0
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+%endif
+    ; Do DC Filter
+    jz              .end
+    psrlw           m1,            8
+    mova            m2,            [pw_2]
+    pmullw          m2,            m1
+    paddw           m2,            [pw_2]
+    movd            r4d,           m2
+    paddw           m1,            m2
+
+    ; filter top
+    movh            m2,            [r2 + 1]
+    punpcklbw       m2,            m0
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+    movh            m3,            [r2 + 9]
+    punpcklbw       m3,            m0
+    paddw           m3,            m1
+    psraw           m3,            2
+    packuswb        m3,            m3
+
+    ; filter top-left
+    movzx           r5d, byte      [r2 + 33]
+    add             r4d,           r5d
+    movzx           r3d, byte      [r2 + 1]
+    add             r3d,           r4d
+    shr             r3d,           2
+
+%if ARCH_X86_64
+    movh            [r0],          m2
+    movh            [r0 + 8],      m3
+    mov             [r0],          r3b
+%else ;32 bit
+    movh            [r6],          m2
+    movh            [r6 + 8],      m3
+    mov             [r6],          r3b
+    add             r6,            r1
+%endif
+
+    ; filter left
+    movh            m2,            [r2 + 34]
+    punpcklbw       m2,            m0
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+
+    movh            m3,            [r2 + 42]
+    punpcklbw       m3,            m0
+    paddw           m3,            m1
+    psraw           m3,            2
+    packuswb        m3,            m3
+%if ARCH_X86_64
+    movh            r3,            m2
+    mov             [r0 + r1],     r3b
+    shr             r3,            8
+    mov             [r0 + r1 * 2], r3b
+    shr             r3,            8
+    mov             [r0 + r6],     r3b
+    shr             r3,            8
+    mov             [r0 + r1 * 4], r3b
+    shr             r3,            8
+    mov             [r0 + r7],     r3b
+    shr             r3,            8
+    mov             [r0 + r6 * 2], r3b
+    shr             r3,            8
+    mov             [r0 + r8],     r3b
+    shr             r3,            8
+    mov             [r0 + r1 * 8], r3b
+    movh            r3,            m3
+    mov             [r9 + r1 * 2], r3b
+    shr             r3,            8
+    mov             [r0 + r7 * 2], r3b
+    shr             r3,            8
+    mov             [r9 + r1 * 4], r3b
+    shr             r3,            8
+    mov             [r0 + r6 * 4], r3b
+    shr             r3,            8
+    mov             [r9 + r6 * 2], r3b
+    shr             r3,            8
+    mov             [r0 + r8 * 2], r3b
+    shr             r3,            8
+    mov             [r9 + r1 * 8], r3b
+%else ;32 bit
+    movd            r2d,            m2
+    pshufd          m2,            m2, 0x01
+    mov             [r6],          r2b
+    shr             r2,            8
+    mov             [r6 + r1],     r2b
+    shr             r2,            8
+    mov             [r6 + r1 * 2], r2b
+    lea             r6,            [r6 + r1 * 2]
+    shr             r2,            8
+    mov             [r6 + r1],     r2b
+    movd            r2d,           m2
+    mov             [r6 + r1 * 2], r2b
+    lea             r6,            [r6 + r1 * 2]
+    shr             r2,            8
+    mov             [r6 + r1],     r2b
+    shr             r2,            8
+    mov             [r6 + r1 * 2], r2b
+    lea             r6,            [r6 + r1 * 2]
+    shr             r2,            8
+    mov             [r6 + r1],     r2b
+    movd            r2d,            m3
+    pshufd          m3,             m3, 0x01
+    mov             [r6 + r1 * 2], r2b
+    lea             r6,            [r6 + r1 * 2]
+    shr             r2,            8
+    mov             [r6 + r1],     r2b
+    shr             r2,            8
+    mov             [r6 + r1 * 2], r2b
+    lea             r6,            [r6 + r1 * 2]
+    shr             r2,            8
+    mov             [r6 + r1],     r2b
+    movd            r2d,           m3
+    mov             [r6 + r1 * 2], r2b
+    lea             r6,            [r6 + r1 * 2]
+    shr             r2,            8
+    mov             [r6 + r1],     r2b
+    shr             r2,            8
+    mov             [r6 + r1 * 2], r2b
+%endif
+.end:
+    RET
+
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
 ;---------------------------------------------------------------------------------------------


More information about the x265-devel mailing list