[x265] [PATCH] asm: dct8 sse2 1.88x improvement over c code

dtyx265 at gmail.com dtyx265 at gmail.com
Thu Feb 19 23:45:47 CET 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1424385856 28800
# Node ID 28287b57013e9c43488bfba1570ded5cfb4af16d
# Parent  039ea966d5ebccab1de2c3766fb7b4f125d2020a
asm: dct8 sse2 1.88x improvement over c code

This is backported from dct8 sse4

diff -r 039ea966d5eb -r 28287b57013e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 18 19:04:02 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp	Thu Feb 19 14:44:16 2015 -0800
@@ -872,6 +872,7 @@
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_32x64_sse2;
 
         p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
+        p.cu[BLOCK_8x8].dct = x265_dct8_sse2;
         p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
 #if X86_64
         p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
@@ -1080,6 +1081,7 @@
         p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
 
         p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
+        p.cu[BLOCK_8x8].dct = x265_dct8_sse2;
         p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
 #if X86_64
         p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
diff -r 039ea966d5eb -r 28287b57013e source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Feb 18 19:04:02 2015 -0600
+++ b/source/common/x86/dct8.asm	Thu Feb 19 14:44:16 2015 -0800
@@ -748,6 +748,368 @@
     movhps      [r1 + r2], m1
     RET
 
+;-------------------------------------------------------
+; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
+;-------------------------------------------------------
+INIT_XMM sse2
+cglobal dct8, 3,6,8,0-16*mmsize
+    ;------------------------
+    ; Stack Mapping(dword)
+    ;------------------------
+    ; Row0[0-3] Row1[0-3]
+    ; ...
+    ; Row6[0-3] Row7[0-3]
+    ; Row0[0-3] Row7[0-3]
+    ; ...
+    ; Row6[4-7] Row7[4-7]
+    ;------------------------
+%if BIT_DEPTH == 10
+  %define       DCT_SHIFT1 4
+  %define       DCT_ADD1 [pd_8]
+%elif BIT_DEPTH == 8
+  %define       DCT_SHIFT1 2
+  %define       DCT_ADD1 [pd_2]
+%else
+  %error Unsupported BIT_DEPTH!
+%endif
+%define         DCT_ADD2 [pd_256]
+%define         DCT_SHIFT2 9
+
+    add         r2, r2
+    lea         r3, [r2 * 3]
+    mov         r5, rsp
+%assign x 0
+%rep 2
+    movu        m0, [r0]
+    movu        m1, [r0 + r2]
+    movu        m2, [r0 + r2 * 2]
+    movu        m3, [r0 + r3]
+
+    punpcklwd   m4, m0, m1
+    punpckhwd   m0, m1
+    punpcklwd   m5, m2, m3
+    punpckhwd   m2, m3
+    punpckldq   m1, m4, m5          ; m1 = [1 0]
+    punpckhdq   m4, m5              ; m4 = [3 2]
+    punpckldq   m3, m0, m2
+    punpckhdq   m0, m2
+    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
+    pshufd      m0, m0, 0x4E        ; m0 = [6 7]
+
+    paddw       m3, m1, m0
+    psubw       m1, m0              ; m1 = [d1 d0]
+    paddw       m0, m4, m2
+    psubw       m4, m2              ; m4 = [d3 d2]
+    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
+    punpckhqdq  m3, m0
+    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]
+
+    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
+    punpckhwd   m1, m4              ; m1 = [d3/d1]
+    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
+    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]
+
+    ; odd
+    lea         r4, [tab_dct8_1]
+    pmaddwd     m1, m4, [r4 + 0*16]
+    pmaddwd     m5, m0, [r4 + 0*16]
+    pshufd      m1, m1, 0xD8
+    pshufd      m5, m5, 0xD8
+    mova        m7, m1
+    punpckhqdq  m7, m5
+    punpcklqdq  m1, m5
+    paddd       m1, m7
+    paddd       m1, DCT_ADD1
+    psrad       m1, DCT_SHIFT1
+  %if x == 1
+    pshufd      m1, m1, 0x1B
+  %endif
+    mova        [r5 + 1*2*mmsize], m1 ; Row 1
+
+    pmaddwd     m1, m4, [r4 + 1*16]
+    pmaddwd     m5, m0, [r4 + 1*16]
+    pshufd      m1, m1, 0xD8
+    pshufd      m5, m5, 0xD8
+    mova        m7, m1
+    punpckhqdq  m7, m5
+    punpcklqdq  m1, m5
+    paddd       m1, m7
+    paddd       m1, DCT_ADD1
+    psrad       m1, DCT_SHIFT1
+  %if x == 1
+    pshufd      m1, m1, 0x1B
+  %endif
+    mova        [r5 + 3*2*mmsize], m1 ; Row 3
+
+    pmaddwd     m1, m4, [r4 + 2*16]
+    pmaddwd     m5, m0, [r4 + 2*16]
+    pshufd      m1, m1, 0xD8
+    pshufd      m5, m5, 0xD8
+    mova        m7, m1
+    punpckhqdq  m7, m5
+    punpcklqdq  m1, m5
+    paddd       m1, m7
+    paddd       m1, DCT_ADD1
+    psrad       m1, DCT_SHIFT1
+  %if x == 1
+    pshufd      m1, m1, 0x1B
+  %endif
+    mova        [r5 + 5*2*mmsize], m1 ; Row 5
+
+    pmaddwd     m4, [r4 + 3*16]
+    pmaddwd     m0, [r4 + 3*16]
+    pshufd      m4, m4, 0xD8
+    pshufd      m0, m0, 0xD8
+    mova        m7, m4
+    punpckhqdq  m7, m0
+    punpcklqdq  m4, m0
+    paddd       m4, m7
+    paddd       m4, DCT_ADD1
+    psrad       m4, DCT_SHIFT1
+  %if x == 1
+    pshufd      m4, m4, 0x1B
+  %endif
+    mova        [r5 + 7*2*mmsize], m4; Row 7
+
+    ; even
+    lea         r4, [tab_dct4]
+    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
+    pshufd      m0, m0, 0xD8
+    pshuflw     m0, m0, 0xD8
+    pshufhw     m0, m0, 0xD8
+    psubw       m2, m3              ; m2 = [EO1 EO0]
+    pmullw      m2, [pw_ppppmmmm]
+    pshufd      m2, m2, 0xD8
+    pshuflw     m2, m2, 0xD8
+    pshufhw     m2, m2, 0xD8
+    pmaddwd     m3, m0, [r4 + 0*16]
+    paddd       m3, DCT_ADD1
+    psrad       m3, DCT_SHIFT1
+  %if x == 1
+    pshufd      m3, m3, 0x1B
+  %endif
+    mova        [r5 + 0*2*mmsize], m3 ; Row 0
+    pmaddwd     m0, [r4 + 2*16]
+    paddd       m0, DCT_ADD1
+    psrad       m0, DCT_SHIFT1
+  %if x == 1
+    pshufd      m0, m0, 0x1B
+  %endif
+    mova        [r5 + 4*2*mmsize], m0 ; Row 4
+    pmaddwd     m3, m2, [r4 + 1*16]
+    paddd       m3, DCT_ADD1
+    psrad       m3, DCT_SHIFT1
+  %if x == 1
+    pshufd      m3, m3, 0x1B
+  %endif
+    mova        [r5 + 2*2*mmsize], m3 ; Row 2
+    pmaddwd     m2, [r4 + 3*16]
+    paddd       m2, DCT_ADD1
+    psrad       m2, DCT_SHIFT1
+  %if x == 1
+    pshufd      m2, m2, 0x1B
+  %endif
+    mova        [r5 + 6*2*mmsize], m2 ; Row 6
+
+  %if x != 1
+    lea         r0, [r0 + r2 * 4]
+    add         r5, mmsize
+  %endif
+%assign x x+1
+%endrep
+
+    mov         r0, rsp                 ; r0 = pointer to Low Part
+    lea         r4, [tab_dct8_2]
+
+%assign x 0
+%rep 4
+    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
+    mova        m1, [r0 + 1*2*mmsize]
+    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
+    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
+    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
+    pshufd      m3, m3, 0x9C            ; m3 = ^^
+    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
+    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^
+
+    ; even
+    pshufd      m4, m2, 0xD8
+    pshufd      m3, m3, 0xD8
+    mova        m7, m4
+    punpckhqdq  m7, m3
+    punpcklqdq  m4, m3
+    mova        m2, m4
+    paddd       m4, m7                  ; m4 = [EE1 EE0 EE1 EE0]
+    psubd       m2, m7                  ; m2 = [EO1 EO0 EO1 EO0]
+
+    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
+    mova        m5, m2
+    pmuludq     m5, [r4 + 0*16]
+    pshufd      m7, m2, 0xF5
+    movu        m6, [r4 + 0*16 + 4]
+    pmuludq     m7, m6
+    pshufd      m5, m5, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m5, m7                  ; m5 = [36*EO1 83*EO0]
+    pshufd      m7, m2, 0xF5
+    pmuludq     m2, [r4 + 1*16]
+    movu        m6, [r4 + 1*16 + 4]
+    pmuludq     m7, m6
+    pshufd      m2, m2, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m2, m7                  ; m2 = [83*EO1 36*EO0]
+
+    pshufd      m3, m4, 0xD8
+    pshufd      m5, m5, 0xD8
+    mova        m7, m3
+    punpckhqdq  m7, m5
+    punpcklqdq  m3, m5
+    paddd       m3, m7                  ; m3 = [Row2 Row0]
+    paddd       m3, DCT_ADD2
+    psrad       m3, DCT_SHIFT2
+    pshufd      m4, m4, 0xD8
+    pshufd      m2, m2, 0xD8
+    mova        m7, m4
+    punpckhqdq  m7, m2
+    punpcklqdq  m4, m2
+    psubd       m4, m7                  ; m4 = [Row6 Row4]
+    paddd       m4, DCT_ADD2
+    psrad       m4, DCT_SHIFT2
+
+    packssdw    m3, m3
+    movd        [r1 + 0*mmsize], m3
+    pshufd      m3, m3, 1
+    movd        [r1 + 2*mmsize], m3
+
+    packssdw    m4, m4
+    movd        [r1 + 4*mmsize], m4
+    pshufd      m4, m4, 1
+    movd        [r1 + 6*mmsize], m4
+
+    ; odd
+    mova        m2, m0
+    pmuludq     m2, [r4 + 2*16]
+    pshufd      m7, m0, 0xF5
+    movu        m6, [r4 + 2*16 + 4]
+    pmuludq     m7, m6
+    pshufd      m2, m2, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m2, m7
+    mova        m3, m1
+    pmuludq     m3, [r4 + 2*16]
+    pshufd      m7, m1, 0xF5
+    pmuludq     m7, m6
+    pshufd      m3, m3, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m3, m7
+    mova        m4, m0
+    pmuludq     m4, [r4 + 3*16]
+    pshufd      m7, m0, 0xF5
+    movu        m6, [r4 + 3*16 + 4]
+    pmuludq     m7, m6
+    pshufd      m4, m4, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m4, m7
+    mova        m5, m1
+    pmuludq     m5, [r4 + 3*16]
+    pshufd      m7, m1, 0xF5
+    pmuludq     m7, m6
+    pshufd      m5, m5, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m5, m7
+    pshufd      m2, m2, 0xD8
+    pshufd      m3, m3, 0xD8
+    mova        m7, m2
+    punpckhqdq  m7, m3
+    punpcklqdq  m2, m3
+    paddd       m2, m7
+    pshufd      m4, m4, 0xD8
+    pshufd      m5, m5, 0xD8
+    mova        m7, m4
+    punpckhqdq  m7, m5
+    punpcklqdq  m4, m5
+    paddd       m4, m7
+    pshufd      m2, m2, 0xD8
+    pshufd      m4, m4, 0xD8
+    mova        m7, m2
+    punpckhqdq  m7, m4
+    punpcklqdq  m2, m4
+    paddd       m2, m7                  ; m2 = [Row3 Row1]
+    paddd       m2, DCT_ADD2
+    psrad       m2, DCT_SHIFT2
+
+    packssdw    m2, m2
+    movd        [r1 + 1*mmsize], m2
+    pshufd      m2, m2, 1
+    movd        [r1 + 3*mmsize], m2
+
+    mova        m2, m0
+    pmuludq     m2, [r4 + 4*16]
+    pshufd      m7, m0, 0xF5
+    movu        m6, [r4 + 4*16 + 4]
+    pmuludq     m7, m6
+    pshufd      m2, m2, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m2, m7
+    mova        m3, m1
+    pmuludq     m3, [r4 + 4*16]
+    pshufd      m7, m1, 0xF5
+    pmuludq     m7, m6
+    pshufd      m3, m3, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m3, m7
+    mova        m4, m0
+    pmuludq     m4, [r4 + 5*16]
+    pshufd      m7, m0, 0xF5
+    movu        m6, [r4 + 5*16 + 4]
+    pmuludq     m7, m6
+    pshufd      m4, m4, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m4, m7
+    mova        m5, m1
+    pmuludq     m5, [r4 + 5*16]
+    pshufd      m7, m1, 0xF5
+    pmuludq     m7, m6
+    pshufd      m5, m5, 0x88
+    pshufd      m7, m7, 0x88
+    punpckldq   m5, m7
+    pshufd      m2, m2, 0xD8
+    pshufd      m3, m3, 0xD8
+    mova        m7, m2
+    punpckhqdq  m7, m3
+    punpcklqdq  m2, m3
+    paddd       m2, m7
+    pshufd      m4, m4, 0xD8
+    pshufd      m5, m5, 0xD8
+    mova        m7, m4
+    punpckhqdq  m7, m5
+    punpcklqdq  m4, m5
+    paddd       m4, m7
+    pshufd      m2, m2, 0xD8
+    pshufd      m4, m4, 0xD8
+    mova        m7, m2
+    punpckhqdq  m7, m4
+    punpcklqdq  m2, m4
+    paddd       m2, m7                  ; m2 = [Row7 Row5]
+    paddd       m2, DCT_ADD2
+    psrad       m2, DCT_SHIFT2
+
+    packssdw    m2, m2
+    movd        [r1 + 5*mmsize], m2
+    pshufd      m2, m2, 1
+    movd        [r1 + 7*mmsize], m2
+%if x < 3
+    add         r1, mmsize/4
+    add         r0, 2*2*mmsize
+%endif
+%assign x x+1
+%endrep
+
+    RET
+%undef IDCT_SHIFT1
+%undef IDCT_ADD1
+%undef IDCT_SHIFT2
+%undef IDCT_ADD2
 
 ;-------------------------------------------------------
 ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
diff -r 039ea966d5eb -r 28287b57013e source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Feb 18 19:04:02 2015 -0600
+++ b/source/common/x86/dct8.h	Thu Feb 19 14:44:16 2015 -0800
@@ -24,6 +24,7 @@
 #ifndef X265_DCT8_H
 #define X265_DCT8_H
 void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);


More information about the x265-devel mailing list