[x265] [PATCH] asm: avx2 code for high_bit_depth intra_dc_16x16

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue May 12 08:18:53 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431409071 -19800
#      Tue May 12 11:07:51 2015 +0530
# Node ID ac2832c459edc3e1417d6fb62f89203e23484ec8
# Parent  5bf9096a4d93fd8c47e9c81e3dafba5d37391114
asm: avx2 code for high_bit_depth intra_dc_16x16

AVX2:
intra_dc_16x16[filter=0]     18.78x   231.65          4350.17
intra_dc_16x16[filter=1]     10.76x   467.37          5028.99

SSE:
intra_dc_16x16[filter=0]     9.46x    459.22          4345.05
intra_dc_16x16[filter=1]     7.19x    692.54          4976.13

diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue May 12 11:07:51 2015 +0530
@@ -1181,6 +1181,7 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_avx2;
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
 
         p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;
diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/const-a.asm	Tue May 12 11:07:51 2015 +0530
@@ -62,7 +62,7 @@
 ;; 16-bit constants
 
 const pw_1,                 times 16 dw 1
-const pw_2,                 times  8 dw 2
+const pw_2,                 times 16 dw 2
 const pw_m2,                times  8 dw -2
 const pw_4,                 times  8 dw 4
 const pw_8,                 times  8 dw 8
diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/intrapred.h	Tue May 12 11:07:51 2015 +0530
@@ -34,6 +34,7 @@
 void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
+void x265_intra_pred_dc16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 
 void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Tue May 12 11:07:51 2015 +0530
@@ -448,6 +448,118 @@
 %endrep
     RET
 
+;-------------------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
+;-------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_dc16, 3, 9, 4
+    mov             r3d,                 r4m
+    add             r1d,                 r1d
+    movu            m0,                  [r2 + 66]
+    movu            m2,                  [r2 +  2]
+    paddw           m0,                  m2
+
+    vextracti128    xm1,                 m0, 1
+    paddw           xm0,                 xm1
+    movhlps         xm1,                 xm0
+    paddw           xm0,                 xm1
+    phaddw          xm0,                 xm0
+    pmaddwd         xm0,                 [pw_1]
+    paddd           xm0,                 [pd_16]
+    psrad           xm0,                 5
+    movd            r5d,                 xm0
+    vpbroadcastw    m0,                  xm0
+
+    test            r3d,                 r3d
+
+    ; store DC 16x16
+    lea             r6,                  [r1 + r1 * 2]        ; index 3
+    lea             r7,                  [r1 + r1 * 4]        ; index 5
+    lea             r8,                  [r6 + r1 * 4]        ; index 7
+    lea             r4,                  [r0 + r8 * 1]        ; base + 7
+
+    movu            [r0],                m0
+    movu            [r0 + r1],           m0
+    movu            [r0 + r1 * 2],       m0
+    movu            [r0 + r6],           m0
+    movu            [r0 + r1 * 4],       m0
+    movu            [r0 + r7],           m0
+    movu            [r0 + r6 * 2],       m0
+    movu            [r4],                m0
+    movu            [r0 + r1 * 8],       m0
+    movu            [r4 + r1 * 2],       m0
+    movu            [r0 + r7 * 2],       m0
+    movu            [r4 + r1 * 4],       m0
+    movu            [r0 + r6 * 4],       m0
+    movu            [r4 + r6 * 2],       m0
+    movu            [r4 + r8],           m0
+    movu            [r4 + r1 * 8],       m0
+
+    ; Do DC Filter
+    jz              .end
+    mova            m1,                  [pw_2]
+    pmullw          m1,                  m0
+    paddw           m1,                  [pw_2]
+    movd            r3d,                 xm1
+    paddw           m1,                  m0
+
+    ; filter top
+    movu            m2,                  [r2 + 2]
+    paddw           m2,                  m1
+    psraw           m2,                  2
+    movu            [r0],                m2
+
+    ; filter top-left
+    movzx           r3d,                 r3w
+    movzx           r5d, word            [r2 + 66]
+    add             r3d,                 r5d
+    movzx           r5d, word            [r2 + 2]
+    add             r5d,                 r3d
+    shr             r5d,                 2
+    mov             [r0],                r5w
+
+    ; filter left
+    movu            m2,                  [r2 + 68]
+    paddw           m2,                  m1
+    psraw           m2,                  2
+    vextracti128    xm3,                 m2, 1
+
+    movq            r3,                  xm2
+    pshufd          xm2,                 xm2, 0xEE
+    mov             [r0 + r1],           r3w
+    shr             r3,                  16
+    mov             [r0 + r1 * 2],       r3w
+    shr             r3,                  16
+    mov             [r0 + r6],           r3w
+    shr             r3,                  16
+    mov             [r0 + r1 * 4],       r3w
+    movq            r3,                  xm2
+    mov             [r0 + r7],           r3w
+    shr             r3,                  16
+    mov             [r0 + r6 * 2],       r3w
+    shr             r3,                  16
+    mov             [r4],                r3w
+    shr             r3,                  16
+    mov             [r0 + r1 * 8],       r3w
+
+    movq            r3,                  xm3
+    pshufd          xm3,                 xm3, 0xEE
+    mov             [r4 + r1 * 2],       r3w
+    shr             r3,                  16
+    mov             [r0 + r7 * 2],       r3w
+    shr             r3,                  16
+    mov             [r4 + r1 * 4],       r3w
+    shr             r3,                  16
+    mov             [r0 + r6 * 4],       r3w
+    movq            r3,                  xm3
+    mov             [r4 + r6 * 2],       r3w
+    shr             r3,                  16
+    mov             [r4 + r8],           r3w
+    shr             r3,                  16
+    mov             [r4 + r1 * 8],       r3w
+.end:
+    RET
+
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
 ;---------------------------------------------------------------------------------------------


More information about the x265-devel mailing list