[x265] [PATCH] asm: intra_pred_dc32 high_bit_depth code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Mon May 4 15:17:18 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1430744823 -19800
#      Mon May 04 18:37:03 2015 +0530
# Node ID 9ca2b6bcb92969ddefe75d04f5a6c2caf4fd994b
# Parent  4cf55e54fe3ec33f540b7678b02de34074c0527b
asm: intra_pred_dc32 high_bit_depth code

AVX2:
intra_dc_32x32    18.93x   801.28          15169.64

SSE:
intra_dc_32x32    10.41x   1457.15         15167.84

diff -r 4cf55e54fe3e -r 9ca2b6bcb929 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sat May 02 10:58:05 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Mon May 04 18:37:03 2015 +0530
@@ -1181,6 +1181,8 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
+
         p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
         p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
 
diff -r 4cf55e54fe3e -r 9ca2b6bcb929 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Sat May 02 10:58:05 2015 -0500
+++ b/source/common/x86/intrapred16.asm	Mon May 04 18:37:03 2015 +0530
@@ -448,6 +448,107 @@
 %endrep
     RET
 
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_dc32, 3, 3, 2
+    add             r1d, r1d
+    movu            m0, [r2 + 130]      ; 130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel)
+    movu            m1, [r2 + 162]      ; 162 = 130+32
+    paddw           m0, m1
+    movu            m1, [r2 + 2]
+    paddw           m0, m1
+    movu            m1, [r2 + 34]
+    paddw           m0, m1
+    vextracti128    xm1, m0, 1
+    paddw           xm0, xm1
+    movhlps         xm1, xm0
+    paddw           m0, m1
+    pshuflw         xm1, xm0, 0x6E
+    paddw           m0, m1
+    pmaddwd         m0, [pw_1]
+    paddd           m0, [pd_32]         ; sum = sum + 32
+    psrld           m0, 6               ; sum = sum / 64
+    pshuflw         xm0, xm0, 0
+    pshufd          m0, m0, 0
+    vinserti128     m0, m0, xm0, 1
+
+    lea             r2, [r1 * 3]
+    ; store DC 32x32
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 0 + 32], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 1 + 32], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r1 * 2 + 32], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    movu            [r0 + r2 * 1 + 32], m0
+    RET
+
 ;---------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------


More information about the x265-devel mailing list