[x265] [PATCH] asm: addAvg avx2 code for luma width >= 8

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Mar 19 06:54:47 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1426744326 -19800
#      Thu Mar 19 11:22:06 2015 +0530
# Node ID 2807d9a5a494de78340ab6d09867205b6676330b
# Parent  cbfa66e0b50cc2393ccbcf6471406504c6c06011
asm: addAvg avx2 code for luma width >= 8

AVX2 improvement of over SSE4 asm(in cpu cycles):
                AVX2      SSE4
 addAvg[  8x8]  207.85    257.95
 addAvg[  8x4]  160.07    166.73
 addAvg[16x16]  517.91    704.17
 addAvg[ 16x8]  265.46    366.31
 addAvg[ 8x16]  426.98    510.07
 addAvg[ 16x4]  145.12    213.41
 addAvg[16x12]  358.53    545.57
 addAvg[12x16]  768.90    808.42
 addAvg[32x32]  1386.84   2566.90
 addAvg[32x16]  692.19    1088.32
 addAvg[16x32]  847.97    1355.29
 addAvg[ 32x8]  397.61    650.59
 addAvg[32x24]  1245.11   1860.74
 addAvg[ 8x32]  941.76    885.06
 addAvg[24x32]  1745.70   2055.57
 addAvg[64x64]  5541.37   9395.08
 addAvg[64x32]  2566.79   4392.35
 addAvg[32x64]  3033.06   4320.25
 addAvg[64x16]  1493.78   2148.42
 addAvg[64x48]  4478.40   7165.04
 addAvg[16x64]  1681.48   2201.52
 addAvg[48x64]  4322.58   6869.51

diff -r cbfa66e0b50c -r 2807d9a5a494 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Mar 18 18:16:51 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Mar 19 11:22:06 2015 +0530
@@ -1417,6 +1417,35 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.pu[LUMA_8x4].addAvg = x265_addAvg_8x4_avx2;
+        p.pu[LUMA_8x8].addAvg = x265_addAvg_8x8_avx2;
+        p.pu[LUMA_8x16].addAvg = x265_addAvg_8x16_avx2;
+        p.pu[LUMA_8x32].addAvg = x265_addAvg_8x32_avx2;
+
+        p.pu[LUMA_12x16].addAvg = x265_addAvg_12x16_avx2;
+
+        p.pu[LUMA_16x4].addAvg = x265_addAvg_16x4_avx2;
+        p.pu[LUMA_16x8].addAvg = x265_addAvg_16x8_avx2;
+        p.pu[LUMA_16x12].addAvg = x265_addAvg_16x12_avx2;
+        p.pu[LUMA_16x16].addAvg = x265_addAvg_16x16_avx2;
+        p.pu[LUMA_16x32].addAvg = x265_addAvg_16x32_avx2;
+        p.pu[LUMA_16x64].addAvg = x265_addAvg_16x64_avx2;
+
+        p.pu[LUMA_24x32].addAvg = x265_addAvg_24x32_avx2;
+
+        p.pu[LUMA_32x8].addAvg = x265_addAvg_32x8_avx2;
+        p.pu[LUMA_32x16].addAvg = x265_addAvg_32x16_avx2;
+        p.pu[LUMA_32x24].addAvg = x265_addAvg_32x24_avx2;
+        p.pu[LUMA_32x32].addAvg = x265_addAvg_32x32_avx2;
+        p.pu[LUMA_32x64].addAvg = x265_addAvg_32x64_avx2;
+
+        p.pu[LUMA_48x64].addAvg = x265_addAvg_48x64_avx2;
+
+        p.pu[LUMA_64x16].addAvg = x265_addAvg_64x16_avx2;
+        p.pu[LUMA_64x32].addAvg = x265_addAvg_64x32_avx2;
+        p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2;
+        p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2;
+
         p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
         p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
         p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
diff -r cbfa66e0b50c -r 2807d9a5a494 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Wed Mar 18 18:16:51 2015 -0500
+++ b/source/common/x86/mc-a.asm	Thu Mar 19 11:22:06 2015 +0530
@@ -1759,7 +1759,492 @@
 ADDAVG_W16_H4 24
 
 ;-----------------------------------------------------------------------------
-
+; addAvg avx2 code start
+;-----------------------------------------------------------------------------
+
+%macro ADDAVG_W8_H4_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova            m4, [pw_256]
+    mova            m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/4
+
+.loop:
+    movu            xm0, [r0]
+    vinserti128     m0, m0, [r0 + r3], 1
+
+    movu            xm2, [r1]
+    vinserti128     m2, m2, [r1 + r4], 1
+
+    paddw           m0, m2
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    packuswb        m0, m0
+    vextracti128    xm1, m0, 1
+    movq            [r2], xm0
+    movq            [r2 + r5], xm1
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    movu            xm0, [r0]
+    vinserti128     m0, m0, [r0 + r3], 1
+
+    movu            m2, [r1]
+    vinserti128     m2, m2, [r1 + r4], 1
+
+    paddw           m0, m2
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    packuswb        m0, m0
+    vextracti128    xm1, m0, 1
+    movq            [r2], xm0
+    movq            [r2 + r5], xm1
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W8_H4_AVX2 4
+ADDAVG_W8_H4_AVX2 8
+ADDAVG_W8_H4_AVX2 16
+ADDAVG_W8_H4_AVX2 32
+
+%macro ADDAVG_W12_H4_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_12x%1, 6,7,7, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova            m4, [pw_256]
+    mova            m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/4
+
+.loop:
+    movu            xm0, [r0]
+    movu            xm1, [r1]
+    movq            xm2, [r0 + 16]
+    movq            xm3, [r1 + 16]
+    vinserti128     m0, m0, xm2, 1
+    vinserti128     m1, m1, xm3, 1
+
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            xm1, [r0 + r3]
+    movu            xm2, [r1 + r4]
+    movq            xm3, [r0 + r3 + 16]
+    movq            xm6, [r1 + r3 + 16]
+    vinserti128     m1, m1, xm3, 1
+    vinserti128     m2, m2, xm6, 1
+
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vextracti128    xm1, m0, 1
+    movq            [r2], xm0
+    movd            [r2 + 8], xm1
+    vpshufd         m1, m1, 2
+    movhps          [r2 + r5], xm0
+    movd            [r2 + r5 + 8], xm1
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    movu            xm0, [r0]
+    movu            xm1, [r1]
+    movq            xm2, [r0 + 16]
+    movq            xm3, [r1 + 16]
+    vinserti128     m0, m0, xm2, 1
+    vinserti128     m1, m1, xm3, 1
+
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            xm1, [r0 + r3]
+    movu            xm2, [r1 + r4]
+    movq            xm3, [r0 + r3 + 16]
+    movq            xm6, [r1 + r3 + 16]
+    vinserti128     m1, m1, xm3, 1
+    vinserti128     m2, m2, xm6, 1
+
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vextracti128    xm1, m0, 1
+    movq            [r2], xm0
+    movd            [r2 + 8], xm1
+    vpshufd         m1, m1, 2
+    movhps          [r2 + r5], xm0
+    movd            [r2 + r5 + 8], xm1
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W12_H4_AVX2 16
+
+%macro ADDAVG_W16_H4_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova            m4, [pw_256]
+    mova            m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/4
+
+.loop:
+    movu            m0, [r0]
+    movu            m1, [r1]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + r3]
+    movu            m2, [r1 + r4]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    vextracti128    [r2], m0, 0
+    vextracti128    [r2 + r5], m0, 1
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    movu            m0, [r0]
+    movu            m1, [r1]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + r3]
+    movu            m2, [r1 + r4]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    vextracti128    [r2], m0, 0
+    vextracti128    [r2 + r5], m0, 1
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W16_H4_AVX2 4
+ADDAVG_W16_H4_AVX2 8
+ADDAVG_W16_H4_AVX2 12
+ADDAVG_W16_H4_AVX2 16
+ADDAVG_W16_H4_AVX2 32
+ADDAVG_W16_H4_AVX2 64
+
+%macro ADDAVG_W24_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_24x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova            m4, [pw_256]
+    mova            m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/2
+
+.loop:
+    movu            m0, [r0]
+    movu            m1, [r1]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            xm1, [r0 + 32]
+    movu            xm2, [r1 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 10001101b
+    vextracti128    [r2], m0, 1
+    movq            [r2 + 16], xm0
+
+    movu            m0, [r0 + r3]
+    movu            m1, [r1 + r4]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            xm1, [r0 + r3 + 32]
+    movu            xm2, [r1 + r4 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 10001101b
+    vextracti128    [r2 + r5], m0, 1
+    movq            [r2 + r5 + 16], xm0
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W24_H2_AVX2 32
+
+%macro ADDAVG_W32_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova            m4, [pw_256]
+    mova            m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/2
+
+.loop:
+    movu            m0, [r0]
+    movu            m1, [r1]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + 32]
+    movu            m2, [r1 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2], m0
+
+    movu            m0, [r0 + r3]
+    movu            m1, [r1 + r4]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + r3 + 32]
+    movu            m2, [r1 + r4 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2 + r5], m0
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W32_H2_AVX2 8
+ADDAVG_W32_H2_AVX2 16
+ADDAVG_W32_H2_AVX2 24
+ADDAVG_W32_H2_AVX2 32
+ADDAVG_W32_H2_AVX2 64
+
+%macro ADDAVG_W64_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova            m4, [pw_256]
+    mova            m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/2
+
+.loop:
+    movu            m0, [r0]
+    movu            m1, [r1]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + 32]
+    movu            m2, [r1 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2], m0
+
+    movu            m0, [r0 + 64]
+    movu            m1, [r1 + 64]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + 96]
+    movu            m2, [r1 + 96]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2 + 32], m0
+
+    movu            m0, [r0 + r3]
+    movu            m1, [r1 + r4]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + r3 + 32]
+    movu            m2, [r1 + r4 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2 + r5], m0
+
+    movu            m0, [r0 + r3 + 64]
+    movu            m1, [r1 + r4 + 64]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + r3 + 96]
+    movu            m2, [r1 + r4 + 96]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2 + r5 + 32], m0
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W64_H2_AVX2 16
+ADDAVG_W64_H2_AVX2 32
+ADDAVG_W64_H2_AVX2 48
+ADDAVG_W64_H2_AVX2 64
+
+%macro ADDAVG_W48_H2_AVX2 1
+INIT_YMM avx2
+cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova            m4, [pw_256]
+    mova            m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/2
+
+.loop:
+    movu            m0, [r0]
+    movu            m1, [r1]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + 32]
+    movu            m2, [r1 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2], m0
+
+    movu            m0, [r0 + 64]
+    movu            m1, [r1 + 64]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    packuswb        m0, m0
+    vpermq          m0, m0, 11011000b
+    vextracti128    [r2 + 32], m0, 0
+
+    movu            m0, [r0 + r3]
+    movu            m1, [r1 + r4]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    movu            m1, [r0 + r3 + 32]
+    movu            m2, [r1 + r4 + 32]
+    paddw           m1, m2
+    pmulhrsw        m1, m4
+    paddw           m1, m5
+
+    packuswb        m0, m1
+    vpermq          m0, m0, 11011000b
+    movu            [r2 + r5], m0
+
+    movu            m0, [r0 + r3 + 64]
+    movu            m1, [r1 + r4 + 64]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+
+    packuswb        m0, m0
+    vpermq          m0, m0, 11011000b
+    vextracti128    [r2 + r5 + 32], m0, 0
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W48_H2_AVX2 64
+
+;-----------------------------------------------------------------------------
+; addAvg avx2 code end
+;-----------------------------------------------------------------------------
 
 ;-----------------------------------------------------------------------------
 %macro ADDAVG_W24_H2 2
diff -r cbfa66e0b50c -r 2807d9a5a494 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Wed Mar 18 18:16:51 2015 -0500
+++ b/source/common/x86/pixel.h	Thu Mar 19 11:22:06 2015 +0530
@@ -180,7 +180,8 @@
 int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
 
 #define ADDAVG(func)  \
-    void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
+    void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+    void x265_ ## func ## _avx2(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
 ADDAVG(addAvg_2x4)
 ADDAVG(addAvg_2x8)
 ADDAVG(addAvg_4x2);


More information about the x265-devel mailing list