[x265] [PATCH] asm: addAvg avx2 code for high_bit_depth sizes >= 8, improved over ~45% than previous code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu May 14 07:45:34 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431582271 -19800
#      Thu May 14 11:14:31 2015 +0530
# Node ID 722ec5cd93ab072a182f0d94ad53ce4e8ad34f94
# Parent  479087422e29a672d6e9bc8d0cd2a65649d71fe2
asm: addAvg avx2 code for high_bit_depth sizes >= 8, improved over ~45% than previous code

diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Thu May 14 11:14:31 2015 +0530
@@ -1181,6 +1181,29 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.pu[LUMA_8x4].addAvg   = x265_addAvg_8x4_avx2;
+        p.pu[LUMA_8x8].addAvg   = x265_addAvg_8x8_avx2;
+        p.pu[LUMA_8x16].addAvg  = x265_addAvg_8x16_avx2;
+        p.pu[LUMA_8x32].addAvg  = x265_addAvg_8x32_avx2;
+        p.pu[LUMA_12x16].addAvg = x265_addAvg_12x16_avx2;
+        p.pu[LUMA_16x4].addAvg  = x265_addAvg_16x4_avx2;
+        p.pu[LUMA_16x8].addAvg  = x265_addAvg_16x8_avx2;
+        p.pu[LUMA_16x12].addAvg = x265_addAvg_16x12_avx2;
+        p.pu[LUMA_16x16].addAvg = x265_addAvg_16x16_avx2;
+        p.pu[LUMA_16x32].addAvg = x265_addAvg_16x32_avx2;
+        p.pu[LUMA_16x64].addAvg = x265_addAvg_16x64_avx2;
+        p.pu[LUMA_24x32].addAvg = x265_addAvg_24x32_avx2;
+        p.pu[LUMA_32x8].addAvg  = x265_addAvg_32x8_avx2;
+        p.pu[LUMA_32x16].addAvg = x265_addAvg_32x16_avx2;
+        p.pu[LUMA_32x24].addAvg = x265_addAvg_32x24_avx2;
+        p.pu[LUMA_32x32].addAvg = x265_addAvg_32x32_avx2;
+        p.pu[LUMA_32x64].addAvg = x265_addAvg_32x64_avx2;
+        p.pu[LUMA_48x64].addAvg = x265_addAvg_48x64_avx2;
+        p.pu[LUMA_64x16].addAvg = x265_addAvg_64x16_avx2;
+        p.pu[LUMA_64x32].addAvg = x265_addAvg_64x32_avx2;
+        p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2;
+        p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2;
+
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
         p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/const-a.asm	Thu May 14 11:14:31 2015 +0530
@@ -75,7 +75,7 @@
 const pw_256,               times 16 dw 256
 const pw_257,               times 16 dw 257
 const pw_512,               times 16 dw 512
-const pw_1023,              times  8 dw 1023
+const pw_1023,              times 16 dw 1023
 const pw_1024,              times 16 dw 1024
 const pw_4096,              times 16 dw 4096
 const pw_00ff,              times 16 dw 0x00ff
diff -r 479087422e29 -r 722ec5cd93ab source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/mc-a.asm	Thu May 14 11:14:31 2015 +0530
@@ -1017,6 +1017,454 @@
 ADDAVG_W64_H1 32
 ADDAVG_W64_H1 48
 ADDAVG_W64_H1 64
+
+;------------------------------------------------------------------------------
+; avx2 asm for addAvg high_bit_depth
+;------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro ADDAVG_W8_H4_AVX2 1
+cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_512]
+    mova        m5,          [pw_1023]
+    mova        m3,          [pw_1024]
+    pxor        m1,          m1
+    add         r3d,         r3d
+    add         r4d,         r4d
+    add         r5d,         r5d
+    mov         r6d,         %1/4
+
+.loop:
+    movu        m0,          [r0]
+    vinserti128 m0,          m0, [r0 + r3], 1
+    movu        m2,          [r1]
+    vinserti128 m2,          m2, [r1 + r4], 1
+
+    paddw       m0,          m2
+    pmulhrsw    m0,          m3
+    paddw       m0,          m4
+    pmaxsw      m0,          m1
+    pminsw      m0,          m5
+    vextracti128 xm2,        m0, 1
+    movu        [r2],        xm0
+    movu        [r2 + r5],   xm2
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        m0,          [r0]
+    vinserti128 m0,          m0, [r0 + r3], 1
+    movu        m2,          [r1]
+    vinserti128 m2,          m2, [r1 + r4], 1
+
+    paddw       m0,          m2
+    pmulhrsw    m0,          m3
+    paddw       m0,          m4
+    pmaxsw      m0,          m1
+    pminsw      m0,          m5
+    vextracti128 xm2,        m0, 1
+    movu        [r2],        xm0
+    movu        [r2 + r5],   xm2
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W8_H4_AVX2 4
+ADDAVG_W8_H4_AVX2 8
+ADDAVG_W8_H4_AVX2 12
+ADDAVG_W8_H4_AVX2 16
+ADDAVG_W8_H4_AVX2 32
+ADDAVG_W8_H4_AVX2 64
+
+cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova           m4,             [pw_512]
+    mova           m5,             [pw_1023]
+    mova           m3,             [pw_1024]
+    pxor           m1,             m1
+    add            r3,             r3
+    add            r4,             r4
+    add            r5,             r5
+    mov            r6d,            4
+
+.loop:
+%rep 2
+    movu           m0,             [r0]
+    movu           m2,             [r1]
+    paddw          m0,             m2
+    pmulhrsw       m0,             m3
+    paddw          m0,             m4
+    pmaxsw         m0,             m1
+    pminsw         m0,             m5
+    vextracti128   xm2,            m0, 1
+    movu           [r2],           xm0
+    movq           [r2 + 16],      xm2
+
+    movu           m0,             [r0 + r3]
+    movu           m2,             [r1 + r4]
+    paddw          m0,             m2
+    pmulhrsw       m0,             m3
+    paddw          m0,             m4
+    pmaxsw         m0,             m1
+    pminsw         m0,             m5
+    vextracti128   xm2,            m0, 1
+    movu           [r2 + r5],      xm0
+    movq           [r2 + r5 + 16], xm2
+
+    lea            r2,             [r2 + 2 * r5]
+    lea            r0,             [r0 + 2 * r3]
+    lea            r1,             [r1 + 2 * r4]
+%endrep
+    dec            r6d
+    jnz            .loop
+    RET
+
+%macro ADDAVG_W16_H4_AVX2 1
+cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m3,              [pw_1024]
+    pxor        m2,              m2
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+    mov         r6d,             %1/4
+
+.loop:
+%rep 2
+    movu        m0,              [r0]
+    movu        m1,              [r1]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + r3]
+    movu        m1,              [r1 + r4]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5],       m0
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+%endrep
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W16_H4_AVX2 4
+ADDAVG_W16_H4_AVX2 8
+ADDAVG_W16_H4_AVX2 12
+ADDAVG_W16_H4_AVX2 16
+ADDAVG_W16_H4_AVX2 24
+ADDAVG_W16_H4_AVX2 32
+ADDAVG_W16_H4_AVX2 64
+
+cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m3,              [pw_1024]
+    pxor        m1,              m1
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+
+    mov         r6d,             16
+
+.loop:
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m1
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        xm0,             [r0 + 32]
+    movu        xm2,             [r1 + 32]
+    paddw       xm0,             xm2
+    pmulhrsw    xm0,             xm3
+    paddw       xm0,             xm4
+    pmaxsw      xm0,             xm1
+    pminsw      xm0,             xm5
+    movu        [r2 + 32],       xm0
+
+    movu        m0,              [r0 + r3]
+    movu        m2,              [r1 + r4]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m1
+    pminsw      m0,              m5
+    movu        [r2 + r5],       m0
+
+    movu        xm2,             [r0 + r3 + 32]
+    movu        xm0,             [r1 + r4 + 32]
+    paddw       xm2,             xm0
+    pmulhrsw    xm2,             xm3
+    paddw       xm2,             xm4
+    pmaxsw      xm2,             xm1
+    pminsw      xm2,             xm5
+    movu        [r2 + r5 + 32],  xm2
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+
+%macro ADDAVG_W32_H2_AVX2 1
+cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m3,              [pw_1024]
+    pxor        m2,              m2
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+
+    mov         r6d,             %1/2
+
+.loop:
+    movu        m0,              [r0]
+    movu        m1,              [r1]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m1,              [r1 + 32]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + 32],       m0
+
+    movu        m0,              [r0 + r3]
+    movu        m1,              [r1 + r4]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5],       m0
+
+    movu        m0,              [r0 + r3 + 32]
+    movu        m1,              [r1 + r4 + 32]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5 + 32],  m0
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz        .loop
+    RET
+%endmacro
+
+ADDAVG_W32_H2_AVX2 8
+ADDAVG_W32_H2_AVX2 16
+ADDAVG_W32_H2_AVX2 24
+ADDAVG_W32_H2_AVX2 32
+ADDAVG_W32_H2_AVX2 48
+ADDAVG_W32_H2_AVX2 64
+
+cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m3,              [pw_1024]
+    pxor        m2,              m2
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+
+    mov         r6d,             32
+
+.loop:
+    movu        m0,              [r0]
+    movu        m1,              [r1]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m1,              [r1 + 32]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + 32],       m0
+
+    movu        m0,              [r0 + 64]
+    movu        m1,              [r1 + 64]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + 64],       m0
+
+    movu        m0,              [r0 + r3]
+    movu        m1,              [r1 + r4]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5],       m0
+
+    movu        m0,              [r0 + r3 + 32]
+    movu        m1,              [r1 + r4 + 32]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5 + 32],  m0
+
+    movu        m0,              [r0 + r3 + 64]
+    movu        m1,              [r1 + r4 + 64]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5 + 64],  m0
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz        .loop
+    RET
+
+%macro ADDAVG_W64_H1_AVX2 1
+cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m3,              [pw_1024]
+    pxor        m2,              m2
+    add         r3d,             r3d
+    add         r4d,             r4d
+    add         r5d,             r5d
+
+    mov         r6d,             %1/2
+
+.loop:
+    movu        m0,              [r0]
+    movu        m1,              [r1]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m1,              [r1 + 32]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + 32],       m0
+
+    movu        m0,              [r0 + 64]
+    movu        m1,              [r1 + 64]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + 64],       m0
+
+    movu        m0,              [r0 + 96]
+    movu        m1,              [r1 + 96]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + 96],       m0
+
+    movu        m0,              [r0 + r3]
+    movu        m1,              [r1 + r4]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5],       m0
+
+    movu        m0,              [r0 + r3 + 32]
+    movu        m1,              [r1 + r4 + 32]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5 + 32],  m0
+
+    movu        m0,              [r0 + r3 + 64]
+    movu        m1,              [r1 + r4 + 64]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5 + 64],  m0
+
+    movu        m0,              [r0 + r3 + 96]
+    movu        m1,              [r1 + r4 + 96]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5 + 96],  m0
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz        .loop
+    RET
+%endmacro
+
+ADDAVG_W64_H1_AVX2 16
+ADDAVG_W64_H1_AVX2 32
+ADDAVG_W64_H1_AVX2 48
+ADDAVG_W64_H1_AVX2 64
 ;-----------------------------------------------------------------------------
 %else ; !HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list