[x265] [PATCH] asm: pixelavg_pp[32xN], [64xN], 48x64 avx2 code for 10bpp

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Wed Jun 24 15:18:18 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1435149234 -19800
#      Wed Jun 24 18:03:54 2015 +0530
# Node ID 6b51492f87f036b6e58f5a92d2b3e85da4e57906
# Parent  3a5cd130f9084147168c02f26de102faf59d193b
asm: pixelavg_pp[32xN],[64xN],48x64 avx2 code for 10bpp

avx2:
avg_pp[ 32x8]  13.95x   345.28          4815.70
avg_pp[32x16]  18.23x   535.22          9759.08
avg_pp[32x24]  19.25x   753.64          14506.10
avg_pp[32x32]  19.68x   975.15          19192.85
avg_pp[32x64]  21.43x   1841.33         39462.92
avg_pp[64x16]  19.15x   987.13          18901.01
avg_pp[64x32]  20.18x   1874.47         37825.34
avg_pp[64x48]  19.89x   2837.11         56439.58
avg_pp[64x64]  19.76x   3774.05         74572.41
avg_pp[48x64]  19.65x   2752.09         54082.53

sse2:
avg_pp[ 32x8]  10.37x   470.87          4883.57
avg_pp[32x16]  11.15x   873.08          9737.43
avg_pp[32x24]  11.34x   1287.71         14596.59
avg_pp[32x32]  11.41x   1697.46         19369.11
avg_pp[32x64]  12.52x   3220.95         40330.95
avg_pp[64x16]  10.94x   1670.19         18267.47
avg_pp[64x32]  11.49x   3274.41         37635.54
avg_pp[64x48]  11.79x   4802.15         56622.23
avg_pp[64x64]  11.30x   6667.17         75332.41
avg_pp[48x64]  10.56x   5138.91         54275.12

diff -r 3a5cd130f908 -r 6b51492f87f0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jun 22 17:39:54 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Wed Jun 24 18:03:54 2015 +0530
@@ -1358,6 +1358,16 @@
         p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
         p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
         p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
+        p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
+        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
+        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
+        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
+        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
+        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
+        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
+        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
+        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
+        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
 
         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_avx2);
         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_avx2);
diff -r 3a5cd130f908 -r 6b51492f87f0 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon Jun 22 17:39:54 2015 -0700
+++ b/source/common/x86/mc-a.asm	Wed Jun 24 18:03:54 2015 +0530
@@ -4578,6 +4578,341 @@
     RET
 %endif
 
+%macro  pixel_avg_W32 0
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], m0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], m2
+
+    movu    m0, [r2 + 32]
+    movu    m1, [r4 + 32]
+    pavgw   m0, m1
+    movu    [r0 + 32], m0
+    movu    m2, [r2 + r3 + 32]
+    movu    m3, [r4 + r5 + 32]
+    pavgw   m2, m3
+    movu    [r0 + r1 + 32], m2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], m0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], m2
+
+    movu    m0, [r2 + r3 * 2 + 32]
+    movu    m1, [r4 + r5 * 2 + 32]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2 + 32], m0
+    movu    m2, [r2 + r6 + 32]
+    movu    m3, [r4 + r7 + 32]
+    pavgw   m2, m3
+    movu    [r0 + r8 + 32], m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_32x8, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 2
+.loop
+    pixel_avg_W32
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_32x16, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 4
+.loop
+    pixel_avg_W32
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_32x24, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 6
+.loop
+    pixel_avg_W32
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_32x32, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 8
+.loop
+    pixel_avg_W32
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_32x64, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 16
+.loop
+    pixel_avg_W32
+    dec     r9d
+    jnz     .loop
+    RET
+%endif
+
+%macro  pixel_avg_W64 0
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], m0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], m2
+
+    movu    m0, [r2 + 32]
+    movu    m1, [r4 + 32]
+    pavgw   m0, m1
+    movu    [r0 + 32], m0
+    movu    m2, [r2 + r3 + 32]
+    movu    m3, [r4 + r5 + 32]
+    pavgw   m2, m3
+    movu    [r0 + r1 + 32], m2
+
+    movu    m0, [r2 + 64]
+    movu    m1, [r4 + 64]
+    pavgw   m0, m1
+    movu    [r0 + 64], m0
+    movu    m2, [r2 + r3 + 64]
+    movu    m3, [r4 + r5 + 64]
+    pavgw   m2, m3
+    movu    [r0 + r1 + 64], m2
+
+    movu    m0, [r2 + 96]
+    movu    m1, [r4 + 96]
+    pavgw   m0, m1
+    movu    [r0 + 96], m0
+    movu    m2, [r2 + r3 + 96]
+    movu    m3, [r4 + r5 + 96]
+    pavgw   m2, m3
+    movu    [r0 + r1 + 96], m2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], m0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], m2
+
+    movu    m0, [r2 + r3 * 2 + 32]
+    movu    m1, [r4 + r5 * 2 + 32]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2 + 32], m0
+    movu    m2, [r2 + r6 + 32]
+    movu    m3, [r4 + r7 + 32]
+    pavgw   m2, m3
+    movu    [r0 + r8 + 32], m2
+
+    movu    m0, [r2 + r3 * 2 + 64]
+    movu    m1, [r4 + r5 * 2 + 64]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2 + 64], m0
+    movu    m2, [r2 + r6 + 64]
+    movu    m3, [r4 + r7 + 64]
+    pavgw   m2, m3
+    movu    [r0 + r8 + 64], m2
+
+    movu    m0, [r2 + r3 * 2 + 96]
+    movu    m1, [r4 + r5 * 2 + 96]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2 + 96], m0
+    movu    m2, [r2 + r6 + 96]
+    movu    m3, [r4 + r7 + 96]
+    pavgw   m2, m3
+    movu    [r0 + r8 + 96], m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_64x16, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 4
+.loop
+    pixel_avg_W64
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_64x32, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 8
+.loop
+    pixel_avg_W64
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_64x48, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 12
+.loop
+    pixel_avg_W64
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_64x64, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 16
+.loop
+    pixel_avg_W64
+    dec     r9d
+    jnz     .loop
+    RET
+%endif
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_48x64, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 16
+
+.loop
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], m0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], m2
+
+    movu    m0, [r2 + 32]
+    movu    m1, [r4 + 32]
+    pavgw   m0, m1
+    movu    [r0 + 32], m0
+    movu    m2, [r2 + r3 + 32]
+    movu    m3, [r4 + r5 + 32]
+    pavgw   m2, m3
+    movu    [r0 + r1 + 32], m2
+
+    movu    m0, [r2 + 64]
+    movu    m1, [r4 + 64]
+    pavgw   m0, m1
+    movu    [r0 + 64], m0
+    movu    m2, [r2 + r3 + 64]
+    movu    m3, [r4 + r5 + 64]
+    pavgw   m2, m3
+    movu    [r0 + r1 + 64], m2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], m0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], m2
+
+    movu    m0, [r2 + r3 * 2 + 32]
+    movu    m1, [r4 + r5 * 2 + 32]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2 + 32], m0
+    movu    m2, [r2 + r6 + 32]
+    movu    m3, [r4 + r7 + 32]
+    pavgw   m2, m3
+    movu    [r0 + r8 + 32], m2
+
+    movu    m0, [r2 + r3 * 2 + 64]
+    movu    m1, [r4 + r5 * 2 + 64]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2 + 64], m0
+    movu    m2, [r2 + r6 + 64]
+    movu    m3, [r4 + r7 + 64]
+    pavgw   m2, m3
+    movu    [r0 + r8 + 64], m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+    dec     r9d
+    jnz     .loop
+    RET
+%endif
+
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0


More information about the x265-devel mailing list