[x265] [PATCH] asm: pixelavg_pp[12x16], [24x32] avx2 code for 10bpp

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Wed Jun 24 15:18:44 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1435150086 -19800
#      Wed Jun 24 18:18:06 2015 +0530
# Node ID 9d8d2bf23696f2329aa553604898e460cb10bf84
# Parent  6b51492f87f036b6e58f5a92d2b3e85da4e57906
asm: pixelavg_pp[12x16],[24x32] avx2 code for 10bpp

avx2:
avg_pp[24x32]  14.35x   965.89          13860.97
avg_pp[12x16]  7.78x    487.43          3791.49

sse2:
avg_pp[24x32]  5.49x    2566.36         14091.85
avg_pp[12x16]  4.95x    744.74          3683.95

diff -r 6b51492f87f0 -r 9d8d2bf23696 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 24 18:03:54 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jun 24 18:18:06 2015 +0530
@@ -1352,12 +1352,14 @@
         p.cu[BLOCK_32x32].intra_pred[33]    = PFX(intra_pred_ang32_33_avx2);
         p.cu[BLOCK_32x32].intra_pred[34]    = PFX(intra_pred_ang32_2_avx2);
 
+        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
         p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
         p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
         p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
         p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
         p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
         p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
+        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_avx2);
         p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
         p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
         p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
diff -r 6b51492f87f0 -r 9d8d2bf23696 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Wed Jun 24 18:03:54 2015 +0530
+++ b/source/common/x86/mc-a.asm	Wed Jun 24 18:18:06 2015 +0530
@@ -4439,6 +4439,57 @@
 INIT_YMM avx2
 PIXEL_AVG_W18
 
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_12x16, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 4
+
+.loop
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], xm0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], xm2
+
+    vextracti128 xm0, m0, 1
+    vextracti128 xm2, m2, 1
+    movq    [r0 + 16], xm0
+    movq    [r0 + r1 + 16], xm2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], xm0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], xm2
+
+    vextracti128 xm0, m0, 1
+    vextracti128 xm2, m2, 1
+    movq    [r0 + r1 * 2 + 16], xm0
+    movq    [r0 + r8 + 16], xm2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+    dec     r9d
+    jnz     .loop
+    RET
+%endif
+
 %macro  pixel_avg_H4 0
     movu    m0, [r2]
     movu    m1, [r4]
@@ -4578,6 +4629,65 @@
     RET
 %endif
 
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_24x32, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 8
+
+.loop
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], m0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], m2
+
+    movu    xm0, [r2 + 32]
+    movu    xm1, [r4 + 32]
+    pavgw   xm0, xm1
+    movu    [r0 + 32], xm0
+    movu    xm2, [r2 + r3 + 32]
+    movu    xm3, [r4 + r5 + 32]
+    pavgw   xm2, xm3
+    movu    [r0 + r1 + 32], xm2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], m0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], m2
+
+    movu    xm0, [r2 + r3 * 2 + 32]
+    movu    xm1, [r4 + r5 * 2 + 32]
+    pavgw   xm0, xm1
+    movu    [r0 + r1 * 2 + 32], xm0
+    movu    xm2, [r2 + r6 + 32]
+    movu    xm3, [r4 + r7 + 32]
+    pavgw   xm2, xm3
+    movu    [r0 + r8 + 32], xm2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+    dec     r9d
+    jnz     .loop
+    RET
+%endif
+
 %macro  pixel_avg_W32 0
     movu    m0, [r2]
     movu    m1, [r4]


More information about the x265-devel mailing list