[x265] [PATCH] asm: pixelavg_pp[16xN] avx2 code for 10bpp

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Mon Jun 22 16:29:58 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1434981872 -19800
#      Mon Jun 22 19:34:32 2015 +0530
# Node ID d4c7638a0d5b842ca2657969b0f1a2bcd8a82d0b
# Parent  83a7d824442455ba5e0a6b53ea68e6b7043845de
asm: pixelavg_pp[16xN] avx2 code for 10bpp

avx2:
avg_pp[ 16x4]  9.60x    140.07          1344.66
avg_pp[ 16x8]  12.90x   200.11          2580.72
avg_pp[16x12]  14.62x   265.30          3878.63
avg_pp[16x16]  15.00x   339.53          5094.42
avg_pp[16x32]  17.80x   578.67          10300.56
avg_pp[16x64]  19.37x   1050.96         20357.99

sse2:
avg_pp[ 16x4]  7.87x    170.18          1339.60
avg_pp[ 16x8]  8.22x    313.15          2575.54
avg_pp[16x12]  9.78x    394.35          3856.47
avg_pp[16x16]  10.41x   486.99          5070.16
avg_pp[16x32]  11.34x   902.48          10236.26
avg_pp[16x64]  11.96x   1686.64         20171.16

diff -r 83a7d8244424 -r d4c7638a0d5b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jun 22 19:34:32 2015 +0530
@@ -1343,6 +1343,13 @@
         p.cu[BLOCK_32x32].intra_pred[33]    = PFX(intra_pred_ang32_33_avx2);
         p.cu[BLOCK_32x32].intra_pred[34]    = PFX(intra_pred_ang32_2_avx2);
 
+        p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
+        p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
+        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
+        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
+        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
+        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
+
         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_avx2);
         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_avx2);
         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_avx2);
diff -r 83a7d8244424 -r d4c7638a0d5b source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/mc-a.asm	Mon Jun 22 19:34:32 2015 +0530
@@ -4439,6 +4439,145 @@
 INIT_YMM avx2
 PIXEL_AVG_W18
 
+%macro  pixel_avg_H4 0
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], m0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], m2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], m0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_16x4, 6,9,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    pixel_avg_H4
+    RET
+
+cglobal pixel_avg_16x8, 6,9,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    pixel_avg_H4
+    pixel_avg_H4
+    RET
+
+cglobal pixel_avg_16x12, 6,9,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    pixel_avg_H4
+    pixel_avg_H4
+    pixel_avg_H4
+    RET
+%endif
+
+%macro  pixel_avg_H16 0
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], m0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], m2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], m0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_16x16, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 4
+.loop
+    pixel_avg_H16
+    dec r9d
+    jnz .loop
+    RET
+
+cglobal pixel_avg_16x32, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 4
+.loop
+    pixel_avg_H16
+    pixel_avg_H16
+    dec r9d
+    jnz .loop
+    RET
+
+cglobal pixel_avg_16x64, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 4
+.loop
+    pixel_avg_H16
+    pixel_avg_H16
+    pixel_avg_H16
+    pixel_avg_H16
+    dec r9d
+    jnz .loop
+    RET
+%endif
+
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0


More information about the x265-devel mailing list