[x265] [PATCH] asm: pixelavg_pp[16xN] avx2 code for 10bpp
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Mon Jun 22 16:29:58 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1434981872 -19800
# Mon Jun 22 19:34:32 2015 +0530
# Node ID d4c7638a0d5b842ca2657969b0f1a2bcd8a82d0b
# Parent 83a7d824442455ba5e0a6b53ea68e6b7043845de
asm: pixelavg_pp[16xN] avx2 code for 10bpp
avx2:
avg_pp[ 16x4] 9.60x 140.07 1344.66
avg_pp[ 16x8] 12.90x 200.11 2580.72
avg_pp[16x12] 14.62x 265.30 3878.63
avg_pp[16x16] 15.00x 339.53 5094.42
avg_pp[16x32] 17.80x 578.67 10300.56
avg_pp[16x64] 19.37x 1050.96 20357.99
sse2:
avg_pp[ 16x4] 7.87x 170.18 1339.60
avg_pp[ 16x8] 8.22x 313.15 2575.54
avg_pp[16x12] 9.78x 394.35 3856.47
avg_pp[16x16] 10.41x 486.99 5070.16
avg_pp[16x32] 11.34x 902.48 10236.26
avg_pp[16x64] 11.96x 1686.64 20171.16
diff -r 83a7d8244424 -r d4c7638a0d5b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jun 22 19:34:32 2015 +0530
@@ -1343,6 +1343,13 @@
p.cu[BLOCK_32x32].intra_pred[33] = PFX(intra_pred_ang32_33_avx2);
p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx2);
+ p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
+ p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
+ p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
+ p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
+ p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
+ p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
+
p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2);
p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2);
diff -r 83a7d8244424 -r d4c7638a0d5b source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/mc-a.asm Mon Jun 22 19:34:32 2015 +0530
@@ -4439,6 +4439,145 @@
INIT_YMM avx2
PIXEL_AVG_W18
+%macro pixel_avg_H4 0
+ movu m0, [r2]
+ movu m1, [r4]
+ pavgw m0, m1
+ movu [r0], m0
+ movu m2, [r2 + r3]
+ movu m3, [r4 + r5]
+ pavgw m2, m3
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + r3 * 2]
+ movu m1, [r4 + r5 * 2]
+ pavgw m0, m1
+ movu [r0 + r1 * 2], m0
+ movu m2, [r2 + r6]
+ movu m3, [r4 + r7]
+ pavgw m2, m3
+ movu [r0 + r8], m2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_16x4, 6,9,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ pixel_avg_H4
+ RET
+
+cglobal pixel_avg_16x8, 6,9,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ pixel_avg_H4
+ pixel_avg_H4
+ RET
+
+cglobal pixel_avg_16x12, 6,9,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ pixel_avg_H4
+ pixel_avg_H4
+ pixel_avg_H4
+ RET
+%endif
+
+%macro pixel_avg_H16 0
+ movu m0, [r2]
+ movu m1, [r4]
+ pavgw m0, m1
+ movu [r0], m0
+ movu m2, [r2 + r3]
+ movu m3, [r4 + r5]
+ pavgw m2, m3
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + r3 * 2]
+ movu m1, [r4 + r5 * 2]
+ pavgw m0, m1
+ movu [r0 + r1 * 2], m0
+ movu m2, [r2 + r6]
+ movu m3, [r4 + r7]
+ pavgw m2, m3
+ movu [r0 + r8], m2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_16x16, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 4
+.loop
+ pixel_avg_H16
+ dec r9d
+ jnz .loop
+ RET
+
+cglobal pixel_avg_16x32, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 4
+.loop
+ pixel_avg_H16
+ pixel_avg_H16
+ dec r9d
+ jnz .loop
+ RET
+
+cglobal pixel_avg_16x64, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 4
+.loop
+ pixel_avg_H16
+ pixel_avg_H16
+ pixel_avg_H16
+ pixel_avg_H16
+ dec r9d
+ jnz .loop
+ RET
+%endif
+
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
More information about the x265-devel
mailing list