[x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Jun 25 10:50:50 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1435220688 -19800
# Thu Jun 25 13:54:48 2015 +0530
# Node ID c8d1630fc5ccb85aa7d98a198895bad31ccc33b0
# Parent 26e8eff8eb5abc1c2fa5dd94f59f620c6040caf9
asm: pixelavg_pp[8xN] avx2 code for 10bpp
avx2:
avg_pp[ 8x4] 4.39x 145.09 636.75
avg_pp[ 8x8] 5.33x 215.27 1146.55
avg_pp[ 8x16] 6.50x 336.88 2190.68
avg_pp[ 8x32] 7.71x 579.86 4470.84
sse2:
avg_pp[ 8x4] 2.31x 287.63 663.94
avg_pp[ 8x8] 3.26x 370.21 1205.26
avg_pp[ 8x16] 3.99x 581.63 2323.25
avg_pp[ 8x32] 4.78x 995.79 4755.58
diff -r 26e8eff8eb5a -r c8d1630fc5cc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jun 25 13:45:55 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 13:54:48 2015 +0530
@@ -1353,6 +1353,10 @@
p.cu[BLOCK_32x32].intra_pred[33] = PFX(intra_pred_ang32_33_avx2);
p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx2);
+ p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2);
+ p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2);
+ p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2);
+ p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2);
p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
diff -r 26e8eff8eb5a -r c8d1630fc5cc source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Thu Jun 25 13:45:55 2015 +0530
+++ b/source/common/x86/mc-a.asm Thu Jun 25 13:54:48 2015 +0530
@@ -4439,6 +4439,88 @@
INIT_YMM avx2
PIXEL_AVG_W18
+%macro pixel_avg_W8 0
+ movu xm0, [r2]
+ movu xm1, [r4]
+ pavgw xm0, xm1
+ movu [r0], xm0
+ movu xm2, [r2 + r3]
+ movu xm3, [r4 + r5]
+ pavgw xm2, xm3
+ movu [r0 + r1], xm2
+
+ movu xm0, [r2 + r3 * 2]
+ movu xm1, [r4 + r5 * 2]
+ pavgw xm0, xm1
+ movu [r0 + r1 * 2], xm0
+ movu xm2, [r2 + r6]
+ movu xm3, [r4 + r7]
+ pavgw xm2, xm3
+ movu [r0 + r8], xm2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_avg_8x4, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ pixel_avg_W8
+ RET
+
+cglobal pixel_avg_8x8, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 2
+.loop
+ pixel_avg_W8
+ dec r9d
+ jnz .loop
+ RET
+
+cglobal pixel_avg_8x16, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 4
+.loop
+ pixel_avg_W8
+ dec r9d
+ jnz .loop
+ RET
+
+cglobal pixel_avg_8x32, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 8
+.loop
+ pixel_avg_W8
+ dec r9d
+ jnz .loop
+ RET
+%endif
+
%macro pixel_avg_H4 0
movu m0, [r2]
movu m1, [r4]
More information about the x265-devel
mailing list