[x265] [PATCH] asm: avx2 10bit code for luma_hpp[48x64] (82440.47 -> 44731.61)
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue May 19 10:59:22 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1432024172 -19800
# Tue May 19 13:59:32 2015 +0530
# Node ID 6fad8107d1a6bebf92d7b38e57528b3cedf5cbd6
# Parent 9d394ee847ae33abb2a3ae06bf934eb5ebac3d03
asm: avx2 10bit code for luma_hpp[48x64] (82440.47 -> 44731.61)
diff -r 9d394ee847ae -r 6fad8107d1a6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue May 19 14:11:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue May 19 13:59:32 2015 +0530
@@ -1409,6 +1409,7 @@
p.pu[LUMA_64x64].luma_hpp = x265_interp_8tap_horiz_pp_64x64_avx2;
p.pu[LUMA_12x16].luma_hpp = x265_interp_8tap_horiz_pp_12x16_avx2;
p.pu[LUMA_24x32].luma_hpp = x265_interp_8tap_horiz_pp_24x32_avx2;
+ p.pu[LUMA_48x64].luma_hpp = x265_interp_8tap_horiz_pp_48x64_avx2;
if (cpuMask & X265_CPU_BMI2)
p.scanPosLast = x265_scanPosLast_avx2_bmi2;
diff -r 9d394ee847ae -r 6fad8107d1a6 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue May 19 14:11:35 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Tue May 19 13:59:32 2015 +0530
@@ -1370,6 +1370,129 @@
jnz .loop
RET
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_48x64, 4,6,8
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, 6
+ mov r4d, r4m
+ shl r4d, 4
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4]
+ vpbroadcastq m1, [r5 + r4 + 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4]
+ vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
+%endif
+ mova m3, [interp8_hpp_shuf]
+ mova m7, [pd_32]
+ pxor m2, m2
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+
+ mov r4d, 64
+
+.loop:
+%assign x 0
+%rep 2
+ vbroadcasti128 m4, [r0 + x]
+ vbroadcasti128 m5, [r0 + 8 + x]
+ pshufb m4, m3
+ pshufb m5, m3
+
+ pmaddwd m4, m0
+ pmaddwd m5, m1
+ paddd m4, m5
+
+ vbroadcasti128 m5, [r0 + 8 + x]
+ vbroadcasti128 m6, [r0 + 16 + x]
+ pshufb m5, m3
+ pshufb m6, m3
+
+ pmaddwd m5, m0
+ pmaddwd m6, m1
+ paddd m5, m6
+
+ phaddd m4, m5
+ vpermq m4, m4, q3120
+ paddd m4, m7
+ psrad m4, 6
+
+ packusdw m4, m4
+ vpermq m4, m4, q2020
+ CLIPW m4, m2, [pw_pixel_max]
+ movu [r2 + x], xm4
+
+ vbroadcasti128 m4, [r0 + 16 + x]
+ vbroadcasti128 m5, [r0 + 24 + x]
+ pshufb m4, m3
+ pshufb m5, m3
+
+ pmaddwd m4, m0
+ pmaddwd m5, m1
+ paddd m4, m5
+
+ vbroadcasti128 m5, [r0 + 24 + x]
+ vbroadcasti128 m6, [r0 + 32 + x]
+ pshufb m5, m3
+ pshufb m6, m3
+
+ pmaddwd m5, m0
+ pmaddwd m6, m1
+ paddd m5, m6
+
+ phaddd m4, m5
+ vpermq m4, m4, q3120
+ paddd m4, m7
+ psrad m4, 6
+
+ packusdw m4, m4
+ vpermq m4, m4, q2020
+ CLIPW m4, m2, [pw_pixel_max]
+ movu [r2 + 16 + x], xm4
+
+ vbroadcasti128 m4, [r0 + 32 + x]
+ vbroadcasti128 m5, [r0 + 40 + x]
+ pshufb m4, m3
+ pshufb m5, m3
+
+ pmaddwd m4, m0
+ pmaddwd m5, m1
+ paddd m4, m5
+
+ vbroadcasti128 m5, [r0 + 40 + x]
+ vbroadcasti128 m6, [r0 + 48 + x]
+ pshufb m5, m3
+ pshufb m6, m3
+
+ pmaddwd m5, m0
+ pmaddwd m6, m1
+ paddd m5, m6
+
+ phaddd m4, m5
+ vpermq m4, m4, q3120
+ paddd m4, m7
+ psrad m4, 6
+
+ packusdw m4, m4
+ vpermq m4, m4, q2020
+ CLIPW m4, m2, [pw_pixel_max]
+ movu [r2 + 32 + x], xm4
+
+%assign x x+48
+%endrep
+
+ add r2, r3
+ add r0, r1
+ dec r4d
+ jnz .loop
+ RET
+
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list