[x265] [PATCH] asm: avx2 10bit code for luma_hpp[8xN]
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue May 19 10:57:08 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1432018444 -19800
# Tue May 19 12:24:04 2015 +0530
# Node ID 712f3f1950098d1603a662944359978e19e39752
# Parent d7b100e51e828833eee006f1da93e499ac161d28
asm: avx2 10bit code for luma_hpp[8xN]
avx2:
luma_hpp[ 8x4] 7.30x 507.64 3706.06
luma_hpp[ 8x8] 7.64x 982.30 7503.45
luma_hpp[ 8x16] 7.78x 1898.72 14779.64
luma_hpp[ 8x32] 7.93x 3778.05 29954.26
sse4:
luma_hpp[ 8x4] 4.34x 877.69 3806.35
luma_hpp[ 8x8] 4.45x 1702.32 7569.03
luma_hpp[ 8x16] 4.44x 3335.36 14812.65
luma_hpp[ 8x32] 4.39x 6785.18 29815.67
diff -r d7b100e51e82 -r 712f3f195009 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Tue May 19 12:24:04 2015 +0530
@@ -1388,6 +1388,11 @@
p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2;
p.pu[LUMA_4x16].luma_hps = x265_interp_8tap_horiz_ps_4x16_avx2;
+ p.pu[LUMA_8x4].luma_hpp = x265_interp_8tap_horiz_pp_8x4_avx2;
+ p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2;
+ p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2;
+ p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2;
+
if (cpuMask & X265_CPU_BMI2)
p.scanPosLast = x265_scanPosLast_avx2_bmi2;
}
diff -r d7b100e51e82 -r 712f3f195009 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/const-a.asm Tue May 19 12:24:04 2015 +0530
@@ -110,7 +110,7 @@
const pd_4, times 4 dd 4
const pd_8, times 4 dd 8
const pd_16, times 4 dd 16
-const pd_32, times 4 dd 32
+const pd_32, times 8 dd 32
const pd_64, times 4 dd 64
const pd_128, times 4 dd 128
const pd_256, times 4 dd 256
diff -r d7b100e51e82 -r 712f3f195009 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/ipfilter16.asm Tue May 19 12:24:04 2015 +0530
@@ -115,6 +115,9 @@
const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
+const interp8_hpp_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+ db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+
SECTION .text
cextern pd_32
cextern pw_pixel_max
@@ -859,6 +862,102 @@
movhps [r2 + r3], m3
%endmacro
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W8 1
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_8x%1, 4,6,8
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, 6
+ mov r4d, r4m
+ shl r4d, 4
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4]
+ vpbroadcastq m1, [r5 + r4 + 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4]
+ vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
+%endif
+ mova m3, [interp8_hpp_shuf]
+ mova m7, [pd_32]
+ pxor m2, m2
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+
+ mov r4d, %1/2
+
+.loop:
+ vbroadcasti128 m4, [r0]
+ vbroadcasti128 m5, [r0 + 8]
+ pshufb m4, m3
+ pshufb m5, m3
+
+ pmaddwd m4, m0
+ pmaddwd m5, m1
+ paddd m4, m5
+
+ vbroadcasti128 m5, [r0 + 8]
+ vbroadcasti128 m6, [r0 + 16]
+ pshufb m5, m3
+ pshufb m6, m3
+
+ pmaddwd m5, m0
+ pmaddwd m6, m1
+ paddd m5, m6
+
+ phaddd m4, m5
+ vpermq m4, m4, q3120
+ paddd m4, m7
+ psrad m4, 6
+
+ packusdw m4, m4
+ vpermq m4, m4, q2020
+ CLIPW m4, m2, [pw_pixel_max]
+ movu [r2], xm4
+
+ vbroadcasti128 m4, [r0 + r1]
+ vbroadcasti128 m5, [r0 + r1 + 8]
+ pshufb m4, m3
+ pshufb m5, m3
+
+ pmaddwd m4, m0
+ pmaddwd m5, m1
+ paddd m4, m5
+
+ vbroadcasti128 m5, [r0 + r1 + 8]
+ vbroadcasti128 m6, [r0 + r1 + 16]
+ pshufb m5, m3
+ pshufb m6, m3
+
+ pmaddwd m5, m0
+ pmaddwd m6, m1
+ paddd m5, m6
+
+ phaddd m4, m5
+ vpermq m4, m4, q3120
+ paddd m4, m7
+ psrad m4, 6
+
+ packusdw m4, m4
+ vpermq m4, m4, q2020
+ CLIPW m4, m2, [pw_pixel_max]
+ movu [r2 + r3], xm4
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+FILTER_HOR_LUMA_W8 4
+FILTER_HOR_LUMA_W8 8
+FILTER_HOR_LUMA_W8 16
+FILTER_HOR_LUMA_W8 32
+
;-----------------------------------------------------------------------------
; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list