[x265] [PATCH 3 of 3] asm: rewrite interpolate hps width of [32, 48, 64], improve ~20%
Min Chen
chenm003 at 163.com
Tue Apr 12 19:31:02 CEST 2016
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1460482251 18000
# Node ID f74e220607e15ea4c00645e59055996767303aaa
# Parent 37e80d50caf51a74e85c83f24317935171a5d375
asm: rewrite interpolate hps width of [32,48,64], improve ~20%
OLD:
luma_hps[32x32] 6.32x 16429.69 103771.02
luma_hps[32x16] 6.04x 10121.56 61140.21
luma_hps[32x64] 6.47x 30813.70 199438.95
luma_hps[32x24] 6.23x 13277.26 82747.75
luma_hps[48x64] 6.13x 46002.25 282176.44
luma_hps[64x64] 6.15x 61393.88 377670.03
luma_hps[64x32] 6.79x 33001.77 224096.58
luma_hps[64x48] 6.21x 47242.66 293529.16
luma_hps[64x16] 6.51x 19207.61 125016.56
NEW:
luma_hps[32x32] 7.66x 13404.22 102730.96
luma_hps[32x16] 7.32x 8355.57 61133.25
luma_hps[32x64] 7.68x 24496.17 188086.11
luma_hps[32x24] 8.00x 10879.09 87077.93
luma_hps[48x64] 7.62x 37094.37 282758.94
luma_hps[64x64] 7.82x 48535.86 379390.78
luma_hps[64x32] 7.91x 26512.17 209755.50
luma_hps[64x48] 8.06x 37020.63 298498.28
luma_hps[64x16] 7.95x 15479.03 123132.41
---
source/common/x86/ipfilter16.asm | 100 +++++++++++++++++++-------------------
1 files changed, 50 insertions(+), 50 deletions(-)
diff -r 37e80d50caf5 -r f74e220607e1 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue Apr 12 12:30:48 2016 -0500
+++ b/source/common/x86/ipfilter16.asm Tue Apr 12 12:30:51 2016 -0500
@@ -116,6 +116,7 @@
dw -1, 4, -11, 40, 40, -11, 4, -1
dw 0, 1, -5, 17, 58, -10, 4, -1
+ALIGN 32
tab_LumaCoeffV: times 4 dw 0, 0
times 4 dw 0, 64
times 4 dw 0, 0
@@ -161,9 +162,8 @@
const interp8_hpp_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
-const pb_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
- db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
-
+const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+ db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
SECTION .text
cextern pd_8
@@ -10407,7 +10407,7 @@
vpbroadcastq m0, [tab_LumaCoeff + r4]
vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -10475,7 +10475,7 @@
vpbroadcastq m0, [tab_LumaCoeff + r4]
vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -10536,16 +10536,16 @@
add r3d, r3d
mov r4d, r4m
mov r5d, r5m
- shl r4d, 4
-%ifdef PIC
- lea r6, [tab_LumaCoeff]
- vpbroadcastq m0, [r6 + r4]
- vpbroadcastq m1, [r6 + r4 + 8]
-%else
- vpbroadcastq m0, [tab_LumaCoeff + r4]
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
-%endif
- mova m3, [pb_shuf]
+ shl r4d, 6
+%ifdef PIC
+ lea r6, [tab_LumaCoeffV]
+ movu m0, [r6 + r4]
+ movu m1, [r6 + r4 + mmsize]
+%else
+ movu m0, [tab_LumaCoeffV + r4]
+ movu m1, [tab_LumaCoeffV + r4 + mmsize]
+%endif
+ mova m3, [interp8_hpp_shuf_new]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -10554,7 +10554,7 @@
sub r0, 6
test r5d, r5d
mov r4d, %2
- jz .loop0
+ jz .loop0
lea r6, [r1*3]
sub r0, r6
add r4d, 7
@@ -10563,64 +10563,64 @@
%assign x 0
%rep %1/16
vbroadcasti128 m4, [r0 + x]
- vbroadcasti128 m5, [r0 + 8 + x]
+ vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x]
pshufb m4, m3
- pshufb m7, m5, m3
+ pshufb m5, m3
pmaddwd m4, m0
- pmaddwd m7, m1
+ pmaddwd m7, m5, m1
paddd m4, m7
+ vextracti128 xm7, m4, 1
+ paddd xm4, xm7
+ paddd xm4, xm2
+ psrad xm4, INTERP_SHIFT_PS
vbroadcasti128 m6, [r0 + 16 + x]
+ pshufb m6, m3
+
+ pmaddwd m5, m0
+ pmaddwd m7, m6, m1
+ paddd m5, m7
+ vextracti128 xm7, m5, 1
+ paddd xm5, xm7
+ paddd xm5, xm2
+ psrad xm5, INTERP_SHIFT_PS
+
+ packssdw xm4, xm5
+ movu [r2 + x], xm4
+
+ vbroadcasti128 m5, [r0 + 24 + x]
pshufb m5, m3
- pshufb m7, m6, m3
+
+ pmaddwd m6, m0
+ pmaddwd m7, m5, m1
+ paddd m6, m7
+ vextracti128 xm7, m6, 1
+ paddd xm6, xm7
+ paddd xm6, xm2
+ psrad xm6, INTERP_SHIFT_PS
+
+ vbroadcasti128 m7, [r0 + 32 + x]
+ pshufb m7, m3
pmaddwd m5, m0
pmaddwd m7, m1
paddd m5, m7
-
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m2
- vextracti128 xm5,m4, 1
- psrad xm4, INTERP_SHIFT_PS
+ vextracti128 xm7, m5, 1
+ paddd xm5, xm7
+ paddd xm5, xm2
psrad xm5, INTERP_SHIFT_PS
- packssdw xm4, xm5
-
- movu [r2 + x], xm4
-
- vbroadcasti128 m5, [r0 + 24 + x]
- pshufb m6, m3
- pshufb m7, m5, m3
-
- pmaddwd m6, m0
- pmaddwd m7, m1
- paddd m6, m7
-
- vbroadcasti128 m7, [r0 + 32 + x]
- pshufb m5, m3
- pshufb m7, m3
-
- pmaddwd m5, m0
- pmaddwd m7, m1
- paddd m5, m7
-
- phaddd m6, m5
- vpermq m6, m6, q3120
- paddd m6, m2
- vextracti128 xm5,m6, 1
- psrad xm6, INTERP_SHIFT_PS
- psrad xm5, INTERP_SHIFT_PS
+
packssdw xm6, xm5
-
movu [r2 + 16 + x], xm6
- %assign x x+32
- %endrep
+
+%assign x x+32
+%endrep
add r2, r3
add r0, r1
dec r4d
- jnz .loop0
+ jnz .loop0
RET
%endif
%endmacro
@@ -10656,7 +10656,7 @@
vpbroadcastq m0, [tab_LumaCoeff + r4]
vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -10749,7 +10749,7 @@
vpbroadcastq m0, [tab_LumaCoeff + r4]
vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -10824,7 +10824,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -10883,7 +10883,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -10956,7 +10956,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -11038,7 +11038,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -11103,7 +11103,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -11204,7 +11204,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -11357,7 +11357,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
@@ -11477,7 +11477,7 @@
%else
vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
%endif
- mova m3, [pb_shuf]
+ mova m3, [interp8_hpp_shuf]
vbroadcasti128 m2, [INTERP_OFFSET_PS]
; register map
More information about the x265-devel
mailing list