[x265] [PATCH] asm: new AVX2 algorithm on interp_4tap_horiz_ps_32x32, 3664c -> 3362c
Min Chen
chenm003 at 163.com
Sat Aug 29 03:19:48 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1440809514 25200
# Node ID 8212b13bb4435124d15ced73e850444dce59e39b
# Parent d8091487bc9749e702c468786b0cd9e663478a91
asm: new AVX2 algorithm on interp_4tap_horiz_ps_32x32, 3664c -> 3362c
---
source/common/x86/ipfilter8.asm | 90 +++++++++++++++++---------------------
1 files changed, 40 insertions(+), 50 deletions(-)
diff -r d8091487bc97 -r 8212b13bb443 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Aug 25 16:39:12 2015 -0700
+++ b/source/common/x86/ipfilter8.asm Fri Aug 28 17:51:54 2015 -0700
@@ -25016,67 +25016,57 @@
; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-----------------------------------------------------------------------------------------------------------------------------;
INIT_YMM avx2
-cglobal interp_4tap_horiz_ps_32x32, 4,7,6
- mov r4d, r4m
- mov r5d, r5m
- add r3d, r3d
-
-%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- vpbroadcastd m0, [r6 + r4 * 4]
-%else
- vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- vbroadcasti128 m2, [pw_1]
- vbroadcasti128 m5, [pw_2000]
- mova m1, [tab_Tm]
-
- ; register map
- ; m0 - interpolate coeff
- ; m1 - shuffle order table
- ; m2 - constant word 1
- mov r6d, 32
- dec r0
- test r5d, r5d
- je .loop
- sub r0 , r1
- add r6d , 3
+cglobal interp_4tap_horiz_ps_32x32, 4,6,8
+ mov r4d, r4m
+ add r3d, r3d
+ dec r0
+
+ ; check isRowExt
+ cmp r5m, byte 0
+
+ lea r5, [tab_ChromaCoeff]
+ vpbroadcastw m0, [r5 + r4 * 4 + 0]
+ vpbroadcastw m1, [r5 + r4 * 4 + 2]
+ mova m7, [pw_2000]
+
+ ; register map
+ ; m0 - interpolate coeff Low
+ ; m1 - interpolate coeff High
+ ; m7 - constant pw_2000
+ mov r4d, 32
+ je .loop
+ sub r0, r1
+ add r4d, 3
.loop
; Row 0
- vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m3, m1
- pmaddubsw m3, m0
- pmaddwd m3, m2
- vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m4, m1
- pmaddubsw m4, m0
- pmaddwd m4, m2
-
- packssdw m3, m4
- psubw m3, m5
- vpermq m3, m3, 11011000b
- movu [r2], m3
-
- vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m3, m1
- pmaddubsw m3, m0
- pmaddwd m3, m2
- vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m4, m1
- pmaddubsw m4, m0
- pmaddwd m4, m2
-
- packssdw m3, m4
- psubw m3, m5
- vpermq m3, m3, 11011000b
- movu [r2 + 32], m3
-
- add r2, r3
- add r0, r1
- dec r6d
- jnz .loop
+ movu m2, [r0]
+ movu m3, [r0 + 1]
+ punpckhbw m4, m2, m3
+ punpcklbw m2, m3
+ pmaddubsw m4, m0
+ pmaddubsw m2, m0
+
+ movu m3, [r0 + 2]
+ movu m5, [r0 + 3]
+ punpckhbw m6, m3, m5
+ punpcklbw m3, m5
+ pmaddubsw m6, m1
+ pmaddubsw m3, m1
+
+ paddw m4, m6
+ paddw m2, m3
+ psubw m4, m7
+ psubw m2, m7
+ vperm2i128 m3, m2, m4, 0x20
+ vperm2i128 m5, m2, m4, 0x31
+ movu [r2], m3
+ movu [r2 + mmsize], m5
+
+ add r2, r3
+ add r0, r1
+ dec r4d
+ jnz .loop
RET
;-----------------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list