[x265] [arm64] port filterPixelToShort
chen
chenm003 at 163.com
Thu Jun 24 02:16:01 UTC 2021
it looks good for me, thanks.
btw: ARM64 have new instruction CBZ / CBNZ.
At 2021-06-24 10:11:32, "Pop, Sebastian" <spop at amazon.com> wrote:
I added the following change in the attached patch.
It has better performance with ldp as it allows to re-schedule the instructions in independent ways:
function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
+ sub x1, x1, #0x20
movi v4.8h, #0xe0, lsl #8
mov x9, #\r
.loop_filterP2S_64x\h:
subs x9, x9, #1
.rept 2
- ld1 {v0.16b-v3.16b}, [x0], x1
+ ldp q0, q1, [x0], #0x20
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
ushll v18.8h, v1.8b, #6
ushll2 v19.8h, v1.16b, #6
- ushll v20.8h, v2.8b, #6
- ushll2 v21.8h, v2.16b, #6
- ushll v22.8h, v3.8b, #6
- ushll2 v23.8h, v3.16b, #6
add v16.8h, v16.8h, v4.8h
add v17.8h, v17.8h, v4.8h
add v18.8h, v18.8h, v4.8h
add v19.8h, v19.8h, v4.8h
+ st1 {v16.16b-v19.16b}, [x2], #0x40
+
+ ldp q2, q3, [x0]
+ add x0, x0, x1
+ ushll v20.8h, v2.8b, #6
+ ushll2 v21.8h, v2.16b, #6
+ ushll v22.8h, v3.8b, #6
+ ushll2 v23.8h, v3.16b, #6
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
add v22.8h, v22.8h, v4.8h
add v23.8h, v23.8h, v4.8h
- st1 {v16.16b-v19.16b}, [x2], #0x40
st1 {v20.16b-v23.16b}, [x2], x3
.endr
bgt .loop_filterP2S_64x\h
Before:
convert_p2s[64x16] 1.46x 105.51 154.37
convert_p2s[64x32] 1.47x 212.07 312.12
convert_p2s[64x48] 1.46x 318.76 466.80
convert_p2s[64x64] 1.47x 425.34 623.56
After:
convert_p2s[64x16] 1.47x 105.24 154.46
convert_p2s[64x32] 1.50x 207.42 312.09
convert_p2s[64x48] 1.49x 312.30 466.27
convert_p2s[64x64] 1.50x 415.77 623.56
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/43ad555f/attachment.html>
More information about the x265-devel
mailing list