[x265] [arm64] port filterPixelToShort
Pop, Sebastian
spop at amazon.com
Thu Jun 24 02:25:22 UTC 2021
I added this change to the patch.
Same re-scheduling with ld1 still has better performance then ldp:
With ld1 instead of ldp+ldp+add:
convert_p2s[64x16] 1.53x 100.98 154.35
convert_p2s[64x32] 1.54x 203.14 313.34
convert_p2s[64x48] 1.54x 304.66 468.15
convert_p2s[64x64] 1.54x 407.16 625.61
function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
- sub x1, x1, #0x20
movi v4.8h, #0xe0, lsl #8
mov x9, #\r
.loop_filterP2S_64x\h:
subs x9, x9, #1
.rept 2
- ldp q0, q1, [x0], #0x20
+ ld1 {v0.16b-v3.16b}, [x0], x1
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
ushll v18.8h, v1.8b, #6
@@ -194,8 +193,6 @@ function x265_filterPixelToShort_64x\h\()_neon
add v19.8h, v19.8h, v4.8h
st1 {v16.16b-v19.16b}, [x2], #0x40
- ldp q2, q3, [x0]
- add x0, x0, x1
ushll v20.8h, v2.8b, #6
ushll2 v21.8h, v2.16b, #6
ushll v22.8h, v3.8b, #6
I also applied the same scheduling to the other functions:
Before:
convert_p2s[ 4x4] 1.36x 4.44 6.04
convert_p2s[ 4x8] 1.79x 6.39 11.43
convert_p2s[ 4x16] 1.70x 13.99 23.72
convert_p2s[ 8x4] 2.21x 3.00 6.63
convert_p2s[ 8x8] 2.26x 6.06 13.72
convert_p2s[ 8x16] 1.95x 13.31 25.95
convert_p2s[ 8x32] 1.63x 29.09 47.49
After:
convert_p2s[ 4x4] 1.39x 4.31 5.98
convert_p2s[ 4x8] 1.88x 6.10 11.43
convert_p2s[ 4x16] 1.69x 14.01 23.73
convert_p2s[ 8x4] 2.27x 2.91 6.59
convert_p2s[ 8x8] 2.34x 5.84 13.67
convert_p2s[ 8x16] 1.95x 13.31 25.94
convert_p2s[ 8x32] 1.60x 29.57 47.24
From: x265-devel <x265-devel-bounces at videolan.org> on behalf of "Pop, Sebastian" <spop at amazon.com>
Reply-To: Development for x265 <x265-devel at videolan.org>
Date: Wednesday, June 23, 2021 at 9:12 PM
To: Development for x265 <x265-devel at videolan.org>
Subject: Re: [x265] [arm64] port filterPixelToShort
I added the following change in the attached patch.
It has better performance with ldp as it allows to re-schedule the instructions in independent ways:
function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
+ sub x1, x1, #0x20
movi v4.8h, #0xe0, lsl #8
mov x9, #\r
.loop_filterP2S_64x\h:
subs x9, x9, #1
.rept 2
- ld1 {v0.16b-v3.16b}, [x0], x1
+ ldp q0, q1, [x0], #0x20
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
ushll v18.8h, v1.8b, #6
ushll2 v19.8h, v1.16b, #6
- ushll v20.8h, v2.8b, #6
- ushll2 v21.8h, v2.16b, #6
- ushll v22.8h, v3.8b, #6
- ushll2 v23.8h, v3.16b, #6
add v16.8h, v16.8h, v4.8h
add v17.8h, v17.8h, v4.8h
add v18.8h, v18.8h, v4.8h
add v19.8h, v19.8h, v4.8h
+ st1 {v16.16b-v19.16b}, [x2], #0x40
+
+ ldp q2, q3, [x0]
+ add x0, x0, x1
+ ushll v20.8h, v2.8b, #6
+ ushll2 v21.8h, v2.16b, #6
+ ushll v22.8h, v3.8b, #6
+ ushll2 v23.8h, v3.16b, #6
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v4.8h
add v22.8h, v22.8h, v4.8h
add v23.8h, v23.8h, v4.8h
- st1 {v16.16b-v19.16b}, [x2], #0x40
st1 {v20.16b-v23.16b}, [x2], x3
.endr
bgt .loop_filterP2S_64x\h
Before:
convert_p2s[64x16] 1.46x 105.51 154.37
convert_p2s[64x32] 1.47x 212.07 312.12
convert_p2s[64x48] 1.46x 318.76 466.80
convert_p2s[64x64] 1.47x 425.34 623.56
After:
convert_p2s[64x16] 1.47x 105.24 154.46
convert_p2s[64x32] 1.50x 207.42 312.09
convert_p2s[64x48] 1.49x 312.30 466.27
convert_p2s[64x64] 1.50x 415.77 623.56
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/f1449503/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-arm64-port-x265_filterPixelToShort_-_neon.patch
Type: application/octet-stream
Size: 13359 bytes
Desc: 0001-arm64-port-x265_filterPixelToShort_-_neon.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/f1449503/attachment-0001.obj>
More information about the x265-devel
mailing list