[x265] [arm64] port filterPixelToShort
Pop, Sebastian
spop at amazon.com
Thu Jun 24 02:01:40 UTC 2021
Thanks again Chen for your careful review and recommendations.
I added the following change to the attached patch as we get better performance:
--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -35,14 +35,14 @@ function x265_filterPixelToShort_4x4_neon
movi v2.8h, #0xe0, lsl #8
ld1 {v0.s}[0], [x0], x1
ld1 {v0.s}[1], [x0], x1
- ld1 {v1.s}[2], [x0], x1
- ld1 {v1.s}[3], [x0], x1
ushll v3.8h, v0.8b, #6
- ushll2 v4.8h, v1.16b, #6
add v3.8h, v3.8h, v2.8h
- add v4.8h, v4.8h, v2.8h
st1 {v3.d}[0], [x2], x3
st1 {v3.d}[1], [x2], x3
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ ushll v4.8h, v1.8b, #6
+ add v4.8h, v4.8h, v2.8h
st1 {v4.d}[0], [x2], x3
st1 {v4.d}[1], [x2], x3
ret
Before:
convert_p2s[ 4x4] 1.20x 4.99 6.01
After:
convert_p2s[ 4x4] 1.38x 4.20 5.78
I tried the ldp with post-increment as you recommended.
Performance is slightly lower with the change:
function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
+ sub x1, x1, #0x20
movi v4.8h, #0xe0, lsl #8
mov x9, #\r
.loop_filterP2S_64x\h:
subs x9, x9, #1
.rept 2
- ld1 {v0.16b-v3.16b}, [x0], x1
+ ldp q0, q1, [x0], #0x20
+ ldp q0, q1, [x0]
+ add x0, x0, x1
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
ushll v18.8h, v1.8b, #6
Before:
convert_p2s[64x16] 1.46x 105.52 154.47
convert_p2s[64x32] 1.47x 212.06 312.14
convert_p2s[64x48] 1.47x 318.75 467.61
convert_p2s[64x64] 1.46x 425.61 622.36
After:
convert_p2s[64x16] 1.42x 108.41 154.37
convert_p2s[64x32] 1.45x 215.18 312.12
convert_p2s[64x48] 1.44x 325.01 468.76
convert_p2s[64x64] 1.44x 432.46 622.36
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/65464694/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-arm64-port-x265_filterPixelToShort_-_neon.patch
Type: application/octet-stream
Size: 13359 bytes
Desc: 0001-arm64-port-x265_filterPixelToShort_-_neon.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/65464694/attachment-0001.obj>
More information about the x265-devel
mailing list