[x265] [arm64] port filterPixelToShort
chen
chenm003 at 163.com
Thu Jun 24 02:07:53 UTC 2021
You are welcome.
on your CPU, the ldp still slower, so we can keep origin version and improve it again in future.
This version looks good for me, thank you for your contribute.
At 2021-06-24 10:01:40, "Pop, Sebastian" <spop at amazon.com> wrote:
Thanks again Chen for your careful review and recommendations.
I added the following change to the attached patch as we get better performance:
--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -35,14 +35,14 @@ function x265_filterPixelToShort_4x4_neon
movi v2.8h, #0xe0, lsl #8
ld1 {v0.s}[0], [x0], x1
ld1 {v0.s}[1], [x0], x1
- ld1 {v1.s}[2], [x0], x1
- ld1 {v1.s}[3], [x0], x1
ushll v3.8h, v0.8b, #6
- ushll2 v4.8h, v1.16b, #6
add v3.8h, v3.8h, v2.8h
- add v4.8h, v4.8h, v2.8h
st1 {v3.d}[0], [x2], x3
st1 {v3.d}[1], [x2], x3
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ ushll v4.8h, v1.8b, #6
+ add v4.8h, v4.8h, v2.8h
st1 {v4.d}[0], [x2], x3
st1 {v4.d}[1], [x2], x3
ret
Before:
convert_p2s[ 4x4] 1.20x 4.99 6.01
After:
convert_p2s[ 4x4] 1.38x 4.20 5.78
I tried the ldp with post-increment as you recommended.
Performance is slightly lower with the change:
function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
+ sub x1, x1, #0x20
movi v4.8h, #0xe0, lsl #8
mov x9, #\r
.loop_filterP2S_64x\h:
subs x9, x9, #1
.rept 2
- ld1 {v0.16b-v3.16b}, [x0], x1
+ ldp q0, q1, [x0], #0x20
+ ldp q0, q1, [x0]
+ add x0, x0, x1
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
ushll v18.8h, v1.8b, #6
Before:
convert_p2s[64x16] 1.46x 105.52 154.47
convert_p2s[64x32] 1.47x 212.06 312.14
convert_p2s[64x48] 1.47x 318.75 467.61
convert_p2s[64x64] 1.46x 425.61 622.36
After:
convert_p2s[64x16] 1.42x 108.41 154.37
convert_p2s[64x32] 1.45x 215.18 312.12
convert_p2s[64x48] 1.44x 325.01 468.76
convert_p2s[64x64] 1.44x 432.46 622.36
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/c670f7e5/attachment.html>
More information about the x265-devel
mailing list