[x265] [arm64] port filterPixelToShort
chen
chenm003 at 163.com
Thu Jun 24 01:36:31 UTC 2021
Thank your response, comment inline.
At 2021-06-24 08:57:20, "Pop, Sebastian" <spop at amazon.com> wrote:
Hi Chen,
Thanks for your review!
> +function x265_filterPixelToShort_4x4_neon
> + add x3, x3, x3
> + movi v2.8h, #0xe0, lsl #8
> are you compiler does not handle constant 0xe000 automatic? it is more readable
GNU assembler errors with that immediate:
ipfilter8.S:35: Error: immediate value out of range -128 to 255 at operand 2 -- `movi v2.8h,#0xe000'
Look old binutils issus, so we can keep your origin version.
>
> + ld1 {v0.s}[0], [x0], x1
> + ld1 {v0.s}[1], [x0], x1
> + ld1 {v1.s}[2], [x0], x1
> Why not v0.s?
It is slightly faster to use an independent register for the upper part:
when using {v0.s}[3] and {v0.s}[4]
convert_p2s[ 4x4] 1.13x 5.35 6.03
performance is lower than when using {v1.s}[3] and {v1.s}[4]
convert_p2s[ 4x4] 1.21x 4.99 6.03
Yes, in here, independent register may faster, but we can use lower part and ushll later, use one register high pare directly may make false register dependency path.
>
> + ld1 {v1.s}[3], [x0], x1
>
> +.macro filterPixelToShort_32xN h
> +function x265_filterPixelToShort_32x\h\()_neon
> + add x3, x3, x3
> + movi v6.8h, #0xe0, lsl #8
> +.rept \h
> + ld1 {v0.16b-v1.16b}, [x0], x1
> ldp maybe provide more bandwidth
ld1 could be replaced with ldp + add, like this:
ldp q0, q1, [x0]
add x0, x0, x1
convert_p2s[ 32x8] 1.39x 26.62 37.07
convert_p2s[32x16] 1.42x 53.19 75.58
convert_p2s[32x24] 1.41x 80.23 113.11
convert_p2s[32x32] 1.42x 107.08 151.63
convert_p2s[32x64] 1.41x 215.11 303.37
Performance with ldp + add is lower than with ld1:
convert_p2s[ 32x8] 1.48x 25.00 37.06
convert_p2s[32x16] 1.49x 50.64 75.56
convert_p2s[32x24] 1.48x 76.46 113.31
convert_p2s[32x32] 1.49x 101.97 151.63
convert_p2s[32x64] 1.48x 205.15 303.31
ldp immediately follow by add may make pipeline stall or similar issue, if there no better choice, we can keep origin version.
>
> +.macro filterPixelToShort_64xN h
> +function x265_filterPixelToShort_64x\h\()_neon
> + add x3, x3, x3
> + sub x3, x3, #0x40
> + movi v4.8h, #0xe0, lsl #8
> +.rept \h
> I guess unroll N is not good idea, because the code section too large, it most probability to make cache flush and missing.
Performance is slightly lower with a loop, i.e., with this change:
--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -173,12 +173,15 @@ filterPixelToShort_32xN 24
filterPixelToShort_32xN 32
filterPixelToShort_32xN 64
-.macro filterPixelToShort_64xN h
+.macro filterPixelToShort_64xN h r
function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
movi v4.8h, #0xe0, lsl #8
-.rept \h
+ mov x9, #\r
+.loop_filterP2S_64x\h:
+ subs x9, x9, #1
+.rept 2
ld1 {v0.16b-v3.16b}, [x0], x1
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
@@ -199,14 +202,15 @@ function x265_filterPixelToShort_64x\h\()_neon
st1 {v16.16b-v19.16b}, [x2], #0x40
st1 {v20.16b-v23.16b}, [x2], x3
.endr
+ bgt .loop_filterP2S_64x\h
ret
endfunc
.endm
-filterPixelToShort_64xN 16
-filterPixelToShort_64xN 32
-filterPixelToShort_64xN 48
-filterPixelToShort_64xN 64
+filterPixelToShort_64xN 16 8
+filterPixelToShort_64xN 32 16
+filterPixelToShort_64xN 48 24
+filterPixelToShort_64xN 64 32
.macro qpel_filter_0_32b
movi v24.8h, #64
With the above change adding a loop I get
convert_p2s[64x16] 1.46x 105.52 154.34
convert_p2s[64x32] 1.47x 212.07 311.71
convert_p2s[64x48] 1.47x 318.75 468.04
convert_p2s[64x64] 1.46x 425.61 622.25
whereas with the fully unrolled version performance is slightly higher:
convert_p2s[64x16] 1.48x 104.14 154.36
convert_p2s[64x32] 1.49x 209.43 312.13
convert_p2s[64x48] 1.48x 315.33 466.37
convert_p2s[64x64] 1.49x 420.45 624.63
I do not have a preference for this one, so I will follow your recommendations.
Please let me know if I need to amend the patch to add the loop.
our testbench is small code piece, so there not so much cache missing report during testbench, in really system, large function is a big problems.
>
> + ld1 {v0.16b-v3.16b}, [x0], x1
> + ushll v16.8h, v0.8b, #6
> + ushll2 v17.8h, v0.16b, #6
> + ushll v18.8h, v1.8b, #6
> + ushll2 v19.8h, v1.16b, #6
> + ushll v20.8h, v2.8b, #6
> + ushll2 v21.8h, v2.16b, #6
> + ushll v22.8h, v3.8b, #6
> + ushll2 v23.8h, v3.16b, #6
> + add v16.8h, v16.8h, v4.8h
> + add v17.8h, v17.8h, v4.8h
> + add v18.8h, v18.8h, v4.8h
> + add v19.8h, v19.8h, v4.8h
> + add v20.8h, v20.8h, v4.8h
> + add v21.8h, v21.8h, v4.8h
> + add v22.8h, v22.8h, v4.8h
> + add v23.8h, v23.8h, v4.8h
> + st1 {v16.16b-v19.16b}, [x2], #0x40
> ldp may reduce pipeline stall and more bandwidth
ldp is not beneficial as it requires 2 ldp + 2 add instead of 1 ld1:
ldp may follow by constant post-increment.
such as
ldp v16,v17,[x2],#0x20
--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -178,8 +178,13 @@ function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
movi v4.8h, #0xe0, lsl #8
+ sub x1, x1, #32
.rept \h
- ld1 {v0.16b-v3.16b}, [x0], x1
+ ldp q0, q1, [x0]
+ add x0, x0, #32
+ ldp q2, q3, [x0]
+ add x0, x0, x1
+
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
ushll v18.8h, v1.8b, #6
This adds overhead to instruction decoding.
cpu back-end will issue the same loads for ldp and ld1.
Here is performance with the above change:
convert_p2s[64x16] 1.43x 108.20 154.47
convert_p2s[64x32] 1.44x 216.43 312.10
convert_p2s[64x48] 1.43x 325.81 466.80
convert_p2s[64x64] 1.44x 433.60 624.30
With ld1 performance is higher:
convert_p2s[64x16] 1.48x 104.14 154.44
convert_p2s[64x32] 1.49x 209.44 312.10
convert_p2s[64x48] 1.48x 314.76 466.79
convert_p2s[64x64] 1.48x 420.30 622.38
Thanks,
Sebastian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/c019bbe4/attachment-0001.html>
More information about the x265-devel
mailing list