[x265] [arm64] port filterPixelToShort
Pop, Sebastian
spop at amazon.com
Thu Jun 24 00:57:20 UTC 2021
Hi Chen,
Thanks for your review!
> +function x265_filterPixelToShort_4x4_neon
> + add x3, x3, x3
> + movi v2.8h, #0xe0, lsl #8
> are you compiler does not handle constant 0xe000 automatic? it is more readable
GNU assembler errors with that immediate:
ipfilter8.S:35: Error: immediate value out of range -128 to 255 at operand 2 -- `movi v2.8h,#0xe000'
>
> + ld1 {v0.s}[0], [x0], x1
> + ld1 {v0.s}[1], [x0], x1
> + ld1 {v1.s}[2], [x0], x1
> Why not v0.s?
It is slightly faster to use an independent register for the upper part:
when using {v0.s}[3] and {v0.s}[4]
convert_p2s[ 4x4] 1.13x 5.35 6.03
performance is lower than when using {v1.s}[3] and {v1.s}[4]
convert_p2s[ 4x4] 1.21x 4.99 6.03
>
> + ld1 {v1.s}[3], [x0], x1
>
> +.macro filterPixelToShort_32xN h
> +function x265_filterPixelToShort_32x\h\()_neon
> + add x3, x3, x3
> + movi v6.8h, #0xe0, lsl #8
> +.rept \h
> + ld1 {v0.16b-v1.16b}, [x0], x1
> ldp maybe provide more bandwidth
ld1 could be replaced with ldp + add, like this:
ldp q0, q1, [x0]
add x0, x0, x1
convert_p2s[ 32x8] 1.39x 26.62 37.07
convert_p2s[32x16] 1.42x 53.19 75.58
convert_p2s[32x24] 1.41x 80.23 113.11
convert_p2s[32x32] 1.42x 107.08 151.63
convert_p2s[32x64] 1.41x 215.11 303.37
Performance with ldp + add is lower than with ld1:
convert_p2s[ 32x8] 1.48x 25.00 37.06
convert_p2s[32x16] 1.49x 50.64 75.56
convert_p2s[32x24] 1.48x 76.46 113.31
convert_p2s[32x32] 1.49x 101.97 151.63
convert_p2s[32x64] 1.48x 205.15 303.31
>
> +.macro filterPixelToShort_64xN h
> +function x265_filterPixelToShort_64x\h\()_neon
> + add x3, x3, x3
> + sub x3, x3, #0x40
> + movi v4.8h, #0xe0, lsl #8
> +.rept \h
> I guess unroll N is not good idea, because the code section too large, it most probability to make cache flush and missing.
Performance is slightly lower with a loop, i.e., with this change:
--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -173,12 +173,15 @@ filterPixelToShort_32xN 24
filterPixelToShort_32xN 32
filterPixelToShort_32xN 64
-.macro filterPixelToShort_64xN h
+.macro filterPixelToShort_64xN h r
function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
movi v4.8h, #0xe0, lsl #8
-.rept \h
+ mov x9, #\r
+.loop_filterP2S_64x\h:
+ subs x9, x9, #1
+.rept 2
ld1 {v0.16b-v3.16b}, [x0], x1
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
@@ -199,14 +202,15 @@ function x265_filterPixelToShort_64x\h\()_neon
st1 {v16.16b-v19.16b}, [x2], #0x40
st1 {v20.16b-v23.16b}, [x2], x3
.endr
+ bgt .loop_filterP2S_64x\h
ret
endfunc
.endm
-filterPixelToShort_64xN 16
-filterPixelToShort_64xN 32
-filterPixelToShort_64xN 48
-filterPixelToShort_64xN 64
+filterPixelToShort_64xN 16 8
+filterPixelToShort_64xN 32 16
+filterPixelToShort_64xN 48 24
+filterPixelToShort_64xN 64 32
.macro qpel_filter_0_32b
movi v24.8h, #64
With the above change adding a loop I get
convert_p2s[64x16] 1.46x 105.52 154.34
convert_p2s[64x32] 1.47x 212.07 311.71
convert_p2s[64x48] 1.47x 318.75 468.04
convert_p2s[64x64] 1.46x 425.61 622.25
whereas with the fully unrolled version performance is slightly higher:
convert_p2s[64x16] 1.48x 104.14 154.36
convert_p2s[64x32] 1.49x 209.43 312.13
convert_p2s[64x48] 1.48x 315.33 466.37
convert_p2s[64x64] 1.49x 420.45 624.63
I do not have a preference for this one, so I will follow your recommendations.
Please let me know if I need to amend the patch to add the loop.
>
> + ld1 {v0.16b-v3.16b}, [x0], x1
> + ushll v16.8h, v0.8b, #6
> + ushll2 v17.8h, v0.16b, #6
> + ushll v18.8h, v1.8b, #6
> + ushll2 v19.8h, v1.16b, #6
> + ushll v20.8h, v2.8b, #6
> + ushll2 v21.8h, v2.16b, #6
> + ushll v22.8h, v3.8b, #6
> + ushll2 v23.8h, v3.16b, #6
> + add v16.8h, v16.8h, v4.8h
> + add v17.8h, v17.8h, v4.8h
> + add v18.8h, v18.8h, v4.8h
> + add v19.8h, v19.8h, v4.8h
> + add v20.8h, v20.8h, v4.8h
> + add v21.8h, v21.8h, v4.8h
> + add v22.8h, v22.8h, v4.8h
> + add v23.8h, v23.8h, v4.8h
> + st1 {v16.16b-v19.16b}, [x2], #0x40
> ldp may reduce pipeline stall and more bandwidth
ldp is not beneficial as it requires 2 ldp + 2 add instead of 1 ld1:
--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -178,8 +178,13 @@ function x265_filterPixelToShort_64x\h\()_neon
add x3, x3, x3
sub x3, x3, #0x40
movi v4.8h, #0xe0, lsl #8
+ sub x1, x1, #32
.rept \h
- ld1 {v0.16b-v3.16b}, [x0], x1
+ ldp q0, q1, [x0]
+ add x0, x0, #32
+ ldp q2, q3, [x0]
+ add x0, x0, x1
+
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
ushll v18.8h, v1.8b, #6
This adds overhead to instruction decoding.
cpu back-end will issue the same loads for ldp and ld1.
Here is performance with the above change:
convert_p2s[64x16] 1.43x 108.20 154.47
convert_p2s[64x32] 1.44x 216.43 312.10
convert_p2s[64x48] 1.43x 325.81 466.80
convert_p2s[64x64] 1.44x 433.60 624.30
With ld1 performance is higher:
convert_p2s[64x16] 1.48x 104.14 154.44
convert_p2s[64x32] 1.49x 209.44 312.10
convert_p2s[64x48] 1.48x 314.76 466.79
convert_p2s[64x64] 1.48x 420.30 622.38
Thanks,
Sebastian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/864c79be/attachment-0001.html>
More information about the x265-devel
mailing list