[x265] [arm64] port filterPixelToShort

Thu Jun 24 00:57:20 UTC 2021

Hi Chen,

Thanks for your review!

> +function x265_filterPixelToShort_4x4_neon
> +    add             x3, x3, x3
> +    movi            v2.8h, #0xe0, lsl #8
> are you compiler does not handle constant 0xe000 automatic? it is more readable

GNU assembler errors with that immediate:
ipfilter8.S:35: Error: immediate value out of range -128 to 255 at operand 2 -- `movi v2.8h,#0xe000'

>
> +    ld1             {v0.s}[0], [x0], x1
> +    ld1             {v0.s}[1], [x0], x1
> +    ld1             {v1.s}[2], [x0], x1
> Why not v0.s?

It is slightly faster to use an independent register for the upper part:

when using {v0.s}[3] and {v0.s}[4]
convert_p2s[  4x4]              1.13x    5.35            6.03

performance is lower than when using {v1.s}[3] and {v1.s}[4]
convert_p2s[  4x4]              1.21x    4.99            6.03

>
> +    ld1             {v1.s}[3], [x0], x1
>
> +.macro filterPixelToShort_32xN h
> +function x265_filterPixelToShort_32x\h\()_neon
> +    add             x3, x3, x3
> +    movi            v6.8h, #0xe0, lsl #8
> +.rept \h
> +    ld1             {v0.16b-v1.16b}, [x0], x1
> ldp maybe provide more bandwidth

ld1 could be replaced with ldp + add, like this:

        ldp             q0, q1, [x0]
        add x0, x0, x1

convert_p2s[ 32x8]              1.39x    26.62           37.07
convert_p2s[32x16]              1.42x    53.19           75.58
convert_p2s[32x24]              1.41x    80.23           113.11
convert_p2s[32x32]              1.42x    107.08          151.63
convert_p2s[32x64]              1.41x    215.11          303.37

Performance with ldp + add is lower than with ld1:

convert_p2s[ 32x8]              1.48x    25.00           37.06
convert_p2s[32x16]              1.49x    50.64           75.56
convert_p2s[32x24]              1.48x    76.46           113.31
convert_p2s[32x32]              1.49x    101.97          151.63
convert_p2s[32x64]              1.48x    205.15          303.31

>
> +.macro filterPixelToShort_64xN h
> +function x265_filterPixelToShort_64x\h\()_neon
> +    add             x3, x3, x3
> +    sub             x3, x3, #0x40
> +    movi            v4.8h, #0xe0, lsl #8
> +.rept \h
> I guess unroll N is not good idea, because the code section too large, it most probability to make cache flush and missing.

Performance is slightly lower with a loop, i.e., with this change:

--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -173,12 +173,15 @@ filterPixelToShort_32xN 24
filterPixelToShort_32xN 32
filterPixelToShort_32xN 64

-.macro filterPixelToShort_64xN h
+.macro filterPixelToShort_64xN h r
function x265_filterPixelToShort_64x\h\()_neon
     add             x3, x3, x3
    sub             x3, x3, #0x40
     movi            v4.8h, #0xe0, lsl #8
-.rept \h
+    mov             x9, #\r
+.loop_filterP2S_64x\h:
+    subs            x9, x9, #1
+.rept 2
     ld1             {v0.16b-v3.16b}, [x0], x1
     ushll           v16.8h, v0.8b, #6
     ushll2          v17.8h, v0.16b, #6
@@ -199,14 +202,15 @@ function x265_filterPixelToShort_64x\h\()_neon
     st1             {v16.16b-v19.16b}, [x2], #0x40
     st1             {v20.16b-v23.16b}, [x2], x3
.endr
+    bgt             .loop_filterP2S_64x\h
     ret
endfunc
.endm

-filterPixelToShort_64xN 16
-filterPixelToShort_64xN 32
-filterPixelToShort_64xN 48
-filterPixelToShort_64xN 64
+filterPixelToShort_64xN 16 8
+filterPixelToShort_64xN 32 16
+filterPixelToShort_64xN 48 24
+filterPixelToShort_64xN 64 32

.macro qpel_filter_0_32b
     movi            v24.8h, #64

With the above change adding a loop I get
convert_p2s[64x16]                        1.46x      105.52                  154.34
convert_p2s[64x32]                        1.47x      212.07                  311.71
convert_p2s[64x48]                        1.47x      318.75                  468.04
convert_p2s[64x64]                        1.46x      425.61                  622.25

whereas with the fully unrolled version performance is slightly higher:
convert_p2s[64x16]                        1.48x      104.14                  154.36
convert_p2s[64x32]                        1.49x      209.43                  312.13
convert_p2s[64x48]                        1.48x      315.33                  466.37
convert_p2s[64x64]                        1.49x      420.45                  624.63

I do not have a preference for this one, so I will follow your recommendations.
Please let me know if I need to amend the patch to add the loop.

>
> +    ld1             {v0.16b-v3.16b}, [x0], x1
> +    ushll           v16.8h, v0.8b, #6
> +    ushll2          v17.8h, v0.16b, #6
> +    ushll           v18.8h, v1.8b, #6
> +    ushll2          v19.8h, v1.16b, #6
> +    ushll           v20.8h, v2.8b, #6
> +    ushll2          v21.8h, v2.16b, #6
> +    ushll           v22.8h, v3.8b, #6
> +    ushll2          v23.8h, v3.16b, #6
> +    add             v16.8h, v16.8h, v4.8h
> +    add             v17.8h, v17.8h, v4.8h
> +    add             v18.8h, v18.8h, v4.8h
> +    add             v19.8h, v19.8h, v4.8h
> +    add             v20.8h, v20.8h, v4.8h
> +    add             v21.8h, v21.8h, v4.8h
> +    add             v22.8h, v22.8h, v4.8h
> +    add             v23.8h, v23.8h, v4.8h
> +    st1             {v16.16b-v19.16b}, [x2], #0x40
> ldp may reduce pipeline stall and more bandwidth

ldp is not beneficial as it requires 2 ldp + 2 add instead of 1 ld1:

--- a/source/common/aarch64/ipfilter8.S
+++ b/source/common/aarch64/ipfilter8.S
@@ -178,8 +178,13 @@ function x265_filterPixelToShort_64x\h\()_neon
     add             x3, x3, x3
     sub             x3, x3, #0x40
     movi            v4.8h, #0xe0, lsl #8
+    sub             x1, x1, #32
.rept \h
-    ld1             {v0.16b-v3.16b}, [x0], x1
+        ldp             q0, q1, [x0]
+        add  x0, x0, #32
+        ldp             q2, q3, [x0]
+        add  x0, x0, x1
+
     ushll           v16.8h, v0.8b, #6
     ushll2          v17.8h, v0.16b, #6
     ushll           v18.8h, v1.8b, #6

This adds overhead to instruction decoding.
cpu back-end will issue the same loads for ldp and ld1.

Here is performance with the above change:
convert_p2s[64x16]              1.43x    108.20          154.47
convert_p2s[64x32]              1.44x    216.43          312.10
convert_p2s[64x48]              1.43x    325.81          466.80
convert_p2s[64x64]              1.44x    433.60          624.30

With ld1 performance is higher:
convert_p2s[64x16]              1.48x    104.14          154.44
convert_p2s[64x32]              1.49x    209.44          312.10
convert_p2s[64x48]              1.48x    314.76          466.79
convert_p2s[64x64]              1.48x    420.30          622.38

Thanks,
Sebastian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/864c79be/attachment-0001.html>