[x265] [arm64] port filterPixelToShort

chen chenm003 at 163.com
Thu Jun 24 01:36:31 UTC 2021


Thank your response, comment inline.

At 2021-06-24 08:57:20, "Pop, Sebastian" <spop at amazon.com> wrote:

Hi Chen,

 

Thanks for your review!

 

> +function x265_filterPixelToShort_4x4_neon

> +    add             x3, x3, x3

> +    movi            v2.8h, #0xe0, lsl #8

> are you compiler does not handle constant 0xe000 automatic? it is more readable

 

GNU assembler errors with that immediate:

ipfilter8.S:35: Error: immediate value out of range -128 to 255 at operand 2 -- `movi v2.8h,#0xe000'




Look old binutils issus, so we can keep your origin version.




 

> 

> +    ld1             {v0.s}[0], [x0], x1

> +    ld1             {v0.s}[1], [x0], x1

> +    ld1             {v1.s}[2], [x0], x1

> Why not v0.s?

 

It is slightly faster to use an independent register for the upper part:

 

when using {v0.s}[3] and {v0.s}[4]

convert_p2s[  4x4]              1.13x    5.35            6.03

 

performance is lower than when using {v1.s}[3] and {v1.s}[4]

convert_p2s[  4x4]              1.21x    4.99            6.03




Yes, in here, independent register may faster, but we can use lower part and ushll later, use one register high pare directly may make false register dependency path. 




> 

> +    ld1             {v1.s}[3], [x0], x1

> 

> +.macro filterPixelToShort_32xN h

> +function x265_filterPixelToShort_32x\h\()_neon

> +    add             x3, x3, x3

> +    movi            v6.8h, #0xe0, lsl #8

> +.rept \h

> +    ld1             {v0.16b-v1.16b}, [x0], x1

> ldp maybe provide more bandwidth

 

ld1 could be replaced with ldp + add, like this:

 

        ldp             q0, q1, [x0]

        add x0, x0, x1

 

convert_p2s[ 32x8]              1.39x    26.62           37.07

convert_p2s[32x16]              1.42x    53.19           75.58

convert_p2s[32x24]              1.41x    80.23           113.11

convert_p2s[32x32]              1.42x    107.08          151.63

convert_p2s[32x64]              1.41x    215.11          303.37

 

Performance with ldp + add is lower than with ld1:

 

convert_p2s[ 32x8]              1.48x    25.00           37.06

convert_p2s[32x16]              1.49x    50.64           75.56

convert_p2s[32x24]              1.48x    76.46           113.31

convert_p2s[32x32]              1.49x    101.97          151.63

convert_p2s[32x64]              1.48x    205.15          303.31




ldp immediately follow by add may make pipeline stall or similar issue, if there no better choice, we can keep origin version.




> 

> +.macro filterPixelToShort_64xN h

> +function x265_filterPixelToShort_64x\h\()_neon

> +    add             x3, x3, x3

> +    sub             x3, x3, #0x40

> +    movi            v4.8h, #0xe0, lsl #8

> +.rept \h

> I guess unroll N is not good idea, because the code section too large, it most probability to make cache flush and missing.

 

Performance is slightly lower with a loop, i.e., with this change:

 

--- a/source/common/aarch64/ipfilter8.S

+++ b/source/common/aarch64/ipfilter8.S

@@ -173,12 +173,15 @@ filterPixelToShort_32xN 24

filterPixelToShort_32xN 32

filterPixelToShort_32xN 64

 

-.macro filterPixelToShort_64xN h

+.macro filterPixelToShort_64xN h r

function x265_filterPixelToShort_64x\h\()_neon

     add             x3, x3, x3

    sub             x3, x3, #0x40

     movi            v4.8h, #0xe0, lsl #8

-.rept \h

+    mov             x9, #\r

+.loop_filterP2S_64x\h:

+    subs            x9, x9, #1

+.rept 2

     ld1             {v0.16b-v3.16b}, [x0], x1

     ushll           v16.8h, v0.8b, #6

     ushll2          v17.8h, v0.16b, #6

@@ -199,14 +202,15 @@ function x265_filterPixelToShort_64x\h\()_neon

     st1             {v16.16b-v19.16b}, [x2], #0x40

     st1             {v20.16b-v23.16b}, [x2], x3

.endr

+    bgt             .loop_filterP2S_64x\h

     ret

endfunc

.endm

 

-filterPixelToShort_64xN 16

-filterPixelToShort_64xN 32

-filterPixelToShort_64xN 48

-filterPixelToShort_64xN 64

+filterPixelToShort_64xN 16 8

+filterPixelToShort_64xN 32 16

+filterPixelToShort_64xN 48 24

+filterPixelToShort_64xN 64 32

 

.macro qpel_filter_0_32b

     movi            v24.8h, #64

 

With the above change adding a loop I get

convert_p2s[64x16]                        1.46x      105.52                  154.34

convert_p2s[64x32]                        1.47x      212.07                  311.71

convert_p2s[64x48]                        1.47x      318.75                  468.04

convert_p2s[64x64]                        1.46x      425.61                  622.25

 

whereas with the fully unrolled version performance is slightly higher:

convert_p2s[64x16]                        1.48x      104.14                  154.36

convert_p2s[64x32]                        1.49x      209.43                  312.13

convert_p2s[64x48]                        1.48x      315.33                  466.37

convert_p2s[64x64]                        1.49x      420.45                  624.63

 

I do not have a preference for this one, so I will follow your recommendations.

Please let me know if I need to amend the patch to add the loop.

 

our testbench is small code piece, so there not so much cache missing report during testbench, in really system, large function is a big problems.




> 

> +    ld1             {v0.16b-v3.16b}, [x0], x1

> +    ushll           v16.8h, v0.8b, #6

> +    ushll2          v17.8h, v0.16b, #6

> +    ushll           v18.8h, v1.8b, #6

> +    ushll2          v19.8h, v1.16b, #6

> +    ushll           v20.8h, v2.8b, #6

> +    ushll2          v21.8h, v2.16b, #6

> +    ushll           v22.8h, v3.8b, #6

> +    ushll2          v23.8h, v3.16b, #6

> +    add             v16.8h, v16.8h, v4.8h

> +    add             v17.8h, v17.8h, v4.8h

> +    add             v18.8h, v18.8h, v4.8h

> +    add             v19.8h, v19.8h, v4.8h

> +    add             v20.8h, v20.8h, v4.8h

> +    add             v21.8h, v21.8h, v4.8h

> +    add             v22.8h, v22.8h, v4.8h

> +    add             v23.8h, v23.8h, v4.8h

> +    st1             {v16.16b-v19.16b}, [x2], #0x40

> ldp may reduce pipeline stall and more bandwidth

 

ldp is not beneficial as it requires 2 ldp + 2 add instead of 1 ld1:




ldp may follow by constant post-increment.

such as

ldp v16,v17,[x2],#0x20

 

--- a/source/common/aarch64/ipfilter8.S

+++ b/source/common/aarch64/ipfilter8.S

@@ -178,8 +178,13 @@ function x265_filterPixelToShort_64x\h\()_neon

     add             x3, x3, x3

     sub             x3, x3, #0x40

     movi            v4.8h, #0xe0, lsl #8

+    sub             x1, x1, #32

.rept \h

-    ld1             {v0.16b-v3.16b}, [x0], x1

+        ldp             q0, q1, [x0]

+        add  x0, x0, #32

+        ldp             q2, q3, [x0]

+        add  x0, x0, x1

+

     ushll           v16.8h, v0.8b, #6

     ushll2          v17.8h, v0.16b, #6

     ushll           v18.8h, v1.8b, #6

 

This adds overhead to instruction decoding.

cpu back-end will issue the same loads for ldp and ld1.

 

Here is performance with the above change:

convert_p2s[64x16]              1.43x    108.20          154.47

convert_p2s[64x32]              1.44x    216.43          312.10

convert_p2s[64x48]              1.43x    325.81          466.80

convert_p2s[64x64]              1.44x    433.60          624.30

 

With ld1 performance is higher:

convert_p2s[64x16]              1.48x    104.14          154.44

convert_p2s[64x32]              1.49x    209.44          312.10

convert_p2s[64x48]              1.48x    314.76          466.79

convert_p2s[64x64]              1.48x    420.30          622.38

 

Thanks,

Sebastian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/c019bbe4/attachment-0001.html>


More information about the x265-devel mailing list