[x265] [arm64] port filterPixelToShort

chen chenm003 at 163.com
Thu Jun 24 00:06:51 UTC 2021


Hi Sebastian,


thanks your patch.
I have some comments.



+function x265_filterPixelToShort_4x4_neon

+    add             x3, x3, x3

+    movi            v2.8h, #0xe0, lsl #8

are you compiler does not handle constant 0xe000 automatic? it is more readable


+    ld1             {v0.s}[0], [x0], x1
+    ld1             {v0.s}[1], [x0], x1
+    ld1             {v1.s}[2], [x0], x1
Why not v0.s?


+    ld1             {v1.s}[3], [x0], x1



+.macro filterPixelToShort_32xN h

+function x265_filterPixelToShort_32x\h\()_neon

+    add             x3, x3, x3

+    movi            v6.8h, #0xe0, lsl #8

+.rept \h

+    ld1             {v0.16b-v1.16b}, [x0], x1

ldp maybe provide more bandwidth


+.macro filterPixelToShort_64xN h
+function x265_filterPixelToShort_64x\h\()_neon
+    add             x3, x3, x3
+    sub             x3, x3, #0x40
+    movi            v4.8h, #0xe0, lsl #8
+.rept \h
I guess unroll N is not good idea, because the code section too large, it most probability to make cache flush and missing.


+    ld1             {v0.16b-v3.16b}, [x0], x1
+    ushll           v16.8h, v0.8b, #6
+    ushll2          v17.8h, v0.16b, #6
+    ushll           v18.8h, v1.8b, #6
+    ushll2          v19.8h, v1.16b, #6
+    ushll           v20.8h, v2.8b, #6
+    ushll2          v21.8h, v2.16b, #6
+    ushll           v22.8h, v3.8b, #6
+    ushll2          v23.8h, v3.16b, #6
+    add             v16.8h, v16.8h, v4.8h
+    add             v17.8h, v17.8h, v4.8h
+    add             v18.8h, v18.8h, v4.8h
+    add             v19.8h, v19.8h, v4.8h
+    add             v20.8h, v20.8h, v4.8h
+    add             v21.8h, v21.8h, v4.8h
+    add             v22.8h, v22.8h, v4.8h
+    add             v23.8h, v23.8h, v4.8h
+    st1             {v16.16b-v19.16b}, [x2], #0x40
ldp may reduce pipeline stall and more bandwidth


+    st1             {v20.16b-v23.16b}, [x2], x3
+.endr
+    ret
+endfunc
+.endm












 2021-06-24 07:52:22,"Pop, Sebastian" <spop at amazon.com> 

Hi,

 

The attached patch ports filterPixelToShort to arm64.

Tested on graviton2 arm64-linux.

 

convert_p2s[  4x4]              1.21x    4.98            6.03

convert_p2s[  8x8]              2.20x    6.20            13.65

convert_p2s[16x16]              1.54x    25.24           38.94

convert_p2s[32x32]              1.49x    101.99          151.63

convert_p2s[64x64]              1.48x    420.31          622.36

convert_p2s[  8x4]              2.18x    3.05            6.64

convert_p2s[  4x8]              1.91x    6.01            11.49

convert_p2s[ 16x8]              1.47x    12.19           17.92

convert_p2s[ 8x16]              1.95x    13.30           25.94

convert_p2s[32x16]              1.49x    50.63           75.58

convert_p2s[16x32]              1.56x    49.92           77.66

convert_p2s[64x32]              1.49x    209.43          312.13

convert_p2s[32x64]              1.48x    205.16          304.53

convert_p2s[16x12]              1.65x    17.62           29.08

convert_p2s[12x16]              6.22x    24.07           149.61

convert_p2s[ 16x4]              1.60x    5.37            8.59

convert_p2s[ 4x16]              1.75x    13.58           23.73

convert_p2s[32x24]              1.48x    76.47           113.22

convert_p2s[24x32]              2.69x    78.12           210.52

convert_p2s[ 32x8]              1.48x    25.00           37.06

convert_p2s[ 8x32]              1.63x    29.10           47.46

convert_p2s[64x48]              1.48x    314.74          466.77

convert_p2s[64x16]              1.48x    104.13          154.48

convert_p2s[16x64]              1.58x    98.66           155.67

 

Ok to commit?

 

Thanks,

Sebastian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/b2ffa4da/attachment.html>


More information about the x265-devel mailing list