[x265] [arm64] port filterPixelToShort
chen
chenm003 at 163.com
Thu Jun 24 00:06:51 UTC 2021
Hi Sebastian,
thanks your patch.
I have some comments.
+function x265_filterPixelToShort_4x4_neon
+ add x3, x3, x3
+ movi v2.8h, #0xe0, lsl #8
are you compiler does not handle constant 0xe000 automatic? it is more readable
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v1.s}[2], [x0], x1
Why not v0.s?
+ ld1 {v1.s}[3], [x0], x1
+.macro filterPixelToShort_32xN h
+function x265_filterPixelToShort_32x\h\()_neon
+ add x3, x3, x3
+ movi v6.8h, #0xe0, lsl #8
+.rept \h
+ ld1 {v0.16b-v1.16b}, [x0], x1
ldp maybe provide more bandwidth
+.macro filterPixelToShort_64xN h
+function x265_filterPixelToShort_64x\h\()_neon
+ add x3, x3, x3
+ sub x3, x3, #0x40
+ movi v4.8h, #0xe0, lsl #8
+.rept \h
I guess unroll N is not good idea, because the code section too large, it most probability to make cache flush and missing.
+ ld1 {v0.16b-v3.16b}, [x0], x1
+ ushll v16.8h, v0.8b, #6
+ ushll2 v17.8h, v0.16b, #6
+ ushll v18.8h, v1.8b, #6
+ ushll2 v19.8h, v1.16b, #6
+ ushll v20.8h, v2.8b, #6
+ ushll2 v21.8h, v2.16b, #6
+ ushll v22.8h, v3.8b, #6
+ ushll2 v23.8h, v3.16b, #6
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v4.8h
+ add v18.8h, v18.8h, v4.8h
+ add v19.8h, v19.8h, v4.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ st1 {v16.16b-v19.16b}, [x2], #0x40
ldp may reduce pipeline stall and more bandwidth
+ st1 {v20.16b-v23.16b}, [x2], x3
+.endr
+ ret
+endfunc
+.endm
2021-06-24 07:52:22,"Pop, Sebastian" <spop at amazon.com>
Hi,
The attached patch ports filterPixelToShort to arm64.
Tested on graviton2 arm64-linux.
convert_p2s[ 4x4] 1.21x 4.98 6.03
convert_p2s[ 8x8] 2.20x 6.20 13.65
convert_p2s[16x16] 1.54x 25.24 38.94
convert_p2s[32x32] 1.49x 101.99 151.63
convert_p2s[64x64] 1.48x 420.31 622.36
convert_p2s[ 8x4] 2.18x 3.05 6.64
convert_p2s[ 4x8] 1.91x 6.01 11.49
convert_p2s[ 16x8] 1.47x 12.19 17.92
convert_p2s[ 8x16] 1.95x 13.30 25.94
convert_p2s[32x16] 1.49x 50.63 75.58
convert_p2s[16x32] 1.56x 49.92 77.66
convert_p2s[64x32] 1.49x 209.43 312.13
convert_p2s[32x64] 1.48x 205.16 304.53
convert_p2s[16x12] 1.65x 17.62 29.08
convert_p2s[12x16] 6.22x 24.07 149.61
convert_p2s[ 16x4] 1.60x 5.37 8.59
convert_p2s[ 4x16] 1.75x 13.58 23.73
convert_p2s[32x24] 1.48x 76.47 113.22
convert_p2s[24x32] 2.69x 78.12 210.52
convert_p2s[ 32x8] 1.48x 25.00 37.06
convert_p2s[ 8x32] 1.63x 29.10 47.46
convert_p2s[64x48] 1.48x 314.74 466.77
convert_p2s[64x16] 1.48x 104.13 154.48
convert_p2s[16x64] 1.58x 98.66 155.67
Ok to commit?
Thanks,
Sebastian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210624/b2ffa4da/attachment.html>
More information about the x265-devel
mailing list