[x265] [arm64] port scale1D_128to64 and scale2D_64to32

Pop, Sebastian spop at amazon.com
Sat Jul 31 04:14:29 UTC 2021


Hi,

Please let me know if you have ideas on how to make this code faster.
I tried to remove the stall by fetching more memory earlier, still no change in performance:

// void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
function x265_scale2D_64to32_neon
    mov             w12, #15
    ld1             {v0.16b-v3.16b}, [x1], x2
    ld1             {v4.16b-v7.16b}, [x1], x2
.loop_scale2D:
    sub             w12, w12, #1
    ld1             {v20.16b-v23.16b}, [x1], x2
    ld1             {v24.16b-v27.16b}, [x1], x2
    scale2D_1 v0, v1, v2, v3, v4, v5, v6, v7
    ld1             {v0.16b-v3.16b}, [x1], x2
    ld1             {v4.16b-v7.16b}, [x1], x2
    scale2D_1 v20, v21, v22, v23, v24, v25, v26, v27
    cbnz            w12, .loop_scale2D
    ld1             {v20.16b-v23.16b}, [x1], x2
    ld1             {v24.16b-v27.16b}, [x1], x2
    scale2D_1 v0, v1, v2, v3, v4, v5, v6, v7
    scale2D_1 v20, v21, v22, v23, v24, v25, v26, v27
    ret
endfunc

.macro scale2D_1 v0, v1, v2, v3, v4, v5, v6, v7
    uaddlp          \v0\().8h, \v0\().16b
    uaddlp          \v1\().8h, \v1\().16b
    uaddlp          \v2\().8h, \v2\().16b
    uaddlp          \v3\().8h, \v3\().16b
    uaddlp          \v4\().8h, \v4\().16b
    uaddlp          \v5\().8h, \v5\().16b
    uaddlp          \v6\().8h, \v6\().16b
    uaddlp          \v7\().8h, \v7\().16b
    add             \v0\().8h, \v0\().8h, \v4\().8h
    add             \v1\().8h, \v1\().8h, \v5\().8h
    add             \v2\().8h, \v2\().8h, \v6\().8h
    add             \v3\().8h, \v3\().8h, \v7\().8h
    uqrshrn         \v0\().8b, \v0\().8h, #2
    uqrshrn2        \v0\().16b, \v1\().8h, #2
    uqrshrn         \v1\().8b, \v2\().8h, #2
    uqrshrn2        \v1\().16b, \v3\().8h, #2
    st1             {\v0\().16b-\v1\().16b}, [x0], #32
.endm

The only change that I did is to further optimize for code size by re-rolling the loop that was unrolled 2x.
No change in performance, and 2x smaller code.

Sebastian

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210731/2f4efe31/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-arm64-port-scale1D_128to64-and-scale2D_64to32.patch
Type: application/octet-stream
Size: 3198 bytes
Desc: 0001-arm64-port-scale1D_128to64-and-scale2D_64to32.patch
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210731/2f4efe31/attachment.obj>


More information about the x265-devel mailing list