[x265] [arm64] port count_nonzero, blkfill, and copy_{ss, sp, ps}

chen chenm003 at 163.com
Fri Jul 23 02:33:02 UTC 2021


Hello,


Thank your patch.




The copy_s* looks good, my only comment is the instruction TBL faster than XTN/XTN2




The copy_cnt_* may improve more, for example,



+function x265_copy_cnt_4_neon

+    lsl             x2, x2, #1

+    movi            v4.8b, #0

+.rept 2

+    ld1             {v0.8b}, [x1], x2

+    ld1             {v1.8b}, [x1], x2

+    clz             v2.4h, v0.4h

+    clz             v3.4h, v1.4h

+    ushr            v2.4h, v2.4h, #4

+    ushr            v3.4h, v3.4h, #4

+    add             v2.4h, v2.4h, v3.4h
clz+ushr+add is slower than cmeq+add in either exection throughput or cycles.



+    add             v4.4h, v4.4h, v2.4h

+    st1             {v0.8b}, [x0], #8

+    st1             {v1.8b}, [x0], #8

+.endr

+    uaddlv          s4, v4.4h

+    fmov            w12, s4

+    neg             w12, w12

+    add             w0, w12, #16
(-w12) + 16 equal to 16-w12, load #16 into w0 may execution parallelism with FMOV.



+    ret

+endfunc



Regards,
Min Chen

 2021-07-23 04:14:30,"Pop, Sebastian" <spop at amazon.com> 

With the patch attached.

 

 

From: "Pop, Sebastian" <spop at amazon.com>
Date: Thursday, July 22, 2021 at 3:13 PM
To: "x265-devel at videolan.org" <x265-devel at videolan.org>
Subject: [arm64] port count_nonzero, blkfill, and copy_{ss,sp,ps}

 

Hi,

 

the attached patch ports to arm64 the following kernels:

 

count_nonzero[4x4]      19.23x   2.95            56.77

count_nonzero[8x8]      32.07x   7.11            228.15

count_nonzero[16x16]    35.16x   26.09           917.23

count_nonzero[32x32]    37.31x   98.07           3658.49

 

          blkfill[4x4]  31.39x   3.72            116.84

          blkfill[8x8]  85.97x   5.78            497.26

        blkfill[16x16]  102.63x 16.28           1670.56

        blkfill[32x32]  100.07x 62.89           6293.62

 

        copy_ss[  4x4]  16.87x   6.21            104.78

        copy_sp[  4x4]  16.21x   6.34            102.69

        copy_ps[  4x4]  18.06x   5.91            106.69

        copy_ss[  8x8]  51.50x   8.30            427.52

        copy_sp[  8x8]  43.34x   9.32            403.79

        copy_ps[  8x8]  49.00x   8.50            416.36

[i420] copy_ss[  4x4]  15.40x   6.62            101.98

[i420] copy_ps[  4x4]  16.50x   6.26            103.28

[i420] copy_sp[  4x4]  14.14x   6.82            96.48

[i422] copy_ss[  4x8]  25.79x   8.28            213.57

[i422] copy_ps[  4x8]  24.74x   8.62            213.35

[i422] copy_sp[  4x8]  22.01x   9.27            204.03

        copy_ss[16x16]  82.20x   19.79           1626.69

        copy_sp[16x16]  85.13x   18.78           1599.19

        copy_ps[16x16]  72.51x   22.28           1615.58

[i420] copy_ss[  8x8]  49.16x   8.49            417.24

[i420] copy_ps[  8x8]  46.52x   8.71            405.34

[i420] copy_sp[  8x8]  42.68x   9.47            404.13

[i422] copy_ss[ 8x16]  56.55x   14.98           847.42

[i422] copy_ps[ 8x16]  57.71x   15.12           872.39

[i422] copy_sp[ 8x16]  49.76x   16.83           837.44

        copy_ss[32x32]  98.60x   67.47           6652.77

        copy_sp[32x32]  96.31x   65.07           6266.88

        copy_ps[32x32]  77.71x   81.02           6295.59

[i420] copy_ss[16x16]  83.93x   20.52           1722.55

[i420] copy_ps[16x16]  72.66x   22.13           1608.30

[i420] copy_sp[16x16]  85.67x   18.73           1604.77

[i422] copy_ss[16x32]  91.45x   36.56           3343.09

[i422] copy_ps[16x32]  75.73x   42.40           3211.16

[i422] copy_sp[16x32]  91.93x   34.32           3154.89

        copy_ss[64x64]  104.11x          254.52          26498.82

        copy_sp[64x64]  98.81x   252.38          24937.40

        copy_ps[64x64]  80.97x   308.55          24983.04

[i420] copy_ss[32x32]  99.49x   67.40           6706.31

[i420] copy_ps[32x32]  76.50x   81.51           6235.63

[i420] copy_sp[32x32]  97.43x   65.84           6414.64

[i422] copy_ss[32x64]  102.57x          129.82          13315.36

[i422] copy_ps[32x64]  78.95x   159.47          12590.31

[i422] copy_sp[32x64]  99.54x   128.29          12769.10

         copy_cnt[4x4]  13.91x   7.48            104.10

         copy_cnt[8x8]  31.01x   12.69           393.40

       copy_cnt[16x16]  42.88x   36.23           1553.66

       copy_cnt[32x32]  47.43x   129.19          6127.58

 

Ok to commit?

 

Thanks,

Sebastian

 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210723/7efde64e/attachment-0001.html>


More information about the x265-devel mailing list