[x265] [arm64] port addAvg

chen chenm003 at 163.com
Tue Jul 27 14:58:33 UTC 2021


Hi,


I just a little comments.


+.macro addAvg_start

+    lsl             x3, x3, #1

+    lsl             x4, x4, #1

+    movrel          x11, addAvg_offset

+    ld1             {v30.8h}, [x11]
All of value in the addAvg_offset is 0x40, why not DUP?



+    add             v0.8h, v0.8h, v1.8h

+    saddl           v16.4s, v0.4h, v30.4h
immediate use v0 may make pipeline stall



+    saddl2          v17.4s, v0.8h, v30.8h

+    add             v2.8h, v2.8h, v3.8h

+    saddl           v18.4s, v2.4h, v30.4h

+    saddl2          v19.4s, v2.8h, v30.8h




 2021-07-27 09:01:32,"Pop, Sebastian" <spop at amazon.com> 

Hi,

the attached patch ports to arm64 the following kernels:

 

             addAvg[  4x4]  22.03x   9.87            217.35

             addAvg[  8x8]  41.06x   21.01           862.77

     [i420]  addAvg[  4x4]  21.07x   10.31           217.20

     [i422]  addAvg[  4x8]  23.19x   17.87           414.44

             addAvg[  8x4]  35.10x   12.46           437.40

     [i420]  addAvg[  4x2]  13.23x   8.01            105.94

             addAvg[  4x8]  23.17x   17.89           414.54

             addAvg[16x16]  50.38x   63.28           3187.50

     [i420]  addAvg[  8x8]  38.47x   21.93           843.59

     [i422]  addAvg[ 8x16]  44.45x   38.55           1713.69

             addAvg[ 16x8]  47.63x   33.70           1605.09

     [i420]  addAvg[  8x4]  34.13x   12.86           439.01

     [i422]  addAvg[  8x8]  39.22x   21.87           857.94

             addAvg[ 8x16]  42.08x   40.88           1720.30

     [i420]  addAvg[  4x8]  23.03x   17.93           413.10

     [i422]  addAvg[ 4x16]  24.58x   32.44           797.45

             addAvg[ 16x4]  44.62x   18.13           809.08

     [i420]  addAvg[  8x2]  28.08x   8.17            229.29

     [i422]  addAvg[  8x4]  34.00x   12.82           435.82

             addAvg[16x12]  50.69x   48.05           2435.74

     [i420]  addAvg[  8x6]  38.48x   17.07           656.91

     [i422]  addAvg[ 8x12]  42.95x   30.00           1288.53

             addAvg[ 4x16]  25.31x   31.73           802.95

             addAvg[12x16]  35.76x   67.70           2421.01

     [i420]  addAvg[  6x8]  19.93x   30.26           603.15

     [i422]  addAvg[ 6x16]  20.47x   57.31           1172.97

             addAvg[32x32]  48.23x   254.84          12291.57

     [i420]  addAvg[16x16]  49.59x   63.82           3164.65

     [i422]  addAvg[16x32]  51.79x   123.15          6377.69

             addAvg[32x16]  49.46x   128.27          6343.50

     [i420]  addAvg[ 16x8]  48.03x   33.75           1620.91

     [i422]  addAvg[16x16]  50.35x   62.86           3164.73

             addAvg[16x32]  51.75x   122.50          6339.62

     [i420]  addAvg[ 8x16]  43.78x   38.62           1690.74

     [i422]  addAvg[ 8x32]  45.53x   72.44           3298.22

             addAvg[ 32x8]  47.93x   65.87           3156.92

     [i420]  addAvg[ 16x4]  43.43x   18.64           809.56

     [i422]  addAvg[ 16x8]  47.47x   33.64           1596.84

             addAvg[32x24]  49.16x   191.04          9392.00

     [i420]  addAvg[16x12]  49.27x   48.68           2398.20

     [i422]  addAvg[16x24]  50.96x   93.21           4750.37

             addAvg[ 8x32]  45.61x   72.32           3298.91

     [i420]  addAvg[ 4x16]  24.65x   32.30           796.37

     [i422]  addAvg[ 4x32]  25.97x   60.57           1572.78

             addAvg[24x32]  46.28x   204.88          9481.85

     [i420]  addAvg[12x16]  35.58x   68.07           2422.33

     [i422]  addAvg[12x32]  37.35x   130.66          4879.55

             addAvg[64x64]  45.30x   1066.50         48309.83

     [i420]  addAvg[32x32]  48.17x   255.22          12293.77

     [i422]  addAvg[32x64]  48.67x   505.28          24591.01

             addAvg[64x32]  45.22x   535.51          24215.25

     [i420]  addAvg[32x16]  48.63x   130.26          6334.18

     [i422]  addAvg[32x32]  48.33x   255.33          12341.31

             addAvg[32x64]  48.88x   504.10          24641.61

     [i420]  addAvg[16x32]  51.87x   123.09          6384.44

     [i422]  addAvg[16x64]  53.21x   242.70          12914.20

             addAvg[64x16]  44.87x   270.22          12125.58

     [i420]  addAvg[ 32x8]  46.57x   66.57           3100.05

     [i422]  addAvg[32x16]  48.76x   129.97          6336.97

             addAvg[64x48]  46.57x   800.90          37301.68

     [i420]  addAvg[32x24]  49.21x   192.49          9473.39

     [i422]  addAvg[32x48]  49.02x   379.97          18627.41

             addAvg[16x64]  53.24x   242.72          12922.55

     [i420]  addAvg[ 8x32]  44.63x   74.53           3326.18

     [i422]  addAvg[ 8x64]  48.12x   138.94          6686.57

             addAvg[48x64]  47.97x   754.41          36187.82

     [i420]  addAvg[24x32]  45.60x   205.26          9360.26

     [i422]  addAvg[24x64]  45.69x   408.96          18684.47

 

Ok to commit?

 

Thanks,

Sebastian

 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20210727/a55c835e/attachment-0001.html>


More information about the x265-devel mailing list