[x264-devel] [Git][videolan/x264][master] aarch64: Use rounded right shifts in dequant
Martin Storsjö (@mstorsjo)
gitlab at videolan.org
Thu Nov 2 21:30:26 UTC 2023
Martin Storsjö pushed to branch master at VideoLAN / x264
Commits:
dc755eab by Martin Storsjö at 2023-11-02T21:26:03+00:00
aarch64: Use rounded right shifts in dequant
Don't manually add in the rounding constant (via a fused multiply-add
instruction) when we can just do a plain rounded right shift.
Cortex A53 A72 A73
8bpc:
Before:
dequant_4x4_cqm_neon: 515 246 267
dequant_4x4_dc_cqm_neon: 410 265 266
dequant_4x4_dc_flat_neon: 413 271 271
dequant_4x4_flat_neon: 519 254 274
dequant_8x8_cqm_neon: 1555 980 1002
dequant_8x8_flat_neon: 1562 994 1014
After:
dequant_4x4_cqm_neon: 499 246 255
dequant_4x4_dc_cqm_neon: 376 265 255
dequant_4x4_dc_flat_neon: 378 271 260
dequant_4x4_flat_neon: 500 254 262
dequant_8x8_cqm_neon: 1489 900 925
dequant_8x8_flat_neon: 1493 915 938
10bpc:
Before:
dequant_4x4_cqm_neon: 483 275 275
dequant_4x4_dc_cqm_neon: 429 256 261
dequant_4x4_dc_flat_neon: 435 267 267
dequant_4x4_flat_neon: 487 283 288
dequant_8x8_cqm_neon: 1511 1112 1076
dequant_8x8_flat_neon: 1518 1139 1089
After:
dequant_4x4_cqm_neon: 472 255 239
dequant_4x4_dc_cqm_neon: 404 256 232
dequant_4x4_dc_flat_neon: 406 267 234
dequant_4x4_flat_neon: 472 255 239
dequant_8x8_cqm_neon: 1462 922 978
dequant_8x8_flat_neon: 1462 922 978
This makes it around 3% faster on the Cortex A53, around 8% faster
for 8bpc on Cortex A72/A73, and around 10-20% faster for 10bpp
on A72/A73.
- - - - -
1 changed file:
- common/aarch64/quant-a.S
Changes:
=====================================
common/aarch64/quant-a.S
=====================================
@@ -510,10 +510,6 @@ dequant_\size\()_lshift_loop:
dequant_\size\()_rshift:
dup v31.4s, w3
- neg w3, w3
- mov w5, #1
- sub w3, w3, #1
- lsl w5, w5, w3
.ifc \size, 8x8
dequant_\size\()_rshift_loop:
@@ -523,24 +519,20 @@ dequant_\size\()_rshift_loop:
ld1 {v17.4s}, [x1], #16
sqxtn v2.4h, v16.4s
ld1 {v18.4s}, [x1], #16
- dup v16.4s, w5
sqxtn2 v2.8h, v17.4s
ld1 {v19.4s}, [x1], #16
- dup v17.4s, w5
sqxtn v3.4h, v18.4s
ld1 {v0.8h,v1.8h}, [x0]
- dup v18.4s, w5
sqxtn2 v3.8h, v19.4s
- dup v19.4s, w5
- smlal v16.4s, v0.4h, v2.4h
- smlal2 v17.4s, v0.8h, v2.8h
- smlal v18.4s, v1.4h, v3.4h
- smlal2 v19.4s, v1.8h, v3.8h
- sshl v16.4s, v16.4s, v31.4s
- sshl v17.4s, v17.4s, v31.4s
- sshl v18.4s, v18.4s, v31.4s
- sshl v19.4s, v19.4s, v31.4s
+ smull v16.4s, v0.4h, v2.4h
+ smull2 v17.4s, v0.8h, v2.8h
+ smull v18.4s, v1.4h, v3.4h
+ smull2 v19.4s, v1.8h, v3.8h
+ srshl v16.4s, v16.4s, v31.4s
+ srshl v17.4s, v17.4s, v31.4s
+ srshl v18.4s, v18.4s, v31.4s
+ srshl v19.4s, v19.4s, v31.4s
sqxtn v0.4h, v16.4s
sqxtn2 v0.8h, v17.4s
@@ -574,25 +566,17 @@ function dequant_4x4_dc_neon, export=1
dequant_4x4_dc_rshift:
dup v4.8h, w1
dup v3.4s, w3
- neg w3, w3
- mov w5, #1
- sub w3, w3, #1
- lsl w5, w5, w3
- dup v16.4s, w5
- dup v17.4s, w5
ld1 {v0.8h,v1.8h}, [x0]
- dup v18.4s, w5
- dup v19.4s, w5
-
- smlal v16.4s, v0.4h, v4.4h
- smlal2 v17.4s, v0.8h, v4.8h
- smlal v18.4s, v1.4h, v4.4h
- smlal2 v19.4s, v1.8h, v4.8h
- sshl v16.4s, v16.4s, v3.4s
- sshl v17.4s, v17.4s, v3.4s
- sshl v18.4s, v18.4s, v3.4s
- sshl v19.4s, v19.4s, v3.4s
+
+ smull v16.4s, v0.4h, v4.4h
+ smull2 v17.4s, v0.8h, v4.8h
+ smull v18.4s, v1.4h, v4.4h
+ smull2 v19.4s, v1.8h, v4.8h
+ srshl v16.4s, v16.4s, v3.4s
+ srshl v17.4s, v17.4s, v3.4s
+ srshl v18.4s, v18.4s, v3.4s
+ srshl v19.4s, v19.4s, v3.4s
sqxtn v0.4h, v16.4s
sqxtn2 v0.8h, v17.4s
@@ -973,10 +957,6 @@ dequant_\size\()_lshift_loop:
dequant_\size\()_rshift:
dup v31.4s, w3
- neg w3, w3
- mov w5, #1
- sub w3, w3, #1
- lsl w5, w5, w3
.ifc \size, 8x8
dequant_\size\()_rshift_loop:
@@ -985,20 +965,15 @@ dequant_\size\()_rshift_loop:
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
- dup v20.4s, w5
- dup v21.4s, w5
- dup v22.4s, w5
- dup v23.4s, w5
-
- mla v20.4s, v0.4s, v16.4s
- mla v21.4s, v1.4s, v17.4s
- mla v22.4s, v2.4s, v18.4s
- mla v23.4s, v3.4s, v19.4s
+ mul v20.4s, v0.4s, v16.4s
+ mul v21.4s, v1.4s, v17.4s
+ mul v22.4s, v2.4s, v18.4s
+ mul v23.4s, v3.4s, v19.4s
- sshl v16.4s, v20.4s, v31.4s
- sshl v17.4s, v21.4s, v31.4s
- sshl v18.4s, v22.4s, v31.4s
- sshl v19.4s, v23.4s, v31.4s
+ srshl v16.4s, v20.4s, v31.4s
+ srshl v17.4s, v21.4s, v31.4s
+ srshl v18.4s, v22.4s, v31.4s
+ srshl v19.4s, v23.4s, v31.4s
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
.ifc \size, 8x8
@@ -1031,28 +1006,17 @@ dequant_4x4_dc_rshift:
dup v31.4s, w1
dup v30.4s, w3
- neg w3, w3
- mov w5, #1
- sub w3, w3, #1
- lsl w5, w5, w3
-
- dup v16.4s, w5
- dup v17.4s, w5
-
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0]
- dup v18.4s, w5
- dup v19.4s, w5
-
- mla v16.4s, v0.4s, v31.4s
- mla v17.4s, v1.4s, v31.4s
- mla v18.4s, v2.4s, v31.4s
- mla v19.4s, v3.4s, v31.4s
+ mul v16.4s, v0.4s, v31.4s
+ mul v17.4s, v1.4s, v31.4s
+ mul v18.4s, v2.4s, v31.4s
+ mul v19.4s, v3.4s, v31.4s
- sshl v16.4s, v16.4s, v30.4s
- sshl v17.4s, v17.4s, v30.4s
- sshl v18.4s, v18.4s, v30.4s
- sshl v19.4s, v19.4s, v30.4s
+ srshl v16.4s, v16.4s, v30.4s
+ srshl v17.4s, v17.4s, v30.4s
+ srshl v18.4s, v18.4s, v30.4s
+ srshl v19.4s, v19.4s, v30.4s
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
ret
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/dc755eabb9914e29df004243bee95013487753c3
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/dc755eabb9914e29df004243bee95013487753c3
You're receiving this email because of your account on code.videolan.org.
VideoLAN code repository instance
More information about the x264-devel
mailing list