[x265] [PATCH] AArch64: Fixed the inconsistent bitstreams for GCC and CLANG

Fri Sep 6 10:45:47 UTC 2024

Starting from b3ce1c32da303c92241e85bd69298ab9903c0126,
We observed that the video streams generated by x265
compiled with CLANG and GCC, using the same parameters
and input video, are inconsistent.

Encoding logs for CLANG compiled x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74146.30
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67231.66

Encoding logs for GCC complied x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74022.17
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67220.26

This because a macro LOAD_DIFF_16x4_sve2 was introduced in
commit b3ce1c32da303c92241e85bd69298ab9903c0126 for AArch64
(This macro has been renamed to LOAD_DIFF_16x4_sve now) and
is used by SATD kernels. The macro LOAD_DIFF_16x4_sve uses
unprotected callee-saved SVE registers z8 to z15.

Unfortunately, in the caller function MotionEstimate of SATD,
the CLANG generates some code to use s10. Here is a simplified
code snippet:

	fmov  s10, #0.50000000
	bl subpelCompare -> will invoke SATD
	fadd  s0, s0, s10

Thus, the content of s10 used in MotionEstimate is overwritten
by the SATD function, which affects the correctness of subsequent
video encoding.

But we did not observe similar code in GCC to access s8 - s15
in MotionEstimate. At the same time, we can influence CLANG by
modifying -mpu=neoverse-N2, in this case CLANG will not use s10
in MotionEstimate, thereby making the video stream consistent.
However, such compiler dependent behavior is unreliable.

Therefore, in this patch, we avoid using z8 to z15 and extra stack
operations by reusing z0 to z7 in LOAD_DIFF_16x4_sve. Here are the
test results after applying the patch:

Encoding logs for CLANG compiled x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74022.17
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67220.26

Encoding logs for GCC compiled x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74022.17
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67220.26

Change-Id: Ic0e0c0706b99e53b138cceb758ee6ce148130e4b
---
  source/common/aarch64/pixel-util-sve.S | 34 +++++++++++++-------------
  1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/source/common/aarch64/pixel-util-sve.S 
b/source/common/aarch64/pixel-util-sve.S
index 3d073d42e..106ba903a 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -190,27 +190,27 @@ endfunc
      ld1b            {z7.h}, p0/z, [x2, x11]
      add             x0, x0, x1
      add             x2, x2, x3
-    ld1b            {z29.h}, p0/z, [x0]
-    ld1b            {z9.h}, p0/z, [x0, x11]
-    ld1b            {z10.h}, p0/z, [x2]
-    ld1b            {z11.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z12.h}, p0/z, [x0]
-    ld1b            {z13.h}, p0/z, [x0, x11]
-    ld1b            {z14.h}, p0/z, [x2]
-    ld1b            {z15.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-
      sub             \v0\().h, z0.h, z2.h
      sub             \v4\().h, z1.h, z3.h
      sub             \v1\().h, z4.h, z6.h
      sub             \v5\().h, z5.h, z7.h
-    sub             \v2\().h, z29.h, z10.h
-    sub             \v6\().h, z9.h, z11.h
-    sub             \v3\().h, z12.h, z14.h
-    sub             \v7\().h, z13.h, z15.h
+
+    ld1b            {z0.h}, p0/z, [x0]
+    ld1b            {z1.h}, p0/z, [x0, x11]
+    ld1b            {z2.h}, p0/z, [x2]
+    ld1b            {z3.h}, p0/z, [x2, x11]
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, [x0]
+    ld1b            {z5.h}, p0/z, [x0, x11]
+    ld1b            {z6.h}, p0/z, [x2]
+    ld1b            {z7.h}, p0/z, [x2, x11]
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             \v2\().h, z0.h, z2.h
+    sub             \v6\().h, z1.h, z3.h
+    sub             \v3\().h, z4.h, z6.h
+    sub             \v7\().h, z5.h, z7.h
  .endm

  // one vertical hadamard pass and two horizontal
-- 
2.34.1
-------------- next part --------------
From 2b01d1cebd89ebdfe34bc92bd44b444d83594110 Mon Sep 17 00:00:00 2001
From: Wei Chen <Wei.Chen at arm.com>
Date: Thu, 5 Sep 2024 11:14:39 +0800
Subject: [PATCH] AArch64: Fixed the inconsistent bitstreams for GCC and CLANG

Starting from b3ce1c32da303c92241e85bd69298ab9903c0126,
We observed that the video streams generated by x265
compiled with CLANG and GCC, using the same parameters
and input video, are inconsistent.

Encoding logs for CLANG compiled x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74146.30
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67231.66

Encoding logs for GCC complied x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74022.17
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67220.26

This because a macro LOAD_DIFF_16x4_sve2 was introduced in
commit b3ce1c32da303c92241e85bd69298ab9903c0126 for AArch64
(This macro has been renamed to LOAD_DIFF_16x4_sve now) and
is used by SATD kernels. The macro LOAD_DIFF_16x4_sve uses
unprotected callee-saved SVE registers z8 to z15.

Unfortunately, in the caller function MotionEstimate of SATD,
the CLANG generates some code to use s10. Here is a simplified
code snippet:

	fmov  s10, #0.50000000
	bl subpelCompare -> will invoke SATD
	fadd  s0, s0, s10

Thus, the content of s10 used in MotionEstimate is overwritten
by the SATD function, which affects the correctness of subsequent
video encoding.

But we did not observe similar code in GCC to access s8 - s15
in MotionEstimate. At the same time, we can influence CLANG by
modifying -mpu=neoverse-N2, in this case CLANG will not use s10
in MotionEstimate, thereby making the video stream consistent.
However, such compiler dependent behavior is unreliable.

Therefore, in this patch, we avoid using z8 to z15 and extra stack
operations by reusing z0 to z7 in LOAD_DIFF_16x4_sve. Here are the
test results after applying the patch:

Encoding logs for CLANG compiled x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74022.17
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67220.26

Encoding logs for GCC compiled x265:
x265 [info]: frame I:      2, Avg QP:37.79  kb/s: 72220.75
x265 [info]: frame P:     20, Avg QP:37.75  kb/s: 74022.17
x265 [info]: frame B:     76, Avg QP:39.75  kb/s: 67220.26

Change-Id: Ic0e0c0706b99e53b138cceb758ee6ce148130e4b
---
 source/common/aarch64/pixel-util-sve.S | 34 +++++++++++++-------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index 3d073d42e..106ba903a 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -190,27 +190,27 @@ endfunc
     ld1b            {z7.h}, p0/z, [x2, x11]
     add             x0, x0, x1
     add             x2, x2, x3
-    ld1b            {z29.h}, p0/z, [x0]
-    ld1b            {z9.h}, p0/z, [x0, x11]
-    ld1b            {z10.h}, p0/z, [x2]
-    ld1b            {z11.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-    ld1b            {z12.h}, p0/z, [x0]
-    ld1b            {z13.h}, p0/z, [x0, x11]
-    ld1b            {z14.h}, p0/z, [x2]
-    ld1b            {z15.h}, p0/z, [x2, x11]
-    add             x0, x0, x1
-    add             x2, x2, x3
-
     sub             \v0\().h, z0.h, z2.h
     sub             \v4\().h, z1.h, z3.h
     sub             \v1\().h, z4.h, z6.h
     sub             \v5\().h, z5.h, z7.h
-    sub             \v2\().h, z29.h, z10.h
-    sub             \v6\().h, z9.h, z11.h
-    sub             \v3\().h, z12.h, z14.h
-    sub             \v7\().h, z13.h, z15.h
+
+    ld1b            {z0.h}, p0/z, [x0]
+    ld1b            {z1.h}, p0/z, [x0, x11]
+    ld1b            {z2.h}, p0/z, [x2]
+    ld1b            {z3.h}, p0/z, [x2, x11]
+    add             x0, x0, x1
+    add             x2, x2, x3
+    ld1b            {z4.h}, p0/z, [x0]
+    ld1b            {z5.h}, p0/z, [x0, x11]
+    ld1b            {z6.h}, p0/z, [x2]
+    ld1b            {z7.h}, p0/z, [x2, x11]
+    add             x0, x0, x1
+    add             x2, x2, x3
+    sub             \v2\().h, z0.h, z2.h
+    sub             \v6\().h, z1.h, z3.h
+    sub             \v3\().h, z4.h, z6.h
+    sub             \v7\().h, z5.h, z7.h
 .endm
 
 // one vertical hadamard pass and two horizontal
-- 
2.34.1