[x265-commits] [x265] asm: active x264 ssim code

Min Chen chenm003 at 163.com
Tue Nov 26 18:18:52 CET 2013


details:   http://hg.videolan.org/x265/rev/63a87d839bed
branches:  
changeset: 5315:63a87d839bed
user:      Min Chen <chenm003 at 163.com>
date:      Tue Nov 26 16:47:55 2013 +0800
description:
asm: active x264 ssim code

Side effect:
Remove ssim_t to avoid conflict with x264 asm code
we use int64_t when HIGH_BIT_DEPTH enable, but x264 always 'int'
Subject: [x265] fix: in 14bpp mode, maximum shift is 10

details:   http://hg.videolan.org/x265/rev/7b48cda38797
branches:  
changeset: 5316:7b48cda38797
user:      Min Chen <chenm003 at 163.com>
date:      Tue Nov 26 16:50:15 2013 +0800
description:
fix: in 14bpp mode, maximum shift is 10
Subject: [x265] asm: removed unused code in pixel_var module

details:   http://hg.videolan.org/x265/rev/a903be46b40d
branches:  
changeset: 5317:a903be46b40d
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Tue Nov 26 16:01:22 2013 +0550
description:
asm: removed unused code in pixel_var module
Subject: [x265] asm: code for pixel_sse_sp_16xN

details:   http://hg.videolan.org/x265/rev/3791482e68f5
branches:  
changeset: 5318:3791482e68f5
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Tue Nov 26 20:51:28 2013 +0550
description:
asm: code for pixel_sse_sp_16xN
Subject: [x265] asm: assembly code for intra_pred_planar[8x8]

details:   http://hg.videolan.org/x265/rev/13fe158374e7
branches:  
changeset: 5319:13fe158374e7
user:      Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
date:      Tue Nov 26 19:13:32 2013 +0550
description:
asm: assembly code for intra_pred_planar[8x8]
Subject: [x265] asm: Correct number of xmm registers for weight_sp routine.

details:   http://hg.videolan.org/x265/rev/8bc370263676
branches:  
changeset: 5320:8bc370263676
user:      Nabajit Deka <nabajit at multicorewareinc.com>
date:      Tue Nov 26 20:08:02 2013 +0550
description:
asm: Correct number of xmm registers for weight_sp routine.
Subject: [x265] asm: assembly code for dct4x4

details:   http://hg.videolan.org/x265/rev/e9ac715f16cf
branches:  
changeset: 5321:e9ac715f16cf
user:      Nabajit Deka <nabajit at multicorewareinc.com>
date:      Tue Nov 26 22:03:47 2013 +0550
description:
asm: assembly code for dct4x4
Subject: [x265] Adding constant table used for dct4

details:   http://hg.videolan.org/x265/rev/78431cd16bb5
branches:  
changeset: 5322:78431cd16bb5
user:      Nabajit Deka <nabajit at multicorewareinc.com>
date:      Tue Nov 26 22:05:43 2013 +0550
description:
Adding constant table used for dct4
Subject: [x265] Adding dct8.asm and dct8.h to CMakeLists

details:   http://hg.videolan.org/x265/rev/51b6d0c6ecf5
branches:  
changeset: 5323:51b6d0c6ecf5
user:      Nabajit Deka <nabajit at multicorewareinc.com>
date:      Tue Nov 26 22:07:16 2013 +0550
description:
Adding dct8.asm and dct8.h to CMakeLists
Subject: [x265] Enable the new dct4 asm routine.

details:   http://hg.videolan.org/x265/rev/5e3b1d59d8dd
branches:  
changeset: 5324:5e3b1d59d8dd
user:      Nabajit Deka <nabajit at multicorewareinc.com>
date:      Tue Nov 26 22:10:27 2013 +0550
description:
Enable the new dct4 asm routine.
Subject: [x265] Merge

details:   http://hg.videolan.org/x265/rev/ba8e95798860
branches:  
changeset: 5325:ba8e95798860
user:      Steve Borho <steve at borho.org>
date:      Tue Nov 26 10:46:10 2013 -0600
description:
Merge
Subject: [x265] vec: remove dct4 intrinsic primitive

details:   http://hg.videolan.org/x265/rev/543390a8644c
branches:  
changeset: 5326:543390a8644c
user:      Steve Borho <steve at borho.org>
date:      Tue Nov 26 10:47:07 2013 -0600
description:
vec: remove dct4 intrinsic primitive

diffstat:

 source/common/CMakeLists.txt         |    4 +-
 source/common/dct.cpp                |    2 +-
 source/common/pixel.cpp              |   62 ++--
 source/common/primitives.h           |    8 +-
 source/common/vec/dct-sse3.cpp       |   82 ------
 source/common/vec/pixel-sse41.cpp    |   88 ------
 source/common/x86/asm-primitives.cpp |   26 +-
 source/common/x86/const-a.asm        |    1 +
 source/common/x86/dct8.asm           |  130 ++++++++++
 source/common/x86/dct8.h             |   29 ++
 source/common/x86/intrapred.asm      |   68 +++++
 source/common/x86/intrapred.h        |    1 +
 source/common/x86/pixel-a.asm        |  444 +++++++++++++++-------------------
 source/common/x86/pixel-util.asm     |    4 +-
 source/common/x86/pixel.h            |   16 +-
 source/encoder/framefilter.cpp       |    8 +-
 source/encoder/ratecontrol.cpp       |    4 +-
 source/test/pixelharness.cpp         |  107 +++++++-
 source/test/pixelharness.h           |    2 +
 source/x265.h                        |   15 +-
 20 files changed, 607 insertions(+), 494 deletions(-)

diffs (truncated from 1510 to 300 lines):

diff -r 116d91f08fcb -r 543390a8644c source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Tue Nov 26 14:19:27 2013 +0800
+++ b/source/common/CMakeLists.txt	Tue Nov 26 10:47:07 2013 -0600
@@ -112,9 +112,9 @@ if(ENABLE_PRIMITIVES_VEC)
 endif(ENABLE_PRIMITIVES_VEC)
 
 if(ENABLE_PRIMITIVES_ASM)
-    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h)
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h)
     set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm
-               blockcopy8.asm intrapred.asm pixeladd8.asm)
+               blockcopy8.asm intrapred.asm pixeladd8.asm dct8.asm)
     if (NOT X64)
         set(A_SRCS ${A_SRCS} pixel-32.asm)
     endif()
diff -r 116d91f08fcb -r 543390a8644c source/common/dct.cpp
--- a/source/common/dct.cpp	Tue Nov 26 14:19:27 2013 +0800
+++ b/source/common/dct.cpp	Tue Nov 26 10:47:07 2013 -0600
@@ -724,7 +724,7 @@ void dequant_normal_c(const int32_t* qua
     // NOTE: maximum of scale is (72 * 256)
     assert(scale < 32768);
     assert((num % 8) == 0);
-    assert(shift <= 6);
+    assert(shift <= 10);
 
     int add, coeffQ;
 
diff -r 116d91f08fcb -r 543390a8644c source/common/pixel.cpp
--- a/source/common/pixel.cpp	Tue Nov 26 14:19:27 2013 +0800
+++ b/source/common/pixel.cpp	Tue Nov 26 10:47:07 2013 -0600
@@ -618,17 +618,17 @@ void frame_init_lowres_core(pixel *src0,
 }
 
 /* structural similarity metric */
-void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, ssim_t sums[2][4])
+void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4])
 {
     for (int z = 0; z < 2; z++)
     {
-        ssim_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
+        uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
         for (int y = 0; y < 4; y++)
         {
             for (int x = 0; x < 4; x++)
             {
-                ssim_t a = pix1[x + y * stride1];
-                ssim_t b = pix2[x + y * stride2];
+                int a = pix1[x + y * stride1];
+                int b = pix2[x + y * stride2];
                 s1 += a;
                 s2 += b;
                 ss += a * a;
@@ -646,19 +646,34 @@ void ssim_4x4x2_core(const pixel *pix1, 
     }
 }
 
-float ssim_end_1(ssim_t s1, ssim_t s2, ssim_t ss, ssim_t s12)
+float ssim_end_1(int s1, int s2, int ss, int s12)
 {
-    static const uint32_t pixelMax = (1 << X265_DEPTH) - 1;
-    static const ssim_t ssim_c1 = (ssim_t)(.01 * .01 * pixelMax * pixelMax * 64 + .5);
-    static const ssim_t ssim_c2 = (ssim_t)(.03 * .03 * pixelMax * pixelMax * 64 * 63 + .5);
-    ssim_t vars = ss * 64 - s1 * s1 - s2 * s2;
-    ssim_t covar = s12 * 64 - s1 * s2;
-
-    return (float)(2 * s1 * s2 + ssim_c1) * (float)(2 * covar + ssim_c2)
-           / ((float)(s1 * s1 + s2 * s2 + ssim_c1) * (float)(vars + ssim_c2));
+/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
+ * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
+ * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
+#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
+#if HIGH_BIT_DEPTH
+#define type float
+    static const float ssim_c1 = (float)(.01*.01*PIXEL_MAX*PIXEL_MAX*64);
+    static const float ssim_c2 = (float)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63);
+#else
+#define type int
+    static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
+    static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
+#endif
+    type fs1 = s1;
+    type fs2 = s2;
+    type fss = ss;
+    type fs12 = s12;
+    type vars = fss*64 - fs1*fs1 - fs2*fs2;
+    type covar = fs12*64 - fs1*fs2;
+    return (float)(2*fs1*fs2 + ssim_c1) * (float)(2*covar + ssim_c2)
+         / ((float)(fs1*fs1 + fs2*fs2 + ssim_c1) * (float)(vars + ssim_c2));
+#undef type
+#undef PIXEL_MAX
 }
 
-float ssim_end_4(ssim_t sum0[5][4], ssim_t sum1[5][4], int width)
+float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
 {
     float ssim = 0.0;
 
@@ -673,14 +688,14 @@ float ssim_end_4(ssim_t sum0[5][4], ssim
     return ssim;
 }
 
-template<int w, int h>
+template<int size>
 uint64_t pixel_var(pixel *pix, intptr_t i_stride)
 {
     uint32_t sum = 0, sqr = 0;
 
-    for (int y = 0; y < h; y++)
+    for (int y = 0; y < size; y++)
     {
-        for (int x = 0; x < w; x++)
+        for (int x = 0; x < size; x++)
         {
             sum += pix[x];
             sqr += pix[x] * pix[x];
@@ -968,17 +983,8 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.ssim_4x4x2_core = ssim_4x4x2_core;
     p.ssim_end_4 = ssim_end_4;
 
-    p.var[LUMA_8x4] = pixel_var<8, 4>;
-    p.var[LUMA_8x8] = pixel_var<8, 8>;
-    p.var[LUMA_8x16] = pixel_var<8, 16>;
-    p.var[LUMA_8x32] = pixel_var<8, 32>;
-    p.var[LUMA_16x4] = pixel_var<16, 4>;
-    p.var[LUMA_16x8] = pixel_var<16, 8>;
-    p.var[LUMA_16x12] = pixel_var<16, 12>;
-    p.var[LUMA_16x16] = pixel_var<16, 16>;
-    p.var[LUMA_16x32] = pixel_var<16, 32>;
-    p.var[LUMA_16x64] = pixel_var<16, 64>;
-
+    p.var[BLOCK_8x8] = pixel_var<8>;
+    p.var[BLOCK_16x16] = pixel_var<16>;
     p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
 }
 }
diff -r 116d91f08fcb -r 543390a8644c source/common/primitives.h
--- a/source/common/primitives.h	Tue Nov 26 14:19:27 2013 +0800
+++ b/source/common/primitives.h	Tue Nov 26 10:47:07 2013 -0600
@@ -50,13 +50,11 @@ typedef uint16_t pixel;
 typedef uint32_t sum_t;
 typedef uint64_t sum2_t;
 typedef uint64_t pixel4;
-typedef int64_t ssim_t;
 #else
 typedef uint8_t pixel;
 typedef uint16_t sum_t;
 typedef uint32_t sum2_t;
 typedef uint32_t pixel4;
-typedef int32_t ssim_t;
 #endif // if HIGH_BIT_DEPTH
 
 namespace x265 {
@@ -187,8 +185,8 @@ typedef void (*scale_t)(pixel *dst, pixe
 typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
                             intptr_t src_stride, intptr_t dst_stride, int width, int height);
 typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX);
-typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, ssim_t sums[2][4]);
-typedef float (*ssim_end4_t)(ssim_t sum0[5][4], ssim_t sum1[5][4], int width);
+typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]);
+typedef float (*ssim_end4_t)(int sum0[5][4], int sum1[5][4], int width);
 typedef uint64_t (*var_t)(pixel *pix, intptr_t stride);
 typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src,  intptr_t srcStride, int w, int h);
 
@@ -268,7 +266,7 @@ struct EncoderPrimitives
     calcrecon_t     calcrecon[NUM_SQUARE_BLOCKS];
     transpose_t     transpose[NUM_SQUARE_BLOCKS];
 
-    var_t           var[NUM_LUMA_PARTITIONS];
+    var_t           var[NUM_SQUARE_BLOCKS];
     ssim_4x4x2_core_t ssim_4x4x2_core;
     ssim_end4_t     ssim_end_4;
 
diff -r 116d91f08fcb -r 543390a8644c source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp	Tue Nov 26 14:19:27 2013 +0800
+++ b/source/common/vec/dct-sse3.cpp	Tue Nov 26 10:47:07 2013 -0600
@@ -41,87 +41,6 @@ using namespace x265;
 
 namespace {
 #if !HIGH_BIT_DEPTH
-ALIGN_VAR_32(static const int16_t, tab_dct_4[][8]) =
-{
-    { 64, 64, 64, 64, 64, 64, 64, 64 },
-    { 83, 36, 83, 36, 83, 36, 83, 36 },
-    { 64, -64, 64, -64, 64, -64, 64, -64 },
-    { 36, -83, 36, -83, 36, -83, 36, -83 },
-};
-void dct4(int16_t *src, int32_t *dst, intptr_t stride)
-{
-    // Const
-    __m128i c_1         = _mm_set1_epi32(1);
-    __m128i c_128       = _mm_set1_epi32(128);
-
-    __m128i T20, T21;
-    __m128i T30, T31, T32, T33;
-    __m128i T40, T41, T50, T51, T60, T61, T62, T63, T70, T71, T72, T73;
-    __m128i T50_, T51_;
-
-    __m128i T10  = _mm_loadl_epi64((__m128i*)&src[0 * stride]);
-    __m128i T11  = _mm_loadl_epi64((__m128i*)&src[1 * stride]);
-    __m128i T12  = _mm_loadl_epi64((__m128i*)&src[2 * stride]);
-    __m128i T13  = _mm_loadl_epi64((__m128i*)&src[3 * stride]);
-
-    T20  = _mm_unpacklo_epi64(T10, T11);
-    T21  = _mm_unpacklo_epi64(T12, T13);
-
-    // DCT1
-    T30  = _mm_shuffle_epi32(T20, 0xD8);        // [13 12 03 02 11 10 01 00]
-    T31  = _mm_shuffle_epi32(T21, 0xD8);        // [33 32 23 22 31 30 21 20]
-    T32  = _mm_shufflehi_epi16(T30, 0xB1);      // [12 13 02 03 11 10 01 00]
-    T33  = _mm_shufflehi_epi16(T31, 0xB1);      // [32 33 22 23 31 30 21 20]
-
-    T40  = _mm_unpacklo_epi64(T32, T33);        // [31 30 21 20 11 10 01 00]
-    T41  = _mm_unpackhi_epi64(T32, T33);        // [32 33 22 23 12 13 02 03]
-    T50  = _mm_add_epi16(T40, T41);             // [1+2 0+3]
-    T51  = _mm_sub_epi16(T40, T41);             // [1-2 0-3]
-    T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_4[0])); // [ 64*s12 + 64*s03] = [03 02 01 00]
-    T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_4[1])); // [ 36*d12 + 83*d03] = [13 12 11 10]
-    T62  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_4[2])); // [-64*s12 + 64*s03] = [23 22 21 20]
-    T63  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_4[3])); // [-83*d12 + 36*d03] = [33 32 31 30]
-    T70  = _mm_srai_epi32(_mm_add_epi32(T60, c_1), 1);  // [30 20 10 00]
-    T71  = _mm_srai_epi32(_mm_add_epi32(T61, c_1), 1);  // [31 21 11 01]
-    T72  = _mm_srai_epi32(_mm_add_epi32(T62, c_1), 1);  // [32 22 12 02]
-    T73  = _mm_srai_epi32(_mm_add_epi32(T63, c_1), 1);  // [33 23 13 03]
-
-    // Transpose
-    T20  = _mm_packs_epi32(T70, T71);       // [13 12 11 10 03 02 01 00]
-    T21  = _mm_packs_epi32(T72, T73);       // [33 32 31 30 23 22 21 20]
-
-    T30  = _mm_shuffle_epi32(T20, 0xD8);        // [13 12 03 02 11 10 01 00]
-    T31  = _mm_shuffle_epi32(T21, 0xD8);        // [33 32 23 22 31 30 21 20]
-    T32  = _mm_shufflehi_epi16(T30, 0xB1);      // [12 13 02 03 11 10 01 00]
-    T33  = _mm_shufflehi_epi16(T31, 0xB1);      // [32 33 22 23 31 30 21 20]
-
-    T40  = _mm_unpacklo_epi64(T32, T33);        // [31 30 21 20 11 10 01 00]
-    T41  = _mm_unpackhi_epi64(T32, T33);        // [32 33 22 23 12 13 02 03]
-
-    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4[0]));
-    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4[0]));
-    T60  = _mm_add_epi32(T50_, T51_);
-    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4[1]));
-    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4[1]));
-    T61  = _mm_sub_epi32(T50_, T51_);
-    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4[2]));
-    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4[2]));
-    T62  = _mm_add_epi32(T50_, T51_);
-    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4[3]));
-    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4[3]));
-    T63  = _mm_sub_epi32(T50_, T51_);
-
-    T70  = _mm_srai_epi32(_mm_add_epi32(T60, c_128), 8);  // [30 20 10 00]
-    T71  = _mm_srai_epi32(_mm_add_epi32(T61, c_128), 8);  // [31 21 11 01]
-    T72  = _mm_srai_epi32(_mm_add_epi32(T62, c_128), 8);  // [32 22 12 02]
-    T73  = _mm_srai_epi32(_mm_add_epi32(T63, c_128), 8);  // [33 23 13 03]
-
-    _mm_storeu_si128((__m128i*)&dst[0 * 4], T70);
-    _mm_storeu_si128((__m128i*)&dst[1 * 4], T71);
-    _mm_storeu_si128((__m128i*)&dst[2 * 4], T72);
-    _mm_storeu_si128((__m128i*)&dst[3 * 4], T73);
-}
-
 ALIGN_VAR_32(static const int16_t, tab_idct_4x4[4][8]) =
 {
     { 64,  64, 64,  64, 64,  64, 64,  64 },
@@ -1737,7 +1656,6 @@ namespace x265 {
 void Setup_Vec_DCTPrimitives_sse3(EncoderPrimitives &p)
 {
 #if !HIGH_BIT_DEPTH
-    p.dct[DCT_4x4] = dct4;
     p.idct[IDCT_4x4] = idct4;
     p.idct[IDCT_8x8] = idct8;
     p.idct[IDCT_16x16] = idct16;
diff -r 116d91f08fcb -r 543390a8644c source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp	Tue Nov 26 14:19:27 2013 +0800
+++ b/source/common/vec/pixel-sse41.cpp	Tue Nov 26 10:47:07 2013 -0600
@@ -33,94 +33,6 @@ using namespace x265;
 
 namespace {
 #if !HIGH_BIT_DEPTH
-void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
-{
-    __m128i w00, roundoff, ofs, fs, tmpsrc, tmpdst, tmp, sign;
-    int x, y;
-
-    w00 = _mm_set1_epi32(w0);
-    ofs = _mm_set1_epi32(IF_INTERNAL_OFFS);
-    fs = _mm_set1_epi32(offset);
-    roundoff = _mm_set1_epi32(round);
-    for (y = height - 1; y >= 0; y--)
-    {
-        for (x = 0; x <= width - 4; x += 4)
-        {
-            tmpsrc = _mm_loadl_epi64((__m128i*)(src + x));
-            sign = _mm_srai_epi16(tmpsrc, 15);
-            tmpsrc = _mm_unpacklo_epi16(tmpsrc, sign);
-            tmpdst = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(w00, _mm_add_epi32(tmpsrc, ofs)), roundoff), shift), fs);
-            *(uint32_t*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(tmpdst, tmpdst), _mm_setzero_si128()));
-        }
-
-        if (width > x)
-        {


More information about the x265-commits mailing list