[x265-commits] [x265] asm: avx2 code for denoiseDct for 10 bpp

Sumalatha at videolan.org Sumalatha at videolan.org
Fri Jun 5 16:23:19 CEST 2015


details:   http://hg.videolan.org/x265/rev/1ea6ca251774
branches:  
changeset: 10569:1ea6ca251774
user:      Sumalatha Polureddy
date:      Thu Jun 04 12:37:07 2015 +0530
description:
asm: avx2 code for denoiseDct for 10 bpp

denoiseDct              13.20x   4118.75         54357.62
Subject: [x265] asm: avx2 code for chroma sse_pp[16x16, 32x32] for i420

details:   http://hg.videolan.org/x265/rev/62adf0ef875f
branches:  
changeset: 10570:62adf0ef875f
user:      Sumalatha Polureddy
date:      Thu Jun 04 15:25:47 2015 +0530
description:
asm: avx2 code for chroma sse_pp[16x16, 32x32] for i420

[i420] sse_pp[16x16]  7.34x    363.66          2668.39
[i420] sse_pp[32x32]  8.76x    1059.07         9281.41
Subject: [x265] asm: chroma_hps[6x8, 6x16] for high bit depth

details:   http://hg.videolan.org/x265/rev/1b7024b575ee
branches:  
changeset: 10571:1b7024b575ee
user:      Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
date:      Thu Jun 04 20:08:24 2015 +0530
description:
asm: chroma_hps[6x8, 6x16] for high bit depth

chroma_hps[ 6x16] - 3.57x   1825c->1716c
chroma_hps[  6x8] - 3.46x   1104c->1005c
Subject: [x265] asm: sse4 version of costCoeffGroupNxN in codeCoeffNxN

details:   http://hg.videolan.org/x265/rev/a15a51503059
branches:  
changeset: 10572:a15a51503059
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jun 04 12:13:05 2015 -0700
description:
asm: sse4 version of costCoeffGroupNxN in codeCoeffNxN
Subject: [x265] asm: reuse costCoeffGroupNxN in 4x4 path

details:   http://hg.videolan.org/x265/rev/5829e02a3dc0
branches:  
changeset: 10573:5829e02a3dc0
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jun 04 12:13:09 2015 -0700
description:
asm: reuse costCoeffGroupNxN in 4x4 path
Subject: [x265] merge ctxIndMap4x4[] into tab_cnt[][]

details:   http://hg.videolan.org/x265/rev/a6fcf4ee8f39
branches:  
changeset: 10574:a6fcf4ee8f39
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jun 04 12:13:12 2015 -0700
description:
merge ctxIndMap4x4[] into tab_cnt[][]
Subject: [x265] merge NxN and 4x4 fast RD path

details:   http://hg.videolan.org/x265/rev/fe5392b12367
branches:  
changeset: 10575:fe5392b12367
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jun 04 12:13:14 2015 -0700
description:
merge NxN and 4x4 fast RD path
Subject: [x265] move tmpCoeff outside fast RD path

details:   http://hg.videolan.org/x265/rev/94814022bb01
branches:  
changeset: 10576:94814022bb01
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jun 04 12:13:17 2015 -0700
description:
move tmpCoeff outside fast RD path
Subject: [x265] asm: AVX2 of sa8d[32x32], 7.6K -> 6.7K cycles

details:   http://hg.videolan.org/x265/rev/3152841223df
branches:  
changeset: 10577:3152841223df
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jun 04 12:13:20 2015 -0700
description:
asm: AVX2 of sa8d[32x32], 7.6K -> 6.7K cycles
Subject: [x265] asm: filterPixelToShort 8-bit and 10-bit sse2

details:   http://hg.videolan.org/x265/rev/c066147e3de4
branches:  
changeset: 10578:c066147e3de4
user:      David T Yuen <dtyx265 at gmail.com>
date:      Thu Jun 04 18:04:06 2015 -0700
description:
asm: filterPixelToShort 8-bit and 10-bit sse2

This replaces c code for all of filterPixelToShort for 8 and 10 bit.

64-bit

./test/TestBench --testbench interp | grep p2s
convert_p2s[  8x8]		2.82x 	 397.50   	 1119.92
convert_p2s[16x16]		3.04x 	 1347.50  	 4102.63
convert_p2s[32x32]		1.41x 	 5197.50  	 7332.50
convert_p2s[64x64]		1.26x 	 20588.66 	 25962.67
convert_p2s[  8x4]		2.52x 	 229.99   	 580.00
convert_p2s[  4x8]		2.22x 	 279.99   	 622.38
convert_p2s[ 16x8]		2.74x 	 710.00   	 1944.96
convert_p2s[ 8x16]		3.09x 	 730.00   	 2254.97
convert_p2s[32x16]		1.43x 	 2630.20  	 3774.20
convert_p2s[16x32]		3.07x 	 2630.01  	 8064.97
convert_p2s[64x32]		1.28x 	 10307.52 	 13162.50
convert_p2s[32x64]		1.40x 	 10307.60 	 14382.50
convert_p2s[16x12]		3.05x 	 1027.50  	 3134.97
convert_p2s[12x16]		2.78x 	 1115.00  	 3100.73
convert_p2s[ 16x4]		2.56x 	 387.50   	 990.06
convert_p2s[ 4x16]		2.43x 	 490.02   	 1190.04
convert_p2s[32x24]		1.42x 	 3909.22  	 5532.57
convert_p2s[24x32]		3.94x 	 3907.50  	 15387.65
convert_p2s[ 32x8]		1.37x 	 1347.49  	 1848.57
convert_p2s[ 8x32]		3.31x 	 1390.00  	 4596.10
convert_p2s[64x48]		1.27x 	 15455.30 	 19562.58
convert_p2s[48x64]		1.30x 	 15428.06 	 20132.50
convert_p2s[64x16]		1.28x 	 5192.50  	 6669.05
convert_p2s[16x64]		3.05x 	 5197.80  	 15855.29
chroma_p2s[  4x4]		1.89x 	 177.50   	 334.95
chroma_p2s[  8x8]		2.82x 	 397.50   	 1119.83
chroma_p2s[16x16]		3.05x 	 1347.51  	 4105.62
chroma_p2s[32x32]		1.41x 	 5187.50  	 7334.46
chroma_p2s[  4x2]		1.67x 	 115.04   	 192.48
chroma_p2s[  2x4]		1.16x 	 184.99   	 214.98
chroma_p2s[  8x4]		2.55x 	 227.50   	 580.01
chroma_p2s[  4x8]		2.24x 	 277.50   	 622.42
chroma_p2s[ 16x8]		2.75x 	 707.50   	 1945.02
chroma_p2s[ 8x16]		2.92x 	 772.45   	 2254.91
chroma_p2s[32x16]		1.44x 	 2627.50  	 3772.50
chroma_p2s[16x32]		3.07x 	 2627.50  	 8065.07
chroma_p2s[  8x6]		2.74x 	 307.59   	 842.44
chroma_p2s[  6x8]		1.71x 	 507.50   	 870.00
chroma_p2s[  8x2]		2.02x 	 147.50   	 297.50
chroma_p2s[  2x8]		1.12x 	 307.50   	 344.96
chroma_p2s[16x12]		3.05x 	 1027.50  	 3134.97
chroma_p2s[12x16]		2.79x 	 1112.50  	 3100.21
chroma_p2s[ 16x4]		2.56x 	 387.50   	 990.19
chroma_p2s[ 4x16]		2.43x 	 489.99   	 1192.44
chroma_p2s[32x24]		1.42x 	 3907.50  	 5533.21
chroma_p2s[24x32]		3.93x 	 3923.72  	 15427.02
chroma_p2s[ 32x8]		1.37x 	 1347.50  	 1850.34
chroma_p2s[ 8x32]		3.31x 	 1387.50  	 4595.20
chroma_p2s[  4x8]		2.22x 	 277.50   	 617.42
chroma_p2s[ 8x16]		3.09x 	 727.50   	 2247.50
chroma_p2s[16x32]		3.07x 	 2627.50  	 8065.15
chroma_p2s[32x64]		1.40x 	 10307.58 	 14385.50
chroma_p2s[  4x4]		1.87x 	 177.50   	 332.51
chroma_p2s[  2x8]		1.11x 	 307.50   	 342.46
chroma_p2s[  8x8]		2.80x 	 397.50   	 1112.50
chroma_p2s[ 4x16]		2.43x 	 489.99   	 1192.49
chroma_p2s[16x16]		3.05x 	 1347.74  	 4104.98
chroma_p2s[ 8x32]		3.31x 	 1387.50  	 4595.44
chroma_p2s[32x32]		1.41x 	 5197.55  	 7332.50
chroma_p2s[16x64]		3.05x 	 5197.60  	 15855.21
chroma_p2s[ 8x12]		2.92x 	 557.50   	 1627.41
chroma_p2s[ 6x16]		1.79x 	 1002.50  	 1797.47
chroma_p2s[  8x4]		2.55x 	 227.50   	 580.01
chroma_p2s[ 2x16]		1.09x 	 602.49   	 657.49
chroma_p2s[16x24]		3.05x 	 1987.50  	 6054.97
chroma_p2s[12x32]		2.81x 	 2170.00  	 6095.56
chroma_p2s[ 16x8]		2.75x 	 707.50   	 1944.97
chroma_p2s[ 4x32]		3.09x 	 877.50   	 2707.95
chroma_p2s[32x48]		1.40x 	 7757.54  	 10862.72
chroma_p2s[24x64]		3.95x 	 7757.50  	 30663.70
chroma_p2s[32x16]		1.44x 	 2627.50  	 3773.21
chroma_p2s[ 8x64]		3.30x 	 2717.50  	 8955.97
chroma_p2s[  4x4]		1.89x 	 177.50   	 334.94
chroma_p2s[  8x8]		2.82x 	 397.50   	 1119.95
chroma_p2s[16x16]		3.05x 	 1347.50  	 4105.23
chroma_p2s[32x32]		1.41x 	 5197.52  	 7332.50
chroma_p2s[64x64]		1.25x 	 20722.45 	 25962.96
chroma_p2s[  8x4]		2.57x 	 227.50   	 584.01
chroma_p2s[  4x8]		2.23x 	 277.49   	 617.44
chroma_p2s[ 16x8]		2.75x 	 707.57   	 1945.83
chroma_p2s[ 8x16]		3.08x 	 729.99   	 2247.50
chroma_p2s[32x16]		1.44x 	 2627.50  	 3772.50
chroma_p2s[16x32]		3.07x 	 2627.50  	 8064.97
chroma_p2s[64x32]		1.28x 	 10307.86 	 13162.50
chroma_p2s[32x64]		1.40x 	 10307.68 	 14385.40
chroma_p2s[16x12]		3.05x 	 1027.50  	 3135.10
chroma_p2s[12x16]		2.79x 	 1112.50  	 3100.94
chroma_p2s[ 16x4]		2.57x 	 387.50   	 994.88
chroma_p2s[ 4x16]		2.43x 	 489.99   	 1192.44
chroma_p2s[32x24]		1.42x 	 3907.82  	 5532.55
chroma_p2s[24x32]		3.95x 	 3907.50  	 15422.51
chroma_p2s[ 32x8]		1.37x 	 1347.50  	 1849.83
chroma_p2s[ 8x32]		3.31x 	 1387.50  	 4594.97
chroma_p2s[64x48]		1.27x 	 15458.83 	 19562.50
chroma_p2s[48x64]		1.30x 	 15427.81 	 20132.50
chroma_p2s[64x16]		1.28x 	 5187.50  	 6662.50
chroma_p2s[16x64]		3.05x 	 5197.50  	 15855.09

32-bit

./test/TestBench --testbench interp | grep p2s
convert_p2s[  8x8]		2.55x 	 484.99   	 1237.46
convert_p2s[16x16]		2.89x 	 1444.98  	 4174.96
convert_p2s[32x32]		6.70x 	 5295.00  	 35485.83
convert_p2s[64x64]		6.41x 	 20695.04 	 132575.69
convert_p2s[  8x4]		2.06x 	 325.00   	 669.93
convert_p2s[  4x8]		1.93x 	 360.08   	 694.95
convert_p2s[ 16x8]		2.52x 	 805.00   	 2032.47
convert_p2s[ 8x16]		2.83x 	 820.00   	 2319.99
convert_p2s[32x16]		6.55x 	 2725.00  	 17855.75
convert_p2s[16x32]		3.00x 	 2725.00  	 8185.23
convert_p2s[64x32]		6.36x 	 10405.41 	 66205.67
convert_p2s[32x64]		6.82x 	 10404.99 	 70948.87
convert_p2s[16x12]		2.84x 	 1124.99  	 3197.47
convert_p2s[12x16]		2.61x 	 1207.52  	 3155.00
convert_p2s[ 16x4]		2.29x 	 485.00   	 1112.42
convert_p2s[ 4x16]		2.22x 	 564.99   	 1252.46
convert_p2s[32x24]		6.66x 	 4005.00  	 26655.58
convert_p2s[24x32]		6.99x 	 4005.00  	 28003.99
convert_p2s[ 32x8]		6.27x 	 1445.00  	 9054.99
convert_p2s[ 8x32]		3.18x 	 1475.00  	 4687.54
convert_p2s[64x48]		6.40x 	 15525.12 	 99348.35
convert_p2s[48x64]		6.54x 	 15525.42 	 101593.84
convert_p2s[64x16]		6.33x 	 5285.10  	 33457.09
convert_p2s[16x64]		3.01x 	 5295.02  	 15944.96
chroma_p2s[  4x4]		1.64x 	 257.49   	 422.54
chroma_p2s[  8x8]		2.55x 	 484.99   	 1237.48
chroma_p2s[16x16]		2.89x 	 1444.99  	 4174.94
chroma_p2s[32x32]		6.70x 	 5295.02  	 35485.08
chroma_p2s[  4x2]		1.33x 	 212.48   	 282.52
chroma_p2s[  2x4]		0.98x 	 272.49   	 267.50
chroma_p2s[  8x4]		2.06x 	 325.00   	 669.99
chroma_p2s[  4x8]		1.93x 	 360.00   	 694.98
chroma_p2s[ 16x8]		2.52x 	 805.00   	 2032.51
chroma_p2s[ 8x16]		2.83x 	 820.15   	 2319.98
chroma_p2s[32x16]		6.55x 	 2725.00  	 17855.50
chroma_p2s[16x32]		3.00x 	 2725.00  	 8184.98
chroma_p2s[  8x6]		2.33x 	 402.50   	 937.48
chroma_p2s[  6x8]		1.71x 	 585.00   	 1000.03
chroma_p2s[  8x2]		1.62x 	 242.49   	 392.50
chroma_p2s[  2x8]		1.15x 	 375.00   	 432.50
chroma_p2s[16x12]		2.84x 	 1124.99  	 3194.98
chroma_p2s[12x16]		2.63x 	 1200.00  	 3155.18
chroma_p2s[ 16x4]		2.29x 	 485.07   	 1112.51
chroma_p2s[ 4x16]		2.18x 	 562.54   	 1224.98
chroma_p2s[32x24]		6.66x 	 4005.01  	 26660.21
chroma_p2s[24x32]		6.94x 	 4005.05  	 27805.24
chroma_p2s[ 32x8]		6.27x 	 1445.00  	 9054.99
chroma_p2s[ 8x32]		3.18x 	 1475.00  	 4687.52
chroma_p2s[  4x8]		2.01x 	 360.00   	 722.48
chroma_p2s[ 8x16]		2.83x 	 820.00   	 2319.98
chroma_p2s[16x32]		3.00x 	 2725.07  	 8185.31
chroma_p2s[32x64]		6.79x 	 10405.07 	 70686.27
chroma_p2s[  4x4]		1.63x 	 259.98   	 422.48
chroma_p2s[  2x8]		1.12x 	 385.00   	 432.50
chroma_p2s[  8x8]		2.55x 	 485.00   	 1237.48
chroma_p2s[ 4x16]		2.18x 	 562.49   	 1224.98
chroma_p2s[16x16]		2.89x 	 1445.10  	 4174.98
chroma_p2s[ 8x32]		3.18x 	 1475.00  	 4687.56
chroma_p2s[32x32]		6.70x 	 5295.13  	 35485.12
chroma_p2s[16x64]		3.01x 	 5295.00  	 15945.08
chroma_p2s[ 8x12]		2.71x 	 655.09   	 1774.92
chroma_p2s[ 6x16]		1.80x 	 1057.50  	 1899.96
chroma_p2s[  8x4]		2.06x 	 325.00   	 669.99
chroma_p2s[ 2x16]		1.06x 	 659.99   	 697.48
chroma_p2s[16x24]		2.93x 	 2085.00  	 6115.20
chroma_p2s[12x32]		2.72x 	 2250.00  	 6125.25
chroma_p2s[ 16x8]		2.52x 	 805.03   	 2032.57
chroma_p2s[ 4x32]		2.85x 	 957.54   	 2725.27
chroma_p2s[32x48]		6.79x 	 7855.06  	 53302.55
chroma_p2s[24x64]		7.04x 	 7855.01  	 55325.50
chroma_p2s[32x16]		6.55x 	 2725.00  	 17855.12
chroma_p2s[ 8x64]		3.23x 	 2805.32  	 9050.32
chroma_p2s[  4x4]		1.63x 	 259.99   	 422.49
chroma_p2s[  8x8]		2.55x 	 487.49   	 1245.00
chroma_p2s[16x16]		2.89x 	 1445.00  	 4175.20
chroma_p2s[32x32]		6.74x 	 5295.00  	 35670.43
chroma_p2s[64x64]		6.40x 	 20696.68 	 132486.70
chroma_p2s[  8x4]		2.06x 	 325.00   	 669.99
chroma_p2s[  4x8]		1.93x 	 360.03   	 694.99
chroma_p2s[ 16x8]		2.52x 	 805.00   	 2032.49
chroma_p2s[ 8x16]		2.83x 	 820.14   	 2320.16
chroma_p2s[32x16]		6.55x 	 2724.98  	 17855.27
chroma_p2s[16x32]		3.00x 	 2725.10  	 8185.09
chroma_p2s[64x32]		6.39x 	 10405.02 	 66479.45
chroma_p2s[32x64]		6.81x 	 10414.99 	 70945.01
chroma_p2s[16x12]		2.84x 	 1125.00  	 3194.98
chroma_p2s[12x16]		2.63x 	 1200.13  	 3155.00
chroma_p2s[ 16x4]		2.29x 	 485.00   	 1112.48
chroma_p2s[ 4x16]		2.18x 	 562.49   	 1224.98
chroma_p2s[32x24]		6.66x 	 4005.17  	 26655.93
chroma_p2s[24x32]		6.99x 	 4005.01  	 28008.09
chroma_p2s[ 32x8]		6.27x 	 1444.99  	 9060.05
chroma_p2s[ 8x32]		3.18x 	 1474.99  	 4687.52
chroma_p2s[64x48]		6.40x 	 15525.01 	 99400.07
chroma_p2s[48x64]		6.55x 	 15525.09 	 101699.22
chroma_p2s[64x16]		6.28x 	 5285.00  	 33215.04
chroma_p2s[16x64]		3.01x 	 5295.16  	 15944.98

10-bit

./test/TestBench --testbench interp | grep p2s
convert_p2s[  8x8]		3.48x 	 369.50   	 1284.66
convert_p2s[16x16]		3.13x 	 1309.98  	 4095.75
convert_p2s[32x32]		3.63x 	 5150.01  	 18697.50
convert_p2s[64x64]		3.61x 	 20510.21 	 74113.83
convert_p2s[  8x4]		2.80x 	 219.99   	 614.97
convert_p2s[  4x8]		3.22x 	 212.49   	 684.96
convert_p2s[ 16x8]		3.02x 	 669.99   	 2022.54
convert_p2s[ 8x16]		3.51x 	 674.98   	 2366.56
convert_p2s[32x16]		3.67x 	 2589.99  	 9497.50
convert_p2s[16x32]		3.09x 	 2589.98  	 8015.87
convert_p2s[64x32]		3.58x 	 10270.03 	 36767.52
convert_p2s[32x64]		3.61x 	 10271.21 	 37097.50
convert_p2s[16x12]		3.17x 	 990.00   	 3136.46
convert_p2s[12x16]		3.15x 	 987.49   	 3112.45
convert_p2s[ 16x4]		2.86x 	 357.50   	 1022.46
convert_p2s[ 4x16]		3.73x 	 354.95   	 1324.90
convert_p2s[32x24]		3.64x 	 3869.97  	 14097.50
convert_p2s[24x32]		3.63x 	 3872.45  	 14068.67
convert_p2s[ 32x8]		3.60x 	 1309.98  	 4720.05
convert_p2s[ 8x32]		3.79x 	 1314.99  	 4980.84
convert_p2s[64x48]		3.57x 	 15391.05 	 55007.52
convert_p2s[48x64]		3.60x 	 15393.61 	 55424.42
convert_p2s[64x16]		3.61x 	 5150.39  	 18616.89
convert_p2s[16x64]		3.06x 	 5149.98  	 15776.41
chroma_p2s[  4x4]		2.53x 	 137.50   	 347.48
chroma_p2s[  8x8]		3.50x 	 367.50   	 1284.99
chroma_p2s[16x16]		3.13x 	 1307.50  	 4094.98
chroma_p2s[32x32]		3.67x 	 5147.55  	 18867.50
chroma_p2s[  4x2]		2.27x 	 89.29    	 203.13
chroma_p2s[  2x4]		1.51x 	 147.50   	 222.48
chroma_p2s[  8x4]		2.83x 	 217.50   	 615.14
chroma_p2s[  4x8]		3.30x 	 207.61   	 684.99
chroma_p2s[ 16x8]		2.98x 	 670.02   	 1999.99
chroma_p2s[ 8x16]		3.52x 	 672.50   	 2365.32
chroma_p2s[32x16]		3.67x 	 2587.50  	 9497.50
chroma_p2s[16x32]		3.10x 	 2587.50  	 8015.44
chroma_p2s[  8x6]		3.18x 	 297.50   	 944.97
chroma_p2s[  6x8]		2.70x 	 347.52   	 937.57
chroma_p2s[  8x2]		2.25x 	 147.57   	 332.45
chroma_p2s[  2x8]		1.76x 	 217.50   	 382.49
chroma_p2s[16x12]		3.18x 	 987.50   	 3135.74
chroma_p2s[12x16]		3.16x 	 985.00   	 3112.70
chroma_p2s[ 16x4]		2.84x 	 357.50   	 1014.98
chroma_p2s[ 4x16]		3.86x 	 343.33   	 1324.94
chroma_p2s[32x24]		3.65x 	 3867.50  	 14097.50
chroma_p2s[24x32]		3.63x 	 3869.99  	 14065.64
chroma_p2s[ 32x8]		3.61x 	 1307.50  	 4720.05
chroma_p2s[ 8x32]		3.79x 	 1312.50  	 4980.54
chroma_p2s[  4x8]		3.30x 	 207.50   	 684.95
chroma_p2s[ 8x16]		3.52x 	 672.50   	 2365.13
chroma_p2s[16x32]		2.91x 	 2756.49  	 8015.58
chroma_p2s[32x64]		3.65x 	 10267.50 	 37511.57
chroma_p2s[  4x4]		2.53x 	 137.50   	 347.49
chroma_p2s[  2x8]		1.76x 	 217.50   	 382.49
chroma_p2s[  8x8]		3.50x 	 367.50   	 1284.99
chroma_p2s[ 4x16]		3.86x 	 343.33   	 1324.96
chroma_p2s[16x16]		3.13x 	 1307.58  	 4095.93
chroma_p2s[ 8x32]		2.89x 	 1750.02  	 5052.14
chroma_p2s[32x32]		3.63x 	 5147.50  	 18697.50
chroma_p2s[16x64]		3.06x 	 5147.52  	 15775.37
chroma_p2s[ 8x12]		3.65x 	 517.50   	 1887.48
chroma_p2s[ 6x16]		2.90x 	 660.01   	 1912.45
chroma_p2s[  8x4]		2.83x 	 217.50   	 614.98
chroma_p2s[ 2x16]		1.82x 	 376.25   	 684.98
chroma_p2s[16x24]		3.11x 	 1947.58  	 6054.96
chroma_p2s[12x32]		3.08x 	 1945.00  	 5993.06
chroma_p2s[ 16x8]		2.98x 	 670.15   	 1999.99
chroma_p2s[ 4x32]		3.74x 	 741.68   	 2772.84
chroma_p2s[32x48]		3.62x 	 7707.50  	 27897.86
chroma_p2s[24x64]		3.61x 	 7707.50  	 27826.29
chroma_p2s[32x16]		3.67x 	 2587.50  	 9499.59
chroma_p2s[ 8x64]		3.76x 	 2592.50  	 9741.75
chroma_p2s[  4x4]		2.53x 	 137.50   	 347.47
chroma_p2s[  8x8]		3.50x 	 367.50   	 1284.47
chroma_p2s[16x16]		3.13x 	 1307.50  	 4095.38
chroma_p2s[32x32]		3.63x 	 5147.51  	 18697.65
chroma_p2s[64x64]		3.58x 	 20511.86 	 73401.03
chroma_p2s[  8x4]		2.83x 	 217.50   	 615.10
chroma_p2s[  4x8]		3.30x 	 207.50   	 684.95
chroma_p2s[ 16x8]		2.99x 	 670.00   	 1999.99
chroma_p2s[ 8x16]		3.52x 	 672.50   	 2365.32
chroma_p2s[32x16]		3.67x 	 2587.50  	 9497.50
chroma_p2s[16x32]		3.10x 	 2587.53  	 8015.30
chroma_p2s[64x32]		3.58x 	 10267.54 	 36767.52
chroma_p2s[32x64]		3.60x 	 10311.38 	 37097.81
chroma_p2s[16x12]		3.17x 	 987.62   	 3135.36
chroma_p2s[12x16]		3.16x 	 985.00   	 3112.94
chroma_p2s[ 16x4]		2.84x 	 357.50   	 1014.97
chroma_p2s[ 4x16]		3.87x 	 342.50   	 1324.90
chroma_p2s[32x24]		3.67x 	 3867.50  	 14201.55
chroma_p2s[24x32]		3.64x 	 3869.99  	 14068.54
chroma_p2s[ 32x8]		3.58x 	 1307.50  	 4678.90
chroma_p2s[ 8x32]		3.85x 	 1312.54  	 5052.43
chroma_p2s[64x48]		3.57x 	 15387.58 	 55007.52
chroma_p2s[48x64]		3.57x 	 15431.11 	 55017.58
chroma_p2s[64x16]		3.60x 	 5147.50  	 18527.71
chroma_p2s[16x64]		3.06x 	 5148.45  	 15775.25
Subject: [x265] fix Issue #141: TestBench failed on AMD FX8350

details:   http://hg.videolan.org/x265/rev/8300639fac68
branches:  
changeset: 10579:8300639fac68
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jun 04 14:41:58 2015 -0700
description:
fix Issue #141: TestBench failed on AMD FX8350
Subject: [x265] asm: avx2 code for chroma sse_pp[16x32, 32x64] for i422

details:   http://hg.videolan.org/x265/rev/43afbde189f3
branches:  
changeset: 10580:43afbde189f3
user:      Sumalatha Polureddy
date:      Fri Jun 05 11:03:10 2015 +0530
description:
asm: avx2 code for chroma sse_pp[16x32, 32x64] for i422

sse2
[i422] sse_pp[16x32]  5.29x    1030.36         5446.27
[i422] sse_pp[32x64]  4.40x    4091.52         17994.55
avx2
[i422] sse_pp[16x32]  9.07x    599.30          5436.95
[i422] sse_pp[32x64]  9.07x    2009.39         18226.27

diffstat:

 source/common/constants.cpp          |    5 +-
 source/common/constants.h            |    2 +-
 source/common/contexts.h             |    3 +-
 source/common/dct.cpp                |   57 +++
 source/common/primitives.h           |    4 +
 source/common/x86/asm-primitives.cpp |   31 +
 source/common/x86/const-a.asm        |    1 +
 source/common/x86/ipfilter16.asm     |  608 +++++++++++++++++++++++++++++++++++
 source/common/x86/ipfilter8.asm      |  179 ++++++++++
 source/common/x86/ipfilter8.h        |   40 ++
 source/common/x86/pixel-a.asm        |   72 ++++
 source/common/x86/pixel-util.h       |    3 +
 source/common/x86/pixel-util8.asm    |  176 ++++++++++
 source/common/x86/ssd-a.asm          |    2 +
 source/encoder/entropy.cpp           |  161 ++------
 source/test/pixelharness.cpp         |   13 +
 source/test/testbench.cpp            |    2 +-
 17 files changed, 1243 insertions(+), 116 deletions(-)

diffs (truncated from 1703 to 300 lines):

diff -r 093618ce0b26 -r 43afbde189f3 source/common/constants.cpp
--- a/source/common/constants.cpp	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/constants.cpp	Fri Jun 05 11:03:10 2015 +0530
@@ -324,11 +324,12 @@ const uint16_t g_scan8x8[NUM_SCAN_TYPE][
       4,  12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }
 };
 
-ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =
+ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]) =
 {
     { 0,  4,  1,  8,  5,  2, 12,  9,  6,  3, 13, 10,  7, 14, 11, 15 },
     { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 }
+    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 },
+    { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 }
 };
 
 const uint16_t g_scan16x16[16 * 16] =
diff -r 093618ce0b26 -r 43afbde189f3 source/common/constants.h
--- a/source/common/constants.h	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/constants.h	Fri Jun 05 11:03:10 2015 +0530
@@ -83,7 +83,7 @@ extern const int16_t g_chromaFilter[8][N
 extern const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
 extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
 extern const uint16_t g_scan8x8diag[8 * 8];
-extern const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4];
+extern const uint16_t g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4];  // +1 for safe buffer area for codeCoeffNxN assembly optimize, there have up to 15 bytes beyond bound read
 
 extern const uint8_t g_lastCoeffTable[32];
 extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes
diff -r 093618ce0b26 -r 43afbde189f3 source/common/contexts.h
--- a/source/common/contexts.h	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/contexts.h	Fri Jun 05 11:03:10 2015 +0530
@@ -102,11 +102,12 @@
 #define OFF_TQUANT_BYPASS_FLAG_CTX (OFF_TRANSFORMSKIP_FLAG_CTX + 2 * NUM_TRANSFORMSKIP_FLAG_CTX)
 #define MAX_OFF_CTX_MOD            (OFF_TQUANT_BYPASS_FLAG_CTX +     NUM_TQUANT_BYPASS_FLAG_CTX)
 
+extern "C" const uint32_t g_entropyStateBits[128];
+
 namespace x265 {
 // private namespace
 
 extern const uint32_t g_entropyBits[128];
-extern const uint32_t g_entropyStateBits[128];
 extern const uint8_t g_nextState[128][2];
 
 #define sbacGetMps(S)            ((S) & 1)
diff -r 093618ce0b26 -r 43afbde189f3 source/common/dct.cpp
--- a/source/common/dct.cpp	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/dct.cpp	Fri Jun 05 11:03:10 2015 +0530
@@ -29,6 +29,7 @@
 
 #include "common.h"
 #include "primitives.h"
+#include "contexts.h"   // costCoeffNxN_c
 
 using namespace x265;
 
@@ -817,6 +818,61 @@ uint32_t findPosFirstLast_c(const int16_
     return ((lastNZPosInCG << 16) | firstNZPosInCG);
 }
 
+
+uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
+{
+    ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
+    uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
+    uint32_t sum = 0;
+
+    // correct offset to match assembly
+    absCoeff -= numNonZero;
+
+    for (int i = 0; i < MLS_CG_SIZE; i++)
+    {
+        tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
+        tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
+        tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
+        tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
+    }
+
+    do
+    {
+        uint32_t blkPos, sig, ctxSig;
+        blkPos = scan[scanPosSigOff];
+        const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
+        sig     = scanFlagMask & 1;
+        scanFlagMask >>= 1;
+        X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
+        if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
+        {
+            const uint32_t cnt = tabSigCtx[blkPos] + offset;
+            ctxSig = cnt & posZeroMask;
+
+            //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
+            //encodeBin(sig, baseCtx[ctxSig]);
+            const uint32_t mstate = baseCtx[ctxSig];
+            const uint32_t mps = mstate & 1;
+            const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
+            uint32_t nextState = (stateBits >> 24) + mps;
+            if ((mstate ^ sig) == 1)
+                nextState = sig;
+            X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
+            X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
+            baseCtx[ctxSig] = (uint8_t)nextState;
+            sum += stateBits;
+        }
+        assert(numNonZero <= 15);
+        assert(blkPos <= 15);
+        absCoeff[numNonZero] = tmpCoeff[blkPos];
+        numNonZero += sig;
+        scanPosSigOff--;
+    }
+    while(scanPosSigOff >= 0);
+
+    return (sum & 0xFFFFFF);
+}
+
 }  // closing - anonymous file-static namespace
 
 namespace x265 {
@@ -851,5 +907,6 @@ void setupDCTPrimitives_c(EncoderPrimiti
 
     p.scanPosLast = scanPosLast_c;
     p.findPosFirstLast = findPosFirstLast_c;
+    p.costCoeffNxN = costCoeffNxN_c;
 }
 }
diff -r 093618ce0b26 -r 43afbde189f3 source/common/primitives.h
--- a/source/common/primitives.h	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/primitives.h	Fri Jun 05 11:03:10 2015 +0530
@@ -186,6 +186,8 @@ typedef void (*cutree_propagate_cost) (i
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
+typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -310,6 +312,8 @@ struct EncoderPrimitives
     scanPosLast_t         scanPosLast;
     findPosFirstLast_t    findPosFirstLast;
 
+    costCoeffNxN_t        costCoeffNxN;
+
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
      * in this array. However we always fill all entries in the array in case
diff -r 093618ce0b26 -r 43afbde189f3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Fri Jun 05 11:03:10 2015 +0530
@@ -934,6 +934,20 @@ void setupAssemblyPrimitives(EncoderPrim
         LUMA_VSS_FILTERS(sse2);
 
         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
+
+        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
+        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
+        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
+        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
+    }
+    if (cpuMask & X265_CPU_SSE3)
+    {
+        ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+        ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+        ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1362,12 +1376,17 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_16x16].sse_pp = x265_pixel_ssd_16x16_avx2;
         p.cu[BLOCK_32x32].sse_pp = x265_pixel_ssd_32x32_avx2;
         p.cu[BLOCK_64x64].sse_pp = x265_pixel_ssd_64x64_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = x265_pixel_ssd_16x16_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = x265_pixel_ssd_32x32_avx2;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_16x32_avx2;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_32x64_avx2;
 
         p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
         p.dequant_normal  = x265_dequant_normal_avx2;
         p.dst4x4 = x265_dst4_avx2;
         p.idst4x4 = x265_idst4_avx2;
+        p.denoiseDct = x265_denoise_dct_avx2;
 
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
         p.scale2D_64to32 = x265_scale2D_64to32_avx2;
@@ -1579,6 +1598,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = x265_interp_4tap_horiz_ps_24x32_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hps = x265_interp_4tap_horiz_ps_12x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps = x265_interp_4tap_horiz_ps_6x8_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hps = x265_interp_4tap_horiz_ps_8x8_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hps = x265_interp_4tap_horiz_ps_8x16_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hps = x265_interp_4tap_horiz_ps_8x32_avx2;
@@ -1596,6 +1616,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = x265_interp_4tap_horiz_ps_32x16_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hps = x265_interp_4tap_horiz_ps_12x32_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = x265_interp_4tap_horiz_ps_24x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hps = x265_interp_4tap_horiz_ps_6x16_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hps = x265_interp_4tap_horiz_ps_8x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hps = x265_interp_4tap_horiz_ps_8x4_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = x265_interp_4tap_horiz_ps_8x16_avx2;
@@ -1678,6 +1699,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = x265_interp_4tap_horiz_pp_64x64_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = x265_interp_4tap_horiz_pp_48x64_avx2;
 
+
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
@@ -1851,6 +1873,10 @@ void setupAssemblyPrimitives(EncoderPrim
         p.idst4x4 = x265_idst4_sse2;
 
         p.planecopy_sp = x265_downShift_16_sse2;
+        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
+        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
+        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
+        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -2048,6 +2074,9 @@ void setupAssemblyPrimitives(EncoderPrim
 
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
+
+        // TODO: it is passed smoke test, but we need testbench, so temporary disable
+        //p.costCoeffNxN = x265_costCoeffNxN_sse4;
 #endif
     }
     if (cpuMask & X265_CPU_AVX)
@@ -2261,8 +2290,10 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.cu[BLOCK_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
         p.cu[BLOCK_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
+        p.cu[BLOCK_32x32].sa8d = x265_pixel_sa8d_32x32_avx2;
         p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = x265_pixel_sa8d_32x32_avx2;
 
         p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
         p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
diff -r 093618ce0b26 -r 43afbde189f3 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/x86/const-a.asm	Fri Jun 05 11:03:10 2015 +0530
@@ -80,6 +80,7 @@ const pw_512,               times 16 dw 
 const pw_1023,              times 16 dw 1023
 const pw_1024,              times 16 dw 1024
 const pw_4096,              times 16 dw 4096
+const pw_8192,              times  8 dw 8192
 const pw_00ff,              times 16 dw 0x00ff
 const pw_ff00,              times  8 dw 0xff00
 const pw_2000,              times 16 dw 0x2000
diff -r 093618ce0b26 -r 43afbde189f3 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/x86/ipfilter16.asm	Fri Jun 05 11:03:10 2015 +0530
@@ -564,6 +564,560 @@ cglobal interp_8tap_vert_%1_%2x%3, 5, 7,
     FILTER_VER_LUMA_sse2 ps, 64, 16
     FILTER_VER_LUMA_sse2 ps, 16, 64
 
+%macro FILTERH_W2_4_sse3 2
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + r1 + %1]
+    movhps      m4,     [r0 + r1 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+    movh        m5,     [r0 + 2 * r1 + %1]
+    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
+    pmaddwd     m5,     m0
+    movh        m4,     [r0 + r4 + %1]
+    movhps      m4,     [r0 + r4 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m5,     m4
+    paddd       m5,     m1
+%ifidn %2, pp
+    psrad       m3,     6
+    psrad       m5,     6
+    packssdw    m3,     m5
+    CLIPW       m3,     m7,     m6
+%else
+    psrad       m3,     2


More information about the x265-commits mailing list