[x265] [PATCH] asm: filterPixelToShort 8-bit and 10-bit sse2

chen chenm003 at 163.com
Fri Jun 5 02:27:33 CEST 2015


code right, but may reduce code size by replace PSRLDQ with MOVHLPS

At 2015-06-05 07:53:02,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1433461939 25200
># Node ID db3d3229113adc9416ca9fd7e33c279d62125bc0
># Parent  fcfba27ecf0b9dac8da123da8cdcac75763496f3
>asm: filterPixelToShort 8-bit and 10-bit sse2
>
>This replaces c code for all of filterPixelToShort for 8 and 10 bit.
>
>64-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[  8x8]		2.82x 	 397.50   	 1119.92
>convert_p2s[16x16]		3.04x 	 1347.50  	 4102.63
>convert_p2s[32x32]		1.41x 	 5197.50  	 7332.50
>convert_p2s[64x64]		1.26x 	 20588.66 	 25962.67
>convert_p2s[  8x4]		2.52x 	 229.99   	 580.00
>convert_p2s[  4x8]		2.22x 	 279.99   	 622.38
>convert_p2s[ 16x8]		2.74x 	 710.00   	 1944.96
>convert_p2s[ 8x16]		3.09x 	 730.00   	 2254.97
>convert_p2s[32x16]		1.43x 	 2630.20  	 3774.20
>convert_p2s[16x32]		3.07x 	 2630.01  	 8064.97
>convert_p2s[64x32]		1.28x 	 10307.52 	 13162.50
>convert_p2s[32x64]		1.40x 	 10307.60 	 14382.50
>convert_p2s[16x12]		3.05x 	 1027.50  	 3134.97
>convert_p2s[12x16]		2.78x 	 1115.00  	 3100.73
>convert_p2s[ 16x4]		2.56x 	 387.50   	 990.06
>convert_p2s[ 4x16]		2.43x 	 490.02   	 1190.04
>convert_p2s[32x24]		1.42x 	 3909.22  	 5532.57
>convert_p2s[24x32]		3.94x 	 3907.50  	 15387.65
>convert_p2s[ 32x8]		1.37x 	 1347.49  	 1848.57
>convert_p2s[ 8x32]		3.31x 	 1390.00  	 4596.10
>convert_p2s[64x48]		1.27x 	 15455.30 	 19562.58
>convert_p2s[48x64]		1.30x 	 15428.06 	 20132.50
>convert_p2s[64x16]		1.28x 	 5192.50  	 6669.05
>convert_p2s[16x64]		3.05x 	 5197.80  	 15855.29
>chroma_p2s[  4x4]		1.89x 	 177.50   	 334.95
>chroma_p2s[  8x8]		2.82x 	 397.50   	 1119.83
>chroma_p2s[16x16]		3.05x 	 1347.51  	 4105.62
>chroma_p2s[32x32]		1.41x 	 5187.50  	 7334.46
>chroma_p2s[  4x2]		1.67x 	 115.04   	 192.48
>chroma_p2s[  2x4]		1.16x 	 184.99   	 214.98
>chroma_p2s[  8x4]		2.55x 	 227.50   	 580.01
>chroma_p2s[  4x8]		2.24x 	 277.50   	 622.42
>chroma_p2s[ 16x8]		2.75x 	 707.50   	 1945.02
>chroma_p2s[ 8x16]		2.92x 	 772.45   	 2254.91
>chroma_p2s[32x16]		1.44x 	 2627.50  	 3772.50
>chroma_p2s[16x32]		3.07x 	 2627.50  	 8065.07
>chroma_p2s[  8x6]		2.74x 	 307.59   	 842.44
>chroma_p2s[  6x8]		1.71x 	 507.50   	 870.00
>chroma_p2s[  8x2]		2.02x 	 147.50   	 297.50
>chroma_p2s[  2x8]		1.12x 	 307.50   	 344.96
>chroma_p2s[16x12]		3.05x 	 1027.50  	 3134.97
>chroma_p2s[12x16]		2.79x 	 1112.50  	 3100.21
>chroma_p2s[ 16x4]		2.56x 	 387.50   	 990.19
>chroma_p2s[ 4x16]		2.43x 	 489.99   	 1192.44
>chroma_p2s[32x24]		1.42x 	 3907.50  	 5533.21
>chroma_p2s[24x32]		3.93x 	 3923.72  	 15427.02
>chroma_p2s[ 32x8]		1.37x 	 1347.50  	 1850.34
>chroma_p2s[ 8x32]		3.31x 	 1387.50  	 4595.20
>chroma_p2s[  4x8]		2.22x 	 277.50   	 617.42
>chroma_p2s[ 8x16]		3.09x 	 727.50   	 2247.50
>chroma_p2s[16x32]		3.07x 	 2627.50  	 8065.15
>chroma_p2s[32x64]		1.40x 	 10307.58 	 14385.50
>chroma_p2s[  4x4]		1.87x 	 177.50   	 332.51
>chroma_p2s[  2x8]		1.11x 	 307.50   	 342.46
>chroma_p2s[  8x8]		2.80x 	 397.50   	 1112.50
>chroma_p2s[ 4x16]		2.43x 	 489.99   	 1192.49
>chroma_p2s[16x16]		3.05x 	 1347.74  	 4104.98
>chroma_p2s[ 8x32]		3.31x 	 1387.50  	 4595.44
>chroma_p2s[32x32]		1.41x 	 5197.55  	 7332.50
>chroma_p2s[16x64]		3.05x 	 5197.60  	 15855.21
>chroma_p2s[ 8x12]		2.92x 	 557.50   	 1627.41
>chroma_p2s[ 6x16]		1.79x 	 1002.50  	 1797.47
>chroma_p2s[  8x4]		2.55x 	 227.50   	 580.01
>chroma_p2s[ 2x16]		1.09x 	 602.49   	 657.49
>chroma_p2s[16x24]		3.05x 	 1987.50  	 6054.97
>chroma_p2s[12x32]		2.81x 	 2170.00  	 6095.56
>chroma_p2s[ 16x8]		2.75x 	 707.50   	 1944.97
>chroma_p2s[ 4x32]		3.09x 	 877.50   	 2707.95
>chroma_p2s[32x48]		1.40x 	 7757.54  	 10862.72
>chroma_p2s[24x64]		3.95x 	 7757.50  	 30663.70
>chroma_p2s[32x16]		1.44x 	 2627.50  	 3773.21
>chroma_p2s[ 8x64]		3.30x 	 2717.50  	 8955.97
>chroma_p2s[  4x4]		1.89x 	 177.50   	 334.94
>chroma_p2s[  8x8]		2.82x 	 397.50   	 1119.95
>chroma_p2s[16x16]		3.05x 	 1347.50  	 4105.23
>chroma_p2s[32x32]		1.41x 	 5197.52  	 7332.50
>chroma_p2s[64x64]		1.25x 	 20722.45 	 25962.96
>chroma_p2s[  8x4]		2.57x 	 227.50   	 584.01
>chroma_p2s[  4x8]		2.23x 	 277.49   	 617.44
>chroma_p2s[ 16x8]		2.75x 	 707.57   	 1945.83
>chroma_p2s[ 8x16]		3.08x 	 729.99   	 2247.50
>chroma_p2s[32x16]		1.44x 	 2627.50  	 3772.50
>chroma_p2s[16x32]		3.07x 	 2627.50  	 8064.97
>chroma_p2s[64x32]		1.28x 	 10307.86 	 13162.50
>chroma_p2s[32x64]		1.40x 	 10307.68 	 14385.40
>chroma_p2s[16x12]		3.05x 	 1027.50  	 3135.10
>chroma_p2s[12x16]		2.79x 	 1112.50  	 3100.94
>chroma_p2s[ 16x4]		2.57x 	 387.50   	 994.88
>chroma_p2s[ 4x16]		2.43x 	 489.99   	 1192.44
>chroma_p2s[32x24]		1.42x 	 3907.82  	 5532.55
>chroma_p2s[24x32]		3.95x 	 3907.50  	 15422.51
>chroma_p2s[ 32x8]		1.37x 	 1347.50  	 1849.83
>chroma_p2s[ 8x32]		3.31x 	 1387.50  	 4594.97
>chroma_p2s[64x48]		1.27x 	 15458.83 	 19562.50
>chroma_p2s[48x64]		1.30x 	 15427.81 	 20132.50
>chroma_p2s[64x16]		1.28x 	 5187.50  	 6662.50
>chroma_p2s[16x64]		3.05x 	 5197.50  	 15855.09
>
>32-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[  8x8]		2.55x 	 484.99   	 1237.46
>convert_p2s[16x16]		2.89x 	 1444.98  	 4174.96
>convert_p2s[32x32]		6.70x 	 5295.00  	 35485.83
>convert_p2s[64x64]		6.41x 	 20695.04 	 132575.69
>convert_p2s[  8x4]		2.06x 	 325.00   	 669.93
>convert_p2s[  4x8]		1.93x 	 360.08   	 694.95
>convert_p2s[ 16x8]		2.52x 	 805.00   	 2032.47
>convert_p2s[ 8x16]		2.83x 	 820.00   	 2319.99
>convert_p2s[32x16]		6.55x 	 2725.00  	 17855.75
>convert_p2s[16x32]		3.00x 	 2725.00  	 8185.23
>convert_p2s[64x32]		6.36x 	 10405.41 	 66205.67
>convert_p2s[32x64]		6.82x 	 10404.99 	 70948.87
>convert_p2s[16x12]		2.84x 	 1124.99  	 3197.47
>convert_p2s[12x16]		2.61x 	 1207.52  	 3155.00
>convert_p2s[ 16x4]		2.29x 	 485.00   	 1112.42
>convert_p2s[ 4x16]		2.22x 	 564.99   	 1252.46
>convert_p2s[32x24]		6.66x 	 4005.00  	 26655.58
>convert_p2s[24x32]		6.99x 	 4005.00  	 28003.99
>convert_p2s[ 32x8]		6.27x 	 1445.00  	 9054.99
>convert_p2s[ 8x32]		3.18x 	 1475.00  	 4687.54
>convert_p2s[64x48]		6.40x 	 15525.12 	 99348.35
>convert_p2s[48x64]		6.54x 	 15525.42 	 101593.84
>convert_p2s[64x16]		6.33x 	 5285.10  	 33457.09
>convert_p2s[16x64]		3.01x 	 5295.02  	 15944.96
>chroma_p2s[  4x4]		1.64x 	 257.49   	 422.54
>chroma_p2s[  8x8]		2.55x 	 484.99   	 1237.48
>chroma_p2s[16x16]		2.89x 	 1444.99  	 4174.94
>chroma_p2s[32x32]		6.70x 	 5295.02  	 35485.08
>chroma_p2s[  4x2]		1.33x 	 212.48   	 282.52
>chroma_p2s[  2x4]		0.98x 	 272.49   	 267.50
>chroma_p2s[  8x4]		2.06x 	 325.00   	 669.99
>chroma_p2s[  4x8]		1.93x 	 360.00   	 694.98
>chroma_p2s[ 16x8]		2.52x 	 805.00   	 2032.51
>chroma_p2s[ 8x16]		2.83x 	 820.15   	 2319.98
>chroma_p2s[32x16]		6.55x 	 2725.00  	 17855.50
>chroma_p2s[16x32]		3.00x 	 2725.00  	 8184.98
>chroma_p2s[  8x6]		2.33x 	 402.50   	 937.48
>chroma_p2s[  6x8]		1.71x 	 585.00   	 1000.03
>chroma_p2s[  8x2]		1.62x 	 242.49   	 392.50
>chroma_p2s[  2x8]		1.15x 	 375.00   	 432.50
>chroma_p2s[16x12]		2.84x 	 1124.99  	 3194.98
>chroma_p2s[12x16]		2.63x 	 1200.00  	 3155.18
>chroma_p2s[ 16x4]		2.29x 	 485.07   	 1112.51
>chroma_p2s[ 4x16]		2.18x 	 562.54   	 1224.98
>chroma_p2s[32x24]		6.66x 	 4005.01  	 26660.21
>chroma_p2s[24x32]		6.94x 	 4005.05  	 27805.24
>chroma_p2s[ 32x8]		6.27x 	 1445.00  	 9054.99
>chroma_p2s[ 8x32]		3.18x 	 1475.00  	 4687.52
>chroma_p2s[  4x8]		2.01x 	 360.00   	 722.48
>chroma_p2s[ 8x16]		2.83x 	 820.00   	 2319.98
>chroma_p2s[16x32]		3.00x 	 2725.07  	 8185.31
>chroma_p2s[32x64]		6.79x 	 10405.07 	 70686.27
>chroma_p2s[  4x4]		1.63x 	 259.98   	 422.48
>chroma_p2s[  2x8]		1.12x 	 385.00   	 432.50
>chroma_p2s[  8x8]		2.55x 	 485.00   	 1237.48
>chroma_p2s[ 4x16]		2.18x 	 562.49   	 1224.98
>chroma_p2s[16x16]		2.89x 	 1445.10  	 4174.98
>chroma_p2s[ 8x32]		3.18x 	 1475.00  	 4687.56
>chroma_p2s[32x32]		6.70x 	 5295.13  	 35485.12
>chroma_p2s[16x64]		3.01x 	 5295.00  	 15945.08
>chroma_p2s[ 8x12]		2.71x 	 655.09   	 1774.92
>chroma_p2s[ 6x16]		1.80x 	 1057.50  	 1899.96
>chroma_p2s[  8x4]		2.06x 	 325.00   	 669.99
>chroma_p2s[ 2x16]		1.06x 	 659.99   	 697.48
>chroma_p2s[16x24]		2.93x 	 2085.00  	 6115.20
>chroma_p2s[12x32]		2.72x 	 2250.00  	 6125.25
>chroma_p2s[ 16x8]		2.52x 	 805.03   	 2032.57
>chroma_p2s[ 4x32]		2.85x 	 957.54   	 2725.27
>chroma_p2s[32x48]		6.79x 	 7855.06  	 53302.55
>chroma_p2s[24x64]		7.04x 	 7855.01  	 55325.50
>chroma_p2s[32x16]		6.55x 	 2725.00  	 17855.12
>chroma_p2s[ 8x64]		3.23x 	 2805.32  	 9050.32
>chroma_p2s[  4x4]		1.63x 	 259.99   	 422.49
>chroma_p2s[  8x8]		2.55x 	 487.49   	 1245.00
>chroma_p2s[16x16]		2.89x 	 1445.00  	 4175.20
>chroma_p2s[32x32]		6.74x 	 5295.00  	 35670.43
>chroma_p2s[64x64]		6.40x 	 20696.68 	 132486.70
>chroma_p2s[  8x4]		2.06x 	 325.00   	 669.99
>chroma_p2s[  4x8]		1.93x 	 360.03   	 694.99
>chroma_p2s[ 16x8]		2.52x 	 805.00   	 2032.49
>chroma_p2s[ 8x16]		2.83x 	 820.14   	 2320.16
>chroma_p2s[32x16]		6.55x 	 2724.98  	 17855.27
>chroma_p2s[16x32]		3.00x 	 2725.10  	 8185.09
>chroma_p2s[64x32]		6.39x 	 10405.02 	 66479.45
>chroma_p2s[32x64]		6.81x 	 10414.99 	 70945.01
>chroma_p2s[16x12]		2.84x 	 1125.00  	 3194.98
>chroma_p2s[12x16]		2.63x 	 1200.13  	 3155.00
>chroma_p2s[ 16x4]		2.29x 	 485.00   	 1112.48
>chroma_p2s[ 4x16]		2.18x 	 562.49   	 1224.98
>chroma_p2s[32x24]		6.66x 	 4005.17  	 26655.93
>chroma_p2s[24x32]		6.99x 	 4005.01  	 28008.09
>chroma_p2s[ 32x8]		6.27x 	 1444.99  	 9060.05
>chroma_p2s[ 8x32]		3.18x 	 1474.99  	 4687.52
>chroma_p2s[64x48]		6.40x 	 15525.01 	 99400.07
>chroma_p2s[48x64]		6.55x 	 15525.09 	 101699.22
>chroma_p2s[64x16]		6.28x 	 5285.00  	 33215.04
>chroma_p2s[16x64]		3.01x 	 5295.16  	 15944.98
>
>10-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[  8x8]		3.50x 	 367.50   	 1284.66
>convert_p2s[16x16]		3.13x 	 1309.98  	 4095.49
>convert_p2s[32x32]		3.63x 	 5150.04  	 18697.50
>convert_p2s[64x64]		3.52x 	 21021.88 	 74018.38
>convert_p2s[  8x4]		2.86x 	 217.78   	 622.48
>convert_p2s[  4x8]		3.23x 	 210.00   	 677.50
>convert_p2s[ 16x8]		3.02x 	 669.99   	 2022.45
>convert_p2s[ 8x16]		3.50x 	 676.67   	 2365.62
>convert_p2s[32x16]		3.67x 	 2589.99  	 9497.50
>convert_p2s[16x32]		3.09x 	 2591.86  	 8015.74
>convert_p2s[64x32]		3.67x 	 10271.36 	 37680.71
>convert_p2s[32x64]		3.51x 	 10678.12 	 37516.15
>convert_p2s[16x12]		3.16x 	 992.47   	 3134.97
>convert_p2s[12x16]		3.14x 	 989.99   	 3112.45
>convert_p2s[ 16x4]		2.84x 	 360.55   	 1022.43
>convert_p2s[ 4x16]		3.79x 	 350.00   	 1324.83
>convert_p2s[32x24]		3.64x 	 3869.98  	 14097.50
>convert_p2s[24x32]		3.63x 	 3871.73  	 14066.42
>convert_p2s[ 32x8]		3.60x 	 1310.04  	 4720.07
>convert_p2s[ 8x32]		3.79x 	 1314.99  	 4980.69
>convert_p2s[64x48]		3.42x 	 16163.33 	 55231.05
>convert_p2s[48x64]		3.52x 	 15806.54 	 55712.53
>convert_p2s[64x16]		3.60x 	 5149.99  	 18527.55
>convert_p2s[16x64]		3.01x 	 5500.02  	 16579.86
>chroma_p2s[  4x4]		2.53x 	 137.50   	 347.44
>chroma_p2s[  8x8]		3.49x 	 367.50   	 1284.14
>chroma_p2s[16x16]		3.13x 	 1307.50  	 4095.64
>chroma_p2s[32x32]		3.65x 	 5147.50  	 18765.20
>chroma_p2s[  4x2]		2.31x 	 87.50    	 202.38
>chroma_p2s[  2x4]		1.43x 	 155.04   	 222.38
>chroma_p2s[  8x4]		2.98x 	 217.50   	 647.48
>chroma_p2s[  4x8]		3.27x 	 207.50   	 677.53
>chroma_p2s[ 16x8]		2.99x 	 670.00   	 1999.98
>chroma_p2s[ 8x16]		3.63x 	 672.67   	 2439.46
>chroma_p2s[32x16]		3.67x 	 2587.50  	 9497.50
>chroma_p2s[16x32]		3.10x 	 2587.50  	 8015.55
>chroma_p2s[  8x6]		3.18x 	 297.50   	 944.99
>chroma_p2s[  6x8]		2.63x 	 355.00   	 935.08
>chroma_p2s[  8x2]		2.25x 	 147.54   	 332.45
>chroma_p2s[  2x8]		1.60x 	 237.50   	 379.99
>chroma_p2s[16x12]		3.18x 	 987.50   	 3135.61
>chroma_p2s[12x16]		3.16x 	 985.00   	 3113.44
>chroma_p2s[ 16x4]		2.84x 	 357.50   	 1014.99
>chroma_p2s[ 4x16]		3.85x 	 343.75   	 1324.97
>chroma_p2s[32x24]		3.65x 	 3867.50  	 14097.50
>chroma_p2s[24x32]		3.64x 	 3867.51  	 14069.03
>chroma_p2s[ 32x8]		3.61x 	 1307.50  	 4720.05
>chroma_p2s[ 8x32]		3.79x 	 1312.50  	 4980.79
>chroma_p2s[  4x8]		3.27x 	 207.50   	 677.50
>chroma_p2s[ 8x16]		3.54x 	 672.66   	 2381.40
>chroma_p2s[16x32]		3.10x 	 2587.52  	 8016.40
>chroma_p2s[32x64]		3.53x 	 10709.11 	 37781.17
>chroma_p2s[  4x4]		2.53x 	 137.50   	 347.41
>chroma_p2s[  2x8]		1.60x 	 237.50   	 380.00
>chroma_p2s[  8x8]		3.50x 	 367.50   	 1284.84
>chroma_p2s[ 4x16]		3.85x 	 343.75   	 1324.97
>chroma_p2s[16x16]		3.13x 	 1307.50  	 4095.19
>chroma_p2s[ 8x32]		3.79x 	 1312.50  	 4980.70
>chroma_p2s[32x32]		3.63x 	 5147.53  	 18697.50
>chroma_p2s[16x64]		3.03x 	 5473.41  	 16578.56
>chroma_p2s[ 8x12]		3.64x 	 518.15   	 1885.51
>chroma_p2s[ 6x16]		2.84x 	 673.01   	 1912.40
>chroma_p2s[  8x4]		2.86x 	 217.50   	 622.61
>chroma_p2s[ 2x16]		1.72x 	 397.50   	 684.98
>chroma_p2s[16x24]		3.11x 	 1947.54  	 6057.70
>chroma_p2s[12x32]		3.08x 	 1945.00  	 5991.46
>chroma_p2s[ 16x8]		2.98x 	 670.00   	 1995.10
>chroma_p2s[ 4x32]		4.17x 	 665.00   	 2775.29
>chroma_p2s[32x48]		3.50x 	 8112.28  	 28426.23
>chroma_p2s[24x64]		3.51x 	 8104.04  	 28433.64
>chroma_p2s[32x16]		3.67x 	 2587.51  	 9497.50
>chroma_p2s[ 8x64]		3.55x 	 2953.69  	 10476.00
>chroma_p2s[  4x4]		2.53x 	 137.50   	 347.44
>chroma_p2s[  8x8]		3.50x 	 367.50   	 1284.63
>chroma_p2s[16x16]		3.13x 	 1307.50  	 4095.76
>chroma_p2s[32x32]		3.63x 	 5147.50  	 18697.50
>chroma_p2s[64x64]		3.50x 	 21082.21 	 73810.80
>chroma_p2s[  8x4]		2.98x 	 217.52   	 647.46
>chroma_p2s[  4x8]		3.27x 	 207.50   	 677.50
>chroma_p2s[ 16x8]		2.98x 	 670.15   	 1999.97
>chroma_p2s[ 8x16]		3.52x 	 672.50   	 2364.99
>chroma_p2s[32x16]		3.67x 	 2587.52  	 9497.50
>chroma_p2s[16x32]		3.10x 	 2587.69  	 8015.21
>chroma_p2s[64x32]		3.58x 	 10267.57 	 36767.66
>chroma_p2s[32x64]		3.50x 	 10673.18 	 37334.83
>chroma_p2s[16x12]		3.18x 	 987.50   	 3136.04
>chroma_p2s[12x16]		3.16x 	 985.04   	 3112.94
>chroma_p2s[ 16x4]		2.84x 	 357.60   	 1015.06
>chroma_p2s[ 4x16]		3.85x 	 343.74   	 1324.92
>chroma_p2s[32x24]		3.65x 	 3867.50  	 14097.50
>chroma_p2s[24x32]		3.64x 	 3867.53  	 14066.04
>chroma_p2s[ 32x8]		3.61x 	 1307.59  	 4720.09
>chroma_p2s[ 8x32]		3.80x 	 1312.50  	 4981.43
>chroma_p2s[64x48]		3.50x 	 15895.79 	 55558.28
>chroma_p2s[48x64]		3.50x 	 15778.97 	 55237.76
>chroma_p2s[64x16]		3.60x 	 5147.58  	 18530.57
>chroma_p2s[16x64]		3.03x 	 5471.98  	 16578.79
>
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 04 16:52:19 2015 -0700
>@@ -934,6 +934,11 @@
>         LUMA_VSS_FILTERS(sse2);
> 
>         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>+
>+        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
>+        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
>     }
>     if (cpuMask & X265_CPU_SSE3)
>     {
>@@ -1860,6 +1865,10 @@
>         p.idst4x4 = x265_idst4_sse2;
> 
>         p.planecopy_sp = x265_downShift_16_sse2;
>+        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
>+        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
>     }
>     if (cpuMask & X265_CPU_SSE3)
>     {
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm	Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/const-a.asm	Thu Jun 04 16:52:19 2015 -0700
>@@ -80,6 +80,7 @@
> const pw_1023,              times 16 dw 1023
> const pw_1024,              times 16 dw 1024
> const pw_4096,              times 16 dw 4096
>+const pw_8192,              times  8 dw 8192
> const pw_00ff,              times 16 dw 0x00ff
> const pw_ff00,              times  8 dw 0xff00
> const pw_2000,              times 16 dw 0x2000
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter16.asm
>--- a/source/common/x86/ipfilter16.asm	Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter16.asm	Thu Jun 04 16:52:19 2015 -0700
>@@ -953,6 +953,171 @@
> FILTER_HOR_CHROMA_sse3 64, 48, ps
> FILTER_HOR_CHROMA_sse3 64, 64, ps
> 
>+%macro FILTER_P2S_2_4_sse2 1
>+    movd        m0,     [r0 + %1]
>+    movd        m2,     [r0 + r1 * 2 + %1]
>+    movhps      m0,     [r0 + r1 + %1]
>+    movhps      m2,     [r0 + r4 + %1]
>+    psllw       m0,     4
>+    psllw       m2,     4
>+    psubw       m0,     m1
>+    psubw       m2,     m1
>+
>+    movd        [r2 + r3 * 0 + %1], m0
>+    movd        [r2 + r3 * 2 + %1], m2
>+    psrldq      m0,     8
>+    psrldq      m2,     8
>+    movd        [r2 + r3 * 1 + %1], m0
>+    movd        [r2 + r5 + %1], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_4_sse2 1
>+    movh        m0,     [r0 + %1]
>+    movhps      m0,     [r0 + r1 + %1]
>+    psllw       m0,     4
>+    psubw       m0,     m1
>+    movh        [r2 + r3 * 0 + %1], m0
>+    movhps      [r2 + r3 * 1 + %1], m0
>+
>+    movh        m2,     [r0 + r1 * 2 + %1]
>+    movhps      m2,     [r0 + r4 + %1]
>+    psllw       m2,     4
>+    psubw       m2,     m1
>+    movh        [r2 + r3 * 2 + %1], m2
>+    movhps      [r2 + r5 + %1], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_2_sse2 0
>+    movh        m0,     [r0]
>+    movhps      m0,     [r0 + r1 * 2]
>+    psllw       m0,     4
>+    psubw       m0,     [pw_2000]
>+    movh        [r2 + r3 * 0], m0
>+    movhps      [r2 + r3 * 2], m0
>+%endmacro
>+
>+%macro FILTER_P2S_8_4_sse2 1
>+    movu        m0,     [r0 + %1]
>+    movu        m2,     [r0 + r1 + %1]
>+    psllw       m0,     4
>+    psllw       m2,     4
>+    psubw       m0,     m1
>+    psubw       m2,     m1
>+    movu        [r2 + r3 * 0 + %1], m0
>+    movu        [r2 + r3 * 1 + %1], m2
>+
>+    movu        m3,     [r0 + r1 * 2 + %1]
>+    movu        m4,     [r0 + r4 + %1]
>+    psllw       m3,     4
>+    psllw       m4,     4
>+    psubw       m3,     m1
>+    psubw       m4,     m1
>+    movu        [r2 + r3 * 2 + %1], m3
>+    movu        [r2 + r5 + %1], m4
>+%endmacro
>+
>+%macro FILTER_P2S_8_2_sse2 1
>+    movu        m0,     [r0 + %1]
>+    movu        m2,     [r0 + r1 + %1]
>+    psllw       m0,     4
>+    psllw       m2,     4
>+    psubw       m0,     m1
>+    psubw       m2,     m1
>+    movu        [r2 + r3 * 0 + %1], m0
>+    movu        [r2 + r3 * 1 + %1], m2
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_PIX_TO_SHORT_sse2 2
>+INIT_XMM sse2
>+cglobal filterPixelToShort_%1x%2, 4, 6, 3
>+%if %2 == 2
>+%if %1 == 4
>+    FILTER_P2S_4_2_sse2
>+%elif %1 == 8
>+    add        r1d, r1d
>+    add        r3d, r3d
>+    mova       m1, [pw_2000]
>+    FILTER_P2S_8_2_sse2 0
>+%endif
>+%else
>+    add        r1d, r1d
>+    add        r3d, r3d
>+    mova       m1, [pw_2000]
>+    lea        r4, [r1 * 3]
>+    lea        r5, [r3 * 3]
>+%assign y 1
>+%rep %2/4
>+%assign x 0
>+%rep %1/8
>+    FILTER_P2S_8_4_sse2 x
>+%if %2 == 6
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+    FILTER_P2S_8_2_sse2 x
>+%endif
>+%assign x x+16
>+%endrep
>+%rep (%1 % 8)/4
>+    FILTER_P2S_4_4_sse2 x
>+%assign x x+8
>+%endrep
>+%rep (%1 % 4)/2
>+    FILTER_P2S_2_4_sse2 x
>+%endrep
>+%if y < %2/4
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+%assign y y+1
>+%endif
>+%endrep
>+%endif
>+RET
>+%endmacro
>+
>+    FILTER_PIX_TO_SHORT_sse2 2, 4
>+    FILTER_PIX_TO_SHORT_sse2 2, 8
>+    FILTER_PIX_TO_SHORT_sse2 2, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 2
>+    FILTER_PIX_TO_SHORT_sse2 4, 4
>+    FILTER_PIX_TO_SHORT_sse2 4, 8
>+    FILTER_PIX_TO_SHORT_sse2 4, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 32
>+    FILTER_PIX_TO_SHORT_sse2 6, 8
>+    FILTER_PIX_TO_SHORT_sse2 6, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 2
>+    FILTER_PIX_TO_SHORT_sse2 8, 4
>+    FILTER_PIX_TO_SHORT_sse2 8, 6
>+    FILTER_PIX_TO_SHORT_sse2 8, 8
>+    FILTER_PIX_TO_SHORT_sse2 8, 12
>+    FILTER_PIX_TO_SHORT_sse2 8, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 32
>+    FILTER_PIX_TO_SHORT_sse2 8, 64
>+    FILTER_PIX_TO_SHORT_sse2 12, 16
>+    FILTER_PIX_TO_SHORT_sse2 12, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 4
>+    FILTER_PIX_TO_SHORT_sse2 16, 8
>+    FILTER_PIX_TO_SHORT_sse2 16, 12
>+    FILTER_PIX_TO_SHORT_sse2 16, 16
>+    FILTER_PIX_TO_SHORT_sse2 16, 24
>+    FILTER_PIX_TO_SHORT_sse2 16, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 64
>+    FILTER_PIX_TO_SHORT_sse2 24, 32
>+    FILTER_PIX_TO_SHORT_sse2 24, 64
>+    FILTER_PIX_TO_SHORT_sse2 32, 8
>+    FILTER_PIX_TO_SHORT_sse2 32, 16
>+    FILTER_PIX_TO_SHORT_sse2 32, 24
>+    FILTER_PIX_TO_SHORT_sse2 32, 32
>+    FILTER_PIX_TO_SHORT_sse2 32, 48
>+    FILTER_PIX_TO_SHORT_sse2 32, 64
>+    FILTER_PIX_TO_SHORT_sse2 48, 64
>+    FILTER_PIX_TO_SHORT_sse2 64, 16
>+    FILTER_PIX_TO_SHORT_sse2 64, 32
>+    FILTER_PIX_TO_SHORT_sse2 64, 48
>+    FILTER_PIX_TO_SHORT_sse2 64, 64
>+
> ;------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;------------------------------------------------------------------------------------------------------------
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm	Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter8.asm	Thu Jun 04 16:52:19 2015 -0700
>@@ -301,6 +301,7 @@
> cextern pw_32
> cextern pw_512
> cextern pw_2000
>+cextern pw_8192
> 
> %macro FILTER_H4_w2_2_sse2 0
>     pxor        m3, m3
>@@ -3283,6 +3284,183 @@
>     FILTER_V4_W16n_H2_sse2 ps, 64, 16
> %endif
> 
>+%macro FILTER_P2S_2_4_sse2 1
>+    movd        m2,     [r0 + %1]
>+    movd        m3,     [r0 + r1 + %1]
>+    punpcklwd   m2,     m3
>+    movd        m3,     [r0 + r1 * 2 + %1]
>+    movd        m4,     [r0 + r4 + %1]
>+    punpcklwd   m3,     m4
>+    punpckldq   m2,     m3
>+    punpcklbw   m2,     m0
>+    psllw       m2,     6
>+    psubw       m2,     m1
>+
>+    movd        [r2 + r3 * 0 + %1 * 2], m2
>+    psrldq      m2,     4
>+    movd        [r2 + r3 * 1 + %1 * 2], m2
>+    psrldq      m2,     4
>+    movd        [r2 + r3 * 2 + %1 * 2], m2
>+    psrldq      m2,     4
>+    movd        [r2 + r5 + %1 * 2], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_4_sse2 1
>+    movd        m2,     [r0 + %1]
>+    movd        m3,     [r0 + r1 + %1]
>+    movd        m4,     [r0 + r1 * 2 + %1]
>+    movd        m5,     [r0 + r4 + %1]
>+    punpckldq   m2,     m3
>+    punpcklbw   m2,     m0
>+    punpckldq   m4,     m5
>+    punpcklbw   m4,     m0
>+    psllw       m2,     6
>+    psllw       m4,     6
>+    psubw       m2,     m1
>+    psubw       m4,     m1
>+    movh        [r2 + r3 * 0 + %1 * 2], m2
>+    movh        [r2 + r3 * 2 + %1 * 2], m4
>+    movhps      [r2 + r3 * 1 + %1 * 2], m2
>+    movhps      [r2 + r5 + %1 * 2], m4
>+%endmacro
>+
>+%macro FILTER_P2S_4_2_sse2 0
>+    movd        m2,     [r0]
>+    movd        m3,     [r0 + r1]
>+    punpckldq   m2,     m3
>+    punpcklbw   m2,     m0
>+    psllw       m2,     6
>+    psubw       m2,     [pw_8192]
>+    movh        [r2],   m2
>+    movhps      [r2 + r3 * 2], m2
>+%endmacro
>+
>+%macro FILTER_P2S_8_4_sse2 1
>+    movh        m2,     [r0 + %1]
>+    movh        m3,     [r0 + r1 + %1]
>+    movh        m4,     [r0 + r1 * 2 + %1]
>+    movh        m5,     [r0 + r4 + %1]
>+    punpcklbw   m2,     m0
>+    punpcklbw   m3,     m0
>+    punpcklbw   m5,     m0
>+    punpcklbw   m4,     m0
>+    psllw       m2,     6
>+    psllw       m3,     6
>+    psllw       m5,     6
>+    psllw       m4,     6
>+    psubw       m2,     m1
>+    psubw       m3,     m1
>+    psubw       m4,     m1
>+    psubw       m5,     m1
>+    movu        [r2 + r3 * 0 + %1 * 2], m2
>+    movu        [r2 + r3 * 1 + %1 * 2], m3
>+    movu        [r2 + r3 * 2 + %1 * 2], m4
>+    movu        [r2 + r5 + %1 * 2], m5
>+%endmacro
>+
>+%macro FILTER_P2S_8_2_sse2 1
>+    movh        m2,     [r0 + %1]
>+    movh        m3,     [r0 + r1 + %1]
>+    punpcklbw   m2,     m0
>+    punpcklbw   m3,     m0
>+    psllw       m2,     6
>+    psllw       m3,     6
>+    psubw       m2,     m1
>+    psubw       m3,     m1
>+    movu        [r2 + r3 * 0 + %1 * 2], m2
>+    movu        [r2 + r3 * 1 + %1 * 2], m3
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_PIX_TO_SHORT_sse2 2
>+INIT_XMM sse2
>+cglobal filterPixelToShort_%1x%2, 4, 6, 6
>+    pxor        m0,     m0
>+%if %2 == 2
>+%if %1 == 4
>+    FILTER_P2S_4_2_sse2
>+%elif %1 == 8
>+    add        r3d, r3d
>+    mova       m1, [pw_8192]
>+    FILTER_P2S_8_2_sse2 0
>+%endif
>+%else
>+    add        r3d, r3d
>+    mova       m1, [pw_8192]
>+    lea        r4, [r1 * 3]
>+    lea        r5, [r3 * 3]
>+%assign y 1
>+%rep %2/4
>+%assign x 0
>+%rep %1/8
>+    FILTER_P2S_8_4_sse2 x
>+%if %2 == 6
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+    FILTER_P2S_8_2_sse2 x
>+%endif
>+%assign x x+8
>+%endrep
>+%rep (%1 % 8)/4
>+    FILTER_P2S_4_4_sse2 x
>+%assign x x+4
>+%endrep
>+%rep (%1 % 4)/2
>+    FILTER_P2S_2_4_sse2 x
>+%endrep
>+%if y < %2/4
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+%assign y y+1
>+%endif
>+%endrep
>+%endif
>+RET
>+%endmacro
>+
>+    FILTER_PIX_TO_SHORT_sse2 2, 4
>+    FILTER_PIX_TO_SHORT_sse2 2, 8
>+    FILTER_PIX_TO_SHORT_sse2 2, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 2
>+    FILTER_PIX_TO_SHORT_sse2 4, 4
>+    FILTER_PIX_TO_SHORT_sse2 4, 8
>+    FILTER_PIX_TO_SHORT_sse2 4, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 32
>+    FILTER_PIX_TO_SHORT_sse2 6, 8
>+    FILTER_PIX_TO_SHORT_sse2 6, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 2
>+    FILTER_PIX_TO_SHORT_sse2 8, 4
>+    FILTER_PIX_TO_SHORT_sse2 8, 6
>+    FILTER_PIX_TO_SHORT_sse2 8, 8
>+    FILTER_PIX_TO_SHORT_sse2 8, 12
>+    FILTER_PIX_TO_SHORT_sse2 8, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 32
>+    FILTER_PIX_TO_SHORT_sse2 8, 64
>+    FILTER_PIX_TO_SHORT_sse2 12, 16
>+    FILTER_PIX_TO_SHORT_sse2 12, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 4
>+    FILTER_PIX_TO_SHORT_sse2 16, 8
>+    FILTER_PIX_TO_SHORT_sse2 16, 12
>+    FILTER_PIX_TO_SHORT_sse2 16, 16
>+    FILTER_PIX_TO_SHORT_sse2 16, 24
>+    FILTER_PIX_TO_SHORT_sse2 16, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 64
>+    FILTER_PIX_TO_SHORT_sse2 24, 32
>+    FILTER_PIX_TO_SHORT_sse2 24, 64
>+    FILTER_PIX_TO_SHORT_sse2 32, 8
>+    FILTER_PIX_TO_SHORT_sse2 32, 16
>+    FILTER_PIX_TO_SHORT_sse2 32, 24
>+    FILTER_PIX_TO_SHORT_sse2 32, 32
>+    FILTER_PIX_TO_SHORT_sse2 32, 48
>+    FILTER_PIX_TO_SHORT_sse2 32, 64
>+    FILTER_PIX_TO_SHORT_sse2 48, 64
>+    FILTER_PIX_TO_SHORT_sse2 64, 16
>+    FILTER_PIX_TO_SHORT_sse2 64, 32
>+    FILTER_PIX_TO_SHORT_sse2 64, 48
>+    FILTER_PIX_TO_SHORT_sse2 64, 64
>+
> %macro FILTER_H4_w2_2 3
>     movh        %2, [srcq - 1]
>     pshufb      %2, %2, Tm0
>@@ -3299,6 +3477,7 @@
>     mov         [dstq + dststrideq], r4w
> %endmacro
> 
>+
> ;-----------------------------------------------------------------------------
> ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;-----------------------------------------------------------------------------
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter8.h
>--- a/source/common/x86/ipfilter8.h	Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter8.h	Thu Jun 04 16:52:19 2015 -0700
>@@ -965,6 +965,46 @@
> void x265_interp_4tap_vert_ps_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_ps_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_ps_4x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>+void x265_filterPixelToShort_2x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_2x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_2x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x2_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_6x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_6x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x2_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x6_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_12x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_24x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
> #ifdef X86_64
> void x265_interp_4tap_vert_pp_6x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_pp_6x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150605/28eb1dd5/attachment-0001.html>


More information about the x265-devel mailing list