<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>code right, but may reduce code size by replace PSRLDQ with MOVHLPS</div><pre><br>At 2015-06-05 07:53:02,dtyx265@gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265@gmail.com>
># Date 1433461939 25200
># Node ID db3d3229113adc9416ca9fd7e33c279d62125bc0
># Parent  fcfba27ecf0b9dac8da123da8cdcac75763496f3
>asm: filterPixelToShort 8-bit and 10-bit sse2
>
>This replaces c code for all of filterPixelToShort for 8 and 10 bit.
>
>64-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[  8x8]                2.82x       397.50      1119.92
>convert_p2s[16x16]          3.04x       1347.50          4102.63
>convert_p2s[32x32]          1.41x       5197.50          7332.50
>convert_p2s[64x64]          1.26x       20588.66      25962.67
>convert_p2s[  8x4]                2.52x       229.99      580.00
>convert_p2s[  4x8]                2.22x       279.99      622.38
>convert_p2s[ 16x8]             2.74x       710.00      1944.96
>convert_p2s[ 8x16]             3.09x       730.00      2254.97
>convert_p2s[32x16]          1.43x       2630.20          3774.20
>convert_p2s[16x32]          3.07x       2630.01          8064.97
>convert_p2s[64x32]          1.28x       10307.52      13162.50
>convert_p2s[32x64]          1.40x       10307.60      14382.50
>convert_p2s[16x12]          3.05x       1027.50          3134.97
>convert_p2s[12x16]          2.78x       1115.00          3100.73
>convert_p2s[ 16x4]             2.56x       387.50      990.06
>convert_p2s[ 4x16]             2.43x       490.02      1190.04
>convert_p2s[32x24]          1.42x       3909.22          5532.57
>convert_p2s[24x32]          3.94x       3907.50          15387.65
>convert_p2s[ 32x8]             1.37x       1347.49          1848.57
>convert_p2s[ 8x32]             3.31x       1390.00          4596.10
>convert_p2s[64x48]          1.27x       15455.30      19562.58
>convert_p2s[48x64]          1.30x       15428.06      20132.50
>convert_p2s[64x16]          1.28x       5192.50          6669.05
>convert_p2s[16x64]          3.05x       5197.80          15855.29
>chroma_p2s[  4x4]         1.89x       177.50      334.95
>chroma_p2s[  8x8]         2.82x       397.50      1119.83
>chroma_p2s[16x16]           3.05x       1347.51          4105.62
>chroma_p2s[32x32]           1.41x       5187.50          7334.46
>chroma_p2s[  4x2]         1.67x       115.04      192.48
>chroma_p2s[  2x4]         1.16x       184.99      214.98
>chroma_p2s[  8x4]         2.55x       227.50      580.01
>chroma_p2s[  4x8]         2.24x       277.50      622.42
>chroma_p2s[ 16x8]              2.75x       707.50      1945.02
>chroma_p2s[ 8x16]              2.92x       772.45      2254.91
>chroma_p2s[32x16]           1.44x       2627.50          3772.50
>chroma_p2s[16x32]           3.07x       2627.50          8065.07
>chroma_p2s[  8x6]         2.74x       307.59      842.44
>chroma_p2s[  6x8]         1.71x       507.50      870.00
>chroma_p2s[  8x2]         2.02x       147.50      297.50
>chroma_p2s[  2x8]         1.12x       307.50      344.96
>chroma_p2s[16x12]           3.05x       1027.50          3134.97
>chroma_p2s[12x16]           2.79x       1112.50          3100.21
>chroma_p2s[ 16x4]              2.56x       387.50      990.19
>chroma_p2s[ 4x16]              2.43x       489.99      1192.44
>chroma_p2s[32x24]           1.42x       3907.50          5533.21
>chroma_p2s[24x32]           3.93x       3923.72          15427.02
>chroma_p2s[ 32x8]              1.37x       1347.50          1850.34
>chroma_p2s[ 8x32]              3.31x       1387.50          4595.20
>chroma_p2s[  4x8]         2.22x       277.50      617.42
>chroma_p2s[ 8x16]              3.09x       727.50      2247.50
>chroma_p2s[16x32]           3.07x       2627.50          8065.15
>chroma_p2s[32x64]           1.40x       10307.58      14385.50
>chroma_p2s[  4x4]         1.87x       177.50      332.51
>chroma_p2s[  2x8]         1.11x       307.50      342.46
>chroma_p2s[  8x8]         2.80x       397.50      1112.50
>chroma_p2s[ 4x16]              2.43x       489.99      1192.49
>chroma_p2s[16x16]           3.05x       1347.74          4104.98
>chroma_p2s[ 8x32]              3.31x       1387.50          4595.44
>chroma_p2s[32x32]           1.41x       5197.55          7332.50
>chroma_p2s[16x64]           3.05x       5197.60          15855.21
>chroma_p2s[ 8x12]              2.92x       557.50      1627.41
>chroma_p2s[ 6x16]              1.79x       1002.50          1797.47
>chroma_p2s[  8x4]         2.55x       227.50      580.01
>chroma_p2s[ 2x16]              1.09x       602.49      657.49
>chroma_p2s[16x24]           3.05x       1987.50          6054.97
>chroma_p2s[12x32]           2.81x       2170.00          6095.56
>chroma_p2s[ 16x8]              2.75x       707.50      1944.97
>chroma_p2s[ 4x32]              3.09x       877.50      2707.95
>chroma_p2s[32x48]           1.40x       7757.54          10862.72
>chroma_p2s[24x64]           3.95x       7757.50          30663.70
>chroma_p2s[32x16]           1.44x       2627.50          3773.21
>chroma_p2s[ 8x64]              3.30x       2717.50          8955.97
>chroma_p2s[  4x4]         1.89x       177.50      334.94
>chroma_p2s[  8x8]         2.82x       397.50      1119.95
>chroma_p2s[16x16]           3.05x       1347.50          4105.23
>chroma_p2s[32x32]           1.41x       5197.52          7332.50
>chroma_p2s[64x64]           1.25x       20722.45      25962.96
>chroma_p2s[  8x4]         2.57x       227.50      584.01
>chroma_p2s[  4x8]         2.23x       277.49      617.44
>chroma_p2s[ 16x8]              2.75x       707.57      1945.83
>chroma_p2s[ 8x16]              3.08x       729.99      2247.50
>chroma_p2s[32x16]           1.44x       2627.50          3772.50
>chroma_p2s[16x32]           3.07x       2627.50          8064.97
>chroma_p2s[64x32]           1.28x       10307.86      13162.50
>chroma_p2s[32x64]           1.40x       10307.68      14385.40
>chroma_p2s[16x12]           3.05x       1027.50          3135.10
>chroma_p2s[12x16]           2.79x       1112.50          3100.94
>chroma_p2s[ 16x4]              2.57x       387.50      994.88
>chroma_p2s[ 4x16]              2.43x       489.99      1192.44
>chroma_p2s[32x24]           1.42x       3907.82          5532.55
>chroma_p2s[24x32]           3.95x       3907.50          15422.51
>chroma_p2s[ 32x8]              1.37x       1347.50          1849.83
>chroma_p2s[ 8x32]              3.31x       1387.50          4594.97
>chroma_p2s[64x48]           1.27x       15458.83      19562.50
>chroma_p2s[48x64]           1.30x       15427.81      20132.50
>chroma_p2s[64x16]           1.28x       5187.50          6662.50
>chroma_p2s[16x64]           3.05x       5197.50          15855.09
>
>32-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[  8x8]                2.55x       484.99      1237.46
>convert_p2s[16x16]          2.89x       1444.98          4174.96
>convert_p2s[32x32]          6.70x       5295.00          35485.83
>convert_p2s[64x64]          6.41x       20695.04      132575.69
>convert_p2s[  8x4]                2.06x       325.00      669.93
>convert_p2s[  4x8]                1.93x       360.08      694.95
>convert_p2s[ 16x8]             2.52x       805.00      2032.47
>convert_p2s[ 8x16]             2.83x       820.00      2319.99
>convert_p2s[32x16]          6.55x       2725.00          17855.75
>convert_p2s[16x32]          3.00x       2725.00          8185.23
>convert_p2s[64x32]          6.36x       10405.41      66205.67
>convert_p2s[32x64]          6.82x       10404.99      70948.87
>convert_p2s[16x12]          2.84x       1124.99          3197.47
>convert_p2s[12x16]          2.61x       1207.52          3155.00
>convert_p2s[ 16x4]             2.29x       485.00      1112.42
>convert_p2s[ 4x16]             2.22x       564.99      1252.46
>convert_p2s[32x24]          6.66x       4005.00          26655.58
>convert_p2s[24x32]          6.99x       4005.00          28003.99
>convert_p2s[ 32x8]             6.27x       1445.00          9054.99
>convert_p2s[ 8x32]             3.18x       1475.00          4687.54
>convert_p2s[64x48]          6.40x       15525.12      99348.35
>convert_p2s[48x64]          6.54x       15525.42      101593.84
>convert_p2s[64x16]          6.33x       5285.10          33457.09
>convert_p2s[16x64]          3.01x       5295.02          15944.96
>chroma_p2s[  4x4]         1.64x       257.49      422.54
>chroma_p2s[  8x8]         2.55x       484.99      1237.48
>chroma_p2s[16x16]           2.89x       1444.99          4174.94
>chroma_p2s[32x32]           6.70x       5295.02          35485.08
>chroma_p2s[  4x2]         1.33x       212.48      282.52
>chroma_p2s[  2x4]         0.98x       272.49      267.50
>chroma_p2s[  8x4]         2.06x       325.00      669.99
>chroma_p2s[  4x8]         1.93x       360.00      694.98
>chroma_p2s[ 16x8]              2.52x       805.00      2032.51
>chroma_p2s[ 8x16]              2.83x       820.15      2319.98
>chroma_p2s[32x16]           6.55x       2725.00          17855.50
>chroma_p2s[16x32]           3.00x       2725.00          8184.98
>chroma_p2s[  8x6]         2.33x       402.50      937.48
>chroma_p2s[  6x8]         1.71x       585.00      1000.03
>chroma_p2s[  8x2]         1.62x       242.49      392.50
>chroma_p2s[  2x8]         1.15x       375.00      432.50
>chroma_p2s[16x12]           2.84x       1124.99          3194.98
>chroma_p2s[12x16]           2.63x       1200.00          3155.18
>chroma_p2s[ 16x4]              2.29x       485.07      1112.51
>chroma_p2s[ 4x16]              2.18x       562.54      1224.98
>chroma_p2s[32x24]           6.66x       4005.01          26660.21
>chroma_p2s[24x32]           6.94x       4005.05          27805.24
>chroma_p2s[ 32x8]              6.27x       1445.00          9054.99
>chroma_p2s[ 8x32]              3.18x       1475.00          4687.52
>chroma_p2s[  4x8]         2.01x       360.00      722.48
>chroma_p2s[ 8x16]              2.83x       820.00      2319.98
>chroma_p2s[16x32]           3.00x       2725.07          8185.31
>chroma_p2s[32x64]           6.79x       10405.07      70686.27
>chroma_p2s[  4x4]         1.63x       259.98      422.48
>chroma_p2s[  2x8]         1.12x       385.00      432.50
>chroma_p2s[  8x8]         2.55x       485.00      1237.48
>chroma_p2s[ 4x16]              2.18x       562.49      1224.98
>chroma_p2s[16x16]           2.89x       1445.10          4174.98
>chroma_p2s[ 8x32]              3.18x       1475.00          4687.56
>chroma_p2s[32x32]           6.70x       5295.13          35485.12
>chroma_p2s[16x64]           3.01x       5295.00          15945.08
>chroma_p2s[ 8x12]              2.71x       655.09      1774.92
>chroma_p2s[ 6x16]              1.80x       1057.50          1899.96
>chroma_p2s[  8x4]         2.06x       325.00      669.99
>chroma_p2s[ 2x16]              1.06x       659.99      697.48
>chroma_p2s[16x24]           2.93x       2085.00          6115.20
>chroma_p2s[12x32]           2.72x       2250.00          6125.25
>chroma_p2s[ 16x8]              2.52x       805.03      2032.57
>chroma_p2s[ 4x32]              2.85x       957.54      2725.27
>chroma_p2s[32x48]           6.79x       7855.06          53302.55
>chroma_p2s[24x64]           7.04x       7855.01          55325.50
>chroma_p2s[32x16]           6.55x       2725.00          17855.12
>chroma_p2s[ 8x64]              3.23x       2805.32          9050.32
>chroma_p2s[  4x4]         1.63x       259.99      422.49
>chroma_p2s[  8x8]         2.55x       487.49      1245.00
>chroma_p2s[16x16]           2.89x       1445.00          4175.20
>chroma_p2s[32x32]           6.74x       5295.00          35670.43
>chroma_p2s[64x64]           6.40x       20696.68      132486.70
>chroma_p2s[  8x4]         2.06x       325.00      669.99
>chroma_p2s[  4x8]         1.93x       360.03      694.99
>chroma_p2s[ 16x8]              2.52x       805.00      2032.49
>chroma_p2s[ 8x16]              2.83x       820.14      2320.16
>chroma_p2s[32x16]           6.55x       2724.98          17855.27
>chroma_p2s[16x32]           3.00x       2725.10          8185.09
>chroma_p2s[64x32]           6.39x       10405.02      66479.45
>chroma_p2s[32x64]           6.81x       10414.99      70945.01
>chroma_p2s[16x12]           2.84x       1125.00          3194.98
>chroma_p2s[12x16]           2.63x       1200.13          3155.00
>chroma_p2s[ 16x4]              2.29x       485.00      1112.48
>chroma_p2s[ 4x16]              2.18x       562.49      1224.98
>chroma_p2s[32x24]           6.66x       4005.17          26655.93
>chroma_p2s[24x32]           6.99x       4005.01          28008.09
>chroma_p2s[ 32x8]              6.27x       1444.99          9060.05
>chroma_p2s[ 8x32]              3.18x       1474.99          4687.52
>chroma_p2s[64x48]           6.40x       15525.01      99400.07
>chroma_p2s[48x64]           6.55x       15525.09      101699.22
>chroma_p2s[64x16]           6.28x       5285.00          33215.04
>chroma_p2s[16x64]           3.01x       5295.16          15944.98
>
>10-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[  8x8]                3.50x       367.50      1284.66
>convert_p2s[16x16]          3.13x       1309.98          4095.49
>convert_p2s[32x32]          3.63x       5150.04          18697.50
>convert_p2s[64x64]          3.52x       21021.88      74018.38
>convert_p2s[  8x4]                2.86x       217.78      622.48
>convert_p2s[  4x8]                3.23x       210.00      677.50
>convert_p2s[ 16x8]             3.02x       669.99      2022.45
>convert_p2s[ 8x16]             3.50x       676.67      2365.62
>convert_p2s[32x16]          3.67x       2589.99          9497.50
>convert_p2s[16x32]          3.09x       2591.86          8015.74
>convert_p2s[64x32]          3.67x       10271.36      37680.71
>convert_p2s[32x64]          3.51x       10678.12      37516.15
>convert_p2s[16x12]          3.16x       992.47      3134.97
>convert_p2s[12x16]          3.14x       989.99      3112.45
>convert_p2s[ 16x4]             2.84x       360.55      1022.43
>convert_p2s[ 4x16]             3.79x       350.00      1324.83
>convert_p2s[32x24]          3.64x       3869.98          14097.50
>convert_p2s[24x32]          3.63x       3871.73          14066.42
>convert_p2s[ 32x8]             3.60x       1310.04          4720.07
>convert_p2s[ 8x32]             3.79x       1314.99          4980.69
>convert_p2s[64x48]          3.42x       16163.33      55231.05
>convert_p2s[48x64]          3.52x       15806.54      55712.53
>convert_p2s[64x16]          3.60x       5149.99          18527.55
>convert_p2s[16x64]          3.01x       5500.02          16579.86
>chroma_p2s[  4x4]         2.53x       137.50      347.44
>chroma_p2s[  8x8]         3.49x       367.50      1284.14
>chroma_p2s[16x16]           3.13x       1307.50          4095.64
>chroma_p2s[32x32]           3.65x       5147.50          18765.20
>chroma_p2s[  4x2]         2.31x       87.50          202.38
>chroma_p2s[  2x4]         1.43x       155.04      222.38
>chroma_p2s[  8x4]         2.98x       217.50      647.48
>chroma_p2s[  4x8]         3.27x       207.50      677.53
>chroma_p2s[ 16x8]              2.99x       670.00      1999.98
>chroma_p2s[ 8x16]              3.63x       672.67      2439.46
>chroma_p2s[32x16]           3.67x       2587.50          9497.50
>chroma_p2s[16x32]           3.10x       2587.50          8015.55
>chroma_p2s[  8x6]         3.18x       297.50      944.99
>chroma_p2s[  6x8]         2.63x       355.00      935.08
>chroma_p2s[  8x2]         2.25x       147.54      332.45
>chroma_p2s[  2x8]         1.60x       237.50      379.99
>chroma_p2s[16x12]           3.18x       987.50      3135.61
>chroma_p2s[12x16]           3.16x       985.00      3113.44
>chroma_p2s[ 16x4]              2.84x       357.50      1014.99
>chroma_p2s[ 4x16]              3.85x       343.75      1324.97
>chroma_p2s[32x24]           3.65x       3867.50          14097.50
>chroma_p2s[24x32]           3.64x       3867.51          14069.03
>chroma_p2s[ 32x8]              3.61x       1307.50          4720.05
>chroma_p2s[ 8x32]              3.79x       1312.50          4980.79
>chroma_p2s[  4x8]         3.27x       207.50      677.50
>chroma_p2s[ 8x16]              3.54x       672.66      2381.40
>chroma_p2s[16x32]           3.10x       2587.52          8016.40
>chroma_p2s[32x64]           3.53x       10709.11      37781.17
>chroma_p2s[  4x4]         2.53x       137.50      347.41
>chroma_p2s[  2x8]         1.60x       237.50      380.00
>chroma_p2s[  8x8]         3.50x       367.50      1284.84
>chroma_p2s[ 4x16]              3.85x       343.75      1324.97
>chroma_p2s[16x16]           3.13x       1307.50          4095.19
>chroma_p2s[ 8x32]              3.79x       1312.50          4980.70
>chroma_p2s[32x32]           3.63x       5147.53          18697.50
>chroma_p2s[16x64]           3.03x       5473.41          16578.56
>chroma_p2s[ 8x12]              3.64x       518.15      1885.51
>chroma_p2s[ 6x16]              2.84x       673.01      1912.40
>chroma_p2s[  8x4]         2.86x       217.50      622.61
>chroma_p2s[ 2x16]              1.72x       397.50      684.98
>chroma_p2s[16x24]           3.11x       1947.54          6057.70
>chroma_p2s[12x32]           3.08x       1945.00          5991.46
>chroma_p2s[ 16x8]              2.98x       670.00      1995.10
>chroma_p2s[ 4x32]              4.17x       665.00      2775.29
>chroma_p2s[32x48]           3.50x       8112.28          28426.23
>chroma_p2s[24x64]           3.51x       8104.04          28433.64
>chroma_p2s[32x16]           3.67x       2587.51          9497.50
>chroma_p2s[ 8x64]              3.55x       2953.69          10476.00
>chroma_p2s[  4x4]         2.53x       137.50      347.44
>chroma_p2s[  8x8]         3.50x       367.50      1284.63
>chroma_p2s[16x16]           3.13x       1307.50          4095.76
>chroma_p2s[32x32]           3.63x       5147.50          18697.50
>chroma_p2s[64x64]           3.50x       21082.21      73810.80
>chroma_p2s[  8x4]         2.98x       217.52      647.46
>chroma_p2s[  4x8]         3.27x       207.50      677.50
>chroma_p2s[ 16x8]              2.98x       670.15      1999.97
>chroma_p2s[ 8x16]              3.52x       672.50      2364.99
>chroma_p2s[32x16]           3.67x       2587.52          9497.50
>chroma_p2s[16x32]           3.10x       2587.69          8015.21
>chroma_p2s[64x32]           3.58x       10267.57      36767.66
>chroma_p2s[32x64]           3.50x       10673.18      37334.83
>chroma_p2s[16x12]           3.18x       987.50      3136.04
>chroma_p2s[12x16]           3.16x       985.04      3112.94
>chroma_p2s[ 16x4]              2.84x       357.60      1015.06
>chroma_p2s[ 4x16]              3.85x       343.74      1324.92
>chroma_p2s[32x24]           3.65x       3867.50          14097.50
>chroma_p2s[24x32]           3.64x       3867.53          14066.04
>chroma_p2s[ 32x8]              3.61x       1307.59          4720.09
>chroma_p2s[ 8x32]              3.80x       1312.50          4981.43
>chroma_p2s[64x48]           3.50x       15895.79      55558.28
>chroma_p2s[48x64]           3.50x       15778.97      55237.76
>chroma_p2s[64x16]           3.60x       5147.58          18530.57
>chroma_p2s[16x64]           3.03x       5471.98          16578.79
>
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp     Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/asm-primitives.cpp     Thu Jun 04 16:52:19 2015 -0700
>@@ -934,6 +934,11 @@
>         LUMA_VSS_FILTERS(sse2);

>         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>+
>+        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
>+        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
>     }
>     if (cpuMask & X265_CPU_SSE3)
>     {
>@@ -1860,6 +1865,10 @@
>         p.idst4x4 = x265_idst4_sse2;

>         p.planecopy_sp = x265_downShift_16_sse2;
>+        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
>+        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
>+        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
>     }
>     if (cpuMask & X265_CPU_SSE3)
>     {
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm    Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/const-a.asm    Thu Jun 04 16:52:19 2015 -0700
>@@ -80,6 +80,7 @@
> const pw_1023,              times 16 dw 1023
> const pw_1024,              times 16 dw 1024
> const pw_4096,              times 16 dw 4096
>+const pw_8192,              times  8 dw 8192
> const pw_00ff,              times 16 dw 0x00ff
> const pw_ff00,              times  8 dw 0xff00
> const pw_2000,              times 16 dw 0x2000
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter16.asm
>--- a/source/common/x86/ipfilter16.asm Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter16.asm Thu Jun 04 16:52:19 2015 -0700
>@@ -953,6 +953,171 @@
> FILTER_HOR_CHROMA_sse3 64, 48, ps
> FILTER_HOR_CHROMA_sse3 64, 64, ps

>+%macro FILTER_P2S_2_4_sse2 1
>+    movd        m0,     [r0 + %1]
>+    movd        m2,     [r0 + r1 * 2 + %1]
>+    movhps      m0,     [r0 + r1 + %1]
>+    movhps      m2,     [r0 + r4 + %1]
>+    psllw       m0,     4
>+    psllw       m2,     4
>+    psubw       m0,     m1
>+    psubw       m2,     m1
>+
>+    movd        [r2 + r3 * 0 + %1], m0
>+    movd        [r2 + r3 * 2 + %1], m2
>+    psrldq      m0,     8
>+    psrldq      m2,     8
>+    movd        [r2 + r3 * 1 + %1], m0
>+    movd        [r2 + r5 + %1], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_4_sse2 1
>+    movh        m0,     [r0 + %1]
>+    movhps      m0,     [r0 + r1 + %1]
>+    psllw       m0,     4
>+    psubw       m0,     m1
>+    movh        [r2 + r3 * 0 + %1], m0
>+    movhps      [r2 + r3 * 1 + %1], m0
>+
>+    movh        m2,     [r0 + r1 * 2 + %1]
>+    movhps      m2,     [r0 + r4 + %1]
>+    psllw       m2,     4
>+    psubw       m2,     m1
>+    movh        [r2 + r3 * 2 + %1], m2
>+    movhps      [r2 + r5 + %1], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_2_sse2 0
>+    movh        m0,     [r0]
>+    movhps      m0,     [r0 + r1 * 2]
>+    psllw       m0,     4
>+    psubw       m0,     [pw_2000]
>+    movh        [r2 + r3 * 0], m0
>+    movhps      [r2 + r3 * 2], m0
>+%endmacro
>+
>+%macro FILTER_P2S_8_4_sse2 1
>+    movu        m0,     [r0 + %1]
>+    movu        m2,     [r0 + r1 + %1]
>+    psllw       m0,     4
>+    psllw       m2,     4
>+    psubw       m0,     m1
>+    psubw       m2,     m1
>+    movu        [r2 + r3 * 0 + %1], m0
>+    movu        [r2 + r3 * 1 + %1], m2
>+
>+    movu        m3,     [r0 + r1 * 2 + %1]
>+    movu        m4,     [r0 + r4 + %1]
>+    psllw       m3,     4
>+    psllw       m4,     4
>+    psubw       m3,     m1
>+    psubw       m4,     m1
>+    movu        [r2 + r3 * 2 + %1], m3
>+    movu        [r2 + r5 + %1], m4
>+%endmacro
>+
>+%macro FILTER_P2S_8_2_sse2 1
>+    movu        m0,     [r0 + %1]
>+    movu        m2,     [r0 + r1 + %1]
>+    psllw       m0,     4
>+    psllw       m2,     4
>+    psubw       m0,     m1
>+    psubw       m2,     m1
>+    movu        [r2 + r3 * 0 + %1], m0
>+    movu        [r2 + r3 * 1 + %1], m2
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_PIX_TO_SHORT_sse2 2
>+INIT_XMM sse2
>+cglobal filterPixelToShort_%1x%2, 4, 6, 3
>+%if %2 == 2
>+%if %1 == 4
>+    FILTER_P2S_4_2_sse2
>+%elif %1 == 8
>+    add        r1d, r1d
>+    add        r3d, r3d
>+    mova       m1, [pw_2000]
>+    FILTER_P2S_8_2_sse2 0
>+%endif
>+%else
>+    add        r1d, r1d
>+    add        r3d, r3d
>+    mova       m1, [pw_2000]
>+    lea        r4, [r1 * 3]
>+    lea        r5, [r3 * 3]
>+%assign y 1
>+%rep %2/4
>+%assign x 0
>+%rep %1/8
>+    FILTER_P2S_8_4_sse2 x
>+%if %2 == 6
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+    FILTER_P2S_8_2_sse2 x
>+%endif
>+%assign x x+16
>+%endrep
>+%rep (%1 % 8)/4
>+    FILTER_P2S_4_4_sse2 x
>+%assign x x+8
>+%endrep
>+%rep (%1 % 4)/2
>+    FILTER_P2S_2_4_sse2 x
>+%endrep
>+%if y < %2/4
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+%assign y y+1
>+%endif
>+%endrep
>+%endif
>+RET
>+%endmacro
>+
>+    FILTER_PIX_TO_SHORT_sse2 2, 4
>+    FILTER_PIX_TO_SHORT_sse2 2, 8
>+    FILTER_PIX_TO_SHORT_sse2 2, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 2
>+    FILTER_PIX_TO_SHORT_sse2 4, 4
>+    FILTER_PIX_TO_SHORT_sse2 4, 8
>+    FILTER_PIX_TO_SHORT_sse2 4, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 32
>+    FILTER_PIX_TO_SHORT_sse2 6, 8
>+    FILTER_PIX_TO_SHORT_sse2 6, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 2
>+    FILTER_PIX_TO_SHORT_sse2 8, 4
>+    FILTER_PIX_TO_SHORT_sse2 8, 6
>+    FILTER_PIX_TO_SHORT_sse2 8, 8
>+    FILTER_PIX_TO_SHORT_sse2 8, 12
>+    FILTER_PIX_TO_SHORT_sse2 8, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 32
>+    FILTER_PIX_TO_SHORT_sse2 8, 64
>+    FILTER_PIX_TO_SHORT_sse2 12, 16
>+    FILTER_PIX_TO_SHORT_sse2 12, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 4
>+    FILTER_PIX_TO_SHORT_sse2 16, 8
>+    FILTER_PIX_TO_SHORT_sse2 16, 12
>+    FILTER_PIX_TO_SHORT_sse2 16, 16
>+    FILTER_PIX_TO_SHORT_sse2 16, 24
>+    FILTER_PIX_TO_SHORT_sse2 16, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 64
>+    FILTER_PIX_TO_SHORT_sse2 24, 32
>+    FILTER_PIX_TO_SHORT_sse2 24, 64
>+    FILTER_PIX_TO_SHORT_sse2 32, 8
>+    FILTER_PIX_TO_SHORT_sse2 32, 16
>+    FILTER_PIX_TO_SHORT_sse2 32, 24
>+    FILTER_PIX_TO_SHORT_sse2 32, 32
>+    FILTER_PIX_TO_SHORT_sse2 32, 48
>+    FILTER_PIX_TO_SHORT_sse2 32, 64
>+    FILTER_PIX_TO_SHORT_sse2 48, 64
>+    FILTER_PIX_TO_SHORT_sse2 64, 16
>+    FILTER_PIX_TO_SHORT_sse2 64, 32
>+    FILTER_PIX_TO_SHORT_sse2 64, 48
>+    FILTER_PIX_TO_SHORT_sse2 64, 64
>+
> ;------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;------------------------------------------------------------------------------------------------------------
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm  Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter8.asm  Thu Jun 04 16:52:19 2015 -0700
>@@ -301,6 +301,7 @@
> cextern pw_32
> cextern pw_512
> cextern pw_2000
>+cextern pw_8192

> %macro FILTER_H4_w2_2_sse2 0
>     pxor        m3, m3
>@@ -3283,6 +3284,183 @@
>     FILTER_V4_W16n_H2_sse2 ps, 64, 16
> %endif

>+%macro FILTER_P2S_2_4_sse2 1
>+    movd        m2,     [r0 + %1]
>+    movd        m3,     [r0 + r1 + %1]
>+    punpcklwd   m2,     m3
>+    movd        m3,     [r0 + r1 * 2 + %1]
>+    movd        m4,     [r0 + r4 + %1]
>+    punpcklwd   m3,     m4
>+    punpckldq   m2,     m3
>+    punpcklbw   m2,     m0
>+    psllw       m2,     6
>+    psubw       m2,     m1
>+
>+    movd        [r2 + r3 * 0 + %1 * 2], m2
>+    psrldq      m2,     4
>+    movd        [r2 + r3 * 1 + %1 * 2], m2
>+    psrldq      m2,     4
>+    movd        [r2 + r3 * 2 + %1 * 2], m2
>+    psrldq      m2,     4
>+    movd        [r2 + r5 + %1 * 2], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_4_sse2 1
>+    movd        m2,     [r0 + %1]
>+    movd        m3,     [r0 + r1 + %1]
>+    movd        m4,     [r0 + r1 * 2 + %1]
>+    movd        m5,     [r0 + r4 + %1]
>+    punpckldq   m2,     m3
>+    punpcklbw   m2,     m0
>+    punpckldq   m4,     m5
>+    punpcklbw   m4,     m0
>+    psllw       m2,     6
>+    psllw       m4,     6
>+    psubw       m2,     m1
>+    psubw       m4,     m1
>+    movh        [r2 + r3 * 0 + %1 * 2], m2
>+    movh        [r2 + r3 * 2 + %1 * 2], m4
>+    movhps      [r2 + r3 * 1 + %1 * 2], m2
>+    movhps      [r2 + r5 + %1 * 2], m4
>+%endmacro
>+
>+%macro FILTER_P2S_4_2_sse2 0
>+    movd        m2,     [r0]
>+    movd        m3,     [r0 + r1]
>+    punpckldq   m2,     m3
>+    punpcklbw   m2,     m0
>+    psllw       m2,     6
>+    psubw       m2,     [pw_8192]
>+    movh        [r2],   m2
>+    movhps      [r2 + r3 * 2], m2
>+%endmacro
>+
>+%macro FILTER_P2S_8_4_sse2 1
>+    movh        m2,     [r0 + %1]
>+    movh        m3,     [r0 + r1 + %1]
>+    movh        m4,     [r0 + r1 * 2 + %1]
>+    movh        m5,     [r0 + r4 + %1]
>+    punpcklbw   m2,     m0
>+    punpcklbw   m3,     m0
>+    punpcklbw   m5,     m0
>+    punpcklbw   m4,     m0
>+    psllw       m2,     6
>+    psllw       m3,     6
>+    psllw       m5,     6
>+    psllw       m4,     6
>+    psubw       m2,     m1
>+    psubw       m3,     m1
>+    psubw       m4,     m1
>+    psubw       m5,     m1
>+    movu        [r2 + r3 * 0 + %1 * 2], m2
>+    movu        [r2 + r3 * 1 + %1 * 2], m3
>+    movu        [r2 + r3 * 2 + %1 * 2], m4
>+    movu        [r2 + r5 + %1 * 2], m5
>+%endmacro
>+
>+%macro FILTER_P2S_8_2_sse2 1
>+    movh        m2,     [r0 + %1]
>+    movh        m3,     [r0 + r1 + %1]
>+    punpcklbw   m2,     m0
>+    punpcklbw   m3,     m0
>+    psllw       m2,     6
>+    psllw       m3,     6
>+    psubw       m2,     m1
>+    psubw       m3,     m1
>+    movu        [r2 + r3 * 0 + %1 * 2], m2
>+    movu        [r2 + r3 * 1 + %1 * 2], m3
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_PIX_TO_SHORT_sse2 2
>+INIT_XMM sse2
>+cglobal filterPixelToShort_%1x%2, 4, 6, 6
>+    pxor        m0,     m0
>+%if %2 == 2
>+%if %1 == 4
>+    FILTER_P2S_4_2_sse2
>+%elif %1 == 8
>+    add        r3d, r3d
>+    mova       m1, [pw_8192]
>+    FILTER_P2S_8_2_sse2 0
>+%endif
>+%else
>+    add        r3d, r3d
>+    mova       m1, [pw_8192]
>+    lea        r4, [r1 * 3]
>+    lea        r5, [r3 * 3]
>+%assign y 1
>+%rep %2/4
>+%assign x 0
>+%rep %1/8
>+    FILTER_P2S_8_4_sse2 x
>+%if %2 == 6
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+    FILTER_P2S_8_2_sse2 x
>+%endif
>+%assign x x+8
>+%endrep
>+%rep (%1 % 8)/4
>+    FILTER_P2S_4_4_sse2 x
>+%assign x x+4
>+%endrep
>+%rep (%1 % 4)/2
>+    FILTER_P2S_2_4_sse2 x
>+%endrep
>+%if y < %2/4
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+%assign y y+1
>+%endif
>+%endrep
>+%endif
>+RET
>+%endmacro
>+
>+    FILTER_PIX_TO_SHORT_sse2 2, 4
>+    FILTER_PIX_TO_SHORT_sse2 2, 8
>+    FILTER_PIX_TO_SHORT_sse2 2, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 2
>+    FILTER_PIX_TO_SHORT_sse2 4, 4
>+    FILTER_PIX_TO_SHORT_sse2 4, 8
>+    FILTER_PIX_TO_SHORT_sse2 4, 16
>+    FILTER_PIX_TO_SHORT_sse2 4, 32
>+    FILTER_PIX_TO_SHORT_sse2 6, 8
>+    FILTER_PIX_TO_SHORT_sse2 6, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 2
>+    FILTER_PIX_TO_SHORT_sse2 8, 4
>+    FILTER_PIX_TO_SHORT_sse2 8, 6
>+    FILTER_PIX_TO_SHORT_sse2 8, 8
>+    FILTER_PIX_TO_SHORT_sse2 8, 12
>+    FILTER_PIX_TO_SHORT_sse2 8, 16
>+    FILTER_PIX_TO_SHORT_sse2 8, 32
>+    FILTER_PIX_TO_SHORT_sse2 8, 64
>+    FILTER_PIX_TO_SHORT_sse2 12, 16
>+    FILTER_PIX_TO_SHORT_sse2 12, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 4
>+    FILTER_PIX_TO_SHORT_sse2 16, 8
>+    FILTER_PIX_TO_SHORT_sse2 16, 12
>+    FILTER_PIX_TO_SHORT_sse2 16, 16
>+    FILTER_PIX_TO_SHORT_sse2 16, 24
>+    FILTER_PIX_TO_SHORT_sse2 16, 32
>+    FILTER_PIX_TO_SHORT_sse2 16, 64
>+    FILTER_PIX_TO_SHORT_sse2 24, 32
>+    FILTER_PIX_TO_SHORT_sse2 24, 64
>+    FILTER_PIX_TO_SHORT_sse2 32, 8
>+    FILTER_PIX_TO_SHORT_sse2 32, 16
>+    FILTER_PIX_TO_SHORT_sse2 32, 24
>+    FILTER_PIX_TO_SHORT_sse2 32, 32
>+    FILTER_PIX_TO_SHORT_sse2 32, 48
>+    FILTER_PIX_TO_SHORT_sse2 32, 64
>+    FILTER_PIX_TO_SHORT_sse2 48, 64
>+    FILTER_PIX_TO_SHORT_sse2 64, 16
>+    FILTER_PIX_TO_SHORT_sse2 64, 32
>+    FILTER_PIX_TO_SHORT_sse2 64, 48
>+    FILTER_PIX_TO_SHORT_sse2 64, 64
>+
> %macro FILTER_H4_w2_2 3
>     movh        %2, [srcq - 1]
>     pshufb      %2, %2, Tm0
>@@ -3299,6 +3477,7 @@
>     mov         [dstq + dststrideq], r4w
> %endmacro

>+
> ;-----------------------------------------------------------------------------
> ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;-----------------------------------------------------------------------------
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter8.h
>--- a/source/common/x86/ipfilter8.h    Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter8.h    Thu Jun 04 16:52:19 2015 -0700
>@@ -965,6 +965,46 @@
> void x265_interp_4tap_vert_ps_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_ps_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_ps_4x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>+void x265_filterPixelToShort_2x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_2x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_2x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x2_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_6x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_6x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x2_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x6_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_12x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_24x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
> #ifdef X86_64
> void x265_interp_4tap_vert_pp_6x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_pp_6x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>