[x265] [PATCH] asm: filterPixelToShort 8-bit and 10-bit sse2
chen
chenm003 at 163.com
Fri Jun 5 02:27:33 CEST 2015
code right, but may reduce code size by replace PSRLDQ with MOVHLPS
At 2015-06-05 07:53:02,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1433461939 25200
># Node ID db3d3229113adc9416ca9fd7e33c279d62125bc0
># Parent fcfba27ecf0b9dac8da123da8cdcac75763496f3
>asm: filterPixelToShort 8-bit and 10-bit sse2
>
>This replaces c code for all of filterPixelToShort for 8 and 10 bit.
>
>64-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[ 8x8] 2.82x 397.50 1119.92
>convert_p2s[16x16] 3.04x 1347.50 4102.63
>convert_p2s[32x32] 1.41x 5197.50 7332.50
>convert_p2s[64x64] 1.26x 20588.66 25962.67
>convert_p2s[ 8x4] 2.52x 229.99 580.00
>convert_p2s[ 4x8] 2.22x 279.99 622.38
>convert_p2s[ 16x8] 2.74x 710.00 1944.96
>convert_p2s[ 8x16] 3.09x 730.00 2254.97
>convert_p2s[32x16] 1.43x 2630.20 3774.20
>convert_p2s[16x32] 3.07x 2630.01 8064.97
>convert_p2s[64x32] 1.28x 10307.52 13162.50
>convert_p2s[32x64] 1.40x 10307.60 14382.50
>convert_p2s[16x12] 3.05x 1027.50 3134.97
>convert_p2s[12x16] 2.78x 1115.00 3100.73
>convert_p2s[ 16x4] 2.56x 387.50 990.06
>convert_p2s[ 4x16] 2.43x 490.02 1190.04
>convert_p2s[32x24] 1.42x 3909.22 5532.57
>convert_p2s[24x32] 3.94x 3907.50 15387.65
>convert_p2s[ 32x8] 1.37x 1347.49 1848.57
>convert_p2s[ 8x32] 3.31x 1390.00 4596.10
>convert_p2s[64x48] 1.27x 15455.30 19562.58
>convert_p2s[48x64] 1.30x 15428.06 20132.50
>convert_p2s[64x16] 1.28x 5192.50 6669.05
>convert_p2s[16x64] 3.05x 5197.80 15855.29
>chroma_p2s[ 4x4] 1.89x 177.50 334.95
>chroma_p2s[ 8x8] 2.82x 397.50 1119.83
>chroma_p2s[16x16] 3.05x 1347.51 4105.62
>chroma_p2s[32x32] 1.41x 5187.50 7334.46
>chroma_p2s[ 4x2] 1.67x 115.04 192.48
>chroma_p2s[ 2x4] 1.16x 184.99 214.98
>chroma_p2s[ 8x4] 2.55x 227.50 580.01
>chroma_p2s[ 4x8] 2.24x 277.50 622.42
>chroma_p2s[ 16x8] 2.75x 707.50 1945.02
>chroma_p2s[ 8x16] 2.92x 772.45 2254.91
>chroma_p2s[32x16] 1.44x 2627.50 3772.50
>chroma_p2s[16x32] 3.07x 2627.50 8065.07
>chroma_p2s[ 8x6] 2.74x 307.59 842.44
>chroma_p2s[ 6x8] 1.71x 507.50 870.00
>chroma_p2s[ 8x2] 2.02x 147.50 297.50
>chroma_p2s[ 2x8] 1.12x 307.50 344.96
>chroma_p2s[16x12] 3.05x 1027.50 3134.97
>chroma_p2s[12x16] 2.79x 1112.50 3100.21
>chroma_p2s[ 16x4] 2.56x 387.50 990.19
>chroma_p2s[ 4x16] 2.43x 489.99 1192.44
>chroma_p2s[32x24] 1.42x 3907.50 5533.21
>chroma_p2s[24x32] 3.93x 3923.72 15427.02
>chroma_p2s[ 32x8] 1.37x 1347.50 1850.34
>chroma_p2s[ 8x32] 3.31x 1387.50 4595.20
>chroma_p2s[ 4x8] 2.22x 277.50 617.42
>chroma_p2s[ 8x16] 3.09x 727.50 2247.50
>chroma_p2s[16x32] 3.07x 2627.50 8065.15
>chroma_p2s[32x64] 1.40x 10307.58 14385.50
>chroma_p2s[ 4x4] 1.87x 177.50 332.51
>chroma_p2s[ 2x8] 1.11x 307.50 342.46
>chroma_p2s[ 8x8] 2.80x 397.50 1112.50
>chroma_p2s[ 4x16] 2.43x 489.99 1192.49
>chroma_p2s[16x16] 3.05x 1347.74 4104.98
>chroma_p2s[ 8x32] 3.31x 1387.50 4595.44
>chroma_p2s[32x32] 1.41x 5197.55 7332.50
>chroma_p2s[16x64] 3.05x 5197.60 15855.21
>chroma_p2s[ 8x12] 2.92x 557.50 1627.41
>chroma_p2s[ 6x16] 1.79x 1002.50 1797.47
>chroma_p2s[ 8x4] 2.55x 227.50 580.01
>chroma_p2s[ 2x16] 1.09x 602.49 657.49
>chroma_p2s[16x24] 3.05x 1987.50 6054.97
>chroma_p2s[12x32] 2.81x 2170.00 6095.56
>chroma_p2s[ 16x8] 2.75x 707.50 1944.97
>chroma_p2s[ 4x32] 3.09x 877.50 2707.95
>chroma_p2s[32x48] 1.40x 7757.54 10862.72
>chroma_p2s[24x64] 3.95x 7757.50 30663.70
>chroma_p2s[32x16] 1.44x 2627.50 3773.21
>chroma_p2s[ 8x64] 3.30x 2717.50 8955.97
>chroma_p2s[ 4x4] 1.89x 177.50 334.94
>chroma_p2s[ 8x8] 2.82x 397.50 1119.95
>chroma_p2s[16x16] 3.05x 1347.50 4105.23
>chroma_p2s[32x32] 1.41x 5197.52 7332.50
>chroma_p2s[64x64] 1.25x 20722.45 25962.96
>chroma_p2s[ 8x4] 2.57x 227.50 584.01
>chroma_p2s[ 4x8] 2.23x 277.49 617.44
>chroma_p2s[ 16x8] 2.75x 707.57 1945.83
>chroma_p2s[ 8x16] 3.08x 729.99 2247.50
>chroma_p2s[32x16] 1.44x 2627.50 3772.50
>chroma_p2s[16x32] 3.07x 2627.50 8064.97
>chroma_p2s[64x32] 1.28x 10307.86 13162.50
>chroma_p2s[32x64] 1.40x 10307.68 14385.40
>chroma_p2s[16x12] 3.05x 1027.50 3135.10
>chroma_p2s[12x16] 2.79x 1112.50 3100.94
>chroma_p2s[ 16x4] 2.57x 387.50 994.88
>chroma_p2s[ 4x16] 2.43x 489.99 1192.44
>chroma_p2s[32x24] 1.42x 3907.82 5532.55
>chroma_p2s[24x32] 3.95x 3907.50 15422.51
>chroma_p2s[ 32x8] 1.37x 1347.50 1849.83
>chroma_p2s[ 8x32] 3.31x 1387.50 4594.97
>chroma_p2s[64x48] 1.27x 15458.83 19562.50
>chroma_p2s[48x64] 1.30x 15427.81 20132.50
>chroma_p2s[64x16] 1.28x 5187.50 6662.50
>chroma_p2s[16x64] 3.05x 5197.50 15855.09
>
>32-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[ 8x8] 2.55x 484.99 1237.46
>convert_p2s[16x16] 2.89x 1444.98 4174.96
>convert_p2s[32x32] 6.70x 5295.00 35485.83
>convert_p2s[64x64] 6.41x 20695.04 132575.69
>convert_p2s[ 8x4] 2.06x 325.00 669.93
>convert_p2s[ 4x8] 1.93x 360.08 694.95
>convert_p2s[ 16x8] 2.52x 805.00 2032.47
>convert_p2s[ 8x16] 2.83x 820.00 2319.99
>convert_p2s[32x16] 6.55x 2725.00 17855.75
>convert_p2s[16x32] 3.00x 2725.00 8185.23
>convert_p2s[64x32] 6.36x 10405.41 66205.67
>convert_p2s[32x64] 6.82x 10404.99 70948.87
>convert_p2s[16x12] 2.84x 1124.99 3197.47
>convert_p2s[12x16] 2.61x 1207.52 3155.00
>convert_p2s[ 16x4] 2.29x 485.00 1112.42
>convert_p2s[ 4x16] 2.22x 564.99 1252.46
>convert_p2s[32x24] 6.66x 4005.00 26655.58
>convert_p2s[24x32] 6.99x 4005.00 28003.99
>convert_p2s[ 32x8] 6.27x 1445.00 9054.99
>convert_p2s[ 8x32] 3.18x 1475.00 4687.54
>convert_p2s[64x48] 6.40x 15525.12 99348.35
>convert_p2s[48x64] 6.54x 15525.42 101593.84
>convert_p2s[64x16] 6.33x 5285.10 33457.09
>convert_p2s[16x64] 3.01x 5295.02 15944.96
>chroma_p2s[ 4x4] 1.64x 257.49 422.54
>chroma_p2s[ 8x8] 2.55x 484.99 1237.48
>chroma_p2s[16x16] 2.89x 1444.99 4174.94
>chroma_p2s[32x32] 6.70x 5295.02 35485.08
>chroma_p2s[ 4x2] 1.33x 212.48 282.52
>chroma_p2s[ 2x4] 0.98x 272.49 267.50
>chroma_p2s[ 8x4] 2.06x 325.00 669.99
>chroma_p2s[ 4x8] 1.93x 360.00 694.98
>chroma_p2s[ 16x8] 2.52x 805.00 2032.51
>chroma_p2s[ 8x16] 2.83x 820.15 2319.98
>chroma_p2s[32x16] 6.55x 2725.00 17855.50
>chroma_p2s[16x32] 3.00x 2725.00 8184.98
>chroma_p2s[ 8x6] 2.33x 402.50 937.48
>chroma_p2s[ 6x8] 1.71x 585.00 1000.03
>chroma_p2s[ 8x2] 1.62x 242.49 392.50
>chroma_p2s[ 2x8] 1.15x 375.00 432.50
>chroma_p2s[16x12] 2.84x 1124.99 3194.98
>chroma_p2s[12x16] 2.63x 1200.00 3155.18
>chroma_p2s[ 16x4] 2.29x 485.07 1112.51
>chroma_p2s[ 4x16] 2.18x 562.54 1224.98
>chroma_p2s[32x24] 6.66x 4005.01 26660.21
>chroma_p2s[24x32] 6.94x 4005.05 27805.24
>chroma_p2s[ 32x8] 6.27x 1445.00 9054.99
>chroma_p2s[ 8x32] 3.18x 1475.00 4687.52
>chroma_p2s[ 4x8] 2.01x 360.00 722.48
>chroma_p2s[ 8x16] 2.83x 820.00 2319.98
>chroma_p2s[16x32] 3.00x 2725.07 8185.31
>chroma_p2s[32x64] 6.79x 10405.07 70686.27
>chroma_p2s[ 4x4] 1.63x 259.98 422.48
>chroma_p2s[ 2x8] 1.12x 385.00 432.50
>chroma_p2s[ 8x8] 2.55x 485.00 1237.48
>chroma_p2s[ 4x16] 2.18x 562.49 1224.98
>chroma_p2s[16x16] 2.89x 1445.10 4174.98
>chroma_p2s[ 8x32] 3.18x 1475.00 4687.56
>chroma_p2s[32x32] 6.70x 5295.13 35485.12
>chroma_p2s[16x64] 3.01x 5295.00 15945.08
>chroma_p2s[ 8x12] 2.71x 655.09 1774.92
>chroma_p2s[ 6x16] 1.80x 1057.50 1899.96
>chroma_p2s[ 8x4] 2.06x 325.00 669.99
>chroma_p2s[ 2x16] 1.06x 659.99 697.48
>chroma_p2s[16x24] 2.93x 2085.00 6115.20
>chroma_p2s[12x32] 2.72x 2250.00 6125.25
>chroma_p2s[ 16x8] 2.52x 805.03 2032.57
>chroma_p2s[ 4x32] 2.85x 957.54 2725.27
>chroma_p2s[32x48] 6.79x 7855.06 53302.55
>chroma_p2s[24x64] 7.04x 7855.01 55325.50
>chroma_p2s[32x16] 6.55x 2725.00 17855.12
>chroma_p2s[ 8x64] 3.23x 2805.32 9050.32
>chroma_p2s[ 4x4] 1.63x 259.99 422.49
>chroma_p2s[ 8x8] 2.55x 487.49 1245.00
>chroma_p2s[16x16] 2.89x 1445.00 4175.20
>chroma_p2s[32x32] 6.74x 5295.00 35670.43
>chroma_p2s[64x64] 6.40x 20696.68 132486.70
>chroma_p2s[ 8x4] 2.06x 325.00 669.99
>chroma_p2s[ 4x8] 1.93x 360.03 694.99
>chroma_p2s[ 16x8] 2.52x 805.00 2032.49
>chroma_p2s[ 8x16] 2.83x 820.14 2320.16
>chroma_p2s[32x16] 6.55x 2724.98 17855.27
>chroma_p2s[16x32] 3.00x 2725.10 8185.09
>chroma_p2s[64x32] 6.39x 10405.02 66479.45
>chroma_p2s[32x64] 6.81x 10414.99 70945.01
>chroma_p2s[16x12] 2.84x 1125.00 3194.98
>chroma_p2s[12x16] 2.63x 1200.13 3155.00
>chroma_p2s[ 16x4] 2.29x 485.00 1112.48
>chroma_p2s[ 4x16] 2.18x 562.49 1224.98
>chroma_p2s[32x24] 6.66x 4005.17 26655.93
>chroma_p2s[24x32] 6.99x 4005.01 28008.09
>chroma_p2s[ 32x8] 6.27x 1444.99 9060.05
>chroma_p2s[ 8x32] 3.18x 1474.99 4687.52
>chroma_p2s[64x48] 6.40x 15525.01 99400.07
>chroma_p2s[48x64] 6.55x 15525.09 101699.22
>chroma_p2s[64x16] 6.28x 5285.00 33215.04
>chroma_p2s[16x64] 3.01x 5295.16 15944.98
>
>10-bit
>
>./test/TestBench --testbench interp | grep p2s
>convert_p2s[ 8x8] 3.50x 367.50 1284.66
>convert_p2s[16x16] 3.13x 1309.98 4095.49
>convert_p2s[32x32] 3.63x 5150.04 18697.50
>convert_p2s[64x64] 3.52x 21021.88 74018.38
>convert_p2s[ 8x4] 2.86x 217.78 622.48
>convert_p2s[ 4x8] 3.23x 210.00 677.50
>convert_p2s[ 16x8] 3.02x 669.99 2022.45
>convert_p2s[ 8x16] 3.50x 676.67 2365.62
>convert_p2s[32x16] 3.67x 2589.99 9497.50
>convert_p2s[16x32] 3.09x 2591.86 8015.74
>convert_p2s[64x32] 3.67x 10271.36 37680.71
>convert_p2s[32x64] 3.51x 10678.12 37516.15
>convert_p2s[16x12] 3.16x 992.47 3134.97
>convert_p2s[12x16] 3.14x 989.99 3112.45
>convert_p2s[ 16x4] 2.84x 360.55 1022.43
>convert_p2s[ 4x16] 3.79x 350.00 1324.83
>convert_p2s[32x24] 3.64x 3869.98 14097.50
>convert_p2s[24x32] 3.63x 3871.73 14066.42
>convert_p2s[ 32x8] 3.60x 1310.04 4720.07
>convert_p2s[ 8x32] 3.79x 1314.99 4980.69
>convert_p2s[64x48] 3.42x 16163.33 55231.05
>convert_p2s[48x64] 3.52x 15806.54 55712.53
>convert_p2s[64x16] 3.60x 5149.99 18527.55
>convert_p2s[16x64] 3.01x 5500.02 16579.86
>chroma_p2s[ 4x4] 2.53x 137.50 347.44
>chroma_p2s[ 8x8] 3.49x 367.50 1284.14
>chroma_p2s[16x16] 3.13x 1307.50 4095.64
>chroma_p2s[32x32] 3.65x 5147.50 18765.20
>chroma_p2s[ 4x2] 2.31x 87.50 202.38
>chroma_p2s[ 2x4] 1.43x 155.04 222.38
>chroma_p2s[ 8x4] 2.98x 217.50 647.48
>chroma_p2s[ 4x8] 3.27x 207.50 677.53
>chroma_p2s[ 16x8] 2.99x 670.00 1999.98
>chroma_p2s[ 8x16] 3.63x 672.67 2439.46
>chroma_p2s[32x16] 3.67x 2587.50 9497.50
>chroma_p2s[16x32] 3.10x 2587.50 8015.55
>chroma_p2s[ 8x6] 3.18x 297.50 944.99
>chroma_p2s[ 6x8] 2.63x 355.00 935.08
>chroma_p2s[ 8x2] 2.25x 147.54 332.45
>chroma_p2s[ 2x8] 1.60x 237.50 379.99
>chroma_p2s[16x12] 3.18x 987.50 3135.61
>chroma_p2s[12x16] 3.16x 985.00 3113.44
>chroma_p2s[ 16x4] 2.84x 357.50 1014.99
>chroma_p2s[ 4x16] 3.85x 343.75 1324.97
>chroma_p2s[32x24] 3.65x 3867.50 14097.50
>chroma_p2s[24x32] 3.64x 3867.51 14069.03
>chroma_p2s[ 32x8] 3.61x 1307.50 4720.05
>chroma_p2s[ 8x32] 3.79x 1312.50 4980.79
>chroma_p2s[ 4x8] 3.27x 207.50 677.50
>chroma_p2s[ 8x16] 3.54x 672.66 2381.40
>chroma_p2s[16x32] 3.10x 2587.52 8016.40
>chroma_p2s[32x64] 3.53x 10709.11 37781.17
>chroma_p2s[ 4x4] 2.53x 137.50 347.41
>chroma_p2s[ 2x8] 1.60x 237.50 380.00
>chroma_p2s[ 8x8] 3.50x 367.50 1284.84
>chroma_p2s[ 4x16] 3.85x 343.75 1324.97
>chroma_p2s[16x16] 3.13x 1307.50 4095.19
>chroma_p2s[ 8x32] 3.79x 1312.50 4980.70
>chroma_p2s[32x32] 3.63x 5147.53 18697.50
>chroma_p2s[16x64] 3.03x 5473.41 16578.56
>chroma_p2s[ 8x12] 3.64x 518.15 1885.51
>chroma_p2s[ 6x16] 2.84x 673.01 1912.40
>chroma_p2s[ 8x4] 2.86x 217.50 622.61
>chroma_p2s[ 2x16] 1.72x 397.50 684.98
>chroma_p2s[16x24] 3.11x 1947.54 6057.70
>chroma_p2s[12x32] 3.08x 1945.00 5991.46
>chroma_p2s[ 16x8] 2.98x 670.00 1995.10
>chroma_p2s[ 4x32] 4.17x 665.00 2775.29
>chroma_p2s[32x48] 3.50x 8112.28 28426.23
>chroma_p2s[24x64] 3.51x 8104.04 28433.64
>chroma_p2s[32x16] 3.67x 2587.51 9497.50
>chroma_p2s[ 8x64] 3.55x 2953.69 10476.00
>chroma_p2s[ 4x4] 2.53x 137.50 347.44
>chroma_p2s[ 8x8] 3.50x 367.50 1284.63
>chroma_p2s[16x16] 3.13x 1307.50 4095.76
>chroma_p2s[32x32] 3.63x 5147.50 18697.50
>chroma_p2s[64x64] 3.50x 21082.21 73810.80
>chroma_p2s[ 8x4] 2.98x 217.52 647.46
>chroma_p2s[ 4x8] 3.27x 207.50 677.50
>chroma_p2s[ 16x8] 2.98x 670.15 1999.97
>chroma_p2s[ 8x16] 3.52x 672.50 2364.99
>chroma_p2s[32x16] 3.67x 2587.52 9497.50
>chroma_p2s[16x32] 3.10x 2587.69 8015.21
>chroma_p2s[64x32] 3.58x 10267.57 36767.66
>chroma_p2s[32x64] 3.50x 10673.18 37334.83
>chroma_p2s[16x12] 3.18x 987.50 3136.04
>chroma_p2s[12x16] 3.16x 985.04 3112.94
>chroma_p2s[ 16x4] 2.84x 357.60 1015.06
>chroma_p2s[ 4x16] 3.85x 343.74 1324.92
>chroma_p2s[32x24] 3.65x 3867.50 14097.50
>chroma_p2s[24x32] 3.64x 3867.53 14066.04
>chroma_p2s[ 32x8] 3.61x 1307.59 4720.09
>chroma_p2s[ 8x32] 3.80x 1312.50 4981.43
>chroma_p2s[64x48] 3.50x 15895.79 55558.28
>chroma_p2s[48x64] 3.50x 15778.97 55237.76
>chroma_p2s[64x16] 3.60x 5147.58 18530.57
>chroma_p2s[16x64] 3.03x 5471.98 16578.79
>
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/asm-primitives.cpp Thu Jun 04 16:52:19 2015 -0700
>@@ -934,6 +934,11 @@
> LUMA_VSS_FILTERS(sse2);
>
> p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>+
>+ ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
>+ ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
>+ ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
>+ ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
> }
> if (cpuMask & X265_CPU_SSE3)
> {
>@@ -1860,6 +1865,10 @@
> p.idst4x4 = x265_idst4_sse2;
>
> p.planecopy_sp = x265_downShift_16_sse2;
>+ ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
>+ ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
>+ ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
>+ ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
> }
> if (cpuMask & X265_CPU_SSE3)
> {
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/const-a.asm Thu Jun 04 16:52:19 2015 -0700
>@@ -80,6 +80,7 @@
> const pw_1023, times 16 dw 1023
> const pw_1024, times 16 dw 1024
> const pw_4096, times 16 dw 4096
>+const pw_8192, times 8 dw 8192
> const pw_00ff, times 16 dw 0x00ff
> const pw_ff00, times 8 dw 0xff00
> const pw_2000, times 16 dw 0x2000
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter16.asm
>--- a/source/common/x86/ipfilter16.asm Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter16.asm Thu Jun 04 16:52:19 2015 -0700
>@@ -953,6 +953,171 @@
> FILTER_HOR_CHROMA_sse3 64, 48, ps
> FILTER_HOR_CHROMA_sse3 64, 64, ps
>
>+%macro FILTER_P2S_2_4_sse2 1
>+ movd m0, [r0 + %1]
>+ movd m2, [r0 + r1 * 2 + %1]
>+ movhps m0, [r0 + r1 + %1]
>+ movhps m2, [r0 + r4 + %1]
>+ psllw m0, 4
>+ psllw m2, 4
>+ psubw m0, m1
>+ psubw m2, m1
>+
>+ movd [r2 + r3 * 0 + %1], m0
>+ movd [r2 + r3 * 2 + %1], m2
>+ psrldq m0, 8
>+ psrldq m2, 8
>+ movd [r2 + r3 * 1 + %1], m0
>+ movd [r2 + r5 + %1], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_4_sse2 1
>+ movh m0, [r0 + %1]
>+ movhps m0, [r0 + r1 + %1]
>+ psllw m0, 4
>+ psubw m0, m1
>+ movh [r2 + r3 * 0 + %1], m0
>+ movhps [r2 + r3 * 1 + %1], m0
>+
>+ movh m2, [r0 + r1 * 2 + %1]
>+ movhps m2, [r0 + r4 + %1]
>+ psllw m2, 4
>+ psubw m2, m1
>+ movh [r2 + r3 * 2 + %1], m2
>+ movhps [r2 + r5 + %1], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_2_sse2 0
>+ movh m0, [r0]
>+ movhps m0, [r0 + r1 * 2]
>+ psllw m0, 4
>+ psubw m0, [pw_2000]
>+ movh [r2 + r3 * 0], m0
>+ movhps [r2 + r3 * 2], m0
>+%endmacro
>+
>+%macro FILTER_P2S_8_4_sse2 1
>+ movu m0, [r0 + %1]
>+ movu m2, [r0 + r1 + %1]
>+ psllw m0, 4
>+ psllw m2, 4
>+ psubw m0, m1
>+ psubw m2, m1
>+ movu [r2 + r3 * 0 + %1], m0
>+ movu [r2 + r3 * 1 + %1], m2
>+
>+ movu m3, [r0 + r1 * 2 + %1]
>+ movu m4, [r0 + r4 + %1]
>+ psllw m3, 4
>+ psllw m4, 4
>+ psubw m3, m1
>+ psubw m4, m1
>+ movu [r2 + r3 * 2 + %1], m3
>+ movu [r2 + r5 + %1], m4
>+%endmacro
>+
>+%macro FILTER_P2S_8_2_sse2 1
>+ movu m0, [r0 + %1]
>+ movu m2, [r0 + r1 + %1]
>+ psllw m0, 4
>+ psllw m2, 4
>+ psubw m0, m1
>+ psubw m2, m1
>+ movu [r2 + r3 * 0 + %1], m0
>+ movu [r2 + r3 * 1 + %1], m2
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_PIX_TO_SHORT_sse2 2
>+INIT_XMM sse2
>+cglobal filterPixelToShort_%1x%2, 4, 6, 3
>+%if %2 == 2
>+%if %1 == 4
>+ FILTER_P2S_4_2_sse2
>+%elif %1 == 8
>+ add r1d, r1d
>+ add r3d, r3d
>+ mova m1, [pw_2000]
>+ FILTER_P2S_8_2_sse2 0
>+%endif
>+%else
>+ add r1d, r1d
>+ add r3d, r3d
>+ mova m1, [pw_2000]
>+ lea r4, [r1 * 3]
>+ lea r5, [r3 * 3]
>+%assign y 1
>+%rep %2/4
>+%assign x 0
>+%rep %1/8
>+ FILTER_P2S_8_4_sse2 x
>+%if %2 == 6
>+ lea r0, [r0 + 4 * r1]
>+ lea r2, [r2 + 4 * r3]
>+ FILTER_P2S_8_2_sse2 x
>+%endif
>+%assign x x+16
>+%endrep
>+%rep (%1 % 8)/4
>+ FILTER_P2S_4_4_sse2 x
>+%assign x x+8
>+%endrep
>+%rep (%1 % 4)/2
>+ FILTER_P2S_2_4_sse2 x
>+%endrep
>+%if y < %2/4
>+ lea r0, [r0 + 4 * r1]
>+ lea r2, [r2 + 4 * r3]
>+%assign y y+1
>+%endif
>+%endrep
>+%endif
>+RET
>+%endmacro
>+
>+ FILTER_PIX_TO_SHORT_sse2 2, 4
>+ FILTER_PIX_TO_SHORT_sse2 2, 8
>+ FILTER_PIX_TO_SHORT_sse2 2, 16
>+ FILTER_PIX_TO_SHORT_sse2 4, 2
>+ FILTER_PIX_TO_SHORT_sse2 4, 4
>+ FILTER_PIX_TO_SHORT_sse2 4, 8
>+ FILTER_PIX_TO_SHORT_sse2 4, 16
>+ FILTER_PIX_TO_SHORT_sse2 4, 32
>+ FILTER_PIX_TO_SHORT_sse2 6, 8
>+ FILTER_PIX_TO_SHORT_sse2 6, 16
>+ FILTER_PIX_TO_SHORT_sse2 8, 2
>+ FILTER_PIX_TO_SHORT_sse2 8, 4
>+ FILTER_PIX_TO_SHORT_sse2 8, 6
>+ FILTER_PIX_TO_SHORT_sse2 8, 8
>+ FILTER_PIX_TO_SHORT_sse2 8, 12
>+ FILTER_PIX_TO_SHORT_sse2 8, 16
>+ FILTER_PIX_TO_SHORT_sse2 8, 32
>+ FILTER_PIX_TO_SHORT_sse2 8, 64
>+ FILTER_PIX_TO_SHORT_sse2 12, 16
>+ FILTER_PIX_TO_SHORT_sse2 12, 32
>+ FILTER_PIX_TO_SHORT_sse2 16, 4
>+ FILTER_PIX_TO_SHORT_sse2 16, 8
>+ FILTER_PIX_TO_SHORT_sse2 16, 12
>+ FILTER_PIX_TO_SHORT_sse2 16, 16
>+ FILTER_PIX_TO_SHORT_sse2 16, 24
>+ FILTER_PIX_TO_SHORT_sse2 16, 32
>+ FILTER_PIX_TO_SHORT_sse2 16, 64
>+ FILTER_PIX_TO_SHORT_sse2 24, 32
>+ FILTER_PIX_TO_SHORT_sse2 24, 64
>+ FILTER_PIX_TO_SHORT_sse2 32, 8
>+ FILTER_PIX_TO_SHORT_sse2 32, 16
>+ FILTER_PIX_TO_SHORT_sse2 32, 24
>+ FILTER_PIX_TO_SHORT_sse2 32, 32
>+ FILTER_PIX_TO_SHORT_sse2 32, 48
>+ FILTER_PIX_TO_SHORT_sse2 32, 64
>+ FILTER_PIX_TO_SHORT_sse2 48, 64
>+ FILTER_PIX_TO_SHORT_sse2 64, 16
>+ FILTER_PIX_TO_SHORT_sse2 64, 32
>+ FILTER_PIX_TO_SHORT_sse2 64, 48
>+ FILTER_PIX_TO_SHORT_sse2 64, 64
>+
> ;------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;------------------------------------------------------------------------------------------------------------
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter8.asm Thu Jun 04 16:52:19 2015 -0700
>@@ -301,6 +301,7 @@
> cextern pw_32
> cextern pw_512
> cextern pw_2000
>+cextern pw_8192
>
> %macro FILTER_H4_w2_2_sse2 0
> pxor m3, m3
>@@ -3283,6 +3284,183 @@
> FILTER_V4_W16n_H2_sse2 ps, 64, 16
> %endif
>
>+%macro FILTER_P2S_2_4_sse2 1
>+ movd m2, [r0 + %1]
>+ movd m3, [r0 + r1 + %1]
>+ punpcklwd m2, m3
>+ movd m3, [r0 + r1 * 2 + %1]
>+ movd m4, [r0 + r4 + %1]
>+ punpcklwd m3, m4
>+ punpckldq m2, m3
>+ punpcklbw m2, m0
>+ psllw m2, 6
>+ psubw m2, m1
>+
>+ movd [r2 + r3 * 0 + %1 * 2], m2
>+ psrldq m2, 4
>+ movd [r2 + r3 * 1 + %1 * 2], m2
>+ psrldq m2, 4
>+ movd [r2 + r3 * 2 + %1 * 2], m2
>+ psrldq m2, 4
>+ movd [r2 + r5 + %1 * 2], m2
>+%endmacro
>+
>+%macro FILTER_P2S_4_4_sse2 1
>+ movd m2, [r0 + %1]
>+ movd m3, [r0 + r1 + %1]
>+ movd m4, [r0 + r1 * 2 + %1]
>+ movd m5, [r0 + r4 + %1]
>+ punpckldq m2, m3
>+ punpcklbw m2, m0
>+ punpckldq m4, m5
>+ punpcklbw m4, m0
>+ psllw m2, 6
>+ psllw m4, 6
>+ psubw m2, m1
>+ psubw m4, m1
>+ movh [r2 + r3 * 0 + %1 * 2], m2
>+ movh [r2 + r3 * 2 + %1 * 2], m4
>+ movhps [r2 + r3 * 1 + %1 * 2], m2
>+ movhps [r2 + r5 + %1 * 2], m4
>+%endmacro
>+
>+%macro FILTER_P2S_4_2_sse2 0
>+ movd m2, [r0]
>+ movd m3, [r0 + r1]
>+ punpckldq m2, m3
>+ punpcklbw m2, m0
>+ psllw m2, 6
>+ psubw m2, [pw_8192]
>+ movh [r2], m2
>+ movhps [r2 + r3 * 2], m2
>+%endmacro
>+
>+%macro FILTER_P2S_8_4_sse2 1
>+ movh m2, [r0 + %1]
>+ movh m3, [r0 + r1 + %1]
>+ movh m4, [r0 + r1 * 2 + %1]
>+ movh m5, [r0 + r4 + %1]
>+ punpcklbw m2, m0
>+ punpcklbw m3, m0
>+ punpcklbw m5, m0
>+ punpcklbw m4, m0
>+ psllw m2, 6
>+ psllw m3, 6
>+ psllw m5, 6
>+ psllw m4, 6
>+ psubw m2, m1
>+ psubw m3, m1
>+ psubw m4, m1
>+ psubw m5, m1
>+ movu [r2 + r3 * 0 + %1 * 2], m2
>+ movu [r2 + r3 * 1 + %1 * 2], m3
>+ movu [r2 + r3 * 2 + %1 * 2], m4
>+ movu [r2 + r5 + %1 * 2], m5
>+%endmacro
>+
>+%macro FILTER_P2S_8_2_sse2 1
>+ movh m2, [r0 + %1]
>+ movh m3, [r0 + r1 + %1]
>+ punpcklbw m2, m0
>+ punpcklbw m3, m0
>+ psllw m2, 6
>+ psllw m3, 6
>+ psubw m2, m1
>+ psubw m3, m1
>+ movu [r2 + r3 * 0 + %1 * 2], m2
>+ movu [r2 + r3 * 1 + %1 * 2], m3
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_PIX_TO_SHORT_sse2 2
>+INIT_XMM sse2
>+cglobal filterPixelToShort_%1x%2, 4, 6, 6
>+ pxor m0, m0
>+%if %2 == 2
>+%if %1 == 4
>+ FILTER_P2S_4_2_sse2
>+%elif %1 == 8
>+ add r3d, r3d
>+ mova m1, [pw_8192]
>+ FILTER_P2S_8_2_sse2 0
>+%endif
>+%else
>+ add r3d, r3d
>+ mova m1, [pw_8192]
>+ lea r4, [r1 * 3]
>+ lea r5, [r3 * 3]
>+%assign y 1
>+%rep %2/4
>+%assign x 0
>+%rep %1/8
>+ FILTER_P2S_8_4_sse2 x
>+%if %2 == 6
>+ lea r0, [r0 + 4 * r1]
>+ lea r2, [r2 + 4 * r3]
>+ FILTER_P2S_8_2_sse2 x
>+%endif
>+%assign x x+8
>+%endrep
>+%rep (%1 % 8)/4
>+ FILTER_P2S_4_4_sse2 x
>+%assign x x+4
>+%endrep
>+%rep (%1 % 4)/2
>+ FILTER_P2S_2_4_sse2 x
>+%endrep
>+%if y < %2/4
>+ lea r0, [r0 + 4 * r1]
>+ lea r2, [r2 + 4 * r3]
>+%assign y y+1
>+%endif
>+%endrep
>+%endif
>+RET
>+%endmacro
>+
>+ FILTER_PIX_TO_SHORT_sse2 2, 4
>+ FILTER_PIX_TO_SHORT_sse2 2, 8
>+ FILTER_PIX_TO_SHORT_sse2 2, 16
>+ FILTER_PIX_TO_SHORT_sse2 4, 2
>+ FILTER_PIX_TO_SHORT_sse2 4, 4
>+ FILTER_PIX_TO_SHORT_sse2 4, 8
>+ FILTER_PIX_TO_SHORT_sse2 4, 16
>+ FILTER_PIX_TO_SHORT_sse2 4, 32
>+ FILTER_PIX_TO_SHORT_sse2 6, 8
>+ FILTER_PIX_TO_SHORT_sse2 6, 16
>+ FILTER_PIX_TO_SHORT_sse2 8, 2
>+ FILTER_PIX_TO_SHORT_sse2 8, 4
>+ FILTER_PIX_TO_SHORT_sse2 8, 6
>+ FILTER_PIX_TO_SHORT_sse2 8, 8
>+ FILTER_PIX_TO_SHORT_sse2 8, 12
>+ FILTER_PIX_TO_SHORT_sse2 8, 16
>+ FILTER_PIX_TO_SHORT_sse2 8, 32
>+ FILTER_PIX_TO_SHORT_sse2 8, 64
>+ FILTER_PIX_TO_SHORT_sse2 12, 16
>+ FILTER_PIX_TO_SHORT_sse2 12, 32
>+ FILTER_PIX_TO_SHORT_sse2 16, 4
>+ FILTER_PIX_TO_SHORT_sse2 16, 8
>+ FILTER_PIX_TO_SHORT_sse2 16, 12
>+ FILTER_PIX_TO_SHORT_sse2 16, 16
>+ FILTER_PIX_TO_SHORT_sse2 16, 24
>+ FILTER_PIX_TO_SHORT_sse2 16, 32
>+ FILTER_PIX_TO_SHORT_sse2 16, 64
>+ FILTER_PIX_TO_SHORT_sse2 24, 32
>+ FILTER_PIX_TO_SHORT_sse2 24, 64
>+ FILTER_PIX_TO_SHORT_sse2 32, 8
>+ FILTER_PIX_TO_SHORT_sse2 32, 16
>+ FILTER_PIX_TO_SHORT_sse2 32, 24
>+ FILTER_PIX_TO_SHORT_sse2 32, 32
>+ FILTER_PIX_TO_SHORT_sse2 32, 48
>+ FILTER_PIX_TO_SHORT_sse2 32, 64
>+ FILTER_PIX_TO_SHORT_sse2 48, 64
>+ FILTER_PIX_TO_SHORT_sse2 64, 16
>+ FILTER_PIX_TO_SHORT_sse2 64, 32
>+ FILTER_PIX_TO_SHORT_sse2 64, 48
>+ FILTER_PIX_TO_SHORT_sse2 64, 64
>+
> %macro FILTER_H4_w2_2 3
> movh %2, [srcq - 1]
> pshufb %2, %2, Tm0
>@@ -3299,6 +3477,7 @@
> mov [dstq + dststrideq], r4w
> %endmacro
>
>+
> ;-----------------------------------------------------------------------------
> ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;-----------------------------------------------------------------------------
>diff -r fcfba27ecf0b -r db3d3229113a source/common/x86/ipfilter8.h
>--- a/source/common/x86/ipfilter8.h Tue Jun 02 08:59:07 2015 -0700
>+++ b/source/common/x86/ipfilter8.h Thu Jun 04 16:52:19 2015 -0700
>@@ -965,6 +965,46 @@
> void x265_interp_4tap_vert_ps_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_ps_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_ps_4x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>+void x265_filterPixelToShort_2x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_2x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_2x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x2_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_4x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_6x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_6x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x2_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x6_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_8x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_12x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_24x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
>+void x265_filterPixelToShort_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
> #ifdef X86_64
> void x265_interp_4tap_vert_pp_6x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> void x265_interp_4tap_vert_pp_6x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150605/28eb1dd5/attachment-0001.html>
More information about the x265-devel
mailing list