[x265-commits] [x265] asm: avx2 code for sad_x3[48x64] for 10 bpp (25339 -> 11730)

Sumalatha at videolan.org Sumalatha at videolan.org
Tue May 19 23:56:25 CEST 2015


details:   http://hg.videolan.org/x265/rev/fd1f061f2229
branches:  
changeset: 10474:fd1f061f2229
user:      Sumalatha Polureddy
date:      Tue May 19 10:40:00 2015 +0530
description:
asm: avx2 code for sad_x3[48x64] for 10 bpp (25339 -> 11730)

sse2
sad_x3[48x64]  2.68x    25339.15        67836.94

avx2
sad_x3[48x64]  5.80x    11730.63        68061.84
Subject: [x265] asm: avx2 code for psyCost_pp 8x8, 16x16, 32x32 & 64x64, improved over 40% than previous asm

details:   http://hg.videolan.org/x265/rev/54b139ef81b6
branches:  
changeset: 10475:54b139ef81b6
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Mon May 18 18:03:19 2015 +0530
description:
asm: avx2 code for psyCost_pp 8x8, 16x16, 32x32 & 64x64, improved over 40% than previous asm
Subject: [x265] asm: avx2 code for sad_x4[16xN] for 10 bpp

details:   http://hg.videolan.org/x265/rev/e44cc3084a48
branches:  
changeset: 10476:e44cc3084a48
user:      Sumalatha Polureddy
date:      Tue May 19 12:31:11 2015 +0530
description:
asm: avx2 code for sad_x4[16xN] for 10 bpp

sse2
sad_x4[ 16x4]  2.80x    976.64          2730.64
sad_x4[ 16x8]  2.97x    1718.50         5111.16
sad_x4[16x12]  3.04x    2475.38         7525.02
sad_x4[16x16]  3.09x    3122.67         9651.31
sad_x4[16x32]  2.83x    6974.52         19741.04
sad_x4[16x64]  3.07x    12935.32        39669.09

avx2
sad_x4[ 16x4]  4.93x    518.46          2555.28
sad_x4[ 16x8]  5.91x    852.26          5038.35
sad_x4[16x12]  6.30x    1185.09         7470.80
sad_x4[16x16]  6.27x    1533.31         9617.03
sad_x4[16x32]  5.82x    3501.26         20373.02
sad_x4[16x64]  6.60x    6106.51         40281.86
Subject: [x265] asm: avx2 code for sad_x4[32xN] for 10 bpp

details:   http://hg.videolan.org/x265/rev/59e1a07c3371
branches:  
changeset: 10477:59e1a07c3371
user:      Sumalatha Polureddy
date:      Tue May 19 12:54:14 2015 +0530
description:
asm: avx2 code for sad_x4[32xN] for 10 bpp

sse2
sad_x4[ 32x8]  2.77x    3007.23         8338.03
sad_x4[32x16]  2.92x    5716.42         16685.54
sad_x4[32x24]  2.66x    9305.07         24790.30
sad_x4[32x32]  2.68x    12034.66        32295.28
sad_x4[32x64]  2.73x    23399.44        63928.18

avx2
sad_x4[ 32x8]  5.82x    1441.76         8388.88
sad_x4[32x16]  6.23x    2692.36         16771.69
sad_x4[32x24]  5.29x    4622.78         24452.15
sad_x4[32x32]  6.14x    5767.10         35437.46
sad_x4[32x64]  5.65x    11114.55        62818.39
Subject: [x265] asm: avx2 10bit code for luma_hpp[8xN]

details:   http://hg.videolan.org/x265/rev/eeb404be221e
branches:  
changeset: 10478:eeb404be221e
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Tue May 19 12:24:04 2015 +0530
description:
asm: avx2 10bit code for luma_hpp[8xN]

avx2:
luma_hpp[  8x4]         7.30x    507.64          3706.06
luma_hpp[  8x8]         7.64x    982.30          7503.45
luma_hpp[ 8x16]         7.78x    1898.72         14779.64
luma_hpp[ 8x32]         7.93x    3778.05         29954.26

sse4:
luma_hpp[  8x4]         4.34x    877.69          3806.35
luma_hpp[  8x8]         4.45x    1702.32         7569.03
luma_hpp[ 8x16]         4.44x    3335.36         14812.65
luma_hpp[ 8x32]         4.39x    6785.18         29815.67
Subject: [x265] asm: avx2 10bit code for luma_hpp[16xN]

details:   http://hg.videolan.org/x265/rev/4b1a9f5a3730
branches:  
changeset: 10479:4b1a9f5a3730
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Tue May 19 12:37:47 2015 +0530
description:
asm: avx2 10bit code for luma_hpp[16xN]

avx2:
luma_hpp[ 16x4]         7.81x    955.42          7466.10
luma_hpp[ 16x8]         8.23x    1847.69         15211.04
luma_hpp[16x12]         8.35x    2747.88         22941.08
luma_hpp[16x16]         8.64x    3651.71         31536.71
luma_hpp[16x32]         8.17x    7369.20         60188.46
luma_hpp[16x64]         8.49x    14626.30        124106.34

sse4:
luma_hpp[ 16x4]         4.40x    1700.57         7484.82
luma_hpp[ 16x8]         4.60x    3332.67         15319.88
luma_hpp[16x12]         5.14x    4922.31         25296.77
luma_hpp[16x16]         4.56x    6548.32         29836.87
luma_hpp[16x32]         4.65x    12974.12        60339.52
luma_hpp[16x64]         5.03x    25374.62        127527.44
Subject: [x265] asm: avx2 10bit code for luma_hpp[32xN],[64xN]

details:   http://hg.videolan.org/x265/rev/1e5d63d5804a
branches:  
changeset: 10480:1e5d63d5804a
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Tue May 19 12:55:49 2015 +0530
description:
asm: avx2 10bit code for luma_hpp[32xN],[64xN]

avx2:
luma_hpp[ 32x8]         8.32x    3627.74         30170.83
luma_hpp[32x16]         8.13x    7919.31         64368.49
luma_hpp[32x32]         7.63x    15286.96        116569.80
luma_hpp[32x24]         7.52x    11697.78        88000.76
luma_hpp[32x64]         7.72x    30244.69        233432.70
luma_hpp[64x16]         7.50x    15080.80        113146.33
luma_hpp[64x32]         8.18x    30264.35        247695.75
luma_hpp[64x48]         7.98x    45546.29        363685.47
luma_hpp[64x64]         8.20x    59435.71        487448.84

sse4:
luma_hpp[ 32x8]         4.99x    6520.85         32525.34
luma_hpp[32x16]         4.94x    13125.30        64830.36
luma_hpp[32x24]         4.61x    19555.56        90103.10
luma_hpp[32x32]         4.53x    25775.25        116649.78
luma_hpp[32x64]         4.59x    52294.73        240239.45
luma_hpp[64x16]         4.48x    26202.95        117409.19
luma_hpp[64x32]         4.53x    51285.01        232277.02
luma_hpp[64x48]         4.30x    80563.75        346052.34
luma_hpp[64x64]         4.58x    106132.23       486381.03
Subject: [x265] asm: avx2 10bit code for luma_hpp[12x16] (5154.47 -> 3632.88)

details:   http://hg.videolan.org/x265/rev/6485be5c9da0
branches:  
changeset: 10481:6485be5c9da0
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Tue May 19 13:10:20 2015 +0530
description:
asm: avx2 10bit code for luma_hpp[12x16] (5154.47 -> 3632.88)
Subject: [x265] asm: avx2 10bit code for luma_hpp[24x32] (18855.08 -> 10742.66)

details:   http://hg.videolan.org/x265/rev/3ef399262e9a
branches:  
changeset: 10482:3ef399262e9a
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Tue May 19 14:11:35 2015 +0530
description:
asm: avx2 10bit code for luma_hpp[24x32] (18855.08 -> 10742.66)
Subject: [x265] asm: avx2 10bit code for luma_hpp[48x64] (82440.47 -> 44731.61)

details:   http://hg.videolan.org/x265/rev/ef50ef0b7ce8
branches:  
changeset: 10483:ef50ef0b7ce8
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Tue May 19 13:59:32 2015 +0530
description:
asm: avx2 10bit code for luma_hpp[48x64] (82440.47 -> 44731.61)
Subject: [x265] asm: avx2 code for sad_x4[64xN] for 10 bpp

details:   http://hg.videolan.org/x265/rev/9e653d60121f
branches:  
changeset: 10484:9e653d60121f
user:      Sumalatha Polureddy
date:      Tue May 19 14:30:03 2015 +0530
description:
asm: avx2 code for sad_x4[64xN] for 10 bpp

sse2
sad_x4[64x16]  2.65x    11016.03        29192.78
sad_x4[64x32]  2.52x    22817.53        57604.71
sad_x4[64x48]  2.68x    32658.78        87513.64
sad_x4[64x64]  2.73x    47608.05        129783.16

avx2
sad_x4[64x16]  6.23x    4761.58         29662.72
sad_x4[64x32]  5.06x    11754.09        59433.72
sad_x4[64x48]  5.69x    15122.64        86068.27
sad_x4[64x64]  5.76x    20597.21        118573.05
Subject: [x265] asm: removed duplicate and redundant constants

details:   http://hg.videolan.org/x265/rev/7732b40d02f8
branches:  
changeset: 10485:7732b40d02f8
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Tue May 19 17:47:36 2015 +0530
description:
asm: removed duplicate and redundant constants
Subject: [x265] search: add lowres MV into search MV candidate list for search ME

details:   http://hg.videolan.org/x265/rev/58309953273e
branches:  
changeset: 10486:58309953273e
user:      Gopu Govindaswamy <gopu at multicorewareinc.com>
date:      Tue May 19 17:04:04 2015 +0530
description:
search: add lowres MV into search MV candidate list for search ME

diffstat:

 source/common/x86/asm-primitives.cpp |   46 ++-
 source/common/x86/const-a.asm        |    9 +-
 source/common/x86/ipfilter16.asm     |  634 +++++++++++++++++++++++++++++++++++
 source/common/x86/mc-a.asm           |    1 -
 source/common/x86/pixel-a.asm        |  245 ++++++++++++-
 source/common/x86/pixel-util8.asm    |    6 +-
 source/common/x86/sad16-a.asm        |   16 +-
 source/encoder/search.cpp            |   41 ++-
 source/encoder/search.h              |    2 +
 9 files changed, 969 insertions(+), 31 deletions(-)

diffs (truncated from 1305 to 300 lines):

diff -r d7b100e51e82 -r 58309953273e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue May 19 17:04:04 2015 +0530
@@ -1226,9 +1226,11 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
         p.cu[BLOCK_32x32].psy_cost_ss = x265_psyCost_ss_32x32_avx2;
         p.cu[BLOCK_64x64].psy_cost_ss = x265_psyCost_ss_64x64_avx2;
-
         p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_avx2;
-
+        p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_avx2;
+        p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_avx2;
+        p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_avx2;
+        p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_avx2;
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
 
@@ -1340,11 +1342,28 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_32x24].sad_x3 = x265_pixel_sad_x3_32x24_avx2;
         p.pu[LUMA_32x32].sad_x3 = x265_pixel_sad_x3_32x32_avx2;
         p.pu[LUMA_32x64].sad_x3 = x265_pixel_sad_x3_32x64_avx2;
+        p.pu[LUMA_48x64].sad_x3 = x265_pixel_sad_x3_48x64_avx2;
         p.pu[LUMA_64x16].sad_x3 = x265_pixel_sad_x3_64x16_avx2;
         p.pu[LUMA_64x32].sad_x3 = x265_pixel_sad_x3_64x32_avx2;
         p.pu[LUMA_64x48].sad_x3 = x265_pixel_sad_x3_64x48_avx2;
         p.pu[LUMA_64x64].sad_x3 = x265_pixel_sad_x3_64x64_avx2;
 
+        p.pu[LUMA_16x4].sad_x4 = x265_pixel_sad_x4_16x4_avx2;
+        p.pu[LUMA_16x8].sad_x4 = x265_pixel_sad_x4_16x8_avx2;
+        p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_avx2;
+        p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_avx2;
+        p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_avx2;
+        p.pu[LUMA_16x64].sad_x4 = x265_pixel_sad_x4_16x64_avx2;
+        p.pu[LUMA_32x8].sad_x4 = x265_pixel_sad_x4_32x8_avx2;
+        p.pu[LUMA_32x16].sad_x4 = x265_pixel_sad_x4_32x16_avx2;
+        p.pu[LUMA_32x24].sad_x4 = x265_pixel_sad_x4_32x24_avx2;
+        p.pu[LUMA_32x32].sad_x4 = x265_pixel_sad_x4_32x32_avx2;
+        p.pu[LUMA_32x64].sad_x4 = x265_pixel_sad_x4_32x64_avx2;
+        p.pu[LUMA_64x16].sad_x4 = x265_pixel_sad_x4_64x16_avx2;
+        p.pu[LUMA_64x32].sad_x4 = x265_pixel_sad_x4_64x32_avx2;
+        p.pu[LUMA_64x48].sad_x4 = x265_pixel_sad_x4_64x48_avx2;
+        p.pu[LUMA_64x64].sad_x4 = x265_pixel_sad_x4_64x64_avx2;
+
         p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2;
         p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2;
         p.pu[LUMA_16x12].convert_p2s = x265_filterPixelToShort_16x12_avx2;
@@ -1388,6 +1407,29 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2;
         p.pu[LUMA_4x16].luma_hps = x265_interp_8tap_horiz_ps_4x16_avx2;
 
+        p.pu[LUMA_8x4].luma_hpp = x265_interp_8tap_horiz_pp_8x4_avx2;
+        p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2;
+        p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2;
+        p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2;
+        p.pu[LUMA_16x4].luma_hpp = x265_interp_8tap_horiz_pp_16x4_avx2;
+        p.pu[LUMA_16x8].luma_hpp = x265_interp_8tap_horiz_pp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_hpp = x265_interp_8tap_horiz_pp_16x12_avx2;
+        p.pu[LUMA_16x16].luma_hpp = x265_interp_8tap_horiz_pp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_hpp = x265_interp_8tap_horiz_pp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_hpp = x265_interp_8tap_horiz_pp_16x64_avx2;
+        p.pu[LUMA_32x8].luma_hpp = x265_interp_8tap_horiz_pp_32x8_avx2;
+        p.pu[LUMA_32x16].luma_hpp = x265_interp_8tap_horiz_pp_32x16_avx2;
+        p.pu[LUMA_32x24].luma_hpp = x265_interp_8tap_horiz_pp_32x24_avx2;
+        p.pu[LUMA_32x32].luma_hpp = x265_interp_8tap_horiz_pp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_hpp = x265_interp_8tap_horiz_pp_32x64_avx2;
+        p.pu[LUMA_64x16].luma_hpp = x265_interp_8tap_horiz_pp_64x16_avx2;
+        p.pu[LUMA_64x32].luma_hpp = x265_interp_8tap_horiz_pp_64x32_avx2;
+        p.pu[LUMA_64x48].luma_hpp = x265_interp_8tap_horiz_pp_64x48_avx2;
+        p.pu[LUMA_64x64].luma_hpp = x265_interp_8tap_horiz_pp_64x64_avx2;
+        p.pu[LUMA_12x16].luma_hpp = x265_interp_8tap_horiz_pp_12x16_avx2;
+        p.pu[LUMA_24x32].luma_hpp = x265_interp_8tap_horiz_pp_24x32_avx2;
+        p.pu[LUMA_48x64].luma_hpp = x265_interp_8tap_horiz_pp_48x64_avx2;
+
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
diff -r d7b100e51e82 -r 58309953273e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/const-a.asm	Tue May 19 17:04:04 2015 +0530
@@ -90,7 +90,7 @@ const pw_pixel_max,         times 16 dw 
 const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
-const pw_pmpmpmpm,          times  1 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
+const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
 const pw_pmmpzzzz,          times  1 dw   1,  -1,  -1,   1,   0,   0,   0,   0
 const multi_2Row,           times  1 dw   1,   2,   3,   4,   1,   2,   3,   4
 const multiH,               times  1 dw   9,  10,  11,  12,  13,  14,  15,  16
@@ -100,7 +100,9 @@ const multiH2,              times  1 dw 
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
 const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
 const pw_FFFFFFFFFFFFFFF0,           dw 0x00
-                            times 7  dw 0xff
+                            times  7 dw 0xff
+const hmul_16p,             times 16 db   1
+                            times  8 db   1,  -1
 
 
 ;; 32-bit constants
@@ -110,7 +112,7 @@ const pd_2,                 times  8 dd 
 const pd_4,                 times  4 dd 4
 const pd_8,                 times  4 dd 8
 const pd_16,                times  4 dd 16
-const pd_32,                times  4 dd 32
+const pd_32,                times  8 dd 32
 const pd_64,                times  4 dd 64
 const pd_128,               times  4 dd 128
 const pd_256,               times  4 dd 256
@@ -122,7 +124,6 @@ const pd_32767,             times  4 dd 
 const pd_n32768,            times  4 dd 0xffff8000
 
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
-const deinterleave_shufd,   times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table
 %assign x 0
diff -r d7b100e51e82 -r 58309953273e source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Mon May 18 18:24:08 2015 -0500
+++ b/source/common/x86/ipfilter16.asm	Tue May 19 17:04:04 2015 +0530
@@ -115,6 +115,9 @@ tab_LumaCoeffVer: times 8 dw 0, 0
 
 const interp8_hps_shuf,     dd 0, 4, 1, 5, 2, 6, 3, 7
 
+const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+                            db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+
 SECTION .text
 cextern pd_32
 cextern pw_pixel_max
@@ -859,6 +862,637 @@ FILTER_HOR_LUMA_W24 24, 32, ps
     movhps      [r2 + r3],  m3
 %endmacro
 
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W8 1
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_8x%1, 4,6,8
+    add              r1d, r1d
+    add              r3d, r3d
+    sub              r0, 6
+    mov              r4d, r4m
+    shl              r4d, 4
+%ifdef PIC
+    lea              r5, [tab_LumaCoeff]
+    vpbroadcastq     m0, [r5 + r4]
+    vpbroadcastq     m1, [r5 + r4 + 8]
+%else
+    vpbroadcastq     m0, [tab_LumaCoeff + r4]
+    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
+%endif
+    mova             m3, [interp8_hpp_shuf]
+    mova             m7, [pd_32]
+    pxor             m2, m2
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    mov              r4d, %1/2
+
+.loop:
+    vbroadcasti128   m4, [r0]
+    vbroadcasti128   m5, [r0 + 8]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + 8]
+    vbroadcasti128   m6, [r0 + 16]
+    pshufb           m5, m3
+    pshufb           m6, m3
+
+    pmaddwd          m5, m0
+    pmaddwd          m6, m1
+    paddd            m5, m6
+
+    phaddd           m4, m5
+    vpermq           m4, m4, q3120
+    paddd            m4, m7
+    psrad            m4, 6
+
+    packusdw         m4, m4
+    vpermq           m4, m4, q2020
+    CLIPW            m4, m2, [pw_pixel_max]
+    movu             [r2], xm4
+
+    vbroadcasti128   m4, [r0 + r1]
+    vbroadcasti128   m5, [r0 + r1 + 8]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + r1 + 8]
+    vbroadcasti128   m6, [r0 + r1 + 16]
+    pshufb           m5, m3
+    pshufb           m6, m3
+
+    pmaddwd          m5, m0
+    pmaddwd          m6, m1
+    paddd            m5, m6
+
+    phaddd           m4, m5
+    vpermq           m4, m4, q3120
+    paddd            m4, m7
+    psrad            m4, 6
+
+    packusdw         m4, m4
+    vpermq           m4, m4, q2020
+    CLIPW            m4, m2, [pw_pixel_max]
+    movu             [r2 + r3], xm4
+
+    lea              r2, [r2 + 2 * r3]
+    lea              r0, [r0 + 2 * r1]
+    dec              r4d
+    jnz              .loop
+    RET
+%endmacro
+FILTER_HOR_LUMA_W8 4
+FILTER_HOR_LUMA_W8 8
+FILTER_HOR_LUMA_W8 16
+FILTER_HOR_LUMA_W8 32
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W16 1
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_16x%1, 4,6,8
+    add              r1d, r1d
+    add              r3d, r3d
+    sub              r0, 6
+    mov              r4d, r4m
+    shl              r4d, 4
+%ifdef PIC
+    lea              r5, [tab_LumaCoeff]
+    vpbroadcastq     m0, [r5 + r4]
+    vpbroadcastq     m1, [r5 + r4 + 8]
+%else
+    vpbroadcastq     m0, [tab_LumaCoeff + r4]
+    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
+%endif
+    mova             m3, [interp8_hpp_shuf]
+    mova             m7, [pd_32]
+    pxor             m2, m2
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    mov              r4d, %1
+
+.loop:
+    vbroadcasti128   m4, [r0]
+    vbroadcasti128   m5, [r0 + 8]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + 8]
+    vbroadcasti128   m6, [r0 + 16]
+    pshufb           m5, m3
+    pshufb           m6, m3
+
+    pmaddwd          m5, m0
+    pmaddwd          m6, m1
+    paddd            m5, m6
+
+    phaddd           m4, m5
+    vpermq           m4, m4, q3120
+    paddd            m4, m7
+    psrad            m4, 6
+
+    packusdw         m4, m4
+    vpermq           m4, m4, q2020
+    CLIPW            m4, m2, [pw_pixel_max]
+    movu             [r2], xm4
+
+    vbroadcasti128   m4, [r0 + 16]
+    vbroadcasti128   m5, [r0 + 24]
+    pshufb           m4, m3
+    pshufb           m5, m3
+
+    pmaddwd          m4, m0
+    pmaddwd          m5, m1
+    paddd            m4, m5
+
+    vbroadcasti128   m5, [r0 + 24]
+    vbroadcasti128   m6, [r0 + 32]
+    pshufb           m5, m3
+    pshufb           m6, m3
+


More information about the x265-commits mailing list