[x265] [PATCH 1 of 4] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
Steve Borho
steve at borho.org
Wed Jul 22 19:53:01 CEST 2015
On 07/22, Steve Borho wrote:
> On 07/21, Min Chen wrote:
> > # HG changeset patch
> > # User Min Chen <chenm003 at 163.com>
> > # Date 1437514211 25200
> > # Node ID ab2c34d6ad913369fd8feb84aee10030ffaa0df5
> > # Parent 46152345eb6ff261fd90272f7a0712300d6324c0
> > asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
>
> nice! queued for smoke testing
I'm enabling ASM in my 12bit smoke test builds; we should do the same
for all our automated regression test systems. main12 should be tested
with ASM enabled as a build option, including the running the assembly
testbench.
Some example encodes on a Haswell (dual-core) Macbook:
http://privatepaste.com/71b507772b
Note that main12 still needs its lambda tables fixed, that's why the
bitrate is out-of-whack.
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/asm-primitives.cpp
> > --- a/source/common/x86/asm-primitives.cpp Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/asm-primitives.cpp Tue Jul 21 14:30:11 2015 -0700
> > @@ -1043,7 +1043,9 @@
> >
> > // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
> > ALL_LUMA_PU(satd, pixel_satd, ssse3);
> > +#if X265_DEPTH <= 10
> > ASSIGN_SA8D(ssse3);
> > +#endif
> > INTRA_ANG_SSSE3(ssse3);
> >
> > p.dst4x4 = PFX(dst4_ssse3);
> > @@ -1126,14 +1128,18 @@
> >
> > // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
> > ALL_LUMA_PU(satd, pixel_satd, sse4);
> > +#if X265_DEPTH <= 10
> > ASSIGN_SA8D(sse4);
> > +#endif
> >
> > p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
> > p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
> > p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
> > p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
> >
> > +#if X265_DEPTH <= 10
> > ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
> > +#endif
> > ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
> > INTRA_ANG_SSE4_COMMON(sse4);
> > INTRA_ANG_SSE4_HIGH(sse4);
> > @@ -1147,7 +1153,9 @@
> >
> > // TODO: check POPCNT flag!
> > ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
> > +#if X265_DEPTH <= 10
> > ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
> > +#endif
> > ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
> >
> > p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
> > @@ -1184,7 +1192,9 @@
> > p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
> > p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
> > p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
> > +#if X265_DEPTH <= 10
> > ASSIGN_SA8D(avx);
> > +#endif
> > p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
> > p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
> > p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
> > @@ -1292,7 +1302,9 @@
> > {
> > //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
> > ALL_LUMA_PU(satd, pixel_satd, xop);
> > +#if X265_DEPTH <= 10
> > ASSIGN_SA8D(xop);
> > +#endif
> > LUMA_VAR(xop);
> > p.frameInitLowres = PFX(frame_init_lowres_core_xop);
> > }
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/const-a.asm
> > --- a/source/common/x86/const-a.asm Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/const-a.asm Tue Jul 21 14:30:11 2015 -0700
> > @@ -79,6 +79,7 @@
> > const pw_512, times 16 dw 512
> > const pw_1023, times 16 dw 1023
> > const pw_1024, times 16 dw 1024
> > +const pw_2048, times 16 dw 2048
> > const pw_4096, times 16 dw 4096
> > const pw_8192, times 8 dw 8192
> > const pw_00ff, times 16 dw 0x00ff
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/intrapred16.asm
> > --- a/source/common/x86/intrapred16.asm Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/intrapred16.asm Tue Jul 21 14:30:11 2015 -0700
> > @@ -1748,7 +1748,7 @@
> > ; filter top
> > movu m1, [r2]
> > paddw m1, m0
> > - psraw m1, 2
> > + psrlw m1, 2
> > movh [r0], m1 ; overwrite top-left pixel, we will update it later
> >
> > ; filter top-left
> > @@ -1763,7 +1763,7 @@
> > lea r0, [r0 + r1 * 2]
> > movu m1, [r3 + 2]
> > paddw m1, m0
> > - psraw m1, 2
> > + psrlw m1, 2
> > movd r3d, m1
> > mov [r0], r3w
> > shr r3d, 16
> > @@ -1872,7 +1872,7 @@
> > ; filter top
> > movu m0, [r2]
> > paddw m0, m1
> > - psraw m0, 2
> > + psrlw m0, 2
> > movu [r6], m0
> >
> > ; filter top-left
> > @@ -1887,7 +1887,7 @@
> > add r6, r1
> > movu m0, [r3 + 2]
> > paddw m0, m1
> > - psraw m0, 2
> > + psrlw m0, 2
> > pextrw [r6], m0, 0
> > pextrw [r6 + r1], m0, 1
> > pextrw [r6 + r1 * 2], m0, 2
> > @@ -1913,13 +1913,13 @@
> > movu m2, [r2]
> > movu m3, [r2 + 16]
> >
> > - paddw m0, m1
> > + paddw m0, m1 ; dynamic range 13 bits
> > paddw m2, m3
> > - paddw m0, m2
> > - movhlps m1, m0
> > - paddw m0, m1
> > - phaddw m0, m0
> > + paddw m0, m2 ; dynamic range 14 bits
> > + movhlps m1, m0 ; dynamic range 15 bits
> > + paddw m0, m1 ; dynamic range 16 bits
> > pmaddwd m0, [pw_1]
> > + phaddd m0, m0
> >
> > movd r5d, m0
> > add r5d, 16
> > @@ -1983,11 +1983,11 @@
> > ; filter top
> > movu m2, [r2]
> > paddw m2, m1
> > - psraw m2, 2
> > + psrlw m2, 2
> > movu [r6], m2
> > movu m3, [r2 + 16]
> > paddw m3, m1
> > - psraw m3, 2
> > + psrlw m3, 2
> > movu [r6 + 16], m3
> >
> > ; filter top-left
> > @@ -2002,7 +2002,7 @@
> > add r6, r1
> > movu m2, [r3 + 2]
> > paddw m2, m1
> > - psraw m2, 2
> > + psrlw m2, 2
> >
> > pextrw [r6], m2, 0
> > pextrw [r6 + r1], m2, 1
> > @@ -2019,7 +2019,7 @@
> > lea r6, [r6 + r1 * 2]
> > movu m3, [r3 + 18]
> > paddw m3, m1
> > - psraw m3, 2
> > + psrlw m3, 2
> >
> > pextrw [r6], m3, 0
> > pextrw [r6 + r1], m3, 1
> > @@ -2046,21 +2046,21 @@
> > movu m1, [r3 + 16]
> > movu m2, [r3 + 32]
> > movu m3, [r3 + 48]
> > - paddw m0, m1
> > + paddw m0, m1 ; dynamic range 13 bits
> > paddw m2, m3
> > - paddw m0, m2
> > + paddw m0, m2 ; dynamic range 14 bits
> > movu m1, [r2]
> > movu m3, [r2 + 16]
> > movu m4, [r2 + 32]
> > movu m5, [r2 + 48]
> > - paddw m1, m3
> > + paddw m1, m3 ; dynamic range 13 bits
> > paddw m4, m5
> > - paddw m1, m4
> > - paddw m0, m1
> > + paddw m1, m4 ; dynamic range 14 bits
> > + paddw m0, m1 ; dynamic range 15 bits
> > + pmaddwd m0, [pw_1]
> > movhlps m1, m0
> > - paddw m0, m1
> > - phaddw m0, m0
> > - pmaddwd m0, [pw_1]
> > + paddd m0, m1
> > + phaddd m0, m0
> >
> > paddd m0, [pd_32] ; sum = sum + 32
> > psrld m0, 6 ; sum = sum / 64
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/ipfilter16.asm
> > --- a/source/common/x86/ipfilter16.asm Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/ipfilter16.asm Tue Jul 21 14:30:11 2015 -0700
> > @@ -26,6 +26,25 @@
> > %include "x86inc.asm"
> > %include "x86util.asm"
> >
> > +
> > +%define INTERP_OFFSET_PP pd_32
> > +%define INTERP_SHIFT_PP 6
> > +
> > +%if BIT_DEPTH == 10
> > + %define INTERP_SHIFT_PS 2
> > + %define INTERP_OFFSET_PS pd_n32768
> > + %define INTERP_SHIFT_SP 10
> > + %define INTERP_OFFSET_SP pd_524800
> > +%elif BIT_DEPTH == 12
> > + %define INTERP_SHIFT_PS 4
> > + %define INTERP_OFFSET_PS pd_n131072
> > + %define INTERP_SHIFT_SP 8
> > + %define INTERP_OFFSET_SP pd_524416
> > +%else
> > + %error Unsupport bit depth!
> > +%endif
> > +
> > +
> > SECTION_RODATA 32
> >
> > tab_c_32: times 8 dd 32
> > @@ -145,21 +164,9 @@
> > const pb_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
> > db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
> >
> > -%if BIT_DEPTH == 10
> > - %define INTERP_OFFSET_PS pd_n32768
> > - %define INTERP_SHIFT_PS 2
> > - %define INTERP_OFFSET_SP pd_524800
> > - %define INTERP_SHIFT_SP 10
> > -%elif BIT_DEPTH == 12
> > - %define INTERP_OFFSET_PS pd_n131072
> > - %define INTERP_SHIFT_PS 4
> > - %define INTERP_OFFSET_SP pd_524416
> > - %define INTERP_SHIFT_SP 8
> > -%else
> > - %error Unsupport bit depth!
> > -%endif
> >
> > SECTION .text
> > +cextern pd_8
> > cextern pd_32
> > cextern pw_pixel_max
> > cextern pd_524416
> > @@ -503,7 +510,7 @@
> > %endif
> >
> > %ifidn %1,pp
> > - mova m7, [pd_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %define SHIFT 6
> > %elifidn %1,ps
> > mova m7, [INTERP_OFFSET_PS]
> > @@ -1176,7 +1183,6 @@
> > %macro FILTER_HOR_LUMA_W4 3
> > INIT_XMM sse4
> > cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> > -
> > mov r4d, r4m
> > sub r0, 6
> > shl r4d, 4
> > @@ -1229,7 +1235,7 @@
> > packusdw m4, m4
> > CLIPW m4, m6, m7
> > %else
> > - psrad m4, 2
> > + psrad m4, INTERP_SHIFT_PS
> > packssdw m4, m4
> > %endif
> >
> > @@ -1287,7 +1293,7 @@
> > mov r4d, %2
> > %ifidn %3, ps
> > cmp r5m, byte 0
> > - je .loopH
> > + je .loopH
> > lea r6, [r1 + 2 * r1]
> > sub r0, r6
> > add r4d, 7
> > @@ -1329,8 +1335,8 @@
> > packusdw m4, m5
> > CLIPW m4, m7, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m4, m5
> > %endif
> >
> > @@ -1340,7 +1346,7 @@
> > add r2, r3
> >
> > dec r4d
> > - jnz .loopH
> > + jnz .loopH
> > RET
> > %endmacro
> >
> > @@ -1380,7 +1386,7 @@
> > mova m0, [tab_LumaCoeff + r4]
> > %endif
> > %ifidn %3, pp
> > - mova m1, [pd_32]
> > + mova m1, [INTERP_OFFSET_PP]
> > %else
> > mova m1, [INTERP_OFFSET_PS]
> > %endif
> > @@ -1425,14 +1431,14 @@
> > phaddd m5, m6
> > paddd m5, m1
> > %ifidn %3, pp
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m4, m5
> > pxor m5, m5
> > CLIPW m4, m5, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m4, m5
> > %endif
> >
> > @@ -1453,12 +1459,12 @@
> > phaddd m4, m5
> > paddd m4, m1
> > %ifidn %3, pp
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> > packusdw m4, m4
> > pxor m5, m5
> > CLIPW m4, m5, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > + psrad m4, INTERP_SHIFT_PS
> > packssdw m4, m4
> > %endif
> >
> > @@ -1550,14 +1556,14 @@
> > phaddd m5, m6
> > paddd m5, m1
> > %ifidn %3, pp
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m4, m5
> > pxor m5, m5
> > CLIPW m4, m5, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m4, m5
> > %endif
> > movu [r2 + x], m4
> > @@ -1591,14 +1597,14 @@
> > phaddd m5, m6
> > paddd m5, m1
> > %ifidn %3, pp
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m4, m5
> > pxor m5, m5
> > CLIPW m4, m5, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m4, m5
> > %endif
> > movu [r2 + 16 + x], m4
> > @@ -1743,14 +1749,14 @@
> > phaddd m5, m6
> > paddd m5, m1
> > %ifidn %3, pp
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m4, m5
> > pxor m5, m5
> > CLIPW m4, m5, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m4, m5
> > %endif
> > movu [r2], m4
> > @@ -1784,14 +1790,14 @@
> > phaddd m5, m6
> > paddd m5, m1
> > %ifidn %3, pp
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m4, m5
> > pxor m5, m5
> > CLIPW m4, m5, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m4, m5
> > %endif
> > movu [r2 + 16], m4
> > @@ -1825,14 +1831,14 @@
> > phaddd m5, m6
> > paddd m5, m1
> > %ifidn %3, pp
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m4, m5
> > pxor m5, m5
> > CLIPW m4, m5, [pw_pixel_max]
> > %else
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m4, m5
> > %endif
> > movu [r2 + 32], m4
> > @@ -1865,11 +1871,11 @@
> > phaddd m3, m4
> > paddd m3, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > + psrad m3, INTERP_SHIFT_PP
> > packusdw m3, m3
> > CLIPW m3, m7, m6
> > %else
> > - psrad m3, 2
> > + psrad m3, INTERP_SHIFT_PS
> > packssdw m3, m3
> > %endif
> > movd [r2], m3
> > @@ -1895,13 +1901,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m7, m6
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2], m3
> > @@ -1950,7 +1956,7 @@
> > phaddd m4, m4
> > vpermq m4, m4, q3120
> > paddd m4, m6
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -1969,7 +1975,7 @@
> > phaddd m4, m4
> > vpermq m4, m4, q3120
> > paddd m4, m6
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2036,7 +2042,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2064,7 +2070,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2132,7 +2138,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2160,7 +2166,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2232,7 +2238,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2260,7 +2266,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2335,7 +2341,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2363,7 +2369,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2425,7 +2431,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2453,7 +2459,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2481,7 +2487,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2545,7 +2551,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2573,7 +2579,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2601,7 +2607,7 @@
> > phaddd m4, m5
> > vpermq m4, m4, q3120
> > paddd m4, m7
> > - psrad m4, 6
> > + psrad m4, INTERP_SHIFT_PP
> >
> > packusdw m4, m4
> > vpermq m4, m4, q2020
> > @@ -2644,32 +2650,32 @@
> > mova m1, [INTERP_OFFSET_PS]
> > cmp r5m, byte 0
> > je .skip
> > - sub r0, r1
> > - movu m3, [r0]
> > - pshufb m3, m3, m2
> > - pmaddwd m3, m0
> > -
> > - %if %1 == 4
> > - movu m4, [r0 + 4]
> > - pshufb m4, m4, m2
> > - pmaddwd m4, m0
> > - phaddd m3, m4
> > - %else
> > - phaddd m3, m3
> > - %endif
> > -
> > - paddd m3, m1
> > - psrad m3, INTERP_SHIFT_PS
> > - packssdw m3, m3
> > -
> > - %if %1 == 2
> > - movd [r2], m3
> > - %else
> > - movh [r2], m3
> > - %endif
> > -
> > - add r0, r1
> > - add r2, r3
> > + sub r0, r1
> > + movu m3, [r0]
> > + pshufb m3, m3, m2
> > + pmaddwd m3, m0
> > +
> > + %if %1 == 4
> > + movu m4, [r0 + 4]
> > + pshufb m4, m4, m2
> > + pmaddwd m4, m0
> > + phaddd m3, m4
> > + %else
> > + phaddd m3, m3
> > + %endif
> > +
> > + paddd m3, m1
> > + psrad m3, INTERP_SHIFT_PS
> > + packssdw m3, m3
> > +
> > + %if %1 == 2
> > + movd [r2], m3
> > + %else
> > + movh [r2], m3
> > + %endif
> > +
> > + add r0, r1
> > + add r2, r3
> > FILTER_W%1_2 %3
> > lea r0, [r0 + 2 * r1]
> > lea r2, [r2 + 2 * r3]
> > @@ -2689,7 +2695,6 @@
> > lea r2, [r2 + 2 * r3]
> > FILTER_W%1_2 %3
> > %endrep
> > -
> > RET
> > %endmacro
> >
> > @@ -2729,13 +2734,13 @@
> > phaddd m4, m4
> > paddd m4, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m4, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m4, INTERP_SHIFT_PP
> > packusdw m3, m4
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m4, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m4, INTERP_SHIFT_PS
> > packssdw m3, m4
> > %endif
> > movh [r2], m3
> > @@ -2769,13 +2774,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2], m3
> > @@ -2809,13 +2814,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2], m3
> > @@ -2831,11 +2836,11 @@
> > paddd m3, m1
> >
> > %ifidn %1, pp
> > - psrad m3, 6
> > + psrad m3, INTERP_SHIFT_PP
> > packusdw m3, m3
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > + psrad m3, INTERP_SHIFT_PS
> > packssdw m3, m3
> > %endif
> > movh [r2 + 16], m3
> > @@ -2868,13 +2873,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2], m3
> > @@ -2898,13 +2903,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2 + 16], m3
> > @@ -2938,13 +2943,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2], m3
> > @@ -2968,13 +2973,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2 + 16], m3
> > @@ -2998,13 +3003,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2 + 32], m3
> > @@ -3038,13 +3043,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2], m3
> > @@ -3068,13 +3073,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2 + 16], m3
> > @@ -3098,13 +3103,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2 + 32], m3
> > @@ -3128,13 +3133,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2 + 48], m3
> > @@ -3168,13 +3173,13 @@
> > phaddd m5, m4
> > paddd m5, m1
> > %ifidn %1, pp
> > - psrad m3, 6
> > - psrad m5, 6
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > packusdw m3, m5
> > CLIPW m3, m6, m7
> > %else
> > - psrad m3, 2
> > - psrad m5, 2
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > packssdw m3, m5
> > %endif
> > movh [r2 + %2], m3
> > @@ -3408,7 +3413,7 @@
> > pmaddwd m4, m0
> > phaddd m3, m4
> > paddd m3, m2
> > - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >
> > packusdw m3, m3
> > vpermq m3, m3, q2020
> > @@ -3426,7 +3431,7 @@
> > pmaddwd m4, m0
> > phaddd m3, m4
> > paddd m3, m2
> > - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >
> > packusdw m3, m3
> > vpermq m3, m3, q2020
> > @@ -3474,7 +3479,7 @@
> > pmaddwd m4, m0
> > phaddd m3, m4
> > paddd m3, m2
> > - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >
> > packusdw m3, m3
> > vpermq m3, m3,q2020
> > @@ -3491,7 +3496,7 @@
> > pmaddwd m4, m0
> > phaddd m3, m4
> > paddd m3, m2
> > - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >
> > packusdw m3, m3
> > vpermq m3, m3,q2020
> > @@ -4089,7 +4094,7 @@
> > %ifnidn %3, ps
> > mova m7, [pw_pixel_max]
> > %ifidn %3, pp
> > - mova m6, [tab_c_32]
> > + mova m6, [INTERP_OFFSET_PP]
> > %else
> > mova m6, [INTERP_OFFSET_SP]
> > %endif
> > @@ -4129,10 +4134,10 @@
> > paddd m2, m6
> > paddd m3, m6
> > %ifidn %3, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %else
> > psrad m0, INTERP_SHIFT_SP
> > psrad m1, INTERP_SHIFT_SP
> > @@ -4344,9 +4349,9 @@
> > pxor m7, m7
> > mova m6, [pw_pixel_max]
> > %ifidn %2, pp
> > - mova m5, [tab_c_32]
> > + mova m5, [INTERP_OFFSET_PP]
> > %else
> > - mova m5, [tab_c_524800]
> > + mova m5, [INTERP_OFFSET_SP]
> > %endif
> > %else
> > mova m5, [INTERP_OFFSET_PS]
> > @@ -4362,18 +4367,18 @@
> > %elifidn %2, ps
> > paddd m0, m5
> > paddd m2, m5
> > - psrad m0, 2
> > - psrad m2, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > packssdw m0, m2
> > %else
> > paddd m0, m5
> > paddd m2, m5
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m2, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > %else
> > - psrad m0, 10
> > - psrad m2, 10
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > %endif
> > packusdw m0, m2
> > CLIPW m0, m7, m6
> > @@ -4389,7 +4394,6 @@
> >
> > dec r4d
> > jnz .loopH
> > -
> > RET
> > %endmacro
> >
> > @@ -4417,7 +4421,6 @@
> > %macro FILTER_VER_CHROMA_W4 3
> > INIT_XMM sse4
> > cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
> > -
> > add r1d, r1d
> > add r3d, r3d
> > sub r0, r1
> > @@ -4439,9 +4442,9 @@
> > pxor m6, m6
> > mova m5, [pw_pixel_max]
> > %ifidn %2, pp
> > - mova m4, [tab_c_32]
> > + mova m4, [INTERP_OFFSET_PP]
> > %else
> > - mova m4, [tab_c_524800]
> > + mova m4, [INTERP_OFFSET_SP]
> > %endif
> > %else
> > mova m4, [INTERP_OFFSET_PS]
> > @@ -4479,18 +4482,18 @@
> > %elifidn %2, ps
> > paddd m0, m4
> > paddd m1, m4
> > - psrad m0, 2
> > - psrad m1, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > packssdw m0, m1
> > %else
> > paddd m0, m4
> > paddd m1, m4
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > %else
> > - psrad m0, 10
> > - psrad m1, 10
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > %endif
> > packusdw m0, m1
> > CLIPW m0, m6, m5
> > @@ -4504,7 +4507,6 @@
> > dec r4d
> > jnz .loop
> > %endif
> > -
> > RET
> > %endmacro
> >
> > @@ -4524,7 +4526,6 @@
> > %macro FILTER_VER_CHROMA_W6 3
> > INIT_XMM sse4
> > cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
> > -
> > add r1d, r1d
> > add r3d, r3d
> > sub r0, r1
> > @@ -4543,9 +4544,9 @@
> > %ifnidn %2, ps
> > mova m7, [pw_pixel_max]
> > %ifidn %2, pp
> > - mova m6, [tab_c_32]
> > + mova m6, [INTERP_OFFSET_PP]
> > %else
> > - mova m6, [tab_c_524800]
> > + mova m6, [INTERP_OFFSET_SP]
> > %endif
> > %else
> > mova m6, [INTERP_OFFSET_PS]
> > @@ -4568,10 +4569,10 @@
> > paddd m1, m6
> > paddd m2, m6
> > paddd m3, m6
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -4581,15 +4582,15 @@
> > paddd m2, m6
> > paddd m3, m6
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %else
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > %endif
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -4616,18 +4617,18 @@
> > %elifidn %2, ps
> > paddd m0, m6
> > paddd m2, m6
> > - psrad m0, 2
> > - psrad m2, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > packssdw m0, m2
> > %else
> > paddd m0, m6
> > paddd m2, m6
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m2, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > %else
> > - psrad m0, 10
> > - psrad m2, 10
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > %endif
> > packusdw m0, m2
> > CLIPW m0, m5, m7
> > @@ -4644,7 +4645,6 @@
> >
> > dec r4d
> > jnz .loopH
> > -
> > RET
> > %endmacro
> >
> > @@ -4712,7 +4712,7 @@
> > mov r4d, %2/2
> >
> > %ifidn %3, pp
> > - mova m7, [tab_c_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %elifidn %3, sp
> > mova m7, [INTERP_OFFSET_SP]
> > %elifidn %3, ps
> > @@ -4748,10 +4748,10 @@
> > paddd m2, m7
> > paddd m3, m7
> > %ifidn %3, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %else
> > psrad m0, INTERP_SHIFT_SP
> > psrad m1, INTERP_SHIFT_SP
> > @@ -4772,7 +4772,6 @@
> >
> > dec r4d
> > jnz .loopH
> > -
> > RET
> > %endmacro
> >
> > @@ -4868,9 +4867,9 @@
> > mov r6d, %1/4
> >
> > %ifidn %2,pp
> > - vbroadcasti128 m8, [pd_32]
> > + vbroadcasti128 m8, [INTERP_OFFSET_PP]
> > %elifidn %2, sp
> > - mova m8, [pd_524800]
> > + mova m8, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m8, [INTERP_OFFSET_PS]
> > %endif
> > @@ -4934,20 +4933,20 @@
> > paddd m2, m8
> > paddd m3, m8
> > %ifidn %2,pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %elifidn %2, sp
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > -%else
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -5012,9 +5011,9 @@
> > mov r4d, %1/2
> >
> > %ifidn %2, pp
> > - mova m7, [tab_c_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %elifidn %2, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %elifidn %2, ps
> > mova m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -5034,10 +5033,10 @@
> > paddd m1, m7
> > paddd m2, m7
> > paddd m3, m7
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5047,15 +5046,15 @@
> > paddd m2, m7
> > paddd m3, m7
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > -%else
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > +%else
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > %endif
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5184,9 +5183,9 @@
> > mov r4d, %1/2
> >
> > %ifidn %2, pp
> > - mova m7, [tab_c_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %elifidn %2, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %elifidn %2, ps
> > mova m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -5213,18 +5212,18 @@
> > paddd m1, m7
> > paddd m2, m7
> > paddd m3, m7
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > paddd m8, m7
> > paddd m9, m7
> > paddd m10, m7
> > paddd m11, m7
> > - psrad m8, 2
> > - psrad m9, 2
> > - psrad m10, 2
> > - psrad m11, 2
> > + psrad m8, INTERP_SHIFT_PS
> > + psrad m9, INTERP_SHIFT_PS
> > + psrad m10, INTERP_SHIFT_PS
> > + psrad m11, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5240,23 +5239,23 @@
> > paddd m10, m7
> > paddd m11, m7
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > - psrad m8, 6
> > - psrad m9, 6
> > - psrad m10, 6
> > - psrad m11, 6
> > -%else
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > - psrad m8, 10
> > - psrad m9, 10
> > - psrad m10, 10
> > - psrad m11, 10
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m8, INTERP_SHIFT_PP
> > + psrad m9, INTERP_SHIFT_PP
> > + psrad m10, INTERP_SHIFT_PP
> > + psrad m11, INTERP_SHIFT_PP
> > +%else
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > + psrad m8, INTERP_SHIFT_SP
> > + psrad m9, INTERP_SHIFT_SP
> > + psrad m10, INTERP_SHIFT_SP
> > + psrad m11, INTERP_SHIFT_SP
> > %endif
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5326,9 +5325,9 @@
> > mov r4d, %1/2
> >
> > %ifidn %2, pp
> > - mova m7, [tab_c_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %elifidn %2, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %elifidn %2, ps
> > mova m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -5380,10 +5379,10 @@
> > paddd m1, m7
> > paddd m2, m7
> > paddd m3, m7
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5393,15 +5392,15 @@
> > paddd m2, m7
> > paddd m3, m7
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > -%else
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > +%else
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > %endif
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5457,9 +5456,9 @@
> > mov r4d, %1/2
> >
> > %ifidn %2, pp
> > - mova m7, [tab_c_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %elifidn %2, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %elifidn %2, ps
> > mova m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -5479,10 +5478,10 @@
> > paddd m1, m7
> > paddd m2, m7
> > paddd m3, m7
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5492,15 +5491,15 @@
> > paddd m2, m7
> > paddd m3, m7
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > -%else
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > +%else
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > %endif
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5610,9 +5609,9 @@
> > mov r4d, %1/2
> >
> > %ifidn %2, pp
> > - mova m7, [tab_c_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %elifidn %2, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %elifidn %2, ps
> > mova m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -5639,18 +5638,18 @@
> > paddd m1, m7
> > paddd m2, m7
> > paddd m3, m7
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > paddd m8, m7
> > paddd m9, m7
> > paddd m10, m7
> > paddd m11, m7
> > - psrad m8, 2
> > - psrad m9, 2
> > - psrad m10, 2
> > - psrad m11, 2
> > + psrad m8, INTERP_SHIFT_PS
> > + psrad m9, INTERP_SHIFT_PS
> > + psrad m10, INTERP_SHIFT_PS
> > + psrad m11, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5666,23 +5665,23 @@
> > paddd m10, m7
> > paddd m11, m7
> > %ifidn %2, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > - psrad m8, 6
> > - psrad m9, 6
> > - psrad m10, 6
> > - psrad m11, 6
> > -%else
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > - psrad m8, 10
> > - psrad m9, 10
> > - psrad m10, 10
> > - psrad m11, 10
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m8, INTERP_SHIFT_PP
> > + psrad m9, INTERP_SHIFT_PP
> > + psrad m10, INTERP_SHIFT_PP
> > + psrad m11, INTERP_SHIFT_PP
> > +%else
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > + psrad m8, INTERP_SHIFT_SP
> > + psrad m9, INTERP_SHIFT_SP
> > + psrad m10, INTERP_SHIFT_SP
> > + psrad m11, INTERP_SHIFT_SP
> > %endif
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5733,9 +5732,9 @@
> > mov r4d, 32
> >
> > %ifidn %1, pp
> > - mova m7, [tab_c_32]
> > + mova m7, [INTERP_OFFSET_PP]
> > %elifidn %1, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %elifidn %1, ps
> > mova m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -5787,10 +5786,10 @@
> > paddd m1, m7
> > paddd m2, m7
> > paddd m3, m7
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5800,15 +5799,15 @@
> > paddd m2, m7
> > paddd m3, m7
> > %ifidn %1, pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > -%else
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > +%else
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > %endif
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -5827,6 +5826,7 @@
> > jnz .loopH
> > RET
> > %endmacro
> > +
> > FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8
> > FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8
> > FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7
> > @@ -5834,7 +5834,6 @@
> >
> > INIT_XMM sse2
> > cglobal chroma_p2s, 3, 7, 3
> > -
> > ; load width and height
> > mov r3d, r3m
> > mov r4d, r4m
> > @@ -5850,11 +5849,11 @@
> > lea r6, [r0 + r5 * 2]
> >
> > movu m0, [r6]
> > - psllw m0, 4
> > + psllw m0, (14 - BIT_DEPTH)
> > paddw m0, m2
> >
> > movu m1, [r6 + r1]
> > - psllw m1, 4
> > + psllw m1, (14 - BIT_DEPTH)
> > paddw m1, m2
> >
> > add r5d, 8
> > @@ -5887,7 +5886,6 @@
> >
> > sub r4d, 2
> > jnz .loopH
> > -
> > RET
> >
> > %macro PROCESS_LUMA_VER_W4_4R 0
> > @@ -5975,7 +5973,7 @@
> > lea r6, [tab_LumaCoeffV + r4]
> > %endif
> >
> > - mova m7, [pd_32]
> > + mova m7, [INTERP_OFFSET_PP]
> >
> > mov dword [rsp], %2/4
> > .loopH:
> > @@ -5988,10 +5986,10 @@
> > paddd m2, m7
> > paddd m3, m7
> >
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -6017,7 +6015,6 @@
> >
> > dec dword [rsp]
> > jnz .loopH
> > -
> > RET
> > %endmacro
> >
> > @@ -6126,14 +6123,14 @@
> > paddd m0, m6
> > paddd m2, m6
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m2, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m2, 10
> > -%else
> > - psrad m0, 2
> > - psrad m2, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -6294,20 +6291,20 @@
> > paddd m2, m11
> > paddd m3, m11
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > -%else
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -6365,20 +6362,20 @@
> > paddd m6, m11
> > paddd m7, m11
> > %ifidn %1,pp
> > - psrad m4, 6
> > - psrad m5, 6
> > - psrad m6, 6
> > - psrad m7, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > + psrad m6, INTERP_SHIFT_PP
> > + psrad m7, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m4, 10
> > - psrad m5, 10
> > - psrad m6, 10
> > - psrad m7, 10
> > -%else
> > - psrad m4, 2
> > - psrad m5, 2
> > - psrad m6, 2
> > - psrad m7, 2
> > + psrad m4, INTERP_SHIFT_SP
> > + psrad m5, INTERP_SHIFT_SP
> > + psrad m6, INTERP_SHIFT_SP
> > + psrad m7, INTERP_SHIFT_SP
> > +%else
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > + psrad m6, INTERP_SHIFT_PS
> > + psrad m7, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -6538,26 +6535,26 @@
> > paddd m4, m14
> > paddd m5, m14
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > - psrad m4, 10
> > - psrad m5, 10
> > -%else
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > + psrad m4, INTERP_SHIFT_SP
> > + psrad m5, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -6620,14 +6617,14 @@
> > paddd m6, m14
> > paddd m7, m14
> > %ifidn %1,pp
> > - psrad m6, 6
> > - psrad m7, 6
> > + psrad m6, INTERP_SHIFT_PP
> > + psrad m7, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m6, 10
> > - psrad m7, 10
> > -%else
> > - psrad m6, 2
> > - psrad m7, 2
> > + psrad m6, INTERP_SHIFT_SP
> > + psrad m7, INTERP_SHIFT_SP
> > +%else
> > + psrad m6, INTERP_SHIFT_PS
> > + psrad m7, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -6734,32 +6731,32 @@
> > paddd m0, m14
> > paddd m1, m14
> > %ifidn %1,pp
> > - psrad m8, 6
> > - psrad m9, 6
> > - psrad m10, 6
> > - psrad m11, 6
> > - psrad m12, 6
> > - psrad m13, 6
> > - psrad m0, 6
> > - psrad m1, 6
> > + psrad m8, INTERP_SHIFT_PP
> > + psrad m9, INTERP_SHIFT_PP
> > + psrad m10, INTERP_SHIFT_PP
> > + psrad m11, INTERP_SHIFT_PP
> > + psrad m12, INTERP_SHIFT_PP
> > + psrad m13, INTERP_SHIFT_PP
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m8, 10
> > - psrad m9, 10
> > - psrad m10, 10
> > - psrad m11, 10
> > - psrad m12, 10
> > - psrad m13, 10
> > - psrad m0, 10
> > - psrad m1, 10
> > -%else
> > - psrad m8, 2
> > - psrad m9, 2
> > - psrad m10, 2
> > - psrad m11, 2
> > - psrad m12, 2
> > - psrad m13, 2
> > - psrad m0, 2
> > - psrad m1, 2
> > + psrad m8, INTERP_SHIFT_SP
> > + psrad m9, INTERP_SHIFT_SP
> > + psrad m10, INTERP_SHIFT_SP
> > + psrad m11, INTERP_SHIFT_SP
> > + psrad m12, INTERP_SHIFT_SP
> > + psrad m13, INTERP_SHIFT_SP
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > +%else
> > + psrad m8, INTERP_SHIFT_PS
> > + psrad m9, INTERP_SHIFT_PS
> > + psrad m10, INTERP_SHIFT_PS
> > + psrad m11, INTERP_SHIFT_PS
> > + psrad m12, INTERP_SHIFT_PS
> > + psrad m13, INTERP_SHIFT_PS
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -6819,7 +6816,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m14, [pd_32]
> > %elifidn %1, sp
> > - mova m14, [pd_524800]
> > + mova m14, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m14, [INTERP_OFFSET_PS]
> > %endif
> > @@ -6870,7 +6867,7 @@
> > %ifidn %3,pp
> > vbroadcasti128 m14, [pd_32]
> > %elifidn %3, sp
> > - mova m14, [pd_524800]
> > + mova m14, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m14, [INTERP_OFFSET_PS]
> > %endif
> > @@ -6953,7 +6950,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m14, [pd_32]
> > %elifidn %1, sp
> > - mova m14, [pd_524800]
> > + mova m14, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m14, [INTERP_OFFSET_PS]
> > %endif
> > @@ -7089,26 +7086,26 @@
> > paddd m4, m14
> > paddd m5, m14
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > - psrad m4, 10
> > - psrad m5, 10
> > -%else
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > + psrad m4, INTERP_SHIFT_SP
> > + psrad m5, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -7171,14 +7168,14 @@
> > paddd m6, m14
> > paddd m7, m14
> > %ifidn %1,pp
> > - psrad m6, 6
> > - psrad m7, 6
> > + psrad m6, INTERP_SHIFT_PP
> > + psrad m7, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m6, 10
> > - psrad m7, 10
> > -%else
> > - psrad m6, 2
> > - psrad m7, 2
> > + psrad m6, INTERP_SHIFT_SP
> > + psrad m7, INTERP_SHIFT_SP
> > +%else
> > + psrad m6, INTERP_SHIFT_PS
> > + psrad m7, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -7285,32 +7282,32 @@
> > paddd m0, m14
> > paddd m1, m14
> > %ifidn %1,pp
> > - psrad m8, 6
> > - psrad m9, 6
> > - psrad m10, 6
> > - psrad m11, 6
> > - psrad m12, 6
> > - psrad m13, 6
> > - psrad m0, 6
> > - psrad m1, 6
> > + psrad m8, INTERP_SHIFT_PP
> > + psrad m9, INTERP_SHIFT_PP
> > + psrad m10, INTERP_SHIFT_PP
> > + psrad m11, INTERP_SHIFT_PP
> > + psrad m12, INTERP_SHIFT_PP
> > + psrad m13, INTERP_SHIFT_PP
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m8, 10
> > - psrad m9, 10
> > - psrad m10, 10
> > - psrad m11, 10
> > - psrad m12, 10
> > - psrad m13, 10
> > - psrad m0, 10
> > - psrad m1, 10
> > -%else
> > - psrad m8, 2
> > - psrad m9, 2
> > - psrad m10, 2
> > - psrad m11, 2
> > - psrad m12, 2
> > - psrad m13, 2
> > - psrad m0, 2
> > - psrad m1, 2
> > + psrad m8, INTERP_SHIFT_SP
> > + psrad m9, INTERP_SHIFT_SP
> > + psrad m10, INTERP_SHIFT_SP
> > + psrad m11, INTERP_SHIFT_SP
> > + psrad m12, INTERP_SHIFT_SP
> > + psrad m13, INTERP_SHIFT_SP
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > +%else
> > + psrad m8, INTERP_SHIFT_PS
> > + psrad m9, INTERP_SHIFT_PS
> > + psrad m10, INTERP_SHIFT_PS
> > + psrad m11, INTERP_SHIFT_PS
> > + psrad m12, INTERP_SHIFT_PS
> > + psrad m13, INTERP_SHIFT_PS
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -7485,26 +7482,26 @@
> > paddd m4, m11
> > paddd m5, m11
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > - psrad m4, 6
> > - psrad m5, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > - psrad m4, 10
> > - psrad m5, 10
> > -%else
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > - psrad m4, 2
> > - psrad m5, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > + psrad m4, INTERP_SHIFT_SP
> > + psrad m5, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -7556,14 +7553,14 @@
> > paddd m6, m11
> > paddd m7, m11
> > %ifidn %1,pp
> > - psrad m6, 6
> > - psrad m7, 6
> > + psrad m6, INTERP_SHIFT_PP
> > + psrad m7, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m6, 10
> > - psrad m7, 10
> > -%else
> > - psrad m6, 2
> > - psrad m7, 2
> > + psrad m6, INTERP_SHIFT_SP
> > + psrad m7, INTERP_SHIFT_SP
> > +%else
> > + psrad m6, INTERP_SHIFT_PS
> > + psrad m7, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -7600,7 +7597,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m11, [pd_32]
> > %elifidn %1, sp
> > - mova m11, [pd_524800]
> > + mova m11, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m11, [INTERP_OFFSET_PS]
> > %endif
> > @@ -7647,7 +7644,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m14, [pd_32]
> > %elifidn %1, sp
> > - mova m14, [pd_524800]
> > + mova m14, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m14, [INTERP_OFFSET_PS]
> > %endif
> > @@ -7765,20 +7762,20 @@
> > paddd m2, m7
> > paddd m3, m7
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > -%else
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -7801,7 +7798,7 @@
> >
> > %macro FILTER_VER_LUMA_AVX2_16x4 1
> > INIT_YMM avx2
> > -cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
> > +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize
> > mov r4d, r4m
> > shl r4d, 7
> > add r1d, r1d
> > @@ -7819,7 +7816,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m7, [pd_32]
> > %elifidn %1, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -7864,7 +7861,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m7, [pd_32]
> > %elifidn %1, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -7904,7 +7901,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m14, [pd_32]
> > %elifidn %1, sp
> > - mova m14, [pd_524800]
> > + mova m14, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m14, [INTERP_OFFSET_PS]
> > %endif
> > @@ -8014,20 +8011,20 @@
> > paddd m2, m14
> > paddd m3, m14
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m1, 6
> > - psrad m2, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > -%else
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8105,20 +8102,20 @@
> > paddd m6, m14
> > paddd m7, m14
> > %ifidn %1,pp
> > - psrad m4, 6
> > - psrad m5, 6
> > - psrad m6, 6
> > - psrad m7, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > + psrad m6, INTERP_SHIFT_PP
> > + psrad m7, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m4, 10
> > - psrad m5, 10
> > - psrad m6, 10
> > - psrad m7, 10
> > -%else
> > - psrad m4, 2
> > - psrad m5, 2
> > - psrad m6, 2
> > - psrad m7, 2
> > + psrad m4, INTERP_SHIFT_SP
> > + psrad m5, INTERP_SHIFT_SP
> > + psrad m6, INTERP_SHIFT_SP
> > + psrad m7, INTERP_SHIFT_SP
> > +%else
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > + psrad m6, INTERP_SHIFT_PS
> > + psrad m7, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8182,20 +8179,20 @@
> > paddd m10, m14
> > paddd m11, m14
> > %ifidn %1,pp
> > - psrad m8, 6
> > - psrad m9, 6
> > - psrad m10, 6
> > - psrad m11, 6
> > + psrad m8, INTERP_SHIFT_PP
> > + psrad m9, INTERP_SHIFT_PP
> > + psrad m10, INTERP_SHIFT_PP
> > + psrad m11, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m8, 10
> > - psrad m9, 10
> > - psrad m10, 10
> > - psrad m11, 10
> > -%else
> > - psrad m8, 2
> > - psrad m9, 2
> > - psrad m10, 2
> > - psrad m11, 2
> > + psrad m8, INTERP_SHIFT_SP
> > + psrad m9, INTERP_SHIFT_SP
> > + psrad m10, INTERP_SHIFT_SP
> > + psrad m11, INTERP_SHIFT_SP
> > +%else
> > + psrad m8, INTERP_SHIFT_PS
> > + psrad m9, INTERP_SHIFT_PS
> > + psrad m10, INTERP_SHIFT_PS
> > + psrad m11, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8251,7 +8248,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m7, [pd_32]
> > %elifidn %1, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -8315,14 +8312,14 @@
> > paddd m0, m7
> > paddd m2, m7
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m2, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m2, 10
> > -%else
> > - psrad m0, 2
> > - psrad m2, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8366,14 +8363,14 @@
> > paddd m4, m7
> > paddd m1, m7
> > %ifidn %1,pp
> > - psrad m4, 6
> > - psrad m1, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m4, 10
> > - psrad m1, 10
> > -%else
> > - psrad m4, 2
> > - psrad m1, 2
> > + psrad m4, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > +%else
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8458,14 +8455,14 @@
> > paddd m0, m7
> > paddd m2, m7
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m2, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m2, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m2, 10
> > -%else
> > - psrad m0, 2
> > - psrad m2, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8516,14 +8513,14 @@
> > paddd m4, m7
> > paddd m1, m7
> > %ifidn %1,pp
> > - psrad m4, 6
> > - psrad m1, 6
> > + psrad m4, INTERP_SHIFT_PP
> > + psrad m1, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m4, 10
> > - psrad m1, 10
> > -%else
> > - psrad m4, 2
> > - psrad m1, 2
> > + psrad m4, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > +%else
> > + psrad m4, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8574,14 +8571,14 @@
> > paddd m6, m7
> > paddd m5, m7
> > %ifidn %1,pp
> > - psrad m6, 6
> > - psrad m5, 6
> > + psrad m6, INTERP_SHIFT_PP
> > + psrad m5, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m6, 10
> > - psrad m5, 10
> > -%else
> > - psrad m6, 2
> > - psrad m5, 2
> > + psrad m6, INTERP_SHIFT_SP
> > + psrad m5, INTERP_SHIFT_SP
> > +%else
> > + psrad m6, INTERP_SHIFT_PS
> > + psrad m5, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8625,14 +8622,14 @@
> > paddd m0, m7
> > paddd m3, m7
> > %ifidn %1,pp
> > - psrad m0, 6
> > - psrad m3, 6
> > + psrad m0, INTERP_SHIFT_PP
> > + psrad m3, INTERP_SHIFT_PP
> > %elifidn %1, sp
> > - psrad m0, 10
> > - psrad m3, 10
> > -%else
> > - psrad m0, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> > +%else
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> > %endif
> > %endif
> >
> > @@ -8671,7 +8668,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m7, [pd_32]
> > %elifidn %1, sp
> > - mova m7, [pd_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m7, [INTERP_OFFSET_PS]
> > %endif
> > @@ -8706,7 +8703,7 @@
> > %ifidn %1,pp
> > vbroadcasti128 m14, [pd_32]
> > %elifidn %1, sp
> > - mova m14, [pd_524800]
> > + mova m14, [INTERP_OFFSET_SP]
> > %else
> > vbroadcasti128 m14, [INTERP_OFFSET_PS]
> > %endif
> > @@ -8758,10 +8755,10 @@
> > paddd m2, m7
> > paddd m3, m7
> >
> > - psrad m0, 2
> > - psrad m1, 2
> > - psrad m2, 2
> > - psrad m3, 2
> > + psrad m0, INTERP_SHIFT_PS
> > + psrad m1, INTERP_SHIFT_PS
> > + psrad m2, INTERP_SHIFT_PS
> > + psrad m3, INTERP_SHIFT_PS
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -8784,7 +8781,6 @@
> >
> > dec dword [rsp]
> > jnz .loopH
> > -
> > RET
> > %endmacro
> >
> > @@ -8837,7 +8833,7 @@
> > lea r6, [tab_LumaCoeffV + r4]
> > %endif
> >
> > - mova m7, [tab_c_524800]
> > + mova m7, [INTERP_OFFSET_SP]
> >
> > mov dword [rsp], %2/4
> > .loopH:
> > @@ -8850,10 +8846,10 @@
> > paddd m2, m7
> > paddd m3, m7
> >
> > - psrad m0, 10
> > - psrad m1, 10
> > - psrad m2, 10
> > - psrad m3, 10
> > + psrad m0, INTERP_SHIFT_SP
> > + psrad m1, INTERP_SHIFT_SP
> > + psrad m2, INTERP_SHIFT_SP
> > + psrad m3, INTERP_SHIFT_SP
> >
> > packssdw m0, m1
> > packssdw m2, m3
> > @@ -8879,7 +8875,6 @@
> >
> > dec dword [rsp]
> > jnz .loopH
> > -
> > RET
> > %endmacro
> >
> > @@ -8963,7 +8958,6 @@
> >
> > dec dword [rsp]
> > jnz .loopH
> > -
> > RET
> > %endmacro
> >
> > @@ -9011,7 +9005,7 @@
> > %rep %1/4
> > movd m0, [r0]
> > movhps m0, [r0 + r1]
> > - psllw m0, 4
> > + psllw m0, (14 - BIT_DEPTH)
> > psubw m0, m1
> >
> > movd [r2 + r3 * 0], m0
> > @@ -9019,7 +9013,7 @@
> >
> > movd m0, [r0 + r1 * 2]
> > movhps m0, [r0 + r4]
> > - psllw m0, 4
> > + psllw m0, (14 - BIT_DEPTH)
> > psubw m0, m1
> >
> > movd [r2 + r3 * 2], m0
> > @@ -10293,14 +10287,13 @@
> > mov r4d, r4m
> > add r1d, r1d
> > add r3d, r3d
> > -%ifdef PIC
> > -
> > +
> > +%ifdef PIC
> > lea r6, [tab_LumaCoeff]
> > - lea r4 , [r4 * 8]
> > + lea r4, [r4 * 8]
> > vbroadcasti128 m0, [r6 + r4 * 2]
> > -
> > -%else
> > - lea r4 , [r4 * 8]
> > +%else
> > + lea r4, [r4 * 8]
> > vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2]
> > %endif
> >
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/loopfilter.asm
> > --- a/source/common/x86/loopfilter.asm Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/loopfilter.asm Tue Jul 21 14:30:11 2015 -0700
> > @@ -39,7 +39,7 @@
> > cextern pb_128
> > cextern pb_2
> > cextern pw_2
> > -cextern pw_1023
> > +cextern pw_pixel_max
> > cextern pb_movemask
> > cextern pw_1
> > cextern hmul_16p
> > @@ -81,7 +81,7 @@
> > palignr m2, m3, m5, 15
> > por m2, m0
> >
> > - mova m4, [pw_1023]
> > + mova m4, [pw_pixel_max]
> > psignb m2, [pb_128] ; m2 = signLeft
> > pxor m0, m0
> > palignr m0, m3, 15
> > @@ -127,7 +127,7 @@
> > palignr m2, m3, m5, 15
> > por m2, m0
> >
> > - mova m4, [pw_1023]
> > + mova m4, [pw_pixel_max]
> > psignb m2, [pb_128] ; m2 = signLeft
> > pxor m0, m0
> > palignr m0, m3, 15
> > @@ -249,7 +249,7 @@
> > neg r1b
> > movd xm1, r1d
> > vinserti128 m0, m0, xm1, 1
> > - mova m5, [pw_1023]
> > + mova m5, [pw_pixel_max]
> > mov r1d, r4m
> > add r1d, r1d
> > shr r2d, 4
> > @@ -402,8 +402,8 @@
> >
> > pmaxsw m7, m0
> > pmaxsw m5, m0
> > - pminsw m7, [pw_1023]
> > - pminsw m5, [pw_1023]
> > + pminsw m7, [pw_pixel_max]
> > + pminsw m5, [pw_pixel_max]
> >
> > movu [r0], m7
> > movu [r0 + 16], m5
> > @@ -468,7 +468,7 @@
> > mov r4d, r4m
> > mova m4, [pb_2]
> > shr r4d, 4
> > - mova m0, [pw_1023]
> > + mova m0, [pw_pixel_max]
> > .loop
> > movu m5, [r0]
> > movu m3, [r0 + r3]
> > @@ -559,7 +559,7 @@
> > add r3d, r3d
> > mov r4d, r4m
> > pxor m0, m0 ; m0 = 0
> > - mova m6, [pw_1023]
> > + mova m6, [pw_pixel_max]
> > mov r5d, r4d
> > shr r4d, 4
> > mov r6, r0
> > @@ -736,7 +736,7 @@
> > cglobal saoCuOrgE1_2Rows, 4,5,8
> > add r3d, r3d
> > mov r4d, r4m
> > - mova m4, [pw_1023]
> > + mova m4, [pw_pixel_max]
> > vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo
> > shr r4d, 4
> > .loop
> > @@ -884,8 +884,8 @@
> > paddw m5, m4
> > pmaxsw m7, m0
> > pmaxsw m5, m0
> > - pminsw m7, [pw_1023]
> > - pminsw m5, [pw_1023]
> > + pminsw m7, [pw_pixel_max]
> > + pminsw m5, [pw_pixel_max]
> > movu [r0], m7
> > movu [r0 + 16], m5
> >
> > @@ -960,7 +960,7 @@
> > movq xm4, [r0 + r4 * 2]
> > movhps xm4, [r1 + r4]
> > vbroadcasti128 m5, [r3]
> > - mova m6, [pw_1023]
> > + mova m6, [pw_pixel_max]
> > .loop
> > movu m1, [r0]
> > movu m3, [r0 + r5 + 2]
> > @@ -1086,8 +1086,8 @@
> > paddw m7, m6
> > pmaxsw m1, m0
> > pmaxsw m7, m0
> > - pminsw m1, [pw_1023]
> > - pminsw m7, [pw_1023]
> > + pminsw m1, [pw_pixel_max]
> > + pminsw m7, [pw_pixel_max]
> > movu [r0], m1
> > movu [r0 + 32], m7
> >
> > @@ -1212,8 +1212,8 @@
> > paddw m5, m4
> > pmaxsw m7, m0
> > pmaxsw m5, m0
> > - pminsw m7, [pw_1023]
> > - pminsw m5, [pw_1023]
> > + pminsw m7, [pw_pixel_max]
> > + pminsw m5, [pw_pixel_max]
> > movu [r0], m7
> > movu [r0 + 16], m5
> >
> > @@ -1333,7 +1333,7 @@
> > paddw m1, m3
> > pxor m0, m0
> > pmaxsw m1, m0
> > - pminsw m1, [pw_1023]
> > + pminsw m1, [pw_pixel_max]
> > movu [r0], m1
> >
> > psubb xm0, xm2
> > @@ -1461,8 +1461,8 @@
> > pxor m0, m0
> > pmaxsw m1, m0
> > pmaxsw m7, m0
> > - pminsw m1, [pw_1023]
> > - pminsw m7, [pw_1023]
> > + pminsw m1, [pw_pixel_max]
> > + pminsw m7, [pw_pixel_max]
> > movu [r0], m1
> > movu [r0 + 32], m7
> >
> > @@ -1565,8 +1565,8 @@
> > .loopW
> > movu m2, [r0 + r6]
> > movu m5, [r0 + r6 + 16]
> > - psrlw m0, m2, 5
> > - psrlw m6, m5, 5
> > + psrlw m0, m2, (BIT_DEPTH - 5)
> > + psrlw m6, m5, (BIT_DEPTH - 5)
> > packuswb m0, m6
> > pand m0, [pb_31] ; m0 = [index]
> >
> > @@ -1584,8 +1584,8 @@
> > paddw m5, m6
> > pmaxsw m2, m7
> > pmaxsw m5, m7
> > - pminsw m2, [pw_1023]
> > - pminsw m5, [pw_1023]
> > + pminsw m2, [pw_pixel_max]
> > + pminsw m5, [pw_pixel_max]
> >
> > movu [r0 + r6], m2
> > movu [r0 + r6 + 16], m5
> > @@ -1656,7 +1656,7 @@
> > sub r1d, r2d
> > sub r1d, r2d
> > shr r2d, 4
> > - mova m7, [pw_1023]
> > + mova m7, [pw_pixel_max]
> >
> > mov r6d, r3d
> > shr r3d, 1
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/mc-a.asm
> > --- a/source/common/x86/mc-a.asm Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/mc-a.asm Tue Jul 21 14:30:11 2015 -0700
> > @@ -32,6 +32,19 @@
> > %include "x86inc.asm"
> > %include "x86util.asm"
> >
> > +%if BIT_DEPTH==8
> > + %define ADDAVG_FACTOR 256
> > + %define ADDAVG_ROUND 128
> > +%elif BIT_DEPTH==10
> > + %define ADDAVG_FACTOR 1024
> > + %define ADDAVG_ROUND 512
> > +%elif BIT_DEPTH==12
> > + %define ADDAVG_FACTOR 4096
> > + %define ADDAVG_ROUND 2048
> > +%else
> > + %error Unsupport bit depth!
> > +%endif
> > +
> > SECTION_RODATA 32
> >
> > ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
> > @@ -54,6 +67,8 @@
> > cextern pw_512
> > cextern pw_1023
> > cextern pw_1024
> > +cextern pw_2048
> > +cextern pw_4096
> > cextern pw_00ff
> > cextern pw_pixel_max
> > cextern pd_32
> > @@ -92,23 +107,24 @@
> > punpcklqdq m1, m2
> > punpcklqdq m3, m5
> > paddw m1, m3
> > - pmulhrsw m1, [pw_1024]
> > - paddw m1, [pw_512]
> > + pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR]
> > + paddw m1, [pw_ %+ ADDAVG_ROUND]
> >
> > pxor m0, m0
> > pmaxsw m1, m0
> > - pminsw m1, [pw_1023]
> > + pminsw m1, [pw_pixel_max]
> > movd [r2], m1
> > pextrd [r2 + r5], m1, 1
> > lea r2, [r2 + 2 * r5]
> > pextrd [r2], m1, 2
> > pextrd [r2 + r5], m1, 3
> > -
> > RET
> > +
> > +
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> > cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m0, [pw_512]
> > + mova m0, [pw_ %+ ADDAVG_ROUND]
> > pxor m7, m7
> > add r3, r3
> > add r4, r4
> > @@ -136,11 +152,11 @@
> > punpcklqdq m1, m2
> > punpcklqdq m3, m5
> > paddw m1, m3
> > - pmulhrsw m1, [pw_1024]
> > + pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR]
> > paddw m1, m0
> >
> > pmaxsw m1, m7
> > - pminsw m1, [pw_1023]
> > + pminsw m1, [pw_pixel_max]
> > movd [r2], m1
> > pextrd [r2 + r5], m1, 1
> > lea r2, [r2 + 2 * r5]
> > @@ -156,8 +172,8 @@
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> > cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m6, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m6, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > mov r6d, 16/4
> > add r3, r3
> > add r4, r4
> > @@ -183,7 +199,7 @@
> > punpcklqdq m3, m5
> > paddw m1, m3
> > pmulhrsw m1, m7
> > - paddw m1, [pw_512]
> > + paddw m1, [pw_ %+ ADDAVG_ROUND]
> > pxor m0, m0
> > pmaxsw m1, m0
> > pminsw m1, m6
> > @@ -213,21 +229,21 @@
> > punpcklqdq m0, m1
> > punpcklqdq m2, m3
> > paddw m0, m2
> > - pmulhrsw m0, [pw_1024]
> > - paddw m0, [pw_512]
> > + pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR]
> > + paddw m0, [pw_ %+ ADDAVG_ROUND]
> >
> > pxor m6, m6
> > pmaxsw m0, m6
> > - pminsw m0, [pw_1023]
> > + pminsw m0, [pw_pixel_max]
> > movh [r2], m0
> > movhps [r2 + r5], m0
> > RET
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> > cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -264,9 +280,9 @@
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> > cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > mov r6d, 16/2
> > add r3, r3
> > @@ -300,9 +316,9 @@
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> > cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -331,9 +347,9 @@
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> > cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -370,9 +386,9 @@
> > %macro ADDAVG_W4_H4 1
> > INIT_XMM sse4
> > cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -420,9 +436,9 @@
> > %macro ADDAVG_W8_H4 1
> > INIT_XMM sse4
> > cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -470,9 +486,9 @@
> > %macro ADDAVG_W12_H4 1
> > INIT_XMM sse4
> > cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -532,9 +548,9 @@
> > %macro ADDAVG_W16_H4 1
> > INIT_XMM sse4
> > cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -601,9 +617,9 @@
> > %macro ADDAVG_W24_H2 2
> > INIT_XMM sse4
> > cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -683,9 +699,9 @@
> > %macro ADDAVG_W32_H2 1
> > INIT_XMM sse4
> > cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -787,9 +803,9 @@
> > %macro ADDAVG_W48_H2 1
> > INIT_XMM sse4
> > cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -921,9 +937,9 @@
> > %macro ADDAVG_W64_H1 1
> > INIT_XMM sse4
> > cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m7, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m7, [pw_ %+ ADDAVG_FACTOR]
> > pxor m6, m6
> > add r3, r3
> > add r4, r4
> > @@ -1029,19 +1045,19 @@
> >
> > paddw m0, m1
> > pxor m1, m1
> > - pmulhrsw m0, [pw_1024]
> > - paddw m0, [pw_512]
> > + pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR]
> > + paddw m0, [pw_ %+ ADDAVG_ROUND]
> > pmaxsw m0, m1
> > - pminsw m0, [pw_1023]
> > + pminsw m0, [pw_pixel_max]
> > vextracti128 xm1, m0, 1
> > movu [r2], xm0
> > movu [r2 + r5 * 2], xm1
> > RET
> >
> > cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m1, m1
> > add r3d, r3d
> > add r4d, r4d
> > @@ -1100,9 +1116,9 @@
> >
> > %macro ADDAVG_W8_H4_AVX2 1
> > cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m1, m1
> > add r3d, r3d
> > add r4d, r4d
> > @@ -1159,9 +1175,9 @@
> > ADDAVG_W8_H4_AVX2 64
> >
> > cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m1, m1
> > add r3, r3
> > add r4, r4
> > @@ -1201,8 +1217,8 @@
> > RET
> >
> > cglobal addAvg_12x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > paddw m3, m4, m4
> > pxor m1, m1
> > add r3, r3
> > @@ -1244,9 +1260,9 @@
> >
> > %macro ADDAVG_W16_H4_AVX2 1
> > cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m2, m2
> > add r3, r3
> > add r4, r4
> > @@ -1291,9 +1307,9 @@
> > ADDAVG_W16_H4_AVX2 64
> >
> > cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m1, m1
> > add r3, r3
> > add r4, r4
> > @@ -1347,8 +1363,8 @@
> > RET
> >
> > cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > paddw m3, m4, m4
> > pxor m1, m1
> > add r3, r3
> > @@ -1404,9 +1420,9 @@
> >
> > %macro ADDAVG_W32_H2_AVX2 1
> > cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m2, m2
> > add r3, r3
> > add r4, r4
> > @@ -1468,9 +1484,9 @@
> > ADDAVG_W32_H2_AVX2 64
> >
> > cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m2, m2
> > add r3, r3
> > add r4, r4
> > @@ -1543,9 +1559,9 @@
> >
> > %macro ADDAVG_W64_H1_AVX2 1
> > cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > - mova m4, [pw_512]
> > - mova m5, [pw_1023]
> > - mova m3, [pw_1024]
> > + mova m4, [pw_ %+ ADDAVG_ROUND]
> > + mova m5, [pw_pixel_max]
> > + mova m3, [pw_ %+ ADDAVG_FACTOR]
> > pxor m2, m2
> > add r3d, r3d
> > add r4d, r4d
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/pixel-util8.asm
> > --- a/source/common/x86/pixel-util8.asm Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/pixel-util8.asm Tue Jul 21 14:30:11 2015 -0700
> > @@ -879,8 +879,8 @@
> > %if HIGH_BIT_DEPTH
> > cmp r3d, 32767
> > jle .skip
> > - shr r3d, 2
> > - sub r4d, 2
> > + shr r3d, (BIT_DEPTH - 8)
> > + sub r4d, (BIT_DEPTH - 8)
> > .skip:
> > %endif
> > movd m0, r4d ; m0 = shift
> > @@ -1273,13 +1273,7 @@
> > INIT_XMM sse4
> > cglobal weight_pp, 4,7,7
> > %define correction (14 - BIT_DEPTH)
> > -%if BIT_DEPTH == 10
> > - mova m6, [pw_1023]
> > -%elif BIT_DEPTH == 12
> > - mova m6, [pw_3fff]
> > -%else
> > - %error Unsupported BIT_DEPTH!
> > -%endif
> > + mova m6, [pw_pixel_max]
> > mov r6d, r6m
> > mov r4d, r4m
> > mov r5d, r5m
> > @@ -1423,7 +1417,7 @@
> > movd xm1, r7m
> > vpbroadcastd m2, r8m
> > mova m5, [pw_1]
> > - mova m6, [pw_1023]
> > + mova m6, [pw_pixel_max]
> > add r2d, r2d
> > add r3d, r3d
> > sub r2d, r3d
> > @@ -1516,13 +1510,7 @@
> > %if HIGH_BIT_DEPTH
> > INIT_XMM sse4
> > cglobal weight_sp, 6,7,8
> > -%if BIT_DEPTH == 10
> > - mova m1, [pw_1023]
> > -%elif BIT_DEPTH == 12
> > - mova m1, [pw_3fff]
> > -%else
> > - %error Unsupported BIT_DEPTH!
> > -%endif
> > + mova m1, [pw_pixel_max]
> > mova m2, [pw_1]
> > mov r6d, r7m
> > shl r6d, 16
> > @@ -1681,7 +1669,7 @@
> > %if HIGH_BIT_DEPTH
> > INIT_YMM avx2
> > cglobal weight_sp, 6,7,9
> > - mova m1, [pw_1023]
> > + mova m1, [pw_pixel_max]
> > mova m2, [pw_1]
> > mov r6d, r7m
> > shl r6d, 16
> >
> >
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
> >
>
> --
> Steve Borho
--
Steve Borho
More information about the x265-devel
mailing list