[x265] [PATCH 1 of 4] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
Steve Borho
steve at borho.org
Wed Jul 22 18:25:56 CEST 2015
On 07/21, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1437514211 25200
> # Node ID ab2c34d6ad913369fd8feb84aee10030ffaa0df5
> # Parent 46152345eb6ff261fd90272f7a0712300d6324c0
> asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
nice! queued for smoke testing
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/asm-primitives.cpp Tue Jul 21 14:30:11 2015 -0700
> @@ -1043,7 +1043,9 @@
>
> // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
> ALL_LUMA_PU(satd, pixel_satd, ssse3);
> +#if X265_DEPTH <= 10
> ASSIGN_SA8D(ssse3);
> +#endif
> INTRA_ANG_SSSE3(ssse3);
>
> p.dst4x4 = PFX(dst4_ssse3);
> @@ -1126,14 +1128,18 @@
>
> // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
> ALL_LUMA_PU(satd, pixel_satd, sse4);
> +#if X265_DEPTH <= 10
> ASSIGN_SA8D(sse4);
> +#endif
>
> p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
> p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
> p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
> p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
>
> +#if X265_DEPTH <= 10
> ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
> +#endif
> ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
> INTRA_ANG_SSE4_COMMON(sse4);
> INTRA_ANG_SSE4_HIGH(sse4);
> @@ -1147,7 +1153,9 @@
>
> // TODO: check POPCNT flag!
> ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
> +#if X265_DEPTH <= 10
> ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
> +#endif
> ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
>
> p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
> @@ -1184,7 +1192,9 @@
> p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
> p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
> p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
> +#if X265_DEPTH <= 10
> ASSIGN_SA8D(avx);
> +#endif
> p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
> p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
> p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
> @@ -1292,7 +1302,9 @@
> {
> //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
> ALL_LUMA_PU(satd, pixel_satd, xop);
> +#if X265_DEPTH <= 10
> ASSIGN_SA8D(xop);
> +#endif
> LUMA_VAR(xop);
> p.frameInitLowres = PFX(frame_init_lowres_core_xop);
> }
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/const-a.asm Tue Jul 21 14:30:11 2015 -0700
> @@ -79,6 +79,7 @@
> const pw_512, times 16 dw 512
> const pw_1023, times 16 dw 1023
> const pw_1024, times 16 dw 1024
> +const pw_2048, times 16 dw 2048
> const pw_4096, times 16 dw 4096
> const pw_8192, times 8 dw 8192
> const pw_00ff, times 16 dw 0x00ff
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/intrapred16.asm
> --- a/source/common/x86/intrapred16.asm Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/intrapred16.asm Tue Jul 21 14:30:11 2015 -0700
> @@ -1748,7 +1748,7 @@
> ; filter top
> movu m1, [r2]
> paddw m1, m0
> - psraw m1, 2
> + psrlw m1, 2
> movh [r0], m1 ; overwrite top-left pixel, we will update it later
>
> ; filter top-left
> @@ -1763,7 +1763,7 @@
> lea r0, [r0 + r1 * 2]
> movu m1, [r3 + 2]
> paddw m1, m0
> - psraw m1, 2
> + psrlw m1, 2
> movd r3d, m1
> mov [r0], r3w
> shr r3d, 16
> @@ -1872,7 +1872,7 @@
> ; filter top
> movu m0, [r2]
> paddw m0, m1
> - psraw m0, 2
> + psrlw m0, 2
> movu [r6], m0
>
> ; filter top-left
> @@ -1887,7 +1887,7 @@
> add r6, r1
> movu m0, [r3 + 2]
> paddw m0, m1
> - psraw m0, 2
> + psrlw m0, 2
> pextrw [r6], m0, 0
> pextrw [r6 + r1], m0, 1
> pextrw [r6 + r1 * 2], m0, 2
> @@ -1913,13 +1913,13 @@
> movu m2, [r2]
> movu m3, [r2 + 16]
>
> - paddw m0, m1
> + paddw m0, m1 ; dynamic range 13 bits
> paddw m2, m3
> - paddw m0, m2
> - movhlps m1, m0
> - paddw m0, m1
> - phaddw m0, m0
> + paddw m0, m2 ; dynamic range 14 bits
> + movhlps m1, m0 ; dynamic range 15 bits
> + paddw m0, m1 ; dynamic range 16 bits
> pmaddwd m0, [pw_1]
> + phaddd m0, m0
>
> movd r5d, m0
> add r5d, 16
> @@ -1983,11 +1983,11 @@
> ; filter top
> movu m2, [r2]
> paddw m2, m1
> - psraw m2, 2
> + psrlw m2, 2
> movu [r6], m2
> movu m3, [r2 + 16]
> paddw m3, m1
> - psraw m3, 2
> + psrlw m3, 2
> movu [r6 + 16], m3
>
> ; filter top-left
> @@ -2002,7 +2002,7 @@
> add r6, r1
> movu m2, [r3 + 2]
> paddw m2, m1
> - psraw m2, 2
> + psrlw m2, 2
>
> pextrw [r6], m2, 0
> pextrw [r6 + r1], m2, 1
> @@ -2019,7 +2019,7 @@
> lea r6, [r6 + r1 * 2]
> movu m3, [r3 + 18]
> paddw m3, m1
> - psraw m3, 2
> + psrlw m3, 2
>
> pextrw [r6], m3, 0
> pextrw [r6 + r1], m3, 1
> @@ -2046,21 +2046,21 @@
> movu m1, [r3 + 16]
> movu m2, [r3 + 32]
> movu m3, [r3 + 48]
> - paddw m0, m1
> + paddw m0, m1 ; dynamic range 13 bits
> paddw m2, m3
> - paddw m0, m2
> + paddw m0, m2 ; dynamic range 14 bits
> movu m1, [r2]
> movu m3, [r2 + 16]
> movu m4, [r2 + 32]
> movu m5, [r2 + 48]
> - paddw m1, m3
> + paddw m1, m3 ; dynamic range 13 bits
> paddw m4, m5
> - paddw m1, m4
> - paddw m0, m1
> + paddw m1, m4 ; dynamic range 14 bits
> + paddw m0, m1 ; dynamic range 15 bits
> + pmaddwd m0, [pw_1]
> movhlps m1, m0
> - paddw m0, m1
> - phaddw m0, m0
> - pmaddwd m0, [pw_1]
> + paddd m0, m1
> + phaddd m0, m0
>
> paddd m0, [pd_32] ; sum = sum + 32
> psrld m0, 6 ; sum = sum / 64
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/ipfilter16.asm
> --- a/source/common/x86/ipfilter16.asm Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/ipfilter16.asm Tue Jul 21 14:30:11 2015 -0700
> @@ -26,6 +26,25 @@
> %include "x86inc.asm"
> %include "x86util.asm"
>
> +
> +%define INTERP_OFFSET_PP pd_32
> +%define INTERP_SHIFT_PP 6
> +
> +%if BIT_DEPTH == 10
> + %define INTERP_SHIFT_PS 2
> + %define INTERP_OFFSET_PS pd_n32768
> + %define INTERP_SHIFT_SP 10
> + %define INTERP_OFFSET_SP pd_524800
> +%elif BIT_DEPTH == 12
> + %define INTERP_SHIFT_PS 4
> + %define INTERP_OFFSET_PS pd_n131072
> + %define INTERP_SHIFT_SP 8
> + %define INTERP_OFFSET_SP pd_524416
> +%else
> + %error Unsupport bit depth!
> +%endif
> +
> +
> SECTION_RODATA 32
>
> tab_c_32: times 8 dd 32
> @@ -145,21 +164,9 @@
> const pb_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
> db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
>
> -%if BIT_DEPTH == 10
> - %define INTERP_OFFSET_PS pd_n32768
> - %define INTERP_SHIFT_PS 2
> - %define INTERP_OFFSET_SP pd_524800
> - %define INTERP_SHIFT_SP 10
> -%elif BIT_DEPTH == 12
> - %define INTERP_OFFSET_PS pd_n131072
> - %define INTERP_SHIFT_PS 4
> - %define INTERP_OFFSET_SP pd_524416
> - %define INTERP_SHIFT_SP 8
> -%else
> - %error Unsupport bit depth!
> -%endif
>
> SECTION .text
> +cextern pd_8
> cextern pd_32
> cextern pw_pixel_max
> cextern pd_524416
> @@ -503,7 +510,7 @@
> %endif
>
> %ifidn %1,pp
> - mova m7, [pd_32]
> + mova m7, [INTERP_OFFSET_PP]
> %define SHIFT 6
> %elifidn %1,ps
> mova m7, [INTERP_OFFSET_PS]
> @@ -1176,7 +1183,6 @@
> %macro FILTER_HOR_LUMA_W4 3
> INIT_XMM sse4
> cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> -
> mov r4d, r4m
> sub r0, 6
> shl r4d, 4
> @@ -1229,7 +1235,7 @@
> packusdw m4, m4
> CLIPW m4, m6, m7
> %else
> - psrad m4, 2
> + psrad m4, INTERP_SHIFT_PS
> packssdw m4, m4
> %endif
>
> @@ -1287,7 +1293,7 @@
> mov r4d, %2
> %ifidn %3, ps
> cmp r5m, byte 0
> - je .loopH
> + je .loopH
> lea r6, [r1 + 2 * r1]
> sub r0, r6
> add r4d, 7
> @@ -1329,8 +1335,8 @@
> packusdw m4, m5
> CLIPW m4, m7, [pw_pixel_max]
> %else
> - psrad m4, 2
> - psrad m5, 2
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m4, m5
> %endif
>
> @@ -1340,7 +1346,7 @@
> add r2, r3
>
> dec r4d
> - jnz .loopH
> + jnz .loopH
> RET
> %endmacro
>
> @@ -1380,7 +1386,7 @@
> mova m0, [tab_LumaCoeff + r4]
> %endif
> %ifidn %3, pp
> - mova m1, [pd_32]
> + mova m1, [INTERP_OFFSET_PP]
> %else
> mova m1, [INTERP_OFFSET_PS]
> %endif
> @@ -1425,14 +1431,14 @@
> phaddd m5, m6
> paddd m5, m1
> %ifidn %3, pp
> - psrad m4, 6
> - psrad m5, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m4, m5
> pxor m5, m5
> CLIPW m4, m5, [pw_pixel_max]
> %else
> - psrad m4, 2
> - psrad m5, 2
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m4, m5
> %endif
>
> @@ -1453,12 +1459,12 @@
> phaddd m4, m5
> paddd m4, m1
> %ifidn %3, pp
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
> packusdw m4, m4
> pxor m5, m5
> CLIPW m4, m5, [pw_pixel_max]
> %else
> - psrad m4, 2
> + psrad m4, INTERP_SHIFT_PS
> packssdw m4, m4
> %endif
>
> @@ -1550,14 +1556,14 @@
> phaddd m5, m6
> paddd m5, m1
> %ifidn %3, pp
> - psrad m4, 6
> - psrad m5, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m4, m5
> pxor m5, m5
> CLIPW m4, m5, [pw_pixel_max]
> %else
> - psrad m4, 2
> - psrad m5, 2
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m4, m5
> %endif
> movu [r2 + x], m4
> @@ -1591,14 +1597,14 @@
> phaddd m5, m6
> paddd m5, m1
> %ifidn %3, pp
> - psrad m4, 6
> - psrad m5, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m4, m5
> pxor m5, m5
> CLIPW m4, m5, [pw_pixel_max]
> %else
> - psrad m4, 2
> - psrad m5, 2
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m4, m5
> %endif
> movu [r2 + 16 + x], m4
> @@ -1743,14 +1749,14 @@
> phaddd m5, m6
> paddd m5, m1
> %ifidn %3, pp
> - psrad m4, 6
> - psrad m5, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m4, m5
> pxor m5, m5
> CLIPW m4, m5, [pw_pixel_max]
> %else
> - psrad m4, 2
> - psrad m5, 2
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m4, m5
> %endif
> movu [r2], m4
> @@ -1784,14 +1790,14 @@
> phaddd m5, m6
> paddd m5, m1
> %ifidn %3, pp
> - psrad m4, 6
> - psrad m5, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m4, m5
> pxor m5, m5
> CLIPW m4, m5, [pw_pixel_max]
> %else
> - psrad m4, 2
> - psrad m5, 2
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m4, m5
> %endif
> movu [r2 + 16], m4
> @@ -1825,14 +1831,14 @@
> phaddd m5, m6
> paddd m5, m1
> %ifidn %3, pp
> - psrad m4, 6
> - psrad m5, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m4, m5
> pxor m5, m5
> CLIPW m4, m5, [pw_pixel_max]
> %else
> - psrad m4, 2
> - psrad m5, 2
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m4, m5
> %endif
> movu [r2 + 32], m4
> @@ -1865,11 +1871,11 @@
> phaddd m3, m4
> paddd m3, m1
> %ifidn %1, pp
> - psrad m3, 6
> + psrad m3, INTERP_SHIFT_PP
> packusdw m3, m3
> CLIPW m3, m7, m6
> %else
> - psrad m3, 2
> + psrad m3, INTERP_SHIFT_PS
> packssdw m3, m3
> %endif
> movd [r2], m3
> @@ -1895,13 +1901,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m7, m6
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2], m3
> @@ -1950,7 +1956,7 @@
> phaddd m4, m4
> vpermq m4, m4, q3120
> paddd m4, m6
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -1969,7 +1975,7 @@
> phaddd m4, m4
> vpermq m4, m4, q3120
> paddd m4, m6
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2036,7 +2042,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2064,7 +2070,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2132,7 +2138,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2160,7 +2166,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2232,7 +2238,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2260,7 +2266,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2335,7 +2341,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2363,7 +2369,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2425,7 +2431,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2453,7 +2459,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2481,7 +2487,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2545,7 +2551,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2573,7 +2579,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2601,7 +2607,7 @@
> phaddd m4, m5
> vpermq m4, m4, q3120
> paddd m4, m7
> - psrad m4, 6
> + psrad m4, INTERP_SHIFT_PP
>
> packusdw m4, m4
> vpermq m4, m4, q2020
> @@ -2644,32 +2650,32 @@
> mova m1, [INTERP_OFFSET_PS]
> cmp r5m, byte 0
> je .skip
> - sub r0, r1
> - movu m3, [r0]
> - pshufb m3, m3, m2
> - pmaddwd m3, m0
> -
> - %if %1 == 4
> - movu m4, [r0 + 4]
> - pshufb m4, m4, m2
> - pmaddwd m4, m0
> - phaddd m3, m4
> - %else
> - phaddd m3, m3
> - %endif
> -
> - paddd m3, m1
> - psrad m3, INTERP_SHIFT_PS
> - packssdw m3, m3
> -
> - %if %1 == 2
> - movd [r2], m3
> - %else
> - movh [r2], m3
> - %endif
> -
> - add r0, r1
> - add r2, r3
> + sub r0, r1
> + movu m3, [r0]
> + pshufb m3, m3, m2
> + pmaddwd m3, m0
> +
> + %if %1 == 4
> + movu m4, [r0 + 4]
> + pshufb m4, m4, m2
> + pmaddwd m4, m0
> + phaddd m3, m4
> + %else
> + phaddd m3, m3
> + %endif
> +
> + paddd m3, m1
> + psrad m3, INTERP_SHIFT_PS
> + packssdw m3, m3
> +
> + %if %1 == 2
> + movd [r2], m3
> + %else
> + movh [r2], m3
> + %endif
> +
> + add r0, r1
> + add r2, r3
> FILTER_W%1_2 %3
> lea r0, [r0 + 2 * r1]
> lea r2, [r2 + 2 * r3]
> @@ -2689,7 +2695,6 @@
> lea r2, [r2 + 2 * r3]
> FILTER_W%1_2 %3
> %endrep
> -
> RET
> %endmacro
>
> @@ -2729,13 +2734,13 @@
> phaddd m4, m4
> paddd m4, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m4, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m4, INTERP_SHIFT_PP
> packusdw m3, m4
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m4, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m4, INTERP_SHIFT_PS
> packssdw m3, m4
> %endif
> movh [r2], m3
> @@ -2769,13 +2774,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2], m3
> @@ -2809,13 +2814,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2], m3
> @@ -2831,11 +2836,11 @@
> paddd m3, m1
>
> %ifidn %1, pp
> - psrad m3, 6
> + psrad m3, INTERP_SHIFT_PP
> packusdw m3, m3
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> + psrad m3, INTERP_SHIFT_PS
> packssdw m3, m3
> %endif
> movh [r2 + 16], m3
> @@ -2868,13 +2873,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2], m3
> @@ -2898,13 +2903,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + 16], m3
> @@ -2938,13 +2943,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2], m3
> @@ -2968,13 +2973,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + 16], m3
> @@ -2998,13 +3003,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + 32], m3
> @@ -3038,13 +3043,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2], m3
> @@ -3068,13 +3073,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + 16], m3
> @@ -3098,13 +3103,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + 32], m3
> @@ -3128,13 +3133,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + 48], m3
> @@ -3168,13 +3173,13 @@
> phaddd m5, m4
> paddd m5, m1
> %ifidn %1, pp
> - psrad m3, 6
> - psrad m5, 6
> + psrad m3, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> packusdw m3, m5
> CLIPW m3, m6, m7
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + %2], m3
> @@ -3408,7 +3413,7 @@
> pmaddwd m4, m0
> phaddd m3, m4
> paddd m3, m2
> - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
>
> packusdw m3, m3
> vpermq m3, m3, q2020
> @@ -3426,7 +3431,7 @@
> pmaddwd m4, m0
> phaddd m3, m4
> paddd m3, m2
> - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
>
> packusdw m3, m3
> vpermq m3, m3, q2020
> @@ -3474,7 +3479,7 @@
> pmaddwd m4, m0
> phaddd m3, m4
> paddd m3, m2
> - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
>
> packusdw m3, m3
> vpermq m3, m3,q2020
> @@ -3491,7 +3496,7 @@
> pmaddwd m4, m0
> phaddd m3, m4
> paddd m3, m2
> - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0]
> + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0]
>
> packusdw m3, m3
> vpermq m3, m3,q2020
> @@ -4089,7 +4094,7 @@
> %ifnidn %3, ps
> mova m7, [pw_pixel_max]
> %ifidn %3, pp
> - mova m6, [tab_c_32]
> + mova m6, [INTERP_OFFSET_PP]
> %else
> mova m6, [INTERP_OFFSET_SP]
> %endif
> @@ -4129,10 +4134,10 @@
> paddd m2, m6
> paddd m3, m6
> %ifidn %3, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %else
> psrad m0, INTERP_SHIFT_SP
> psrad m1, INTERP_SHIFT_SP
> @@ -4344,9 +4349,9 @@
> pxor m7, m7
> mova m6, [pw_pixel_max]
> %ifidn %2, pp
> - mova m5, [tab_c_32]
> + mova m5, [INTERP_OFFSET_PP]
> %else
> - mova m5, [tab_c_524800]
> + mova m5, [INTERP_OFFSET_SP]
> %endif
> %else
> mova m5, [INTERP_OFFSET_PS]
> @@ -4362,18 +4367,18 @@
> %elifidn %2, ps
> paddd m0, m5
> paddd m2, m5
> - psrad m0, 2
> - psrad m2, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> packssdw m0, m2
> %else
> paddd m0, m5
> paddd m2, m5
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m2, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> %else
> - psrad m0, 10
> - psrad m2, 10
> + psrad m0, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> %endif
> packusdw m0, m2
> CLIPW m0, m7, m6
> @@ -4389,7 +4394,6 @@
>
> dec r4d
> jnz .loopH
> -
> RET
> %endmacro
>
> @@ -4417,7 +4421,6 @@
> %macro FILTER_VER_CHROMA_W4 3
> INIT_XMM sse4
> cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
> -
> add r1d, r1d
> add r3d, r3d
> sub r0, r1
> @@ -4439,9 +4442,9 @@
> pxor m6, m6
> mova m5, [pw_pixel_max]
> %ifidn %2, pp
> - mova m4, [tab_c_32]
> + mova m4, [INTERP_OFFSET_PP]
> %else
> - mova m4, [tab_c_524800]
> + mova m4, [INTERP_OFFSET_SP]
> %endif
> %else
> mova m4, [INTERP_OFFSET_PS]
> @@ -4479,18 +4482,18 @@
> %elifidn %2, ps
> paddd m0, m4
> paddd m1, m4
> - psrad m0, 2
> - psrad m1, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> packssdw m0, m1
> %else
> paddd m0, m4
> paddd m1, m4
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m1, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> %else
> - psrad m0, 10
> - psrad m1, 10
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> %endif
> packusdw m0, m1
> CLIPW m0, m6, m5
> @@ -4504,7 +4507,6 @@
> dec r4d
> jnz .loop
> %endif
> -
> RET
> %endmacro
>
> @@ -4524,7 +4526,6 @@
> %macro FILTER_VER_CHROMA_W6 3
> INIT_XMM sse4
> cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
> -
> add r1d, r1d
> add r3d, r3d
> sub r0, r1
> @@ -4543,9 +4544,9 @@
> %ifnidn %2, ps
> mova m7, [pw_pixel_max]
> %ifidn %2, pp
> - mova m6, [tab_c_32]
> + mova m6, [INTERP_OFFSET_PP]
> %else
> - mova m6, [tab_c_524800]
> + mova m6, [INTERP_OFFSET_SP]
> %endif
> %else
> mova m6, [INTERP_OFFSET_PS]
> @@ -4568,10 +4569,10 @@
> paddd m1, m6
> paddd m2, m6
> paddd m3, m6
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -4581,15 +4582,15 @@
> paddd m2, m6
> paddd m3, m6
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -4616,18 +4617,18 @@
> %elifidn %2, ps
> paddd m0, m6
> paddd m2, m6
> - psrad m0, 2
> - psrad m2, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> packssdw m0, m2
> %else
> paddd m0, m6
> paddd m2, m6
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m2, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> %else
> - psrad m0, 10
> - psrad m2, 10
> + psrad m0, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> %endif
> packusdw m0, m2
> CLIPW m0, m5, m7
> @@ -4644,7 +4645,6 @@
>
> dec r4d
> jnz .loopH
> -
> RET
> %endmacro
>
> @@ -4712,7 +4712,7 @@
> mov r4d, %2/2
>
> %ifidn %3, pp
> - mova m7, [tab_c_32]
> + mova m7, [INTERP_OFFSET_PP]
> %elifidn %3, sp
> mova m7, [INTERP_OFFSET_SP]
> %elifidn %3, ps
> @@ -4748,10 +4748,10 @@
> paddd m2, m7
> paddd m3, m7
> %ifidn %3, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %else
> psrad m0, INTERP_SHIFT_SP
> psrad m1, INTERP_SHIFT_SP
> @@ -4772,7 +4772,6 @@
>
> dec r4d
> jnz .loopH
> -
> RET
> %endmacro
>
> @@ -4868,9 +4867,9 @@
> mov r6d, %1/4
>
> %ifidn %2,pp
> - vbroadcasti128 m8, [pd_32]
> + vbroadcasti128 m8, [INTERP_OFFSET_PP]
> %elifidn %2, sp
> - mova m8, [pd_524800]
> + mova m8, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m8, [INTERP_OFFSET_PS]
> %endif
> @@ -4934,20 +4933,20 @@
> paddd m2, m8
> paddd m3, m8
> %ifidn %2,pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %elifidn %2, sp
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> -%else
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -5012,9 +5011,9 @@
> mov r4d, %1/2
>
> %ifidn %2, pp
> - mova m7, [tab_c_32]
> + mova m7, [INTERP_OFFSET_PP]
> %elifidn %2, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %elifidn %2, ps
> mova m7, [INTERP_OFFSET_PS]
> %endif
> @@ -5034,10 +5033,10 @@
> paddd m1, m7
> paddd m2, m7
> paddd m3, m7
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -5047,15 +5046,15 @@
> paddd m2, m7
> paddd m3, m7
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> -%else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> +%else
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -5184,9 +5183,9 @@
> mov r4d, %1/2
>
> %ifidn %2, pp
> - mova m7, [tab_c_32]
> + mova m7, [INTERP_OFFSET_PP]
> %elifidn %2, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %elifidn %2, ps
> mova m7, [INTERP_OFFSET_PS]
> %endif
> @@ -5213,18 +5212,18 @@
> paddd m1, m7
> paddd m2, m7
> paddd m3, m7
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> paddd m8, m7
> paddd m9, m7
> paddd m10, m7
> paddd m11, m7
> - psrad m8, 2
> - psrad m9, 2
> - psrad m10, 2
> - psrad m11, 2
> + psrad m8, INTERP_SHIFT_PS
> + psrad m9, INTERP_SHIFT_PS
> + psrad m10, INTERP_SHIFT_PS
> + psrad m11, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -5240,23 +5239,23 @@
> paddd m10, m7
> paddd m11, m7
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> - psrad m8, 6
> - psrad m9, 6
> - psrad m10, 6
> - psrad m11, 6
> -%else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> - psrad m8, 10
> - psrad m9, 10
> - psrad m10, 10
> - psrad m11, 10
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> + psrad m8, INTERP_SHIFT_PP
> + psrad m9, INTERP_SHIFT_PP
> + psrad m10, INTERP_SHIFT_PP
> + psrad m11, INTERP_SHIFT_PP
> +%else
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> + psrad m8, INTERP_SHIFT_SP
> + psrad m9, INTERP_SHIFT_SP
> + psrad m10, INTERP_SHIFT_SP
> + psrad m11, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -5326,9 +5325,9 @@
> mov r4d, %1/2
>
> %ifidn %2, pp
> - mova m7, [tab_c_32]
> + mova m7, [INTERP_OFFSET_PP]
> %elifidn %2, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %elifidn %2, ps
> mova m7, [INTERP_OFFSET_PS]
> %endif
> @@ -5380,10 +5379,10 @@
> paddd m1, m7
> paddd m2, m7
> paddd m3, m7
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -5393,15 +5392,15 @@
> paddd m2, m7
> paddd m3, m7
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> -%else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> +%else
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -5457,9 +5456,9 @@
> mov r4d, %1/2
>
> %ifidn %2, pp
> - mova m7, [tab_c_32]
> + mova m7, [INTERP_OFFSET_PP]
> %elifidn %2, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %elifidn %2, ps
> mova m7, [INTERP_OFFSET_PS]
> %endif
> @@ -5479,10 +5478,10 @@
> paddd m1, m7
> paddd m2, m7
> paddd m3, m7
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -5492,15 +5491,15 @@
> paddd m2, m7
> paddd m3, m7
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> -%else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> +%else
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -5610,9 +5609,9 @@
> mov r4d, %1/2
>
> %ifidn %2, pp
> - mova m7, [tab_c_32]
> + mova m7, [INTERP_OFFSET_PP]
> %elifidn %2, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %elifidn %2, ps
> mova m7, [INTERP_OFFSET_PS]
> %endif
> @@ -5639,18 +5638,18 @@
> paddd m1, m7
> paddd m2, m7
> paddd m3, m7
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> paddd m8, m7
> paddd m9, m7
> paddd m10, m7
> paddd m11, m7
> - psrad m8, 2
> - psrad m9, 2
> - psrad m10, 2
> - psrad m11, 2
> + psrad m8, INTERP_SHIFT_PS
> + psrad m9, INTERP_SHIFT_PS
> + psrad m10, INTERP_SHIFT_PS
> + psrad m11, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -5666,23 +5665,23 @@
> paddd m10, m7
> paddd m11, m7
> %ifidn %2, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> - psrad m8, 6
> - psrad m9, 6
> - psrad m10, 6
> - psrad m11, 6
> -%else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> - psrad m8, 10
> - psrad m9, 10
> - psrad m10, 10
> - psrad m11, 10
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> + psrad m8, INTERP_SHIFT_PP
> + psrad m9, INTERP_SHIFT_PP
> + psrad m10, INTERP_SHIFT_PP
> + psrad m11, INTERP_SHIFT_PP
> +%else
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> + psrad m8, INTERP_SHIFT_SP
> + psrad m9, INTERP_SHIFT_SP
> + psrad m10, INTERP_SHIFT_SP
> + psrad m11, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -5733,9 +5732,9 @@
> mov r4d, 32
>
> %ifidn %1, pp
> - mova m7, [tab_c_32]
> + mova m7, [INTERP_OFFSET_PP]
> %elifidn %1, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %elifidn %1, ps
> mova m7, [INTERP_OFFSET_PS]
> %endif
> @@ -5787,10 +5786,10 @@
> paddd m1, m7
> paddd m2, m7
> paddd m3, m7
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -5800,15 +5799,15 @@
> paddd m2, m7
> paddd m3, m7
> %ifidn %1, pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> -%else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> +%else
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -5827,6 +5826,7 @@
> jnz .loopH
> RET
> %endmacro
> +
> FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8
> FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8
> FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7
> @@ -5834,7 +5834,6 @@
>
> INIT_XMM sse2
> cglobal chroma_p2s, 3, 7, 3
> -
> ; load width and height
> mov r3d, r3m
> mov r4d, r4m
> @@ -5850,11 +5849,11 @@
> lea r6, [r0 + r5 * 2]
>
> movu m0, [r6]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> paddw m0, m2
>
> movu m1, [r6 + r1]
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> paddw m1, m2
>
> add r5d, 8
> @@ -5887,7 +5886,6 @@
>
> sub r4d, 2
> jnz .loopH
> -
> RET
>
> %macro PROCESS_LUMA_VER_W4_4R 0
> @@ -5975,7 +5973,7 @@
> lea r6, [tab_LumaCoeffV + r4]
> %endif
>
> - mova m7, [pd_32]
> + mova m7, [INTERP_OFFSET_PP]
>
> mov dword [rsp], %2/4
> .loopH:
> @@ -5988,10 +5986,10 @@
> paddd m2, m7
> paddd m3, m7
>
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -6017,7 +6015,6 @@
>
> dec dword [rsp]
> jnz .loopH
> -
> RET
> %endmacro
>
> @@ -6126,14 +6123,14 @@
> paddd m0, m6
> paddd m2, m6
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m2, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m2, 10
> -%else
> - psrad m0, 2
> - psrad m2, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -6294,20 +6291,20 @@
> paddd m2, m11
> paddd m3, m11
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> -%else
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -6365,20 +6362,20 @@
> paddd m6, m11
> paddd m7, m11
> %ifidn %1,pp
> - psrad m4, 6
> - psrad m5, 6
> - psrad m6, 6
> - psrad m7, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> + psrad m6, INTERP_SHIFT_PP
> + psrad m7, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m4, 10
> - psrad m5, 10
> - psrad m6, 10
> - psrad m7, 10
> -%else
> - psrad m4, 2
> - psrad m5, 2
> - psrad m6, 2
> - psrad m7, 2
> + psrad m4, INTERP_SHIFT_SP
> + psrad m5, INTERP_SHIFT_SP
> + psrad m6, INTERP_SHIFT_SP
> + psrad m7, INTERP_SHIFT_SP
> +%else
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> + psrad m6, INTERP_SHIFT_PS
> + psrad m7, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -6538,26 +6535,26 @@
> paddd m4, m14
> paddd m5, m14
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> - psrad m4, 6
> - psrad m5, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> - psrad m4, 10
> - psrad m5, 10
> -%else
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> - psrad m4, 2
> - psrad m5, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> + psrad m4, INTERP_SHIFT_SP
> + psrad m5, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -6620,14 +6617,14 @@
> paddd m6, m14
> paddd m7, m14
> %ifidn %1,pp
> - psrad m6, 6
> - psrad m7, 6
> + psrad m6, INTERP_SHIFT_PP
> + psrad m7, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m6, 10
> - psrad m7, 10
> -%else
> - psrad m6, 2
> - psrad m7, 2
> + psrad m6, INTERP_SHIFT_SP
> + psrad m7, INTERP_SHIFT_SP
> +%else
> + psrad m6, INTERP_SHIFT_PS
> + psrad m7, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -6734,32 +6731,32 @@
> paddd m0, m14
> paddd m1, m14
> %ifidn %1,pp
> - psrad m8, 6
> - psrad m9, 6
> - psrad m10, 6
> - psrad m11, 6
> - psrad m12, 6
> - psrad m13, 6
> - psrad m0, 6
> - psrad m1, 6
> + psrad m8, INTERP_SHIFT_PP
> + psrad m9, INTERP_SHIFT_PP
> + psrad m10, INTERP_SHIFT_PP
> + psrad m11, INTERP_SHIFT_PP
> + psrad m12, INTERP_SHIFT_PP
> + psrad m13, INTERP_SHIFT_PP
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m8, 10
> - psrad m9, 10
> - psrad m10, 10
> - psrad m11, 10
> - psrad m12, 10
> - psrad m13, 10
> - psrad m0, 10
> - psrad m1, 10
> -%else
> - psrad m8, 2
> - psrad m9, 2
> - psrad m10, 2
> - psrad m11, 2
> - psrad m12, 2
> - psrad m13, 2
> - psrad m0, 2
> - psrad m1, 2
> + psrad m8, INTERP_SHIFT_SP
> + psrad m9, INTERP_SHIFT_SP
> + psrad m10, INTERP_SHIFT_SP
> + psrad m11, INTERP_SHIFT_SP
> + psrad m12, INTERP_SHIFT_SP
> + psrad m13, INTERP_SHIFT_SP
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> +%else
> + psrad m8, INTERP_SHIFT_PS
> + psrad m9, INTERP_SHIFT_PS
> + psrad m10, INTERP_SHIFT_PS
> + psrad m11, INTERP_SHIFT_PS
> + psrad m12, INTERP_SHIFT_PS
> + psrad m13, INTERP_SHIFT_PS
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -6819,7 +6816,7 @@
> %ifidn %1,pp
> vbroadcasti128 m14, [pd_32]
> %elifidn %1, sp
> - mova m14, [pd_524800]
> + mova m14, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m14, [INTERP_OFFSET_PS]
> %endif
> @@ -6870,7 +6867,7 @@
> %ifidn %3,pp
> vbroadcasti128 m14, [pd_32]
> %elifidn %3, sp
> - mova m14, [pd_524800]
> + mova m14, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m14, [INTERP_OFFSET_PS]
> %endif
> @@ -6953,7 +6950,7 @@
> %ifidn %1,pp
> vbroadcasti128 m14, [pd_32]
> %elifidn %1, sp
> - mova m14, [pd_524800]
> + mova m14, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m14, [INTERP_OFFSET_PS]
> %endif
> @@ -7089,26 +7086,26 @@
> paddd m4, m14
> paddd m5, m14
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> - psrad m4, 6
> - psrad m5, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> - psrad m4, 10
> - psrad m5, 10
> -%else
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> - psrad m4, 2
> - psrad m5, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> + psrad m4, INTERP_SHIFT_SP
> + psrad m5, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -7171,14 +7168,14 @@
> paddd m6, m14
> paddd m7, m14
> %ifidn %1,pp
> - psrad m6, 6
> - psrad m7, 6
> + psrad m6, INTERP_SHIFT_PP
> + psrad m7, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m6, 10
> - psrad m7, 10
> -%else
> - psrad m6, 2
> - psrad m7, 2
> + psrad m6, INTERP_SHIFT_SP
> + psrad m7, INTERP_SHIFT_SP
> +%else
> + psrad m6, INTERP_SHIFT_PS
> + psrad m7, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -7285,32 +7282,32 @@
> paddd m0, m14
> paddd m1, m14
> %ifidn %1,pp
> - psrad m8, 6
> - psrad m9, 6
> - psrad m10, 6
> - psrad m11, 6
> - psrad m12, 6
> - psrad m13, 6
> - psrad m0, 6
> - psrad m1, 6
> + psrad m8, INTERP_SHIFT_PP
> + psrad m9, INTERP_SHIFT_PP
> + psrad m10, INTERP_SHIFT_PP
> + psrad m11, INTERP_SHIFT_PP
> + psrad m12, INTERP_SHIFT_PP
> + psrad m13, INTERP_SHIFT_PP
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m8, 10
> - psrad m9, 10
> - psrad m10, 10
> - psrad m11, 10
> - psrad m12, 10
> - psrad m13, 10
> - psrad m0, 10
> - psrad m1, 10
> -%else
> - psrad m8, 2
> - psrad m9, 2
> - psrad m10, 2
> - psrad m11, 2
> - psrad m12, 2
> - psrad m13, 2
> - psrad m0, 2
> - psrad m1, 2
> + psrad m8, INTERP_SHIFT_SP
> + psrad m9, INTERP_SHIFT_SP
> + psrad m10, INTERP_SHIFT_SP
> + psrad m11, INTERP_SHIFT_SP
> + psrad m12, INTERP_SHIFT_SP
> + psrad m13, INTERP_SHIFT_SP
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> +%else
> + psrad m8, INTERP_SHIFT_PS
> + psrad m9, INTERP_SHIFT_PS
> + psrad m10, INTERP_SHIFT_PS
> + psrad m11, INTERP_SHIFT_PS
> + psrad m12, INTERP_SHIFT_PS
> + psrad m13, INTERP_SHIFT_PS
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -7485,26 +7482,26 @@
> paddd m4, m11
> paddd m5, m11
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> - psrad m4, 6
> - psrad m5, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> - psrad m4, 10
> - psrad m5, 10
> -%else
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> - psrad m4, 2
> - psrad m5, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> + psrad m4, INTERP_SHIFT_SP
> + psrad m5, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -7556,14 +7553,14 @@
> paddd m6, m11
> paddd m7, m11
> %ifidn %1,pp
> - psrad m6, 6
> - psrad m7, 6
> + psrad m6, INTERP_SHIFT_PP
> + psrad m7, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m6, 10
> - psrad m7, 10
> -%else
> - psrad m6, 2
> - psrad m7, 2
> + psrad m6, INTERP_SHIFT_SP
> + psrad m7, INTERP_SHIFT_SP
> +%else
> + psrad m6, INTERP_SHIFT_PS
> + psrad m7, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -7600,7 +7597,7 @@
> %ifidn %1,pp
> vbroadcasti128 m11, [pd_32]
> %elifidn %1, sp
> - mova m11, [pd_524800]
> + mova m11, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m11, [INTERP_OFFSET_PS]
> %endif
> @@ -7647,7 +7644,7 @@
> %ifidn %1,pp
> vbroadcasti128 m14, [pd_32]
> %elifidn %1, sp
> - mova m14, [pd_524800]
> + mova m14, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m14, [INTERP_OFFSET_PS]
> %endif
> @@ -7765,20 +7762,20 @@
> paddd m2, m7
> paddd m3, m7
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> -%else
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -7801,7 +7798,7 @@
>
> %macro FILTER_VER_LUMA_AVX2_16x4 1
> INIT_YMM avx2
> -cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
> +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize
> mov r4d, r4m
> shl r4d, 7
> add r1d, r1d
> @@ -7819,7 +7816,7 @@
> %ifidn %1,pp
> vbroadcasti128 m7, [pd_32]
> %elifidn %1, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m7, [INTERP_OFFSET_PS]
> %endif
> @@ -7864,7 +7861,7 @@
> %ifidn %1,pp
> vbroadcasti128 m7, [pd_32]
> %elifidn %1, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m7, [INTERP_OFFSET_PS]
> %endif
> @@ -7904,7 +7901,7 @@
> %ifidn %1,pp
> vbroadcasti128 m14, [pd_32]
> %elifidn %1, sp
> - mova m14, [pd_524800]
> + mova m14, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m14, [INTERP_OFFSET_PS]
> %endif
> @@ -8014,20 +8011,20 @@
> paddd m2, m14
> paddd m3, m14
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m1, 6
> - psrad m2, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> -%else
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8105,20 +8102,20 @@
> paddd m6, m14
> paddd m7, m14
> %ifidn %1,pp
> - psrad m4, 6
> - psrad m5, 6
> - psrad m6, 6
> - psrad m7, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> + psrad m6, INTERP_SHIFT_PP
> + psrad m7, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m4, 10
> - psrad m5, 10
> - psrad m6, 10
> - psrad m7, 10
> -%else
> - psrad m4, 2
> - psrad m5, 2
> - psrad m6, 2
> - psrad m7, 2
> + psrad m4, INTERP_SHIFT_SP
> + psrad m5, INTERP_SHIFT_SP
> + psrad m6, INTERP_SHIFT_SP
> + psrad m7, INTERP_SHIFT_SP
> +%else
> + psrad m4, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> + psrad m6, INTERP_SHIFT_PS
> + psrad m7, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8182,20 +8179,20 @@
> paddd m10, m14
> paddd m11, m14
> %ifidn %1,pp
> - psrad m8, 6
> - psrad m9, 6
> - psrad m10, 6
> - psrad m11, 6
> + psrad m8, INTERP_SHIFT_PP
> + psrad m9, INTERP_SHIFT_PP
> + psrad m10, INTERP_SHIFT_PP
> + psrad m11, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m8, 10
> - psrad m9, 10
> - psrad m10, 10
> - psrad m11, 10
> -%else
> - psrad m8, 2
> - psrad m9, 2
> - psrad m10, 2
> - psrad m11, 2
> + psrad m8, INTERP_SHIFT_SP
> + psrad m9, INTERP_SHIFT_SP
> + psrad m10, INTERP_SHIFT_SP
> + psrad m11, INTERP_SHIFT_SP
> +%else
> + psrad m8, INTERP_SHIFT_PS
> + psrad m9, INTERP_SHIFT_PS
> + psrad m10, INTERP_SHIFT_PS
> + psrad m11, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8251,7 +8248,7 @@
> %ifidn %1,pp
> vbroadcasti128 m7, [pd_32]
> %elifidn %1, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m7, [INTERP_OFFSET_PS]
> %endif
> @@ -8315,14 +8312,14 @@
> paddd m0, m7
> paddd m2, m7
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m2, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m2, 10
> -%else
> - psrad m0, 2
> - psrad m2, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8366,14 +8363,14 @@
> paddd m4, m7
> paddd m1, m7
> %ifidn %1,pp
> - psrad m4, 6
> - psrad m1, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m4, 10
> - psrad m1, 10
> -%else
> - psrad m4, 2
> - psrad m1, 2
> + psrad m4, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> +%else
> + psrad m4, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8458,14 +8455,14 @@
> paddd m0, m7
> paddd m2, m7
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m2, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m2, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m2, 10
> -%else
> - psrad m0, 2
> - psrad m2, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8516,14 +8513,14 @@
> paddd m4, m7
> paddd m1, m7
> %ifidn %1,pp
> - psrad m4, 6
> - psrad m1, 6
> + psrad m4, INTERP_SHIFT_PP
> + psrad m1, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m4, 10
> - psrad m1, 10
> -%else
> - psrad m4, 2
> - psrad m1, 2
> + psrad m4, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> +%else
> + psrad m4, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8574,14 +8571,14 @@
> paddd m6, m7
> paddd m5, m7
> %ifidn %1,pp
> - psrad m6, 6
> - psrad m5, 6
> + psrad m6, INTERP_SHIFT_PP
> + psrad m5, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m6, 10
> - psrad m5, 10
> -%else
> - psrad m6, 2
> - psrad m5, 2
> + psrad m6, INTERP_SHIFT_SP
> + psrad m5, INTERP_SHIFT_SP
> +%else
> + psrad m6, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8625,14 +8622,14 @@
> paddd m0, m7
> paddd m3, m7
> %ifidn %1,pp
> - psrad m0, 6
> - psrad m3, 6
> + psrad m0, INTERP_SHIFT_PP
> + psrad m3, INTERP_SHIFT_PP
> %elifidn %1, sp
> - psrad m0, 10
> - psrad m3, 10
> -%else
> - psrad m0, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> +%else
> + psrad m0, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
> %endif
> %endif
>
> @@ -8671,7 +8668,7 @@
> %ifidn %1,pp
> vbroadcasti128 m7, [pd_32]
> %elifidn %1, sp
> - mova m7, [pd_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m7, [INTERP_OFFSET_PS]
> %endif
> @@ -8706,7 +8703,7 @@
> %ifidn %1,pp
> vbroadcasti128 m14, [pd_32]
> %elifidn %1, sp
> - mova m14, [pd_524800]
> + mova m14, [INTERP_OFFSET_SP]
> %else
> vbroadcasti128 m14, [INTERP_OFFSET_PS]
> %endif
> @@ -8758,10 +8755,10 @@
> paddd m2, m7
> paddd m3, m7
>
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -8784,7 +8781,6 @@
>
> dec dword [rsp]
> jnz .loopH
> -
> RET
> %endmacro
>
> @@ -8837,7 +8833,7 @@
> lea r6, [tab_LumaCoeffV + r4]
> %endif
>
> - mova m7, [tab_c_524800]
> + mova m7, [INTERP_OFFSET_SP]
>
> mov dword [rsp], %2/4
> .loopH:
> @@ -8850,10 +8846,10 @@
> paddd m2, m7
> paddd m3, m7
>
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -8879,7 +8875,6 @@
>
> dec dword [rsp]
> jnz .loopH
> -
> RET
> %endmacro
>
> @@ -8963,7 +8958,6 @@
>
> dec dword [rsp]
> jnz .loopH
> -
> RET
> %endmacro
>
> @@ -9011,7 +9005,7 @@
> %rep %1/4
> movd m0, [r0]
> movhps m0, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
>
> movd [r2 + r3 * 0], m0
> @@ -9019,7 +9013,7 @@
>
> movd m0, [r0 + r1 * 2]
> movhps m0, [r0 + r4]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
>
> movd [r2 + r3 * 2], m0
> @@ -10293,14 +10287,13 @@
> mov r4d, r4m
> add r1d, r1d
> add r3d, r3d
> -%ifdef PIC
> -
> +
> +%ifdef PIC
> lea r6, [tab_LumaCoeff]
> - lea r4 , [r4 * 8]
> + lea r4, [r4 * 8]
> vbroadcasti128 m0, [r6 + r4 * 2]
> -
> -%else
> - lea r4 , [r4 * 8]
> +%else
> + lea r4, [r4 * 8]
> vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2]
> %endif
>
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/loopfilter.asm Tue Jul 21 14:30:11 2015 -0700
> @@ -39,7 +39,7 @@
> cextern pb_128
> cextern pb_2
> cextern pw_2
> -cextern pw_1023
> +cextern pw_pixel_max
> cextern pb_movemask
> cextern pw_1
> cextern hmul_16p
> @@ -81,7 +81,7 @@
> palignr m2, m3, m5, 15
> por m2, m0
>
> - mova m4, [pw_1023]
> + mova m4, [pw_pixel_max]
> psignb m2, [pb_128] ; m2 = signLeft
> pxor m0, m0
> palignr m0, m3, 15
> @@ -127,7 +127,7 @@
> palignr m2, m3, m5, 15
> por m2, m0
>
> - mova m4, [pw_1023]
> + mova m4, [pw_pixel_max]
> psignb m2, [pb_128] ; m2 = signLeft
> pxor m0, m0
> palignr m0, m3, 15
> @@ -249,7 +249,7 @@
> neg r1b
> movd xm1, r1d
> vinserti128 m0, m0, xm1, 1
> - mova m5, [pw_1023]
> + mova m5, [pw_pixel_max]
> mov r1d, r4m
> add r1d, r1d
> shr r2d, 4
> @@ -402,8 +402,8 @@
>
> pmaxsw m7, m0
> pmaxsw m5, m0
> - pminsw m7, [pw_1023]
> - pminsw m5, [pw_1023]
> + pminsw m7, [pw_pixel_max]
> + pminsw m5, [pw_pixel_max]
>
> movu [r0], m7
> movu [r0 + 16], m5
> @@ -468,7 +468,7 @@
> mov r4d, r4m
> mova m4, [pb_2]
> shr r4d, 4
> - mova m0, [pw_1023]
> + mova m0, [pw_pixel_max]
> .loop
> movu m5, [r0]
> movu m3, [r0 + r3]
> @@ -559,7 +559,7 @@
> add r3d, r3d
> mov r4d, r4m
> pxor m0, m0 ; m0 = 0
> - mova m6, [pw_1023]
> + mova m6, [pw_pixel_max]
> mov r5d, r4d
> shr r4d, 4
> mov r6, r0
> @@ -736,7 +736,7 @@
> cglobal saoCuOrgE1_2Rows, 4,5,8
> add r3d, r3d
> mov r4d, r4m
> - mova m4, [pw_1023]
> + mova m4, [pw_pixel_max]
> vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo
> shr r4d, 4
> .loop
> @@ -884,8 +884,8 @@
> paddw m5, m4
> pmaxsw m7, m0
> pmaxsw m5, m0
> - pminsw m7, [pw_1023]
> - pminsw m5, [pw_1023]
> + pminsw m7, [pw_pixel_max]
> + pminsw m5, [pw_pixel_max]
> movu [r0], m7
> movu [r0 + 16], m5
>
> @@ -960,7 +960,7 @@
> movq xm4, [r0 + r4 * 2]
> movhps xm4, [r1 + r4]
> vbroadcasti128 m5, [r3]
> - mova m6, [pw_1023]
> + mova m6, [pw_pixel_max]
> .loop
> movu m1, [r0]
> movu m3, [r0 + r5 + 2]
> @@ -1086,8 +1086,8 @@
> paddw m7, m6
> pmaxsw m1, m0
> pmaxsw m7, m0
> - pminsw m1, [pw_1023]
> - pminsw m7, [pw_1023]
> + pminsw m1, [pw_pixel_max]
> + pminsw m7, [pw_pixel_max]
> movu [r0], m1
> movu [r0 + 32], m7
>
> @@ -1212,8 +1212,8 @@
> paddw m5, m4
> pmaxsw m7, m0
> pmaxsw m5, m0
> - pminsw m7, [pw_1023]
> - pminsw m5, [pw_1023]
> + pminsw m7, [pw_pixel_max]
> + pminsw m5, [pw_pixel_max]
> movu [r0], m7
> movu [r0 + 16], m5
>
> @@ -1333,7 +1333,7 @@
> paddw m1, m3
> pxor m0, m0
> pmaxsw m1, m0
> - pminsw m1, [pw_1023]
> + pminsw m1, [pw_pixel_max]
> movu [r0], m1
>
> psubb xm0, xm2
> @@ -1461,8 +1461,8 @@
> pxor m0, m0
> pmaxsw m1, m0
> pmaxsw m7, m0
> - pminsw m1, [pw_1023]
> - pminsw m7, [pw_1023]
> + pminsw m1, [pw_pixel_max]
> + pminsw m7, [pw_pixel_max]
> movu [r0], m1
> movu [r0 + 32], m7
>
> @@ -1565,8 +1565,8 @@
> .loopW
> movu m2, [r0 + r6]
> movu m5, [r0 + r6 + 16]
> - psrlw m0, m2, 5
> - psrlw m6, m5, 5
> + psrlw m0, m2, (BIT_DEPTH - 5)
> + psrlw m6, m5, (BIT_DEPTH - 5)
> packuswb m0, m6
> pand m0, [pb_31] ; m0 = [index]
>
> @@ -1584,8 +1584,8 @@
> paddw m5, m6
> pmaxsw m2, m7
> pmaxsw m5, m7
> - pminsw m2, [pw_1023]
> - pminsw m5, [pw_1023]
> + pminsw m2, [pw_pixel_max]
> + pminsw m5, [pw_pixel_max]
>
> movu [r0 + r6], m2
> movu [r0 + r6 + 16], m5
> @@ -1656,7 +1656,7 @@
> sub r1d, r2d
> sub r1d, r2d
> shr r2d, 4
> - mova m7, [pw_1023]
> + mova m7, [pw_pixel_max]
>
> mov r6d, r3d
> shr r3d, 1
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/mc-a.asm
> --- a/source/common/x86/mc-a.asm Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/mc-a.asm Tue Jul 21 14:30:11 2015 -0700
> @@ -32,6 +32,19 @@
> %include "x86inc.asm"
> %include "x86util.asm"
>
> +%if BIT_DEPTH==8
> + %define ADDAVG_FACTOR 256
> + %define ADDAVG_ROUND 128
> +%elif BIT_DEPTH==10
> + %define ADDAVG_FACTOR 1024
> + %define ADDAVG_ROUND 512
> +%elif BIT_DEPTH==12
> + %define ADDAVG_FACTOR 4096
> + %define ADDAVG_ROUND 2048
> +%else
> + %error Unsupport bit depth!
> +%endif
> +
> SECTION_RODATA 32
>
> ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
> @@ -54,6 +67,8 @@
> cextern pw_512
> cextern pw_1023
> cextern pw_1024
> +cextern pw_2048
> +cextern pw_4096
> cextern pw_00ff
> cextern pw_pixel_max
> cextern pd_32
> @@ -92,23 +107,24 @@
> punpcklqdq m1, m2
> punpcklqdq m3, m5
> paddw m1, m3
> - pmulhrsw m1, [pw_1024]
> - paddw m1, [pw_512]
> + pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR]
> + paddw m1, [pw_ %+ ADDAVG_ROUND]
>
> pxor m0, m0
> pmaxsw m1, m0
> - pminsw m1, [pw_1023]
> + pminsw m1, [pw_pixel_max]
> movd [r2], m1
> pextrd [r2 + r5], m1, 1
> lea r2, [r2 + 2 * r5]
> pextrd [r2], m1, 2
> pextrd [r2 + r5], m1, 3
> -
> RET
> +
> +
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m0, [pw_512]
> + mova m0, [pw_ %+ ADDAVG_ROUND]
> pxor m7, m7
> add r3, r3
> add r4, r4
> @@ -136,11 +152,11 @@
> punpcklqdq m1, m2
> punpcklqdq m3, m5
> paddw m1, m3
> - pmulhrsw m1, [pw_1024]
> + pmulhrsw m1, [pw_ %+ ADDAVG_FACTOR]
> paddw m1, m0
>
> pmaxsw m1, m7
> - pminsw m1, [pw_1023]
> + pminsw m1, [pw_pixel_max]
> movd [r2], m1
> pextrd [r2 + r5], m1, 1
> lea r2, [r2 + 2 * r5]
> @@ -156,8 +172,8 @@
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m6, [pw_1023]
> - mova m7, [pw_1024]
> + mova m6, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> mov r6d, 16/4
> add r3, r3
> add r4, r4
> @@ -183,7 +199,7 @@
> punpcklqdq m3, m5
> paddw m1, m3
> pmulhrsw m1, m7
> - paddw m1, [pw_512]
> + paddw m1, [pw_ %+ ADDAVG_ROUND]
> pxor m0, m0
> pmaxsw m1, m0
> pminsw m1, m6
> @@ -213,21 +229,21 @@
> punpcklqdq m0, m1
> punpcklqdq m2, m3
> paddw m0, m2
> - pmulhrsw m0, [pw_1024]
> - paddw m0, [pw_512]
> + pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR]
> + paddw m0, [pw_ %+ ADDAVG_ROUND]
>
> pxor m6, m6
> pmaxsw m0, m6
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> movh [r2], m0
> movhps [r2 + r5], m0
> RET
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -264,9 +280,9 @@
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> mov r6d, 16/2
> add r3, r3
> @@ -300,9 +316,9 @@
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -331,9 +347,9 @@
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -370,9 +386,9 @@
> %macro ADDAVG_W4_H4 1
> INIT_XMM sse4
> cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -420,9 +436,9 @@
> %macro ADDAVG_W8_H4 1
> INIT_XMM sse4
> cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -470,9 +486,9 @@
> %macro ADDAVG_W12_H4 1
> INIT_XMM sse4
> cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -532,9 +548,9 @@
> %macro ADDAVG_W16_H4 1
> INIT_XMM sse4
> cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -601,9 +617,9 @@
> %macro ADDAVG_W24_H2 2
> INIT_XMM sse4
> cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -683,9 +699,9 @@
> %macro ADDAVG_W32_H2 1
> INIT_XMM sse4
> cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -787,9 +803,9 @@
> %macro ADDAVG_W48_H2 1
> INIT_XMM sse4
> cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -921,9 +937,9 @@
> %macro ADDAVG_W64_H1 1
> INIT_XMM sse4
> cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m7, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m7, [pw_ %+ ADDAVG_FACTOR]
> pxor m6, m6
> add r3, r3
> add r4, r4
> @@ -1029,19 +1045,19 @@
>
> paddw m0, m1
> pxor m1, m1
> - pmulhrsw m0, [pw_1024]
> - paddw m0, [pw_512]
> + pmulhrsw m0, [pw_ %+ ADDAVG_FACTOR]
> + paddw m0, [pw_ %+ ADDAVG_ROUND]
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> vextracti128 xm1, m0, 1
> movu [r2], xm0
> movu [r2 + r5 * 2], xm1
> RET
>
> cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m1, m1
> add r3d, r3d
> add r4d, r4d
> @@ -1100,9 +1116,9 @@
>
> %macro ADDAVG_W8_H4_AVX2 1
> cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m1, m1
> add r3d, r3d
> add r4d, r4d
> @@ -1159,9 +1175,9 @@
> ADDAVG_W8_H4_AVX2 64
>
> cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m1, m1
> add r3, r3
> add r4, r4
> @@ -1201,8 +1217,8 @@
> RET
>
> cglobal addAvg_12x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> paddw m3, m4, m4
> pxor m1, m1
> add r3, r3
> @@ -1244,9 +1260,9 @@
>
> %macro ADDAVG_W16_H4_AVX2 1
> cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m2, m2
> add r3, r3
> add r4, r4
> @@ -1291,9 +1307,9 @@
> ADDAVG_W16_H4_AVX2 64
>
> cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m1, m1
> add r3, r3
> add r4, r4
> @@ -1347,8 +1363,8 @@
> RET
>
> cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> paddw m3, m4, m4
> pxor m1, m1
> add r3, r3
> @@ -1404,9 +1420,9 @@
>
> %macro ADDAVG_W32_H2_AVX2 1
> cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m2, m2
> add r3, r3
> add r4, r4
> @@ -1468,9 +1484,9 @@
> ADDAVG_W32_H2_AVX2 64
>
> cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m2, m2
> add r3, r3
> add r4, r4
> @@ -1543,9 +1559,9 @@
>
> %macro ADDAVG_W64_H1_AVX2 1
> cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> - mova m4, [pw_512]
> - mova m5, [pw_1023]
> - mova m3, [pw_1024]
> + mova m4, [pw_ %+ ADDAVG_ROUND]
> + mova m5, [pw_pixel_max]
> + mova m3, [pw_ %+ ADDAVG_FACTOR]
> pxor m2, m2
> add r3d, r3d
> add r4d, r4d
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/pixel-util8.asm Tue Jul 21 14:30:11 2015 -0700
> @@ -879,8 +879,8 @@
> %if HIGH_BIT_DEPTH
> cmp r3d, 32767
> jle .skip
> - shr r3d, 2
> - sub r4d, 2
> + shr r3d, (BIT_DEPTH - 8)
> + sub r4d, (BIT_DEPTH - 8)
> .skip:
> %endif
> movd m0, r4d ; m0 = shift
> @@ -1273,13 +1273,7 @@
> INIT_XMM sse4
> cglobal weight_pp, 4,7,7
> %define correction (14 - BIT_DEPTH)
> -%if BIT_DEPTH == 10
> - mova m6, [pw_1023]
> -%elif BIT_DEPTH == 12
> - mova m6, [pw_3fff]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + mova m6, [pw_pixel_max]
> mov r6d, r6m
> mov r4d, r4m
> mov r5d, r5m
> @@ -1423,7 +1417,7 @@
> movd xm1, r7m
> vpbroadcastd m2, r8m
> mova m5, [pw_1]
> - mova m6, [pw_1023]
> + mova m6, [pw_pixel_max]
> add r2d, r2d
> add r3d, r3d
> sub r2d, r3d
> @@ -1516,13 +1510,7 @@
> %if HIGH_BIT_DEPTH
> INIT_XMM sse4
> cglobal weight_sp, 6,7,8
> -%if BIT_DEPTH == 10
> - mova m1, [pw_1023]
> -%elif BIT_DEPTH == 12
> - mova m1, [pw_3fff]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + mova m1, [pw_pixel_max]
> mova m2, [pw_1]
> mov r6d, r7m
> shl r6d, 16
> @@ -1681,7 +1669,7 @@
> %if HIGH_BIT_DEPTH
> INIT_YMM avx2
> cglobal weight_sp, 6,7,9
> - mova m1, [pw_1023]
> + mova m1, [pw_pixel_max]
> mova m2, [pw_1]
> mov r6d, r7m
> shl r6d, 16
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
More information about the x265-devel
mailing list