[x265] [PATCH 1 of 4] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX

Steve Borho steve at borho.org
Wed Jul 22 18:25:56 CEST 2015


On 07/21, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1437514211 25200
> # Node ID ab2c34d6ad913369fd8feb84aee10030ffaa0df5
> # Parent  46152345eb6ff261fd90272f7a0712300d6324c0
> asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX

nice!  queued for smoke testing

> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/asm-primitives.cpp	Tue Jul 21 14:30:11 2015 -0700
> @@ -1043,7 +1043,9 @@
>  
>          // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
>          ALL_LUMA_PU(satd, pixel_satd, ssse3);
> +#if X265_DEPTH <= 10
>          ASSIGN_SA8D(ssse3);
> +#endif
>          INTRA_ANG_SSSE3(ssse3);
>  
>          p.dst4x4 = PFX(dst4_ssse3);
> @@ -1126,14 +1128,18 @@
>  
>          // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
>          ALL_LUMA_PU(satd, pixel_satd, sse4);
> +#if X265_DEPTH <= 10
>          ASSIGN_SA8D(sse4);
> +#endif
>  
>          p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
>          p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
>          p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
>          p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
>  
> +#if X265_DEPTH <= 10
>          ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
> +#endif
>          ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
>          INTRA_ANG_SSE4_COMMON(sse4);
>          INTRA_ANG_SSE4_HIGH(sse4);
> @@ -1147,7 +1153,9 @@
>  
>          // TODO: check POPCNT flag!
>          ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
> +#if X265_DEPTH <= 10
>          ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
> +#endif
>          ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
>  
>          p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
> @@ -1184,7 +1192,9 @@
>          p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
>          p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
>          p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
> +#if X265_DEPTH <= 10
>          ASSIGN_SA8D(avx);
> +#endif
>          p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
>          p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
>          p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
> @@ -1292,7 +1302,9 @@
>      {
>          //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
>          ALL_LUMA_PU(satd, pixel_satd, xop);
> +#if X265_DEPTH <= 10
>          ASSIGN_SA8D(xop);
> +#endif
>          LUMA_VAR(xop);
>          p.frameInitLowres = PFX(frame_init_lowres_core_xop);
>      }
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm	Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/const-a.asm	Tue Jul 21 14:30:11 2015 -0700
> @@ -79,6 +79,7 @@
>  const pw_512,               times 16 dw 512
>  const pw_1023,              times 16 dw 1023
>  const pw_1024,              times 16 dw 1024
> +const pw_2048,              times 16 dw 2048
>  const pw_4096,              times 16 dw 4096
>  const pw_8192,              times  8 dw 8192
>  const pw_00ff,              times 16 dw 0x00ff
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/intrapred16.asm
> --- a/source/common/x86/intrapred16.asm	Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/intrapred16.asm	Tue Jul 21 14:30:11 2015 -0700
> @@ -1748,7 +1748,7 @@
>      ; filter top
>      movu        m1,             [r2]
>      paddw       m1,             m0
> -    psraw       m1,             2
> +    psrlw       m1,             2
>      movh        [r0],           m1             ; overwrite top-left pixel, we will update it later
>  
>      ; filter top-left
> @@ -1763,7 +1763,7 @@
>      lea         r0,             [r0 + r1 * 2]
>      movu        m1,             [r3 + 2]
>      paddw       m1,             m0
> -    psraw       m1,             2
> +    psrlw       m1,             2
>      movd        r3d,            m1
>      mov         [r0],           r3w
>      shr         r3d,            16
> @@ -1872,7 +1872,7 @@
>      ; filter top
>      movu            m0,            [r2]
>      paddw           m0,            m1
> -    psraw           m0,            2
> +    psrlw           m0,            2
>      movu            [r6],          m0
>  
>      ; filter top-left
> @@ -1887,7 +1887,7 @@
>      add             r6,            r1
>      movu            m0,            [r3 + 2]
>      paddw           m0,            m1
> -    psraw           m0,            2
> +    psrlw           m0,            2
>      pextrw          [r6],          m0, 0
>      pextrw          [r6 + r1],     m0, 1
>      pextrw          [r6 + r1 * 2], m0, 2
> @@ -1913,13 +1913,13 @@
>      movu            m2,                  [r2]
>      movu            m3,                  [r2 + 16]
>  
> -    paddw           m0,                  m1
> +    paddw           m0,                  m1                     ; dynamic range 13 bits
>      paddw           m2,                  m3
> -    paddw           m0,                  m2
> -    movhlps         m1,                  m0
> -    paddw           m0,                  m1
> -    phaddw          m0,                  m0
> +    paddw           m0,                  m2                     ; dynamic range 14 bits
> +    movhlps         m1,                  m0                     ; dynamic range 15 bits
> +    paddw           m0,                  m1                     ; dynamic range 16 bits
>      pmaddwd         m0,                  [pw_1]
> +    phaddd          m0,                  m0
>  
>      movd            r5d,                 m0
>      add             r5d,                 16
> @@ -1983,11 +1983,11 @@
>      ; filter top
>      movu            m2,                  [r2]
>      paddw           m2,                  m1
> -    psraw           m2,                  2
> +    psrlw           m2,                  2
>      movu            [r6],                m2
>      movu            m3,                  [r2 + 16]
>      paddw           m3,                  m1
> -    psraw           m3,                  2
> +    psrlw           m3,                  2
>      movu            [r6 + 16],           m3
>  
>      ; filter top-left
> @@ -2002,7 +2002,7 @@
>      add             r6,                  r1
>      movu            m2,                  [r3 + 2]
>      paddw           m2,                  m1
> -    psraw           m2,                  2
> +    psrlw           m2,                  2
>  
>      pextrw          [r6],                m2, 0
>      pextrw          [r6 + r1],           m2, 1
> @@ -2019,7 +2019,7 @@
>      lea             r6,                  [r6 + r1 * 2]
>      movu            m3,                  [r3 + 18]
>      paddw           m3,                  m1
> -    psraw           m3,                  2
> +    psrlw           m3,                  2
>  
>      pextrw          [r6],                m3, 0
>      pextrw          [r6 + r1],           m3, 1
> @@ -2046,21 +2046,21 @@
>      movu            m1,                  [r3 + 16]
>      movu            m2,                  [r3 + 32]
>      movu            m3,                  [r3 + 48]
> -    paddw           m0,                  m1
> +    paddw           m0,                  m1             ; dynamic range 13 bits
>      paddw           m2,                  m3
> -    paddw           m0,                  m2
> +    paddw           m0,                  m2             ; dynamic range 14 bits
>      movu            m1,                  [r2]
>      movu            m3,                  [r2 + 16]
>      movu            m4,                  [r2 + 32]
>      movu            m5,                  [r2 + 48]
> -    paddw           m1,                  m3
> +    paddw           m1,                  m3             ; dynamic range 13 bits
>      paddw           m4,                  m5
> -    paddw           m1,                  m4
> -    paddw           m0,                  m1
> +    paddw           m1,                  m4             ; dynamic range 14 bits
> +    paddw           m0,                  m1             ; dynamic range 15 bits
> +    pmaddwd         m0,                  [pw_1]
>      movhlps         m1,                  m0
> -    paddw           m0,                  m1
> -    phaddw          m0,                  m0
> -    pmaddwd         m0,                  [pw_1]
> +    paddd           m0,                  m1
> +    phaddd          m0,                  m0
>  
>      paddd           m0,                  [pd_32]     ; sum = sum + 32
>      psrld           m0,                  6           ; sum = sum / 64
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/ipfilter16.asm
> --- a/source/common/x86/ipfilter16.asm	Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/ipfilter16.asm	Tue Jul 21 14:30:11 2015 -0700
> @@ -26,6 +26,25 @@
>  %include "x86inc.asm"
>  %include "x86util.asm"
>  
> +
> +%define INTERP_OFFSET_PP        pd_32
> +%define INTERP_SHIFT_PP         6
> +
> +%if BIT_DEPTH == 10
> +    %define INTERP_SHIFT_PS         2
> +    %define INTERP_OFFSET_PS        pd_n32768
> +    %define INTERP_SHIFT_SP         10
> +    %define INTERP_OFFSET_SP        pd_524800
> +%elif BIT_DEPTH == 12
> +    %define INTERP_SHIFT_PS         4
> +    %define INTERP_OFFSET_PS        pd_n131072
> +    %define INTERP_SHIFT_SP         8
> +    %define INTERP_OFFSET_SP        pd_524416
> +%else
> +    %error Unsupport bit depth!
> +%endif
> +
> +
>  SECTION_RODATA 32
>  
>  tab_c_32:         times 8 dd 32
> @@ -145,21 +164,9 @@
>  const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
>                  db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
>  
> -%if BIT_DEPTH == 10
> -    %define INTERP_OFFSET_PS        pd_n32768
> -    %define INTERP_SHIFT_PS         2
> -    %define INTERP_OFFSET_SP        pd_524800
> -    %define INTERP_SHIFT_SP         10
> -%elif BIT_DEPTH == 12
> -    %define INTERP_OFFSET_PS        pd_n131072
> -    %define INTERP_SHIFT_PS         4
> -    %define INTERP_OFFSET_SP        pd_524416
> -    %define INTERP_SHIFT_SP         8
> -%else
> -    %error Unsupport bit depth!
> -%endif
>  
>  SECTION .text
> +cextern pd_8
>  cextern pd_32
>  cextern pw_pixel_max
>  cextern pd_524416
> @@ -503,7 +510,7 @@
>  %endif
>  
>  %ifidn %1,pp
> -    mova      m7, [pd_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %define SHIFT 6
>  %elifidn %1,ps
>      mova      m7, [INTERP_OFFSET_PS]
> @@ -1176,7 +1183,6 @@
>  %macro FILTER_HOR_LUMA_W4 3
>  INIT_XMM sse4
>  cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> -
>      mov         r4d, r4m
>      sub         r0, 6
>      shl         r4d, 4
> @@ -1229,7 +1235,7 @@
>      packusdw    m4, m4
>      CLIPW       m4, m6, m7
>  %else
> -    psrad       m4, 2
> +    psrad       m4, INTERP_SHIFT_PS
>      packssdw    m4, m4
>  %endif
>  
> @@ -1287,7 +1293,7 @@
>      mov         r4d, %2
>  %ifidn %3, ps
>      cmp         r5m, byte 0
> -    je          .loopH
> +    je         .loopH
>      lea         r6, [r1 + 2 * r1]
>      sub         r0, r6
>      add         r4d, 7
> @@ -1329,8 +1335,8 @@
>      packusdw    m4, m5
>      CLIPW       m4, m7, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> -    psrad       m5, 2
> +    psrad       m4, INTERP_SHIFT_PS
> +    psrad       m5, INTERP_SHIFT_PS
>      packssdw    m4, m5
>  %endif
>  
> @@ -1340,7 +1346,7 @@
>      add         r2, r3
>  
>      dec         r4d
> -    jnz         .loopH
> +    jnz        .loopH
>      RET
>  %endmacro
>  
> @@ -1380,7 +1386,7 @@
>      mova        m0, [tab_LumaCoeff + r4]
>  %endif
>  %ifidn %3, pp
> -    mova        m1, [pd_32]
> +    mova        m1, [INTERP_OFFSET_PP]
>  %else
>      mova        m1, [INTERP_OFFSET_PS]
>  %endif
> @@ -1425,14 +1431,14 @@
>      phaddd      m5, m6
>      paddd       m5, m1
>  %ifidn %3, pp
> -    psrad       m4, 6
> -    psrad       m5, 6
> +    psrad       m4, INTERP_SHIFT_PP
> +    psrad       m5, INTERP_SHIFT_PP
>      packusdw    m4, m5
>      pxor        m5, m5
>      CLIPW       m4, m5, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> -    psrad       m5, 2
> +    psrad       m4, INTERP_SHIFT_PS
> +    psrad       m5, INTERP_SHIFT_PS
>      packssdw    m4, m5
>  %endif
>  
> @@ -1453,12 +1459,12 @@
>      phaddd      m4, m5
>      paddd       m4, m1
>  %ifidn %3, pp
> -    psrad       m4, 6
> +    psrad       m4, INTERP_SHIFT_PP
>      packusdw    m4, m4
>      pxor        m5, m5
>      CLIPW       m4, m5, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> +    psrad       m4, INTERP_SHIFT_PS
>      packssdw    m4, m4
>  %endif
>  
> @@ -1550,14 +1556,14 @@
>      phaddd      m5, m6
>      paddd       m5, m1
>  %ifidn %3, pp
> -    psrad       m4, 6
> -    psrad       m5, 6
> +    psrad       m4, INTERP_SHIFT_PP
> +    psrad       m5, INTERP_SHIFT_PP
>      packusdw    m4, m5
>      pxor        m5, m5
>      CLIPW       m4, m5, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> -    psrad       m5, 2
> +    psrad       m4, INTERP_SHIFT_PS
> +    psrad       m5, INTERP_SHIFT_PS
>      packssdw    m4, m5
>  %endif
>      movu        [r2 + x], m4
> @@ -1591,14 +1597,14 @@
>      phaddd      m5, m6
>      paddd       m5, m1
>  %ifidn %3, pp
> -    psrad       m4, 6
> -    psrad       m5, 6
> +    psrad       m4, INTERP_SHIFT_PP
> +    psrad       m5, INTERP_SHIFT_PP
>      packusdw    m4, m5
>      pxor        m5, m5
>      CLIPW       m4, m5, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> -    psrad       m5, 2
> +    psrad       m4, INTERP_SHIFT_PS
> +    psrad       m5, INTERP_SHIFT_PS
>      packssdw    m4, m5
>  %endif
>      movu        [r2 + 16 + x], m4
> @@ -1743,14 +1749,14 @@
>      phaddd      m5, m6
>      paddd       m5, m1
>  %ifidn %3, pp
> -    psrad       m4, 6
> -    psrad       m5, 6
> +    psrad       m4, INTERP_SHIFT_PP
> +    psrad       m5, INTERP_SHIFT_PP
>      packusdw    m4, m5
>      pxor        m5, m5
>      CLIPW       m4, m5, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> -    psrad       m5, 2
> +    psrad       m4, INTERP_SHIFT_PS
> +    psrad       m5, INTERP_SHIFT_PS
>      packssdw    m4, m5
>  %endif
>      movu        [r2], m4
> @@ -1784,14 +1790,14 @@
>      phaddd      m5, m6
>      paddd       m5, m1
>  %ifidn %3, pp
> -    psrad       m4, 6
> -    psrad       m5, 6
> +    psrad       m4, INTERP_SHIFT_PP
> +    psrad       m5, INTERP_SHIFT_PP
>      packusdw    m4, m5
>      pxor        m5, m5
>      CLIPW       m4, m5, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> -    psrad       m5, 2
> +    psrad       m4, INTERP_SHIFT_PS
> +    psrad       m5, INTERP_SHIFT_PS
>      packssdw    m4, m5
>  %endif
>      movu        [r2 + 16], m4
> @@ -1825,14 +1831,14 @@
>      phaddd      m5, m6
>      paddd       m5, m1
>  %ifidn %3, pp
> -    psrad       m4, 6
> -    psrad       m5, 6
> +    psrad       m4, INTERP_SHIFT_PP
> +    psrad       m5, INTERP_SHIFT_PP
>      packusdw    m4, m5
>      pxor        m5, m5
>      CLIPW       m4, m5, [pw_pixel_max]
>  %else
> -    psrad       m4, 2
> -    psrad       m5, 2
> +    psrad       m4, INTERP_SHIFT_PS
> +    psrad       m5, INTERP_SHIFT_PS
>      packssdw    m4, m5
>  %endif
>      movu        [r2 + 32], m4
> @@ -1865,11 +1871,11 @@
>      phaddd      m3,         m4
>      paddd       m3,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> +    psrad       m3,         INTERP_SHIFT_PP
>      packusdw    m3,         m3
>      CLIPW       m3,         m7,    m6
>  %else
> -    psrad       m3,         2
> +    psrad       m3,         INTERP_SHIFT_PS
>      packssdw    m3,         m3
>  %endif
>      movd        [r2],       m3
> @@ -1895,13 +1901,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m7,    m6
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2],       m3
> @@ -1950,7 +1956,7 @@
>      phaddd           m4, m4
>      vpermq           m4, m4, q3120
>      paddd            m4, m6
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -1969,7 +1975,7 @@
>      phaddd           m4, m4
>      vpermq           m4, m4, q3120
>      paddd            m4, m6
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2036,7 +2042,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2064,7 +2070,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2132,7 +2138,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2160,7 +2166,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2232,7 +2238,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2260,7 +2266,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2335,7 +2341,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2363,7 +2369,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2425,7 +2431,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2453,7 +2459,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2481,7 +2487,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2545,7 +2551,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2573,7 +2579,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2601,7 +2607,7 @@
>      phaddd           m4, m5
>      vpermq           m4, m4, q3120
>      paddd            m4, m7
> -    psrad            m4, 6
> +    psrad            m4, INTERP_SHIFT_PP
>  
>      packusdw         m4, m4
>      vpermq           m4, m4, q2020
> @@ -2644,32 +2650,32 @@
>      mova        m1,       [INTERP_OFFSET_PS]
>      cmp         r5m, byte 0
>      je          .skip
> -    sub         r0, r1
> -    movu        m3,         [r0]
> -    pshufb      m3,         m3, m2
> -    pmaddwd     m3,         m0
> -
> -    %if %1 == 4
> -        movu        m4,         [r0 + 4]
> -        pshufb      m4,         m4, m2
> -        pmaddwd     m4,         m0
> -        phaddd      m3,         m4
> -    %else
> -        phaddd      m3,         m3
> -    %endif
> -
> -    paddd       m3,         m1
> -    psrad       m3,         INTERP_SHIFT_PS
> -    packssdw    m3,         m3
> -
> -    %if %1 == 2
> -        movd        [r2],       m3
> -    %else
> -        movh        [r2],       m3
> -    %endif
> -
> -    add         r0, r1
> -    add         r2, r3
> +    sub         r0,       r1
> +    movu        m3,       [r0]
> +    pshufb      m3,       m3, m2
> +    pmaddwd     m3,       m0
> +
> +  %if %1 == 4
> +    movu        m4,       [r0 + 4]
> +    pshufb      m4,       m4, m2
> +    pmaddwd     m4,       m0
> +    phaddd      m3,       m4
> +  %else
> +    phaddd      m3,       m3
> +  %endif
> +
> +    paddd       m3,       m1
> +    psrad       m3,       INTERP_SHIFT_PS
> +    packssdw    m3,       m3
> +
> +  %if %1 == 2
> +    movd        [r2],     m3
> +  %else
> +    movh        [r2],     m3
> +  %endif
> +
> +    add         r0,       r1
> +    add         r2,       r3
>      FILTER_W%1_2 %3
>      lea         r0,       [r0 + 2 * r1]
>      lea         r2,       [r2 + 2 * r3]
> @@ -2689,7 +2695,6 @@
>      lea         r2,       [r2 + 2 * r3]
>      FILTER_W%1_2 %3
>  %endrep
> -
>      RET
>  %endmacro
>  
> @@ -2729,13 +2734,13 @@
>      phaddd      m4,         m4
>      paddd       m4,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m4,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m4,         INTERP_SHIFT_PP
>      packusdw    m3,         m4
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m4,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m4,         INTERP_SHIFT_PS
>      packssdw    m3,         m4
>  %endif
>      movh        [r2],       m3
> @@ -2769,13 +2774,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2],       m3
> @@ -2809,13 +2814,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2],       m3
> @@ -2831,11 +2836,11 @@
>      paddd       m3,         m1
>  
>  %ifidn %1, pp
> -    psrad       m3,         6
> +    psrad       m3,         INTERP_SHIFT_PP
>      packusdw    m3,         m3
>      CLIPW       m3,         m6, m7
>  %else
> -    psrad       m3,         2
> +    psrad       m3,         INTERP_SHIFT_PS
>      packssdw    m3,         m3
>  %endif
>      movh        [r2 + 16],  m3
> @@ -2868,13 +2873,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2],       m3
> @@ -2898,13 +2903,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2 + 16],  m3
> @@ -2938,13 +2943,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2],       m3
> @@ -2968,13 +2973,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2 + 16],  m3
> @@ -2998,13 +3003,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2 + 32],  m3
> @@ -3038,13 +3043,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2],       m3
> @@ -3068,13 +3073,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2 + 16],  m3
> @@ -3098,13 +3103,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2 + 32],  m3
> @@ -3128,13 +3133,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2 + 48],  m3
> @@ -3168,13 +3173,13 @@
>      phaddd      m5,         m4
>      paddd       m5,         m1
>  %ifidn %1, pp
> -    psrad       m3,         6
> -    psrad       m5,         6
> +    psrad       m3,         INTERP_SHIFT_PP
> +    psrad       m5,         INTERP_SHIFT_PP
>      packusdw    m3,         m5
>      CLIPW       m3,         m6,    m7
>  %else
> -    psrad       m3,         2
> -    psrad       m5,         2
> +    psrad       m3,         INTERP_SHIFT_PS
> +    psrad       m5,         INTERP_SHIFT_PS
>      packssdw    m3,         m5
>  %endif
>      movh        [r2 + %2],       m3
> @@ -3408,7 +3413,7 @@
>      pmaddwd         m4, m0
>      phaddd          m3, m4
>      paddd           m3, m2
> -    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
> +    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
>  
>      packusdw        m3, m3
>      vpermq          m3, m3, q2020
> @@ -3426,7 +3431,7 @@
>      pmaddwd         m4, m0
>      phaddd          m3, m4
>      paddd           m3, m2
> -    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
> +    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
>  
>      packusdw        m3, m3
>      vpermq          m3, m3, q2020
> @@ -3474,7 +3479,7 @@
>      pmaddwd         m4, m0
>      phaddd          m3, m4
>      paddd           m3, m2
> -    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
> +    psrad           m3, INTERP_SHIFT_PP          ; m3 = DWORD[7 6 3 2 5 4 1 0]
>  
>      packusdw        m3, m3
>      vpermq          m3, m3,q2020
> @@ -3491,7 +3496,7 @@
>      pmaddwd         m4, m0
>      phaddd          m3, m4
>      paddd           m3, m2
> -    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
> +    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
>  
>      packusdw        m3, m3
>      vpermq          m3, m3,q2020
> @@ -4089,7 +4094,7 @@
>      %ifnidn %3, ps
>          mova      m7, [pw_pixel_max]
>          %ifidn %3, pp
> -            mova      m6, [tab_c_32]
> +            mova      m6, [INTERP_OFFSET_PP]
>          %else
>              mova      m6, [INTERP_OFFSET_SP]
>          %endif
> @@ -4129,10 +4134,10 @@
>      paddd     m2, m6
>      paddd     m3, m6
>      %ifidn %3, pp
> -        psrad     m0, 6
> -        psrad     m1, 6
> -        psrad     m2, 6
> -        psrad     m3, 6
> +        psrad     m0, INTERP_SHIFT_PP
> +        psrad     m1, INTERP_SHIFT_PP
> +        psrad     m2, INTERP_SHIFT_PP
> +        psrad     m3, INTERP_SHIFT_PP
>      %else
>          psrad     m0, INTERP_SHIFT_SP
>          psrad     m1, INTERP_SHIFT_SP
> @@ -4344,9 +4349,9 @@
>          pxor      m7, m7
>          mova      m6, [pw_pixel_max]
>          %ifidn %2, pp
> -            mova      m5, [tab_c_32]
> +            mova      m5, [INTERP_OFFSET_PP]
>          %else
> -            mova      m5, [tab_c_524800]
> +            mova      m5, [INTERP_OFFSET_SP]
>          %endif
>      %else
>          mova      m5, [INTERP_OFFSET_PS]
> @@ -4362,18 +4367,18 @@
>  %elifidn %2, ps
>      paddd     m0, m5
>      paddd     m2, m5
> -    psrad     m0, 2
> -    psrad     m2, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
>      packssdw  m0, m2
>  %else
>      paddd     m0, m5
>      paddd     m2, m5
>      %ifidn %2, pp
> -        psrad     m0, 6
> -        psrad     m2, 6
> +        psrad     m0, INTERP_SHIFT_PP
> +        psrad     m2, INTERP_SHIFT_PP
>      %else
> -        psrad     m0, 10
> -        psrad     m2, 10
> +        psrad     m0, INTERP_SHIFT_SP
> +        psrad     m2, INTERP_SHIFT_SP
>      %endif
>      packusdw  m0, m2
>      CLIPW     m0, m7,    m6
> @@ -4389,7 +4394,6 @@
>  
>      dec       r4d
>      jnz       .loopH
> -
>      RET
>  %endmacro
>  
> @@ -4417,7 +4421,6 @@
>  %macro FILTER_VER_CHROMA_W4 3
>  INIT_XMM sse4
>  cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
> -
>      add        r1d, r1d
>      add        r3d, r3d
>      sub        r0, r1
> @@ -4439,9 +4442,9 @@
>          pxor      m6, m6
>          mova      m5, [pw_pixel_max]
>          %ifidn %2, pp
> -            mova      m4, [tab_c_32]
> +            mova      m4, [INTERP_OFFSET_PP]
>          %else
> -            mova      m4, [tab_c_524800]
> +            mova      m4, [INTERP_OFFSET_SP]
>          %endif
>      %else
>          mova      m4, [INTERP_OFFSET_PS]
> @@ -4479,18 +4482,18 @@
>  %elifidn %2, ps
>      paddd     m0, m4
>      paddd     m1, m4
> -    psrad     m0, 2
> -    psrad     m1, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
>      packssdw  m0, m1
>  %else
>      paddd     m0, m4
>      paddd     m1, m4
>      %ifidn %2, pp
> -        psrad     m0, 6
> -        psrad     m1, 6
> +        psrad     m0, INTERP_SHIFT_PP
> +        psrad     m1, INTERP_SHIFT_PP
>      %else
> -        psrad     m0, 10
> -        psrad     m1, 10
> +        psrad     m0, INTERP_SHIFT_SP
> +        psrad     m1, INTERP_SHIFT_SP
>      %endif
>      packusdw  m0, m1
>      CLIPW     m0, m6,    m5
> @@ -4504,7 +4507,6 @@
>      dec        r4d
>      jnz        .loop
>  %endif
> -
>      RET
>  %endmacro
>  
> @@ -4524,7 +4526,6 @@
>  %macro FILTER_VER_CHROMA_W6 3
>  INIT_XMM sse4
>  cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
> -
>      add       r1d, r1d
>      add       r3d, r3d
>      sub       r0, r1
> @@ -4543,9 +4544,9 @@
>      %ifnidn %2, ps
>          mova      m7, [pw_pixel_max]
>          %ifidn %2, pp
> -            mova      m6, [tab_c_32]
> +            mova      m6, [INTERP_OFFSET_PP]
>          %else
> -            mova      m6, [tab_c_524800]
> +            mova      m6, [INTERP_OFFSET_SP]
>          %endif
>      %else
>          mova      m6, [INTERP_OFFSET_PS]
> @@ -4568,10 +4569,10 @@
>      paddd     m1, m6
>      paddd     m2, m6
>      paddd     m3, m6
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -4581,15 +4582,15 @@
>      paddd     m2, m6
>      paddd     m3, m6
>      %ifidn %2, pp
> -        psrad     m0, 6
> -        psrad     m1, 6
> -        psrad     m2, 6
> -        psrad     m3, 6
> +        psrad     m0, INTERP_SHIFT_PP
> +        psrad     m1, INTERP_SHIFT_PP
> +        psrad     m2, INTERP_SHIFT_PP
> +        psrad     m3, INTERP_SHIFT_PP
>      %else
> -        psrad     m0, 10
> -        psrad     m1, 10
> -        psrad     m2, 10
> -        psrad     m3, 10
> +        psrad     m0, INTERP_SHIFT_SP
> +        psrad     m1, INTERP_SHIFT_SP
> +        psrad     m2, INTERP_SHIFT_SP
> +        psrad     m3, INTERP_SHIFT_SP
>      %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -4616,18 +4617,18 @@
>  %elifidn %2, ps
>      paddd     m0, m6
>      paddd     m2, m6
> -    psrad     m0, 2
> -    psrad     m2, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
>      packssdw  m0, m2
>  %else
>      paddd     m0, m6
>      paddd     m2, m6
>      %ifidn %2, pp
> -        psrad     m0, 6
> -        psrad     m2, 6
> +        psrad     m0, INTERP_SHIFT_PP
> +        psrad     m2, INTERP_SHIFT_PP
>      %else
> -        psrad     m0, 10
> -        psrad     m2, 10
> +        psrad     m0, INTERP_SHIFT_SP
> +        psrad     m2, INTERP_SHIFT_SP
>      %endif
>      packusdw  m0, m2
>      CLIPW     m0, m5,    m7
> @@ -4644,7 +4645,6 @@
>  
>      dec       r4d
>      jnz       .loopH
> -
>      RET
>  %endmacro
>  
> @@ -4712,7 +4712,7 @@
>      mov       r4d, %2/2
>  
>  %ifidn %3, pp
> -    mova      m7, [tab_c_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %elifidn %3, sp
>      mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %3, ps
> @@ -4748,10 +4748,10 @@
>      paddd     m2, m7
>      paddd     m3, m7
>      %ifidn %3, pp
> -        psrad     m0, 6
> -        psrad     m1, 6
> -        psrad     m2, 6
> -        psrad     m3, 6
> +        psrad     m0, INTERP_SHIFT_PP
> +        psrad     m1, INTERP_SHIFT_PP
> +        psrad     m2, INTERP_SHIFT_PP
> +        psrad     m3, INTERP_SHIFT_PP
>      %else
>          psrad     m0, INTERP_SHIFT_SP
>          psrad     m1, INTERP_SHIFT_SP
> @@ -4772,7 +4772,6 @@
>  
>      dec       r4d
>      jnz       .loopH
> -
>      RET
>  %endmacro
>  
> @@ -4868,9 +4867,9 @@
>      mov             r6d, %1/4
>  
>  %ifidn %2,pp
> -    vbroadcasti128  m8, [pd_32]
> +    vbroadcasti128  m8, [INTERP_OFFSET_PP]
>  %elifidn %2, sp
> -    mova            m8, [pd_524800]
> +    mova            m8, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m8, [INTERP_OFFSET_PS]
>  %endif
> @@ -4934,20 +4933,20 @@
>      paddd           m2, m8
>      paddd           m3, m8
>  %ifidn %2,pp
> -    psrad           m0, 6
> -    psrad           m1, 6
> -    psrad           m2, 6
> -    psrad           m3, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
>  %elifidn %2, sp
> -    psrad           m0, 10
> -    psrad           m1, 10
> -    psrad           m2, 10
> -    psrad           m3, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m1, 2
> -    psrad           m2, 2
> -    psrad           m3, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -5012,9 +5011,9 @@
>      mov       r4d, %1/2
>  
>  %ifidn %2, pp
> -    mova      m7, [tab_c_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %elifidn %2, sp
> -    mova      m7, [pd_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %2, ps
>      mova      m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -5034,10 +5033,10 @@
>      paddd     m1, m7
>      paddd     m2, m7
>      paddd     m3, m7
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5047,15 +5046,15 @@
>      paddd     m2, m7
>      paddd     m3, m7
>   %ifidn %2, pp
> -    psrad     m0, 6
> -    psrad     m1, 6
> -    psrad     m2, 6
> -    psrad     m3, 6
> -%else
> -    psrad     m0, 10
> -    psrad     m1, 10
> -    psrad     m2, 10
> -    psrad     m3, 10
> +    psrad     m0, INTERP_SHIFT_PP
> +    psrad     m1, INTERP_SHIFT_PP
> +    psrad     m2, INTERP_SHIFT_PP
> +    psrad     m3, INTERP_SHIFT_PP
> +%else
> +    psrad     m0, INTERP_SHIFT_SP
> +    psrad     m1, INTERP_SHIFT_SP
> +    psrad     m2, INTERP_SHIFT_SP
> +    psrad     m3, INTERP_SHIFT_SP
>  %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5184,9 +5183,9 @@
>      mov       r4d, %1/2
>  
>  %ifidn %2, pp
> -    mova      m7, [tab_c_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %elifidn %2, sp
> -    mova      m7, [pd_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %2, ps
>      mova      m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -5213,18 +5212,18 @@
>      paddd     m1, m7
>      paddd     m2, m7
>      paddd     m3, m7
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>      paddd     m8, m7
>      paddd     m9, m7
>      paddd     m10, m7
>      paddd     m11, m7
> -    psrad     m8, 2
> -    psrad     m9, 2
> -    psrad     m10, 2
> -    psrad     m11, 2
> +    psrad     m8, INTERP_SHIFT_PS
> +    psrad     m9, INTERP_SHIFT_PS
> +    psrad     m10, INTERP_SHIFT_PS
> +    psrad     m11, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5240,23 +5239,23 @@
>      paddd     m10, m7
>      paddd     m11, m7
>   %ifidn %2, pp
> -    psrad     m0, 6
> -    psrad     m1, 6
> -    psrad     m2, 6
> -    psrad     m3, 6
> -    psrad     m8, 6
> -    psrad     m9, 6
> -    psrad     m10, 6
> -    psrad     m11, 6
> -%else
> -    psrad     m0, 10
> -    psrad     m1, 10
> -    psrad     m2, 10
> -    psrad     m3, 10
> -    psrad     m8, 10
> -    psrad     m9, 10
> -    psrad     m10, 10
> -    psrad     m11, 10
> +    psrad     m0, INTERP_SHIFT_PP
> +    psrad     m1, INTERP_SHIFT_PP
> +    psrad     m2, INTERP_SHIFT_PP
> +    psrad     m3, INTERP_SHIFT_PP
> +    psrad     m8, INTERP_SHIFT_PP
> +    psrad     m9, INTERP_SHIFT_PP
> +    psrad     m10, INTERP_SHIFT_PP
> +    psrad     m11, INTERP_SHIFT_PP
> +%else
> +    psrad     m0, INTERP_SHIFT_SP
> +    psrad     m1, INTERP_SHIFT_SP
> +    psrad     m2, INTERP_SHIFT_SP
> +    psrad     m3, INTERP_SHIFT_SP
> +    psrad     m8, INTERP_SHIFT_SP
> +    psrad     m9, INTERP_SHIFT_SP
> +    psrad     m10, INTERP_SHIFT_SP
> +    psrad     m11, INTERP_SHIFT_SP
>  %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5326,9 +5325,9 @@
>      mov       r4d, %1/2
>  
>  %ifidn %2, pp
> -    mova      m7, [tab_c_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %elifidn %2, sp
> -    mova      m7, [pd_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %2, ps
>      mova      m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -5380,10 +5379,10 @@
>      paddd     m1, m7
>      paddd     m2, m7
>      paddd     m3, m7
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5393,15 +5392,15 @@
>      paddd     m2, m7
>      paddd     m3, m7
>  %ifidn %2, pp
> -    psrad     m0, 6
> -    psrad     m1, 6
> -    psrad     m2, 6
> -    psrad     m3, 6
> -%else
> -    psrad     m0, 10
> -    psrad     m1, 10
> -    psrad     m2, 10
> -    psrad     m3, 10
> +    psrad     m0, INTERP_SHIFT_PP
> +    psrad     m1, INTERP_SHIFT_PP
> +    psrad     m2, INTERP_SHIFT_PP
> +    psrad     m3, INTERP_SHIFT_PP
> +%else
> +    psrad     m0, INTERP_SHIFT_SP
> +    psrad     m1, INTERP_SHIFT_SP
> +    psrad     m2, INTERP_SHIFT_SP
> +    psrad     m3, INTERP_SHIFT_SP
>  %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5457,9 +5456,9 @@
>      mov       r4d, %1/2
>  
>  %ifidn %2, pp
> -    mova      m7, [tab_c_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %elifidn %2, sp
> -    mova      m7, [pd_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %2, ps
>      mova      m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -5479,10 +5478,10 @@
>      paddd     m1, m7
>      paddd     m2, m7
>      paddd     m3, m7
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5492,15 +5491,15 @@
>      paddd     m2, m7
>      paddd     m3, m7
>   %ifidn %2, pp
> -    psrad     m0, 6
> -    psrad     m1, 6
> -    psrad     m2, 6
> -    psrad     m3, 6
> -%else
> -    psrad     m0, 10
> -    psrad     m1, 10
> -    psrad     m2, 10
> -    psrad     m3, 10
> +    psrad     m0, INTERP_SHIFT_PP
> +    psrad     m1, INTERP_SHIFT_PP
> +    psrad     m2, INTERP_SHIFT_PP
> +    psrad     m3, INTERP_SHIFT_PP
> +%else
> +    psrad     m0, INTERP_SHIFT_SP
> +    psrad     m1, INTERP_SHIFT_SP
> +    psrad     m2, INTERP_SHIFT_SP
> +    psrad     m3, INTERP_SHIFT_SP
>  %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5610,9 +5609,9 @@
>      mov       r4d, %1/2
>  
>  %ifidn %2, pp
> -    mova      m7, [tab_c_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %elifidn %2, sp
> -    mova      m7, [pd_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %2, ps
>      mova      m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -5639,18 +5638,18 @@
>      paddd     m1, m7
>      paddd     m2, m7
>      paddd     m3, m7
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>      paddd     m8, m7
>      paddd     m9, m7
>      paddd     m10, m7
>      paddd     m11, m7
> -    psrad     m8, 2
> -    psrad     m9, 2
> -    psrad     m10, 2
> -    psrad     m11, 2
> +    psrad     m8, INTERP_SHIFT_PS
> +    psrad     m9, INTERP_SHIFT_PS
> +    psrad     m10, INTERP_SHIFT_PS
> +    psrad     m11, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5666,23 +5665,23 @@
>      paddd     m10, m7
>      paddd     m11, m7
>   %ifidn %2, pp
> -    psrad     m0, 6
> -    psrad     m1, 6
> -    psrad     m2, 6
> -    psrad     m3, 6
> -    psrad     m8, 6
> -    psrad     m9, 6
> -    psrad     m10, 6
> -    psrad     m11, 6
> -%else
> -    psrad     m0, 10
> -    psrad     m1, 10
> -    psrad     m2, 10
> -    psrad     m3, 10
> -    psrad     m8, 10
> -    psrad     m9, 10
> -    psrad     m10, 10
> -    psrad     m11, 10
> +    psrad     m0, INTERP_SHIFT_PP
> +    psrad     m1, INTERP_SHIFT_PP
> +    psrad     m2, INTERP_SHIFT_PP
> +    psrad     m3, INTERP_SHIFT_PP
> +    psrad     m8, INTERP_SHIFT_PP
> +    psrad     m9, INTERP_SHIFT_PP
> +    psrad     m10, INTERP_SHIFT_PP
> +    psrad     m11, INTERP_SHIFT_PP
> +%else
> +    psrad     m0, INTERP_SHIFT_SP
> +    psrad     m1, INTERP_SHIFT_SP
> +    psrad     m2, INTERP_SHIFT_SP
> +    psrad     m3, INTERP_SHIFT_SP
> +    psrad     m8, INTERP_SHIFT_SP
> +    psrad     m9, INTERP_SHIFT_SP
> +    psrad     m10, INTERP_SHIFT_SP
> +    psrad     m11, INTERP_SHIFT_SP
>  %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5733,9 +5732,9 @@
>      mov       r4d, 32
>  
>  %ifidn %1, pp
> -    mova      m7, [tab_c_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  %elifidn %1, sp
> -    mova      m7, [pd_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %1, ps
>      mova      m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -5787,10 +5786,10 @@
>      paddd     m1, m7
>      paddd     m2, m7
>      paddd     m3, m7
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5800,15 +5799,15 @@
>      paddd     m2, m7
>      paddd     m3, m7
>  %ifidn %1, pp
> -    psrad     m0, 6
> -    psrad     m1, 6
> -    psrad     m2, 6
> -    psrad     m3, 6
> -%else
> -    psrad     m0, 10
> -    psrad     m1, 10
> -    psrad     m2, 10
> -    psrad     m3, 10
> +    psrad     m0, INTERP_SHIFT_PP
> +    psrad     m1, INTERP_SHIFT_PP
> +    psrad     m2, INTERP_SHIFT_PP
> +    psrad     m3, INTERP_SHIFT_PP
> +%else
> +    psrad     m0, INTERP_SHIFT_SP
> +    psrad     m1, INTERP_SHIFT_SP
> +    psrad     m2, INTERP_SHIFT_SP
> +    psrad     m3, INTERP_SHIFT_SP
>  %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5827,6 +5826,7 @@
>      jnz       .loopH
>      RET
>  %endmacro
> +
>      FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8
>      FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8
>      FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7
> @@ -5834,7 +5834,6 @@
>  
>  INIT_XMM sse2
>  cglobal chroma_p2s, 3, 7, 3
> -
>      ; load width and height
>      mov         r3d, r3m
>      mov         r4d, r4m
> @@ -5850,11 +5849,11 @@
>      lea         r6, [r0 + r5 * 2]
>  
>      movu        m0, [r6]
> -    psllw       m0, 4
> +    psllw       m0, (14 - BIT_DEPTH)
>      paddw       m0, m2
>  
>      movu        m1, [r6 + r1]
> -    psllw       m1, 4
> +    psllw       m1, (14 - BIT_DEPTH)
>      paddw       m1, m2
>  
>      add         r5d, 8
> @@ -5887,7 +5886,6 @@
>  
>      sub         r4d, 2
>      jnz         .loopH
> -
>      RET
>  
>  %macro PROCESS_LUMA_VER_W4_4R 0
> @@ -5975,7 +5973,7 @@
>      lea       r6, [tab_LumaCoeffV + r4]
>  %endif
>  
> -    mova      m7, [pd_32]
> +    mova      m7, [INTERP_OFFSET_PP]
>  
>      mov       dword [rsp], %2/4
>  .loopH:
> @@ -5988,10 +5986,10 @@
>      paddd     m2, m7
>      paddd     m3, m7
>  
> -    psrad     m0, 6
> -    psrad     m1, 6
> -    psrad     m2, 6
> -    psrad     m3, 6
> +    psrad     m0, INTERP_SHIFT_PP
> +    psrad     m1, INTERP_SHIFT_PP
> +    psrad     m2, INTERP_SHIFT_PP
> +    psrad     m3, INTERP_SHIFT_PP
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -6017,7 +6015,6 @@
>  
>      dec       dword [rsp]
>      jnz       .loopH
> -
>      RET
>  %endmacro
>  
> @@ -6126,14 +6123,14 @@
>      paddd           m0, m6
>      paddd           m2, m6
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m2, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m2, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m2, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -6294,20 +6291,20 @@
>      paddd           m2, m11
>      paddd           m3, m11
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m1, 6
> -    psrad           m2, 6
> -    psrad           m3, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m1, 10
> -    psrad           m2, 10
> -    psrad           m3, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m1, 2
> -    psrad           m2, 2
> -    psrad           m3, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -6365,20 +6362,20 @@
>      paddd           m6, m11
>      paddd           m7, m11
>  %ifidn %1,pp
> -    psrad           m4, 6
> -    psrad           m5, 6
> -    psrad           m6, 6
> -    psrad           m7, 6
> +    psrad           m4, INTERP_SHIFT_PP
> +    psrad           m5, INTERP_SHIFT_PP
> +    psrad           m6, INTERP_SHIFT_PP
> +    psrad           m7, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m4, 10
> -    psrad           m5, 10
> -    psrad           m6, 10
> -    psrad           m7, 10
> -%else
> -    psrad           m4, 2
> -    psrad           m5, 2
> -    psrad           m6, 2
> -    psrad           m7, 2
> +    psrad           m4, INTERP_SHIFT_SP
> +    psrad           m5, INTERP_SHIFT_SP
> +    psrad           m6, INTERP_SHIFT_SP
> +    psrad           m7, INTERP_SHIFT_SP
> +%else
> +    psrad           m4, INTERP_SHIFT_PS
> +    psrad           m5, INTERP_SHIFT_PS
> +    psrad           m6, INTERP_SHIFT_PS
> +    psrad           m7, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -6538,26 +6535,26 @@
>      paddd           m4, m14
>      paddd           m5, m14
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m1, 6
> -    psrad           m2, 6
> -    psrad           m3, 6
> -    psrad           m4, 6
> -    psrad           m5, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
> +    psrad           m4, INTERP_SHIFT_PP
> +    psrad           m5, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m1, 10
> -    psrad           m2, 10
> -    psrad           m3, 10
> -    psrad           m4, 10
> -    psrad           m5, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m1, 2
> -    psrad           m2, 2
> -    psrad           m3, 2
> -    psrad           m4, 2
> -    psrad           m5, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +    psrad           m4, INTERP_SHIFT_SP
> +    psrad           m5, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
> +    psrad           m4, INTERP_SHIFT_PS
> +    psrad           m5, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -6620,14 +6617,14 @@
>      paddd           m6, m14
>      paddd           m7, m14
>  %ifidn %1,pp
> -    psrad           m6, 6
> -    psrad           m7, 6
> +    psrad           m6, INTERP_SHIFT_PP
> +    psrad           m7, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m6, 10
> -    psrad           m7, 10
> -%else
> -    psrad           m6, 2
> -    psrad           m7, 2
> +    psrad           m6, INTERP_SHIFT_SP
> +    psrad           m7, INTERP_SHIFT_SP
> +%else
> +    psrad           m6, INTERP_SHIFT_PS
> +    psrad           m7, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -6734,32 +6731,32 @@
>      paddd           m0, m14
>      paddd           m1, m14
>  %ifidn %1,pp
> -    psrad           m8, 6
> -    psrad           m9, 6
> -    psrad           m10, 6
> -    psrad           m11, 6
> -    psrad           m12, 6
> -    psrad           m13, 6
> -    psrad           m0, 6
> -    psrad           m1, 6
> +    psrad           m8, INTERP_SHIFT_PP
> +    psrad           m9, INTERP_SHIFT_PP
> +    psrad           m10, INTERP_SHIFT_PP
> +    psrad           m11, INTERP_SHIFT_PP
> +    psrad           m12, INTERP_SHIFT_PP
> +    psrad           m13, INTERP_SHIFT_PP
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m8, 10
> -    psrad           m9, 10
> -    psrad           m10, 10
> -    psrad           m11, 10
> -    psrad           m12, 10
> -    psrad           m13, 10
> -    psrad           m0, 10
> -    psrad           m1, 10
> -%else
> -    psrad           m8, 2
> -    psrad           m9, 2
> -    psrad           m10, 2
> -    psrad           m11, 2
> -    psrad           m12, 2
> -    psrad           m13, 2
> -    psrad           m0, 2
> -    psrad           m1, 2
> +    psrad           m8, INTERP_SHIFT_SP
> +    psrad           m9, INTERP_SHIFT_SP
> +    psrad           m10, INTERP_SHIFT_SP
> +    psrad           m11, INTERP_SHIFT_SP
> +    psrad           m12, INTERP_SHIFT_SP
> +    psrad           m13, INTERP_SHIFT_SP
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +%else
> +    psrad           m8, INTERP_SHIFT_PS
> +    psrad           m9, INTERP_SHIFT_PS
> +    psrad           m10, INTERP_SHIFT_PS
> +    psrad           m11, INTERP_SHIFT_PS
> +    psrad           m12, INTERP_SHIFT_PS
> +    psrad           m13, INTERP_SHIFT_PS
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -6819,7 +6816,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m14, [pd_32]
>  %elifidn %1, sp
> -    mova            m14, [pd_524800]
> +    mova            m14, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m14, [INTERP_OFFSET_PS]
>  %endif
> @@ -6870,7 +6867,7 @@
>  %ifidn %3,pp
>      vbroadcasti128  m14, [pd_32]
>  %elifidn %3, sp
> -    mova            m14, [pd_524800]
> +    mova            m14, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m14, [INTERP_OFFSET_PS]
>  %endif
> @@ -6953,7 +6950,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m14, [pd_32]
>  %elifidn %1, sp
> -    mova            m14, [pd_524800]
> +    mova            m14, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m14, [INTERP_OFFSET_PS]
>  %endif
> @@ -7089,26 +7086,26 @@
>      paddd           m4, m14
>      paddd           m5, m14
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m1, 6
> -    psrad           m2, 6
> -    psrad           m3, 6
> -    psrad           m4, 6
> -    psrad           m5, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
> +    psrad           m4, INTERP_SHIFT_PP
> +    psrad           m5, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m1, 10
> -    psrad           m2, 10
> -    psrad           m3, 10
> -    psrad           m4, 10
> -    psrad           m5, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m1, 2
> -    psrad           m2, 2
> -    psrad           m3, 2
> -    psrad           m4, 2
> -    psrad           m5, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +    psrad           m4, INTERP_SHIFT_SP
> +    psrad           m5, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
> +    psrad           m4, INTERP_SHIFT_PS
> +    psrad           m5, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -7171,14 +7168,14 @@
>      paddd           m6, m14
>      paddd           m7, m14
>  %ifidn %1,pp
> -    psrad           m6, 6
> -    psrad           m7, 6
> +    psrad           m6, INTERP_SHIFT_PP
> +    psrad           m7, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m6, 10
> -    psrad           m7, 10
> -%else
> -    psrad           m6, 2
> -    psrad           m7, 2
> +    psrad           m6, INTERP_SHIFT_SP
> +    psrad           m7, INTERP_SHIFT_SP
> +%else
> +    psrad           m6, INTERP_SHIFT_PS
> +    psrad           m7, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -7285,32 +7282,32 @@
>      paddd           m0, m14
>      paddd           m1, m14
>  %ifidn %1,pp
> -    psrad           m8, 6
> -    psrad           m9, 6
> -    psrad           m10, 6
> -    psrad           m11, 6
> -    psrad           m12, 6
> -    psrad           m13, 6
> -    psrad           m0, 6
> -    psrad           m1, 6
> +    psrad           m8, INTERP_SHIFT_PP
> +    psrad           m9, INTERP_SHIFT_PP
> +    psrad           m10, INTERP_SHIFT_PP
> +    psrad           m11, INTERP_SHIFT_PP
> +    psrad           m12, INTERP_SHIFT_PP
> +    psrad           m13, INTERP_SHIFT_PP
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m8, 10
> -    psrad           m9, 10
> -    psrad           m10, 10
> -    psrad           m11, 10
> -    psrad           m12, 10
> -    psrad           m13, 10
> -    psrad           m0, 10
> -    psrad           m1, 10
> -%else
> -    psrad           m8, 2
> -    psrad           m9, 2
> -    psrad           m10, 2
> -    psrad           m11, 2
> -    psrad           m12, 2
> -    psrad           m13, 2
> -    psrad           m0, 2
> -    psrad           m1, 2
> +    psrad           m8, INTERP_SHIFT_SP
> +    psrad           m9, INTERP_SHIFT_SP
> +    psrad           m10, INTERP_SHIFT_SP
> +    psrad           m11, INTERP_SHIFT_SP
> +    psrad           m12, INTERP_SHIFT_SP
> +    psrad           m13, INTERP_SHIFT_SP
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +%else
> +    psrad           m8, INTERP_SHIFT_PS
> +    psrad           m9, INTERP_SHIFT_PS
> +    psrad           m10, INTERP_SHIFT_PS
> +    psrad           m11, INTERP_SHIFT_PS
> +    psrad           m12, INTERP_SHIFT_PS
> +    psrad           m13, INTERP_SHIFT_PS
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -7485,26 +7482,26 @@
>      paddd           m4, m11
>      paddd           m5, m11
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m1, 6
> -    psrad           m2, 6
> -    psrad           m3, 6
> -    psrad           m4, 6
> -    psrad           m5, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
> +    psrad           m4, INTERP_SHIFT_PP
> +    psrad           m5, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m1, 10
> -    psrad           m2, 10
> -    psrad           m3, 10
> -    psrad           m4, 10
> -    psrad           m5, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m1, 2
> -    psrad           m2, 2
> -    psrad           m3, 2
> -    psrad           m4, 2
> -    psrad           m5, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +    psrad           m4, INTERP_SHIFT_SP
> +    psrad           m5, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
> +    psrad           m4, INTERP_SHIFT_PS
> +    psrad           m5, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -7556,14 +7553,14 @@
>      paddd           m6, m11
>      paddd           m7, m11
>  %ifidn %1,pp
> -    psrad           m6, 6
> -    psrad           m7, 6
> +    psrad           m6, INTERP_SHIFT_PP
> +    psrad           m7, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m6, 10
> -    psrad           m7, 10
> -%else
> -    psrad           m6, 2
> -    psrad           m7, 2
> +    psrad           m6, INTERP_SHIFT_SP
> +    psrad           m7, INTERP_SHIFT_SP
> +%else
> +    psrad           m6, INTERP_SHIFT_PS
> +    psrad           m7, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -7600,7 +7597,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m11, [pd_32]
>  %elifidn %1, sp
> -    mova            m11, [pd_524800]
> +    mova            m11, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m11, [INTERP_OFFSET_PS]
>  %endif
> @@ -7647,7 +7644,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m14, [pd_32]
>  %elifidn %1, sp
> -    mova            m14, [pd_524800]
> +    mova            m14, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m14, [INTERP_OFFSET_PS]
>  %endif
> @@ -7765,20 +7762,20 @@
>      paddd           m2, m7
>      paddd           m3, m7
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m1, 6
> -    psrad           m2, 6
> -    psrad           m3, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m1, 10
> -    psrad           m2, 10
> -    psrad           m3, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m1, 2
> -    psrad           m2, 2
> -    psrad           m3, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -7801,7 +7798,7 @@
>  
>  %macro FILTER_VER_LUMA_AVX2_16x4 1
>  INIT_YMM avx2
> -cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
> +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize
>      mov             r4d, r4m
>      shl             r4d, 7
>      add             r1d, r1d
> @@ -7819,7 +7816,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m7, [pd_32]
>  %elifidn %1, sp
> -    mova            m7, [pd_524800]
> +    mova            m7, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -7864,7 +7861,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m7, [pd_32]
>  %elifidn %1, sp
> -    mova            m7, [pd_524800]
> +    mova            m7, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -7904,7 +7901,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m14, [pd_32]
>  %elifidn %1, sp
> -    mova            m14, [pd_524800]
> +    mova            m14, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m14, [INTERP_OFFSET_PS]
>  %endif
> @@ -8014,20 +8011,20 @@
>      paddd           m2, m14
>      paddd           m3, m14
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m1, 6
> -    psrad           m2, 6
> -    psrad           m3, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m1, 10
> -    psrad           m2, 10
> -    psrad           m3, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m1, 2
> -    psrad           m2, 2
> -    psrad           m3, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8105,20 +8102,20 @@
>      paddd           m6, m14
>      paddd           m7, m14
>  %ifidn %1,pp
> -    psrad           m4, 6
> -    psrad           m5, 6
> -    psrad           m6, 6
> -    psrad           m7, 6
> +    psrad           m4, INTERP_SHIFT_PP
> +    psrad           m5, INTERP_SHIFT_PP
> +    psrad           m6, INTERP_SHIFT_PP
> +    psrad           m7, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m4, 10
> -    psrad           m5, 10
> -    psrad           m6, 10
> -    psrad           m7, 10
> -%else
> -    psrad           m4, 2
> -    psrad           m5, 2
> -    psrad           m6, 2
> -    psrad           m7, 2
> +    psrad           m4, INTERP_SHIFT_SP
> +    psrad           m5, INTERP_SHIFT_SP
> +    psrad           m6, INTERP_SHIFT_SP
> +    psrad           m7, INTERP_SHIFT_SP
> +%else
> +    psrad           m4, INTERP_SHIFT_PS
> +    psrad           m5, INTERP_SHIFT_PS
> +    psrad           m6, INTERP_SHIFT_PS
> +    psrad           m7, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8182,20 +8179,20 @@
>      paddd           m10, m14
>      paddd           m11, m14
>  %ifidn %1,pp
> -    psrad           m8, 6
> -    psrad           m9, 6
> -    psrad           m10, 6
> -    psrad           m11, 6
> +    psrad           m8, INTERP_SHIFT_PP
> +    psrad           m9, INTERP_SHIFT_PP
> +    psrad           m10, INTERP_SHIFT_PP
> +    psrad           m11, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m8, 10
> -    psrad           m9, 10
> -    psrad           m10, 10
> -    psrad           m11, 10
> -%else
> -    psrad           m8, 2
> -    psrad           m9, 2
> -    psrad           m10, 2
> -    psrad           m11, 2
> +    psrad           m8, INTERP_SHIFT_SP
> +    psrad           m9, INTERP_SHIFT_SP
> +    psrad           m10, INTERP_SHIFT_SP
> +    psrad           m11, INTERP_SHIFT_SP
> +%else
> +    psrad           m8, INTERP_SHIFT_PS
> +    psrad           m9, INTERP_SHIFT_PS
> +    psrad           m10, INTERP_SHIFT_PS
> +    psrad           m11, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8251,7 +8248,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m7, [pd_32]
>  %elifidn %1, sp
> -    mova            m7, [pd_524800]
> +    mova            m7, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -8315,14 +8312,14 @@
>      paddd           m0, m7
>      paddd           m2, m7
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m2, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m2, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m2, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8366,14 +8363,14 @@
>      paddd           m4, m7
>      paddd           m1, m7
>  %ifidn %1,pp
> -    psrad           m4, 6
> -    psrad           m1, 6
> +    psrad           m4, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m4, 10
> -    psrad           m1, 10
> -%else
> -    psrad           m4, 2
> -    psrad           m1, 2
> +    psrad           m4, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +%else
> +    psrad           m4, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8458,14 +8455,14 @@
>      paddd           m0, m7
>      paddd           m2, m7
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m2, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m2, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m2, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m2, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m2, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m2, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8516,14 +8513,14 @@
>      paddd           m4, m7
>      paddd           m1, m7
>  %ifidn %1,pp
> -    psrad           m4, 6
> -    psrad           m1, 6
> +    psrad           m4, INTERP_SHIFT_PP
> +    psrad           m1, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m4, 10
> -    psrad           m1, 10
> -%else
> -    psrad           m4, 2
> -    psrad           m1, 2
> +    psrad           m4, INTERP_SHIFT_SP
> +    psrad           m1, INTERP_SHIFT_SP
> +%else
> +    psrad           m4, INTERP_SHIFT_PS
> +    psrad           m1, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8574,14 +8571,14 @@
>      paddd           m6, m7
>      paddd           m5, m7
>  %ifidn %1,pp
> -    psrad           m6, 6
> -    psrad           m5, 6
> +    psrad           m6, INTERP_SHIFT_PP
> +    psrad           m5, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m6, 10
> -    psrad           m5, 10
> -%else
> -    psrad           m6, 2
> -    psrad           m5, 2
> +    psrad           m6, INTERP_SHIFT_SP
> +    psrad           m5, INTERP_SHIFT_SP
> +%else
> +    psrad           m6, INTERP_SHIFT_PS
> +    psrad           m5, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8625,14 +8622,14 @@
>      paddd           m0, m7
>      paddd           m3, m7
>  %ifidn %1,pp
> -    psrad           m0, 6
> -    psrad           m3, 6
> +    psrad           m0, INTERP_SHIFT_PP
> +    psrad           m3, INTERP_SHIFT_PP
>  %elifidn %1, sp
> -    psrad           m0, 10
> -    psrad           m3, 10
> -%else
> -    psrad           m0, 2
> -    psrad           m3, 2
> +    psrad           m0, INTERP_SHIFT_SP
> +    psrad           m3, INTERP_SHIFT_SP
> +%else
> +    psrad           m0, INTERP_SHIFT_PS
> +    psrad           m3, INTERP_SHIFT_PS
>  %endif
>  %endif
>  
> @@ -8671,7 +8668,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m7, [pd_32]
>  %elifidn %1, sp
> -    mova            m7, [pd_524800]
> +    mova            m7, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -8706,7 +8703,7 @@
>  %ifidn %1,pp
>      vbroadcasti128  m14, [pd_32]
>  %elifidn %1, sp
> -    mova            m14, [pd_524800]
> +    mova            m14, [INTERP_OFFSET_SP]
>  %else
>      vbroadcasti128  m14, [INTERP_OFFSET_PS]
>  %endif
> @@ -8758,10 +8755,10 @@
>      paddd     m2, m7
>      paddd     m3, m7
>  
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -8784,7 +8781,6 @@
>  
>      dec       dword [rsp]
>      jnz       .loopH
> -
>      RET
>  %endmacro
>  
> @@ -8837,7 +8833,7 @@
>      lea       r6, [tab_LumaCoeffV + r4]
>  %endif
>  
> -    mova      m7, [tab_c_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  
>      mov       dword [rsp], %2/4
>  .loopH:
> @@ -8850,10 +8846,10 @@
>      paddd     m2, m7
>      paddd     m3, m7
>  
> -    psrad     m0, 10
> -    psrad     m1, 10
> -    psrad     m2, 10
> -    psrad     m3, 10
> +    psrad     m0, INTERP_SHIFT_SP
> +    psrad     m1, INTERP_SHIFT_SP
> +    psrad     m2, INTERP_SHIFT_SP
> +    psrad     m3, INTERP_SHIFT_SP
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -8879,7 +8875,6 @@
>  
>      dec       dword [rsp]
>      jnz       .loopH
> -
>      RET
>  %endmacro
>  
> @@ -8963,7 +8958,6 @@
>  
>      dec        dword [rsp]
>      jnz        .loopH
> -
>      RET
>  %endmacro
>  
> @@ -9011,7 +9005,7 @@
>  %rep %1/4
>      movd       m0, [r0]
>      movhps     m0, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>  
>      movd       [r2 + r3 * 0], m0
> @@ -9019,7 +9013,7 @@
>  
>      movd       m0, [r0 + r1 * 2]
>      movhps     m0, [r0 + r4]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>  
>      movd       [r2 + r3 * 2], m0
> @@ -10293,14 +10287,13 @@
>      mov                         r4d,               r4m
>      add                         r1d,               r1d
>      add                         r3d,               r3d
> -%ifdef PIC
> -
> +
> +%ifdef PIC
>      lea                         r6,                [tab_LumaCoeff]
> -    lea                         r4 ,               [r4 * 8]
> +    lea                         r4,                [r4 * 8]
>      vbroadcasti128              m0,                [r6 + r4 * 2]
> -
> -%else
> -    lea                         r4 ,                [r4 * 8]
> +%else
> +    lea                         r4,                [r4 * 8]
>      vbroadcasti128              m0,                [tab_LumaCoeff + r4 * 2]
>  %endif
>  
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm	Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/loopfilter.asm	Tue Jul 21 14:30:11 2015 -0700
> @@ -39,7 +39,7 @@
>  cextern pb_128
>  cextern pb_2
>  cextern pw_2
> -cextern pw_1023
> +cextern pw_pixel_max
>  cextern pb_movemask
>  cextern pw_1
>  cextern hmul_16p
> @@ -81,7 +81,7 @@
>      palignr     m2, m3, m5, 15
>      por         m2, m0
>  
> -    mova        m4, [pw_1023]
> +    mova        m4, [pw_pixel_max]
>      psignb      m2, [pb_128]                ; m2 = signLeft
>      pxor        m0, m0
>      palignr     m0, m3, 15
> @@ -127,7 +127,7 @@
>      palignr     m2, m3, m5, 15
>      por         m2, m0
>  
> -    mova        m4, [pw_1023]
> +    mova        m4, [pw_pixel_max]
>      psignb      m2, [pb_128]                ; m2 = signLeft
>      pxor        m0, m0
>      palignr     m0, m3, 15
> @@ -249,7 +249,7 @@
>      neg             r1b
>      movd            xm1, r1d
>      vinserti128     m0, m0, xm1, 1
> -    mova            m5, [pw_1023]
> +    mova            m5, [pw_pixel_max]
>      mov             r1d, r4m
>      add             r1d, r1d
>      shr             r2d, 4
> @@ -402,8 +402,8 @@
>  
>      pmaxsw      m7, m0
>      pmaxsw      m5, m0
> -    pminsw      m7, [pw_1023]
> -    pminsw      m5, [pw_1023]
> +    pminsw      m7, [pw_pixel_max]
> +    pminsw      m5, [pw_pixel_max]
>  
>      movu        [r0], m7
>      movu        [r0 + 16],  m5
> @@ -468,7 +468,7 @@
>      mov         r4d, r4m
>      mova        m4, [pb_2]
>      shr         r4d, 4
> -    mova        m0, [pw_1023]
> +    mova        m0, [pw_pixel_max]
>  .loop
>      movu        m5, [r0]
>      movu        m3, [r0 + r3]
> @@ -559,7 +559,7 @@
>      add         r3d, r3d
>      mov         r4d, r4m
>      pxor        m0, m0                      ; m0 = 0
> -    mova        m6, [pw_1023]
> +    mova        m6, [pw_pixel_max]
>      mov         r5d, r4d
>      shr         r4d, 4
>      mov         r6, r0
> @@ -736,7 +736,7 @@
>  cglobal saoCuOrgE1_2Rows, 4,5,8
>      add             r3d, r3d
>      mov             r4d, r4m
> -    mova            m4, [pw_1023]
> +    mova            m4, [pw_pixel_max]
>      vbroadcasti128  m6, [r2]                ; m6 = m_iOffsetEo
>      shr             r4d, 4
>  .loop
> @@ -884,8 +884,8 @@
>      paddw       m5, m4
>      pmaxsw      m7, m0
>      pmaxsw      m5, m0
> -    pminsw      m7, [pw_1023]
> -    pminsw      m5, [pw_1023]
> +    pminsw      m7, [pw_pixel_max]
> +    pminsw      m5, [pw_pixel_max]
>      movu        [r0], m7
>      movu        [r0 + 16], m5
>  
> @@ -960,7 +960,7 @@
>      movq            xm4, [r0 + r4 * 2]
>      movhps          xm4, [r1 + r4]
>      vbroadcasti128  m5, [r3]
> -    mova            m6, [pw_1023]
> +    mova            m6, [pw_pixel_max]
>  .loop
>      movu            m1, [r0]
>      movu            m3, [r0 + r5 + 2]
> @@ -1086,8 +1086,8 @@
>      paddw           m7, m6
>      pmaxsw          m1, m0
>      pmaxsw          m7, m0
> -    pminsw          m1, [pw_1023]
> -    pminsw          m7, [pw_1023]
> +    pminsw          m1, [pw_pixel_max]
> +    pminsw          m7, [pw_pixel_max]
>      movu            [r0], m1
>      movu            [r0 + 32], m7
>  
> @@ -1212,8 +1212,8 @@
>      paddw           m5, m4
>      pmaxsw          m7, m0
>      pmaxsw          m5, m0
> -    pminsw          m7, [pw_1023]
> -    pminsw          m5, [pw_1023]
> +    pminsw          m7, [pw_pixel_max]
> +    pminsw          m5, [pw_pixel_max]
>      movu            [r0], m7
>      movu            [r0 + 16], m5
>  
> @@ -1333,7 +1333,7 @@
>      paddw           m1, m3
>      pxor            m0, m0
>      pmaxsw          m1, m0
> -    pminsw          m1, [pw_1023]
> +    pminsw          m1, [pw_pixel_max]
>      movu            [r0], m1
>  
>      psubb           xm0, xm2
> @@ -1461,8 +1461,8 @@
>      pxor            m0, m0
>      pmaxsw          m1, m0
>      pmaxsw          m7, m0
> -    pminsw          m1, [pw_1023]
> -    pminsw          m7, [pw_1023]
> +    pminsw          m1, [pw_pixel_max]
> +    pminsw          m7, [pw_pixel_max]
>      movu            [r0], m1
>      movu            [r0 + 32], m7
>  
> @@ -1565,8 +1565,8 @@
>  .loopW
>      movu        m2, [r0 + r6]
>      movu        m5, [r0 + r6 + 16]
> -    psrlw       m0, m2, 5
> -    psrlw       m6, m5, 5
> +    psrlw       m0, m2, (BIT_DEPTH - 5)
> +    psrlw       m6, m5, (BIT_DEPTH - 5)
>      packuswb    m0, m6
>      pand        m0, [pb_31]         ; m0 = [index]
>  
> @@ -1584,8 +1584,8 @@
>      paddw       m5, m6
>      pmaxsw      m2, m7
>      pmaxsw      m5, m7
> -    pminsw      m2, [pw_1023]
> -    pminsw      m5, [pw_1023]
> +    pminsw      m2, [pw_pixel_max]
> +    pminsw      m5, [pw_pixel_max]
>  
>      movu        [r0 + r6], m2
>      movu        [r0 + r6 + 16], m5
> @@ -1656,7 +1656,7 @@
>      sub             r1d, r2d
>      sub             r1d, r2d
>      shr             r2d, 4
> -    mova            m7, [pw_1023]
> +    mova            m7, [pw_pixel_max]
>  
>      mov             r6d, r3d
>      shr             r3d, 1
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/mc-a.asm
> --- a/source/common/x86/mc-a.asm	Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/mc-a.asm	Tue Jul 21 14:30:11 2015 -0700
> @@ -32,6 +32,19 @@
>  %include "x86inc.asm"
>  %include "x86util.asm"
>  
> +%if BIT_DEPTH==8
> +    %define ADDAVG_FACTOR       256
> +    %define ADDAVG_ROUND        128
> +%elif BIT_DEPTH==10
> +    %define ADDAVG_FACTOR       1024
> +    %define ADDAVG_ROUND        512
> +%elif BIT_DEPTH==12
> +    %define ADDAVG_FACTOR       4096
> +    %define ADDAVG_ROUND        2048
> +%else
> +    %error Unsupport bit depth!
> +%endif
> +
>  SECTION_RODATA 32
>  
>  ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
> @@ -54,6 +67,8 @@
>  cextern pw_512
>  cextern pw_1023
>  cextern pw_1024
> +cextern pw_2048
> +cextern pw_4096
>  cextern pw_00ff
>  cextern pw_pixel_max
>  cextern pd_32
> @@ -92,23 +107,24 @@
>      punpcklqdq    m1,          m2
>      punpcklqdq    m3,          m5
>      paddw         m1,          m3
> -    pmulhrsw      m1,          [pw_1024]
> -    paddw         m1,          [pw_512]
> +    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
> +    paddw         m1,          [pw_ %+ ADDAVG_ROUND]
>  
>      pxor          m0,          m0
>      pmaxsw        m1,          m0
> -    pminsw        m1,          [pw_1023]
> +    pminsw        m1,          [pw_pixel_max]
>      movd          [r2],        m1
>      pextrd        [r2 + r5],   m1, 1
>      lea           r2,          [r2 + 2 * r5]
>      pextrd        [r2],        m1, 2
>      pextrd        [r2 + r5],   m1, 3
> -
>      RET
> +
> +
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova          m0,          [pw_512]
> +    mova          m0,          [pw_ %+ ADDAVG_ROUND]
>      pxor          m7,          m7
>      add           r3,          r3
>      add           r4,          r4
> @@ -136,11 +152,11 @@
>      punpcklqdq    m1,          m2
>      punpcklqdq    m3,          m5
>      paddw         m1,          m3
> -    pmulhrsw      m1,          [pw_1024]
> +    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
>      paddw         m1,          m0
>  
>      pmaxsw        m1,          m7
> -    pminsw        m1,          [pw_1023]
> +    pminsw        m1,          [pw_pixel_max]
>      movd          [r2],        m1
>      pextrd        [r2 + r5],   m1, 1
>      lea           r2,          [r2 + 2 * r5]
> @@ -156,8 +172,8 @@
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m6,         [pw_1023]
> -    mova        m7,         [pw_1024]
> +    mova        m6,         [pw_pixel_max]
> +    mova        m7,         [pw_ %+ ADDAVG_FACTOR]
>      mov         r6d,        16/4
>      add         r3,         r3
>      add         r4,         r4
> @@ -183,7 +199,7 @@
>      punpcklqdq  m3,         m5
>      paddw       m1,         m3
>      pmulhrsw    m1,         m7
> -    paddw       m1,         [pw_512]
> +    paddw       m1,         [pw_ %+ ADDAVG_ROUND]
>      pxor        m0,         m0
>      pmaxsw      m1,         m0
>      pminsw      m1,         m6
> @@ -213,21 +229,21 @@
>      punpcklqdq     m0,          m1
>      punpcklqdq     m2,          m3
>      paddw          m0,          m2
> -    pmulhrsw       m0,          [pw_1024]
> -    paddw          m0,          [pw_512]
> +    pmulhrsw       m0,          [pw_ %+ ADDAVG_FACTOR]
> +    paddw          m0,          [pw_ %+ ADDAVG_ROUND]
>  
>      pxor           m6,          m6
>      pmaxsw         m0,          m6
> -    pminsw         m0,          [pw_1023]
> +    pminsw         m0,          [pw_pixel_max]
>      movh           [r2],        m0
>      movhps         [r2 + r5],   m0
>      RET
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,             [pw_512]
> -    mova        m5,             [pw_1023]
> -    mova        m7,             [pw_1024]
> +    mova        m4,             [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,             [pw_pixel_max]
> +    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,             m6
>      add         r3,             r3
>      add         r4,             r4
> @@ -264,9 +280,9 @@
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,             [pw_512]
> -    mova        m5,             [pw_1023]
> -    mova        m7,             [pw_1024]
> +    mova        m4,             [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,             [pw_pixel_max]
> +    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,             m6
>      mov         r6d,            16/2
>      add         r3,             r3
> @@ -300,9 +316,9 @@
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,          [pw_512]
> -    mova        m5,          [pw_1023]
> -    mova        m7,          [pw_1024]
> +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,          [pw_pixel_max]
> +    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,          m6
>      add         r3,          r3
>      add         r4,          r4
> @@ -331,9 +347,9 @@
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,          [pw_512]
> -    mova        m5,          [pw_1023]
> -    mova        m7,          [pw_1024]
> +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,          [pw_pixel_max]
> +    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,          m6
>      add         r3,          r3
>      add         r4,          r4
> @@ -370,9 +386,9 @@
>  %macro ADDAVG_W4_H4 1
>  INIT_XMM sse4
>  cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova           m4,          [pw_512]
> -    mova           m5,          [pw_1023]
> -    mova           m7,          [pw_1024]
> +    mova           m4,          [pw_ %+ ADDAVG_ROUND]
> +    mova           m5,          [pw_pixel_max]
> +    mova           m7,          [pw_ %+ ADDAVG_FACTOR]
>      pxor           m6,          m6
>      add            r3,          r3
>      add            r4,          r4
> @@ -420,9 +436,9 @@
>  %macro ADDAVG_W8_H4 1
>  INIT_XMM sse4
>  cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,          [pw_512]
> -    mova        m5,          [pw_1023]
> -    mova        m7,          [pw_1024]
> +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,          [pw_pixel_max]
> +    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,          m6
>      add         r3,          r3
>      add         r4,          r4
> @@ -470,9 +486,9 @@
>  %macro ADDAVG_W12_H4 1
>  INIT_XMM sse4
>  cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova           m4,             [pw_512]
> -    mova           m5,             [pw_1023]
> -    mova           m7,             [pw_1024]
> +    mova           m4,             [pw_ %+ ADDAVG_ROUND]
> +    mova           m5,             [pw_pixel_max]
> +    mova           m7,             [pw_ %+ ADDAVG_FACTOR]
>      pxor           m6,             m6
>      add            r3,             r3
>      add            r4,             r4
> @@ -532,9 +548,9 @@
>  %macro ADDAVG_W16_H4 1
>  INIT_XMM sse4
>  cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m7,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,              m6
>      add         r3,              r3
>      add         r4,              r4
> @@ -601,9 +617,9 @@
>  %macro ADDAVG_W24_H2 2
>  INIT_XMM sse4
>  cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m7,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,              m6
>      add         r3,              r3
>      add         r4,              r4
> @@ -683,9 +699,9 @@
>  %macro ADDAVG_W32_H2 1
>  INIT_XMM sse4
>  cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m7,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,              m6
>      add         r3,              r3
>      add         r4,              r4
> @@ -787,9 +803,9 @@
>  %macro ADDAVG_W48_H2 1
>  INIT_XMM sse4
>  cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m7,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,              m6
>      add         r3,              r3
>      add         r4,              r4
> @@ -921,9 +937,9 @@
>  %macro ADDAVG_W64_H1 1
>  INIT_XMM sse4
>  cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m7,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m6,              m6
>      add         r3,              r3
>      add         r4,              r4
> @@ -1029,19 +1045,19 @@
>  
>      paddw       m0,          m1
>      pxor        m1,          m1
> -    pmulhrsw    m0,          [pw_1024]
> -    paddw       m0,          [pw_512]
> +    pmulhrsw    m0,          [pw_ %+ ADDAVG_FACTOR]
> +    paddw       m0,          [pw_ %+ ADDAVG_ROUND]
>      pmaxsw      m0,          m1
> -    pminsw      m0,          [pw_1023]
> +    pminsw      m0,          [pw_pixel_max]
>      vextracti128 xm1,        m0, 1
>      movu        [r2],        xm0
>      movu        [r2 + r5 * 2], xm1
>      RET
>  
>  cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,          [pw_512]
> -    mova        m5,          [pw_1023]
> -    mova        m3,          [pw_1024]
> +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,          [pw_pixel_max]
> +    mova        m3,          [pw_ %+ ADDAVG_FACTOR]
>      pxor        m1,          m1
>      add         r3d,         r3d
>      add         r4d,         r4d
> @@ -1100,9 +1116,9 @@
>  
>  %macro ADDAVG_W8_H4_AVX2 1
>  cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,          [pw_512]
> -    mova        m5,          [pw_1023]
> -    mova        m3,          [pw_1024]
> +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,          [pw_pixel_max]
> +    mova        m3,          [pw_ %+ ADDAVG_FACTOR]
>      pxor        m1,          m1
>      add         r3d,         r3d
>      add         r4d,         r4d
> @@ -1159,9 +1175,9 @@
>  ADDAVG_W8_H4_AVX2 64
>  
>  cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova           m4,             [pw_512]
> -    mova           m5,             [pw_1023]
> -    mova           m3,             [pw_1024]
> +    mova           m4,             [pw_ %+ ADDAVG_ROUND]
> +    mova           m5,             [pw_pixel_max]
> +    mova           m3,             [pw_ %+ ADDAVG_FACTOR]
>      pxor           m1,             m1
>      add            r3,             r3
>      add            r4,             r4
> @@ -1201,8 +1217,8 @@
>      RET
>  
>  cglobal addAvg_12x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova           m4,             [pw_512]
> -    mova           m5,             [pw_1023]
> +    mova           m4,             [pw_ %+ ADDAVG_ROUND]
> +    mova           m5,             [pw_pixel_max]
>      paddw          m3,             m4,  m4
>      pxor           m1,             m1
>      add            r3,             r3
> @@ -1244,9 +1260,9 @@
>  
>  %macro ADDAVG_W16_H4_AVX2 1
>  cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m3,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m2,              m2
>      add         r3,              r3
>      add         r4,              r4
> @@ -1291,9 +1307,9 @@
>  ADDAVG_W16_H4_AVX2 64
>  
>  cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m3,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m1,              m1
>      add         r3,              r3
>      add         r4,              r4
> @@ -1347,8 +1363,8 @@
>      RET
>  
>  cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
>      paddw       m3,              m4,  m4
>      pxor        m1,              m1
>      add         r3,              r3
> @@ -1404,9 +1420,9 @@
>  
>  %macro ADDAVG_W32_H2_AVX2 1
>  cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m3,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m2,              m2
>      add         r3,              r3
>      add         r4,              r4
> @@ -1468,9 +1484,9 @@
>  ADDAVG_W32_H2_AVX2 64
>  
>  cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m3,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m2,              m2
>      add         r3,              r3
>      add         r4,              r4
> @@ -1543,9 +1559,9 @@
>  
>  %macro ADDAVG_W64_H1_AVX2 1
>  cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> -    mova        m4,              [pw_512]
> -    mova        m5,              [pw_1023]
> -    mova        m3,              [pw_1024]
> +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> +    mova        m5,              [pw_pixel_max]
> +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
>      pxor        m2,              m2
>      add         r3d,             r3d
>      add         r4d,             r4d
> diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm	Mon Jul 20 17:18:54 2015 -0700
> +++ b/source/common/x86/pixel-util8.asm	Tue Jul 21 14:30:11 2015 -0700
> @@ -879,8 +879,8 @@
>  %if HIGH_BIT_DEPTH
>      cmp         r3d, 32767
>      jle         .skip
> -    shr         r3d, 2
> -    sub         r4d, 2
> +    shr         r3d, (BIT_DEPTH - 8)
> +    sub         r4d, (BIT_DEPTH - 8)
>  .skip:
>  %endif
>      movd        m0, r4d             ; m0 = shift
> @@ -1273,13 +1273,7 @@
>  INIT_XMM sse4
>  cglobal weight_pp, 4,7,7
>  %define correction      (14 - BIT_DEPTH)
> -%if BIT_DEPTH == 10
> -    mova        m6, [pw_1023]
> -%elif BIT_DEPTH == 12
> -    mova        m6, [pw_3fff]
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +    mova        m6, [pw_pixel_max]
>      mov         r6d, r6m
>      mov         r4d, r4m
>      mov         r5d, r5m
> @@ -1423,7 +1417,7 @@
>      movd         xm1, r7m
>      vpbroadcastd m2, r8m
>      mova         m5, [pw_1]
> -    mova         m6, [pw_1023]
> +    mova         m6, [pw_pixel_max]
>      add         r2d, r2d
>      add         r3d, r3d
>      sub          r2d, r3d
> @@ -1516,13 +1510,7 @@
>  %if HIGH_BIT_DEPTH
>  INIT_XMM sse4
>  cglobal weight_sp, 6,7,8
> -%if BIT_DEPTH == 10
> -    mova        m1, [pw_1023]
> -%elif BIT_DEPTH == 12
> -    mova        m1, [pw_3fff]
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +    mova        m1, [pw_pixel_max]
>      mova        m2, [pw_1]
>      mov         r6d, r7m
>      shl         r6d, 16
> @@ -1681,7 +1669,7 @@
>  %if HIGH_BIT_DEPTH
>  INIT_YMM avx2
>  cglobal weight_sp, 6,7,9
> -    mova                      m1, [pw_1023]
> +    mova                      m1, [pw_pixel_max]
>      mova                      m2, [pw_1]
>      mov                       r6d, r7m
>      shl                       r6d, 16
> 
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
> 

-- 
Steve Borho


More information about the x265-devel mailing list