[x265] [PATCH 1 of 4] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX

Steve Borho steve at borho.org
Wed Jul 22 19:53:01 CEST 2015


On 07/22, Steve Borho wrote:
> On 07/21, Min Chen wrote:
> > # HG changeset patch
> > # User Min Chen <chenm003 at 163.com>
> > # Date 1437514211 25200
> > # Node ID ab2c34d6ad913369fd8feb84aee10030ffaa0df5
> > # Parent  46152345eb6ff261fd90272f7a0712300d6324c0
> > asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
> 
> nice!  queued for smoke testing

I'm enabling ASM in my 12bit smoke test builds; we should do the same
for all our automated regression test systems. main12 should be tested
with ASM enabled as a build option, including the running the assembly
testbench.

Some example encodes on a Haswell (dual-core) Macbook:

http://privatepaste.com/71b507772b

Note that main12 still needs its lambda tables fixed, that's why the
bitrate is out-of-whack.

> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/asm-primitives.cpp
> > --- a/source/common/x86/asm-primitives.cpp	Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/asm-primitives.cpp	Tue Jul 21 14:30:11 2015 -0700
> > @@ -1043,7 +1043,9 @@
> >  
> >          // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
> >          ALL_LUMA_PU(satd, pixel_satd, ssse3);
> > +#if X265_DEPTH <= 10
> >          ASSIGN_SA8D(ssse3);
> > +#endif
> >          INTRA_ANG_SSSE3(ssse3);
> >  
> >          p.dst4x4 = PFX(dst4_ssse3);
> > @@ -1126,14 +1128,18 @@
> >  
> >          // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
> >          ALL_LUMA_PU(satd, pixel_satd, sse4);
> > +#if X265_DEPTH <= 10
> >          ASSIGN_SA8D(sse4);
> > +#endif
> >  
> >          p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
> >          p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
> >          p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
> >          p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
> >  
> > +#if X265_DEPTH <= 10
> >          ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
> > +#endif
> >          ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
> >          INTRA_ANG_SSE4_COMMON(sse4);
> >          INTRA_ANG_SSE4_HIGH(sse4);
> > @@ -1147,7 +1153,9 @@
> >  
> >          // TODO: check POPCNT flag!
> >          ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
> > +#if X265_DEPTH <= 10
> >          ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
> > +#endif
> >          ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
> >  
> >          p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
> > @@ -1184,7 +1192,9 @@
> >          p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
> >          p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
> >          p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
> > +#if X265_DEPTH <= 10
> >          ASSIGN_SA8D(avx);
> > +#endif
> >          p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
> >          p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
> >          p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
> > @@ -1292,7 +1302,9 @@
> >      {
> >          //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
> >          ALL_LUMA_PU(satd, pixel_satd, xop);
> > +#if X265_DEPTH <= 10
> >          ASSIGN_SA8D(xop);
> > +#endif
> >          LUMA_VAR(xop);
> >          p.frameInitLowres = PFX(frame_init_lowres_core_xop);
> >      }
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/const-a.asm
> > --- a/source/common/x86/const-a.asm	Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/const-a.asm	Tue Jul 21 14:30:11 2015 -0700
> > @@ -79,6 +79,7 @@
> >  const pw_512,               times 16 dw 512
> >  const pw_1023,              times 16 dw 1023
> >  const pw_1024,              times 16 dw 1024
> > +const pw_2048,              times 16 dw 2048
> >  const pw_4096,              times 16 dw 4096
> >  const pw_8192,              times  8 dw 8192
> >  const pw_00ff,              times 16 dw 0x00ff
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/intrapred16.asm
> > --- a/source/common/x86/intrapred16.asm	Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/intrapred16.asm	Tue Jul 21 14:30:11 2015 -0700
> > @@ -1748,7 +1748,7 @@
> >      ; filter top
> >      movu        m1,             [r2]
> >      paddw       m1,             m0
> > -    psraw       m1,             2
> > +    psrlw       m1,             2
> >      movh        [r0],           m1             ; overwrite top-left pixel, we will update it later
> >  
> >      ; filter top-left
> > @@ -1763,7 +1763,7 @@
> >      lea         r0,             [r0 + r1 * 2]
> >      movu        m1,             [r3 + 2]
> >      paddw       m1,             m0
> > -    psraw       m1,             2
> > +    psrlw       m1,             2
> >      movd        r3d,            m1
> >      mov         [r0],           r3w
> >      shr         r3d,            16
> > @@ -1872,7 +1872,7 @@
> >      ; filter top
> >      movu            m0,            [r2]
> >      paddw           m0,            m1
> > -    psraw           m0,            2
> > +    psrlw           m0,            2
> >      movu            [r6],          m0
> >  
> >      ; filter top-left
> > @@ -1887,7 +1887,7 @@
> >      add             r6,            r1
> >      movu            m0,            [r3 + 2]
> >      paddw           m0,            m1
> > -    psraw           m0,            2
> > +    psrlw           m0,            2
> >      pextrw          [r6],          m0, 0
> >      pextrw          [r6 + r1],     m0, 1
> >      pextrw          [r6 + r1 * 2], m0, 2
> > @@ -1913,13 +1913,13 @@
> >      movu            m2,                  [r2]
> >      movu            m3,                  [r2 + 16]
> >  
> > -    paddw           m0,                  m1
> > +    paddw           m0,                  m1                     ; dynamic range 13 bits
> >      paddw           m2,                  m3
> > -    paddw           m0,                  m2
> > -    movhlps         m1,                  m0
> > -    paddw           m0,                  m1
> > -    phaddw          m0,                  m0
> > +    paddw           m0,                  m2                     ; dynamic range 14 bits
> > +    movhlps         m1,                  m0                     ; dynamic range 15 bits
> > +    paddw           m0,                  m1                     ; dynamic range 16 bits
> >      pmaddwd         m0,                  [pw_1]
> > +    phaddd          m0,                  m0
> >  
> >      movd            r5d,                 m0
> >      add             r5d,                 16
> > @@ -1983,11 +1983,11 @@
> >      ; filter top
> >      movu            m2,                  [r2]
> >      paddw           m2,                  m1
> > -    psraw           m2,                  2
> > +    psrlw           m2,                  2
> >      movu            [r6],                m2
> >      movu            m3,                  [r2 + 16]
> >      paddw           m3,                  m1
> > -    psraw           m3,                  2
> > +    psrlw           m3,                  2
> >      movu            [r6 + 16],           m3
> >  
> >      ; filter top-left
> > @@ -2002,7 +2002,7 @@
> >      add             r6,                  r1
> >      movu            m2,                  [r3 + 2]
> >      paddw           m2,                  m1
> > -    psraw           m2,                  2
> > +    psrlw           m2,                  2
> >  
> >      pextrw          [r6],                m2, 0
> >      pextrw          [r6 + r1],           m2, 1
> > @@ -2019,7 +2019,7 @@
> >      lea             r6,                  [r6 + r1 * 2]
> >      movu            m3,                  [r3 + 18]
> >      paddw           m3,                  m1
> > -    psraw           m3,                  2
> > +    psrlw           m3,                  2
> >  
> >      pextrw          [r6],                m3, 0
> >      pextrw          [r6 + r1],           m3, 1
> > @@ -2046,21 +2046,21 @@
> >      movu            m1,                  [r3 + 16]
> >      movu            m2,                  [r3 + 32]
> >      movu            m3,                  [r3 + 48]
> > -    paddw           m0,                  m1
> > +    paddw           m0,                  m1             ; dynamic range 13 bits
> >      paddw           m2,                  m3
> > -    paddw           m0,                  m2
> > +    paddw           m0,                  m2             ; dynamic range 14 bits
> >      movu            m1,                  [r2]
> >      movu            m3,                  [r2 + 16]
> >      movu            m4,                  [r2 + 32]
> >      movu            m5,                  [r2 + 48]
> > -    paddw           m1,                  m3
> > +    paddw           m1,                  m3             ; dynamic range 13 bits
> >      paddw           m4,                  m5
> > -    paddw           m1,                  m4
> > -    paddw           m0,                  m1
> > +    paddw           m1,                  m4             ; dynamic range 14 bits
> > +    paddw           m0,                  m1             ; dynamic range 15 bits
> > +    pmaddwd         m0,                  [pw_1]
> >      movhlps         m1,                  m0
> > -    paddw           m0,                  m1
> > -    phaddw          m0,                  m0
> > -    pmaddwd         m0,                  [pw_1]
> > +    paddd           m0,                  m1
> > +    phaddd          m0,                  m0
> >  
> >      paddd           m0,                  [pd_32]     ; sum = sum + 32
> >      psrld           m0,                  6           ; sum = sum / 64
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/ipfilter16.asm
> > --- a/source/common/x86/ipfilter16.asm	Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/ipfilter16.asm	Tue Jul 21 14:30:11 2015 -0700
> > @@ -26,6 +26,25 @@
> >  %include "x86inc.asm"
> >  %include "x86util.asm"
> >  
> > +
> > +%define INTERP_OFFSET_PP        pd_32
> > +%define INTERP_SHIFT_PP         6
> > +
> > +%if BIT_DEPTH == 10
> > +    %define INTERP_SHIFT_PS         2
> > +    %define INTERP_OFFSET_PS        pd_n32768
> > +    %define INTERP_SHIFT_SP         10
> > +    %define INTERP_OFFSET_SP        pd_524800
> > +%elif BIT_DEPTH == 12
> > +    %define INTERP_SHIFT_PS         4
> > +    %define INTERP_OFFSET_PS        pd_n131072
> > +    %define INTERP_SHIFT_SP         8
> > +    %define INTERP_OFFSET_SP        pd_524416
> > +%else
> > +    %error Unsupport bit depth!
> > +%endif
> > +
> > +
> >  SECTION_RODATA 32
> >  
> >  tab_c_32:         times 8 dd 32
> > @@ -145,21 +164,9 @@
> >  const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
> >                  db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
> >  
> > -%if BIT_DEPTH == 10
> > -    %define INTERP_OFFSET_PS        pd_n32768
> > -    %define INTERP_SHIFT_PS         2
> > -    %define INTERP_OFFSET_SP        pd_524800
> > -    %define INTERP_SHIFT_SP         10
> > -%elif BIT_DEPTH == 12
> > -    %define INTERP_OFFSET_PS        pd_n131072
> > -    %define INTERP_SHIFT_PS         4
> > -    %define INTERP_OFFSET_SP        pd_524416
> > -    %define INTERP_SHIFT_SP         8
> > -%else
> > -    %error Unsupport bit depth!
> > -%endif
> >  
> >  SECTION .text
> > +cextern pd_8
> >  cextern pd_32
> >  cextern pw_pixel_max
> >  cextern pd_524416
> > @@ -503,7 +510,7 @@
> >  %endif
> >  
> >  %ifidn %1,pp
> > -    mova      m7, [pd_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %define SHIFT 6
> >  %elifidn %1,ps
> >      mova      m7, [INTERP_OFFSET_PS]
> > @@ -1176,7 +1183,6 @@
> >  %macro FILTER_HOR_LUMA_W4 3
> >  INIT_XMM sse4
> >  cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> > -
> >      mov         r4d, r4m
> >      sub         r0, 6
> >      shl         r4d, 4
> > @@ -1229,7 +1235,7 @@
> >      packusdw    m4, m4
> >      CLIPW       m4, m6, m7
> >  %else
> > -    psrad       m4, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> >      packssdw    m4, m4
> >  %endif
> >  
> > @@ -1287,7 +1293,7 @@
> >      mov         r4d, %2
> >  %ifidn %3, ps
> >      cmp         r5m, byte 0
> > -    je          .loopH
> > +    je         .loopH
> >      lea         r6, [r1 + 2 * r1]
> >      sub         r0, r6
> >      add         r4d, 7
> > @@ -1329,8 +1335,8 @@
> >      packusdw    m4, m5
> >      CLIPW       m4, m7, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > -    psrad       m5, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> > +    psrad       m5, INTERP_SHIFT_PS
> >      packssdw    m4, m5
> >  %endif
> >  
> > @@ -1340,7 +1346,7 @@
> >      add         r2, r3
> >  
> >      dec         r4d
> > -    jnz         .loopH
> > +    jnz        .loopH
> >      RET
> >  %endmacro
> >  
> > @@ -1380,7 +1386,7 @@
> >      mova        m0, [tab_LumaCoeff + r4]
> >  %endif
> >  %ifidn %3, pp
> > -    mova        m1, [pd_32]
> > +    mova        m1, [INTERP_OFFSET_PP]
> >  %else
> >      mova        m1, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -1425,14 +1431,14 @@
> >      phaddd      m5, m6
> >      paddd       m5, m1
> >  %ifidn %3, pp
> > -    psrad       m4, 6
> > -    psrad       m5, 6
> > +    psrad       m4, INTERP_SHIFT_PP
> > +    psrad       m5, INTERP_SHIFT_PP
> >      packusdw    m4, m5
> >      pxor        m5, m5
> >      CLIPW       m4, m5, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > -    psrad       m5, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> > +    psrad       m5, INTERP_SHIFT_PS
> >      packssdw    m4, m5
> >  %endif
> >  
> > @@ -1453,12 +1459,12 @@
> >      phaddd      m4, m5
> >      paddd       m4, m1
> >  %ifidn %3, pp
> > -    psrad       m4, 6
> > +    psrad       m4, INTERP_SHIFT_PP
> >      packusdw    m4, m4
> >      pxor        m5, m5
> >      CLIPW       m4, m5, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> >      packssdw    m4, m4
> >  %endif
> >  
> > @@ -1550,14 +1556,14 @@
> >      phaddd      m5, m6
> >      paddd       m5, m1
> >  %ifidn %3, pp
> > -    psrad       m4, 6
> > -    psrad       m5, 6
> > +    psrad       m4, INTERP_SHIFT_PP
> > +    psrad       m5, INTERP_SHIFT_PP
> >      packusdw    m4, m5
> >      pxor        m5, m5
> >      CLIPW       m4, m5, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > -    psrad       m5, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> > +    psrad       m5, INTERP_SHIFT_PS
> >      packssdw    m4, m5
> >  %endif
> >      movu        [r2 + x], m4
> > @@ -1591,14 +1597,14 @@
> >      phaddd      m5, m6
> >      paddd       m5, m1
> >  %ifidn %3, pp
> > -    psrad       m4, 6
> > -    psrad       m5, 6
> > +    psrad       m4, INTERP_SHIFT_PP
> > +    psrad       m5, INTERP_SHIFT_PP
> >      packusdw    m4, m5
> >      pxor        m5, m5
> >      CLIPW       m4, m5, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > -    psrad       m5, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> > +    psrad       m5, INTERP_SHIFT_PS
> >      packssdw    m4, m5
> >  %endif
> >      movu        [r2 + 16 + x], m4
> > @@ -1743,14 +1749,14 @@
> >      phaddd      m5, m6
> >      paddd       m5, m1
> >  %ifidn %3, pp
> > -    psrad       m4, 6
> > -    psrad       m5, 6
> > +    psrad       m4, INTERP_SHIFT_PP
> > +    psrad       m5, INTERP_SHIFT_PP
> >      packusdw    m4, m5
> >      pxor        m5, m5
> >      CLIPW       m4, m5, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > -    psrad       m5, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> > +    psrad       m5, INTERP_SHIFT_PS
> >      packssdw    m4, m5
> >  %endif
> >      movu        [r2], m4
> > @@ -1784,14 +1790,14 @@
> >      phaddd      m5, m6
> >      paddd       m5, m1
> >  %ifidn %3, pp
> > -    psrad       m4, 6
> > -    psrad       m5, 6
> > +    psrad       m4, INTERP_SHIFT_PP
> > +    psrad       m5, INTERP_SHIFT_PP
> >      packusdw    m4, m5
> >      pxor        m5, m5
> >      CLIPW       m4, m5, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > -    psrad       m5, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> > +    psrad       m5, INTERP_SHIFT_PS
> >      packssdw    m4, m5
> >  %endif
> >      movu        [r2 + 16], m4
> > @@ -1825,14 +1831,14 @@
> >      phaddd      m5, m6
> >      paddd       m5, m1
> >  %ifidn %3, pp
> > -    psrad       m4, 6
> > -    psrad       m5, 6
> > +    psrad       m4, INTERP_SHIFT_PP
> > +    psrad       m5, INTERP_SHIFT_PP
> >      packusdw    m4, m5
> >      pxor        m5, m5
> >      CLIPW       m4, m5, [pw_pixel_max]
> >  %else
> > -    psrad       m4, 2
> > -    psrad       m5, 2
> > +    psrad       m4, INTERP_SHIFT_PS
> > +    psrad       m5, INTERP_SHIFT_PS
> >      packssdw    m4, m5
> >  %endif
> >      movu        [r2 + 32], m4
> > @@ -1865,11 +1871,11 @@
> >      phaddd      m3,         m4
> >      paddd       m3,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> >      packusdw    m3,         m3
> >      CLIPW       m3,         m7,    m6
> >  %else
> > -    psrad       m3,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> >      packssdw    m3,         m3
> >  %endif
> >      movd        [r2],       m3
> > @@ -1895,13 +1901,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m7,    m6
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2],       m3
> > @@ -1950,7 +1956,7 @@
> >      phaddd           m4, m4
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m6
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -1969,7 +1975,7 @@
> >      phaddd           m4, m4
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m6
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2036,7 +2042,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2064,7 +2070,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2132,7 +2138,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2160,7 +2166,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2232,7 +2238,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2260,7 +2266,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2335,7 +2341,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2363,7 +2369,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2425,7 +2431,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2453,7 +2459,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2481,7 +2487,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2545,7 +2551,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2573,7 +2579,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2601,7 +2607,7 @@
> >      phaddd           m4, m5
> >      vpermq           m4, m4, q3120
> >      paddd            m4, m7
> > -    psrad            m4, 6
> > +    psrad            m4, INTERP_SHIFT_PP
> >  
> >      packusdw         m4, m4
> >      vpermq           m4, m4, q2020
> > @@ -2644,32 +2650,32 @@
> >      mova        m1,       [INTERP_OFFSET_PS]
> >      cmp         r5m, byte 0
> >      je          .skip
> > -    sub         r0, r1
> > -    movu        m3,         [r0]
> > -    pshufb      m3,         m3, m2
> > -    pmaddwd     m3,         m0
> > -
> > -    %if %1 == 4
> > -        movu        m4,         [r0 + 4]
> > -        pshufb      m4,         m4, m2
> > -        pmaddwd     m4,         m0
> > -        phaddd      m3,         m4
> > -    %else
> > -        phaddd      m3,         m3
> > -    %endif
> > -
> > -    paddd       m3,         m1
> > -    psrad       m3,         INTERP_SHIFT_PS
> > -    packssdw    m3,         m3
> > -
> > -    %if %1 == 2
> > -        movd        [r2],       m3
> > -    %else
> > -        movh        [r2],       m3
> > -    %endif
> > -
> > -    add         r0, r1
> > -    add         r2, r3
> > +    sub         r0,       r1
> > +    movu        m3,       [r0]
> > +    pshufb      m3,       m3, m2
> > +    pmaddwd     m3,       m0
> > +
> > +  %if %1 == 4
> > +    movu        m4,       [r0 + 4]
> > +    pshufb      m4,       m4, m2
> > +    pmaddwd     m4,       m0
> > +    phaddd      m3,       m4
> > +  %else
> > +    phaddd      m3,       m3
> > +  %endif
> > +
> > +    paddd       m3,       m1
> > +    psrad       m3,       INTERP_SHIFT_PS
> > +    packssdw    m3,       m3
> > +
> > +  %if %1 == 2
> > +    movd        [r2],     m3
> > +  %else
> > +    movh        [r2],     m3
> > +  %endif
> > +
> > +    add         r0,       r1
> > +    add         r2,       r3
> >      FILTER_W%1_2 %3
> >      lea         r0,       [r0 + 2 * r1]
> >      lea         r2,       [r2 + 2 * r3]
> > @@ -2689,7 +2695,6 @@
> >      lea         r2,       [r2 + 2 * r3]
> >      FILTER_W%1_2 %3
> >  %endrep
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -2729,13 +2734,13 @@
> >      phaddd      m4,         m4
> >      paddd       m4,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m4,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m4,         INTERP_SHIFT_PP
> >      packusdw    m3,         m4
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m4,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m4,         INTERP_SHIFT_PS
> >      packssdw    m3,         m4
> >  %endif
> >      movh        [r2],       m3
> > @@ -2769,13 +2774,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2],       m3
> > @@ -2809,13 +2814,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2],       m3
> > @@ -2831,11 +2836,11 @@
> >      paddd       m3,         m1
> >  
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> >      packusdw    m3,         m3
> >      CLIPW       m3,         m6, m7
> >  %else
> > -    psrad       m3,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> >      packssdw    m3,         m3
> >  %endif
> >      movh        [r2 + 16],  m3
> > @@ -2868,13 +2873,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2],       m3
> > @@ -2898,13 +2903,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2 + 16],  m3
> > @@ -2938,13 +2943,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2],       m3
> > @@ -2968,13 +2973,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2 + 16],  m3
> > @@ -2998,13 +3003,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2 + 32],  m3
> > @@ -3038,13 +3043,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2],       m3
> > @@ -3068,13 +3073,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2 + 16],  m3
> > @@ -3098,13 +3103,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2 + 32],  m3
> > @@ -3128,13 +3133,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2 + 48],  m3
> > @@ -3168,13 +3173,13 @@
> >      phaddd      m5,         m4
> >      paddd       m5,         m1
> >  %ifidn %1, pp
> > -    psrad       m3,         6
> > -    psrad       m5,         6
> > +    psrad       m3,         INTERP_SHIFT_PP
> > +    psrad       m5,         INTERP_SHIFT_PP
> >      packusdw    m3,         m5
> >      CLIPW       m3,         m6,    m7
> >  %else
> > -    psrad       m3,         2
> > -    psrad       m5,         2
> > +    psrad       m3,         INTERP_SHIFT_PS
> > +    psrad       m5,         INTERP_SHIFT_PS
> >      packssdw    m3,         m5
> >  %endif
> >      movh        [r2 + %2],       m3
> > @@ -3408,7 +3413,7 @@
> >      pmaddwd         m4, m0
> >      phaddd          m3, m4
> >      paddd           m3, m2
> > -    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > +    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >  
> >      packusdw        m3, m3
> >      vpermq          m3, m3, q2020
> > @@ -3426,7 +3431,7 @@
> >      pmaddwd         m4, m0
> >      phaddd          m3, m4
> >      paddd           m3, m2
> > -    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > +    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >  
> >      packusdw        m3, m3
> >      vpermq          m3, m3, q2020
> > @@ -3474,7 +3479,7 @@
> >      pmaddwd         m4, m0
> >      phaddd          m3, m4
> >      paddd           m3, m2
> > -    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > +    psrad           m3, INTERP_SHIFT_PP          ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >  
> >      packusdw        m3, m3
> >      vpermq          m3, m3,q2020
> > @@ -3491,7 +3496,7 @@
> >      pmaddwd         m4, m0
> >      phaddd          m3, m4
> >      paddd           m3, m2
> > -    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
> > +    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
> >  
> >      packusdw        m3, m3
> >      vpermq          m3, m3,q2020
> > @@ -4089,7 +4094,7 @@
> >      %ifnidn %3, ps
> >          mova      m7, [pw_pixel_max]
> >          %ifidn %3, pp
> > -            mova      m6, [tab_c_32]
> > +            mova      m6, [INTERP_OFFSET_PP]
> >          %else
> >              mova      m6, [INTERP_OFFSET_SP]
> >          %endif
> > @@ -4129,10 +4134,10 @@
> >      paddd     m2, m6
> >      paddd     m3, m6
> >      %ifidn %3, pp
> > -        psrad     m0, 6
> > -        psrad     m1, 6
> > -        psrad     m2, 6
> > -        psrad     m3, 6
> > +        psrad     m0, INTERP_SHIFT_PP
> > +        psrad     m1, INTERP_SHIFT_PP
> > +        psrad     m2, INTERP_SHIFT_PP
> > +        psrad     m3, INTERP_SHIFT_PP
> >      %else
> >          psrad     m0, INTERP_SHIFT_SP
> >          psrad     m1, INTERP_SHIFT_SP
> > @@ -4344,9 +4349,9 @@
> >          pxor      m7, m7
> >          mova      m6, [pw_pixel_max]
> >          %ifidn %2, pp
> > -            mova      m5, [tab_c_32]
> > +            mova      m5, [INTERP_OFFSET_PP]
> >          %else
> > -            mova      m5, [tab_c_524800]
> > +            mova      m5, [INTERP_OFFSET_SP]
> >          %endif
> >      %else
> >          mova      m5, [INTERP_OFFSET_PS]
> > @@ -4362,18 +4367,18 @@
> >  %elifidn %2, ps
> >      paddd     m0, m5
> >      paddd     m2, m5
> > -    psrad     m0, 2
> > -    psrad     m2, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> >      packssdw  m0, m2
> >  %else
> >      paddd     m0, m5
> >      paddd     m2, m5
> >      %ifidn %2, pp
> > -        psrad     m0, 6
> > -        psrad     m2, 6
> > +        psrad     m0, INTERP_SHIFT_PP
> > +        psrad     m2, INTERP_SHIFT_PP
> >      %else
> > -        psrad     m0, 10
> > -        psrad     m2, 10
> > +        psrad     m0, INTERP_SHIFT_SP
> > +        psrad     m2, INTERP_SHIFT_SP
> >      %endif
> >      packusdw  m0, m2
> >      CLIPW     m0, m7,    m6
> > @@ -4389,7 +4394,6 @@
> >  
> >      dec       r4d
> >      jnz       .loopH
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -4417,7 +4421,6 @@
> >  %macro FILTER_VER_CHROMA_W4 3
> >  INIT_XMM sse4
> >  cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
> > -
> >      add        r1d, r1d
> >      add        r3d, r3d
> >      sub        r0, r1
> > @@ -4439,9 +4442,9 @@
> >          pxor      m6, m6
> >          mova      m5, [pw_pixel_max]
> >          %ifidn %2, pp
> > -            mova      m4, [tab_c_32]
> > +            mova      m4, [INTERP_OFFSET_PP]
> >          %else
> > -            mova      m4, [tab_c_524800]
> > +            mova      m4, [INTERP_OFFSET_SP]
> >          %endif
> >      %else
> >          mova      m4, [INTERP_OFFSET_PS]
> > @@ -4479,18 +4482,18 @@
> >  %elifidn %2, ps
> >      paddd     m0, m4
> >      paddd     m1, m4
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> >      packssdw  m0, m1
> >  %else
> >      paddd     m0, m4
> >      paddd     m1, m4
> >      %ifidn %2, pp
> > -        psrad     m0, 6
> > -        psrad     m1, 6
> > +        psrad     m0, INTERP_SHIFT_PP
> > +        psrad     m1, INTERP_SHIFT_PP
> >      %else
> > -        psrad     m0, 10
> > -        psrad     m1, 10
> > +        psrad     m0, INTERP_SHIFT_SP
> > +        psrad     m1, INTERP_SHIFT_SP
> >      %endif
> >      packusdw  m0, m1
> >      CLIPW     m0, m6,    m5
> > @@ -4504,7 +4507,6 @@
> >      dec        r4d
> >      jnz        .loop
> >  %endif
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -4524,7 +4526,6 @@
> >  %macro FILTER_VER_CHROMA_W6 3
> >  INIT_XMM sse4
> >  cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
> > -
> >      add       r1d, r1d
> >      add       r3d, r3d
> >      sub       r0, r1
> > @@ -4543,9 +4544,9 @@
> >      %ifnidn %2, ps
> >          mova      m7, [pw_pixel_max]
> >          %ifidn %2, pp
> > -            mova      m6, [tab_c_32]
> > +            mova      m6, [INTERP_OFFSET_PP]
> >          %else
> > -            mova      m6, [tab_c_524800]
> > +            mova      m6, [INTERP_OFFSET_SP]
> >          %endif
> >      %else
> >          mova      m6, [INTERP_OFFSET_PS]
> > @@ -4568,10 +4569,10 @@
> >      paddd     m1, m6
> >      paddd     m2, m6
> >      paddd     m3, m6
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -4581,15 +4582,15 @@
> >      paddd     m2, m6
> >      paddd     m3, m6
> >      %ifidn %2, pp
> > -        psrad     m0, 6
> > -        psrad     m1, 6
> > -        psrad     m2, 6
> > -        psrad     m3, 6
> > +        psrad     m0, INTERP_SHIFT_PP
> > +        psrad     m1, INTERP_SHIFT_PP
> > +        psrad     m2, INTERP_SHIFT_PP
> > +        psrad     m3, INTERP_SHIFT_PP
> >      %else
> > -        psrad     m0, 10
> > -        psrad     m1, 10
> > -        psrad     m2, 10
> > -        psrad     m3, 10
> > +        psrad     m0, INTERP_SHIFT_SP
> > +        psrad     m1, INTERP_SHIFT_SP
> > +        psrad     m2, INTERP_SHIFT_SP
> > +        psrad     m3, INTERP_SHIFT_SP
> >      %endif
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -4616,18 +4617,18 @@
> >  %elifidn %2, ps
> >      paddd     m0, m6
> >      paddd     m2, m6
> > -    psrad     m0, 2
> > -    psrad     m2, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> >      packssdw  m0, m2
> >  %else
> >      paddd     m0, m6
> >      paddd     m2, m6
> >      %ifidn %2, pp
> > -        psrad     m0, 6
> > -        psrad     m2, 6
> > +        psrad     m0, INTERP_SHIFT_PP
> > +        psrad     m2, INTERP_SHIFT_PP
> >      %else
> > -        psrad     m0, 10
> > -        psrad     m2, 10
> > +        psrad     m0, INTERP_SHIFT_SP
> > +        psrad     m2, INTERP_SHIFT_SP
> >      %endif
> >      packusdw  m0, m2
> >      CLIPW     m0, m5,    m7
> > @@ -4644,7 +4645,6 @@
> >  
> >      dec       r4d
> >      jnz       .loopH
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -4712,7 +4712,7 @@
> >      mov       r4d, %2/2
> >  
> >  %ifidn %3, pp
> > -    mova      m7, [tab_c_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %elifidn %3, sp
> >      mova      m7, [INTERP_OFFSET_SP]
> >  %elifidn %3, ps
> > @@ -4748,10 +4748,10 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >      %ifidn %3, pp
> > -        psrad     m0, 6
> > -        psrad     m1, 6
> > -        psrad     m2, 6
> > -        psrad     m3, 6
> > +        psrad     m0, INTERP_SHIFT_PP
> > +        psrad     m1, INTERP_SHIFT_PP
> > +        psrad     m2, INTERP_SHIFT_PP
> > +        psrad     m3, INTERP_SHIFT_PP
> >      %else
> >          psrad     m0, INTERP_SHIFT_SP
> >          psrad     m1, INTERP_SHIFT_SP
> > @@ -4772,7 +4772,6 @@
> >  
> >      dec       r4d
> >      jnz       .loopH
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -4868,9 +4867,9 @@
> >      mov             r6d, %1/4
> >  
> >  %ifidn %2,pp
> > -    vbroadcasti128  m8, [pd_32]
> > +    vbroadcasti128  m8, [INTERP_OFFSET_PP]
> >  %elifidn %2, sp
> > -    mova            m8, [pd_524800]
> > +    mova            m8, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m8, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -4934,20 +4933,20 @@
> >      paddd           m2, m8
> >      paddd           m3, m8
> >  %ifidn %2,pp
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > -    psrad           m2, 6
> > -    psrad           m3, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> >  %elifidn %2, sp
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -    psrad           m2, 10
> > -    psrad           m3, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > -    psrad           m2, 2
> > -    psrad           m3, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -5012,9 +5011,9 @@
> >      mov       r4d, %1/2
> >  
> >  %ifidn %2, pp
> > -    mova      m7, [tab_c_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %elifidn %2, sp
> > -    mova      m7, [pd_524800]
> > +    mova      m7, [INTERP_OFFSET_SP]
> >  %elifidn %2, ps
> >      mova      m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -5034,10 +5033,10 @@
> >      paddd     m1, m7
> >      paddd     m2, m7
> >      paddd     m3, m7
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5047,15 +5046,15 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >   %ifidn %2, pp
> > -    psrad     m0, 6
> > -    psrad     m1, 6
> > -    psrad     m2, 6
> > -    psrad     m3, 6
> > -%else
> > -    psrad     m0, 10
> > -    psrad     m1, 10
> > -    psrad     m2, 10
> > -    psrad     m3, 10
> > +    psrad     m0, INTERP_SHIFT_PP
> > +    psrad     m1, INTERP_SHIFT_PP
> > +    psrad     m2, INTERP_SHIFT_PP
> > +    psrad     m3, INTERP_SHIFT_PP
> > +%else
> > +    psrad     m0, INTERP_SHIFT_SP
> > +    psrad     m1, INTERP_SHIFT_SP
> > +    psrad     m2, INTERP_SHIFT_SP
> > +    psrad     m3, INTERP_SHIFT_SP
> >  %endif
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5184,9 +5183,9 @@
> >      mov       r4d, %1/2
> >  
> >  %ifidn %2, pp
> > -    mova      m7, [tab_c_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %elifidn %2, sp
> > -    mova      m7, [pd_524800]
> > +    mova      m7, [INTERP_OFFSET_SP]
> >  %elifidn %2, ps
> >      mova      m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -5213,18 +5212,18 @@
> >      paddd     m1, m7
> >      paddd     m2, m7
> >      paddd     m3, m7
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >      paddd     m8, m7
> >      paddd     m9, m7
> >      paddd     m10, m7
> >      paddd     m11, m7
> > -    psrad     m8, 2
> > -    psrad     m9, 2
> > -    psrad     m10, 2
> > -    psrad     m11, 2
> > +    psrad     m8, INTERP_SHIFT_PS
> > +    psrad     m9, INTERP_SHIFT_PS
> > +    psrad     m10, INTERP_SHIFT_PS
> > +    psrad     m11, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5240,23 +5239,23 @@
> >      paddd     m10, m7
> >      paddd     m11, m7
> >   %ifidn %2, pp
> > -    psrad     m0, 6
> > -    psrad     m1, 6
> > -    psrad     m2, 6
> > -    psrad     m3, 6
> > -    psrad     m8, 6
> > -    psrad     m9, 6
> > -    psrad     m10, 6
> > -    psrad     m11, 6
> > -%else
> > -    psrad     m0, 10
> > -    psrad     m1, 10
> > -    psrad     m2, 10
> > -    psrad     m3, 10
> > -    psrad     m8, 10
> > -    psrad     m9, 10
> > -    psrad     m10, 10
> > -    psrad     m11, 10
> > +    psrad     m0, INTERP_SHIFT_PP
> > +    psrad     m1, INTERP_SHIFT_PP
> > +    psrad     m2, INTERP_SHIFT_PP
> > +    psrad     m3, INTERP_SHIFT_PP
> > +    psrad     m8, INTERP_SHIFT_PP
> > +    psrad     m9, INTERP_SHIFT_PP
> > +    psrad     m10, INTERP_SHIFT_PP
> > +    psrad     m11, INTERP_SHIFT_PP
> > +%else
> > +    psrad     m0, INTERP_SHIFT_SP
> > +    psrad     m1, INTERP_SHIFT_SP
> > +    psrad     m2, INTERP_SHIFT_SP
> > +    psrad     m3, INTERP_SHIFT_SP
> > +    psrad     m8, INTERP_SHIFT_SP
> > +    psrad     m9, INTERP_SHIFT_SP
> > +    psrad     m10, INTERP_SHIFT_SP
> > +    psrad     m11, INTERP_SHIFT_SP
> >  %endif
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5326,9 +5325,9 @@
> >      mov       r4d, %1/2
> >  
> >  %ifidn %2, pp
> > -    mova      m7, [tab_c_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %elifidn %2, sp
> > -    mova      m7, [pd_524800]
> > +    mova      m7, [INTERP_OFFSET_SP]
> >  %elifidn %2, ps
> >      mova      m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -5380,10 +5379,10 @@
> >      paddd     m1, m7
> >      paddd     m2, m7
> >      paddd     m3, m7
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5393,15 +5392,15 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >  %ifidn %2, pp
> > -    psrad     m0, 6
> > -    psrad     m1, 6
> > -    psrad     m2, 6
> > -    psrad     m3, 6
> > -%else
> > -    psrad     m0, 10
> > -    psrad     m1, 10
> > -    psrad     m2, 10
> > -    psrad     m3, 10
> > +    psrad     m0, INTERP_SHIFT_PP
> > +    psrad     m1, INTERP_SHIFT_PP
> > +    psrad     m2, INTERP_SHIFT_PP
> > +    psrad     m3, INTERP_SHIFT_PP
> > +%else
> > +    psrad     m0, INTERP_SHIFT_SP
> > +    psrad     m1, INTERP_SHIFT_SP
> > +    psrad     m2, INTERP_SHIFT_SP
> > +    psrad     m3, INTERP_SHIFT_SP
> >  %endif
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5457,9 +5456,9 @@
> >      mov       r4d, %1/2
> >  
> >  %ifidn %2, pp
> > -    mova      m7, [tab_c_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %elifidn %2, sp
> > -    mova      m7, [pd_524800]
> > +    mova      m7, [INTERP_OFFSET_SP]
> >  %elifidn %2, ps
> >      mova      m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -5479,10 +5478,10 @@
> >      paddd     m1, m7
> >      paddd     m2, m7
> >      paddd     m3, m7
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5492,15 +5491,15 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >   %ifidn %2, pp
> > -    psrad     m0, 6
> > -    psrad     m1, 6
> > -    psrad     m2, 6
> > -    psrad     m3, 6
> > -%else
> > -    psrad     m0, 10
> > -    psrad     m1, 10
> > -    psrad     m2, 10
> > -    psrad     m3, 10
> > +    psrad     m0, INTERP_SHIFT_PP
> > +    psrad     m1, INTERP_SHIFT_PP
> > +    psrad     m2, INTERP_SHIFT_PP
> > +    psrad     m3, INTERP_SHIFT_PP
> > +%else
> > +    psrad     m0, INTERP_SHIFT_SP
> > +    psrad     m1, INTERP_SHIFT_SP
> > +    psrad     m2, INTERP_SHIFT_SP
> > +    psrad     m3, INTERP_SHIFT_SP
> >  %endif
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5610,9 +5609,9 @@
> >      mov       r4d, %1/2
> >  
> >  %ifidn %2, pp
> > -    mova      m7, [tab_c_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %elifidn %2, sp
> > -    mova      m7, [pd_524800]
> > +    mova      m7, [INTERP_OFFSET_SP]
> >  %elifidn %2, ps
> >      mova      m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -5639,18 +5638,18 @@
> >      paddd     m1, m7
> >      paddd     m2, m7
> >      paddd     m3, m7
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >      paddd     m8, m7
> >      paddd     m9, m7
> >      paddd     m10, m7
> >      paddd     m11, m7
> > -    psrad     m8, 2
> > -    psrad     m9, 2
> > -    psrad     m10, 2
> > -    psrad     m11, 2
> > +    psrad     m8, INTERP_SHIFT_PS
> > +    psrad     m9, INTERP_SHIFT_PS
> > +    psrad     m10, INTERP_SHIFT_PS
> > +    psrad     m11, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5666,23 +5665,23 @@
> >      paddd     m10, m7
> >      paddd     m11, m7
> >   %ifidn %2, pp
> > -    psrad     m0, 6
> > -    psrad     m1, 6
> > -    psrad     m2, 6
> > -    psrad     m3, 6
> > -    psrad     m8, 6
> > -    psrad     m9, 6
> > -    psrad     m10, 6
> > -    psrad     m11, 6
> > -%else
> > -    psrad     m0, 10
> > -    psrad     m1, 10
> > -    psrad     m2, 10
> > -    psrad     m3, 10
> > -    psrad     m8, 10
> > -    psrad     m9, 10
> > -    psrad     m10, 10
> > -    psrad     m11, 10
> > +    psrad     m0, INTERP_SHIFT_PP
> > +    psrad     m1, INTERP_SHIFT_PP
> > +    psrad     m2, INTERP_SHIFT_PP
> > +    psrad     m3, INTERP_SHIFT_PP
> > +    psrad     m8, INTERP_SHIFT_PP
> > +    psrad     m9, INTERP_SHIFT_PP
> > +    psrad     m10, INTERP_SHIFT_PP
> > +    psrad     m11, INTERP_SHIFT_PP
> > +%else
> > +    psrad     m0, INTERP_SHIFT_SP
> > +    psrad     m1, INTERP_SHIFT_SP
> > +    psrad     m2, INTERP_SHIFT_SP
> > +    psrad     m3, INTERP_SHIFT_SP
> > +    psrad     m8, INTERP_SHIFT_SP
> > +    psrad     m9, INTERP_SHIFT_SP
> > +    psrad     m10, INTERP_SHIFT_SP
> > +    psrad     m11, INTERP_SHIFT_SP
> >  %endif
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5733,9 +5732,9 @@
> >      mov       r4d, 32
> >  
> >  %ifidn %1, pp
> > -    mova      m7, [tab_c_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  %elifidn %1, sp
> > -    mova      m7, [pd_524800]
> > +    mova      m7, [INTERP_OFFSET_SP]
> >  %elifidn %1, ps
> >      mova      m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -5787,10 +5786,10 @@
> >      paddd     m1, m7
> >      paddd     m2, m7
> >      paddd     m3, m7
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5800,15 +5799,15 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >  %ifidn %1, pp
> > -    psrad     m0, 6
> > -    psrad     m1, 6
> > -    psrad     m2, 6
> > -    psrad     m3, 6
> > -%else
> > -    psrad     m0, 10
> > -    psrad     m1, 10
> > -    psrad     m2, 10
> > -    psrad     m3, 10
> > +    psrad     m0, INTERP_SHIFT_PP
> > +    psrad     m1, INTERP_SHIFT_PP
> > +    psrad     m2, INTERP_SHIFT_PP
> > +    psrad     m3, INTERP_SHIFT_PP
> > +%else
> > +    psrad     m0, INTERP_SHIFT_SP
> > +    psrad     m1, INTERP_SHIFT_SP
> > +    psrad     m2, INTERP_SHIFT_SP
> > +    psrad     m3, INTERP_SHIFT_SP
> >  %endif
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -5827,6 +5826,7 @@
> >      jnz       .loopH
> >      RET
> >  %endmacro
> > +
> >      FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8
> >      FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8
> >      FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7
> > @@ -5834,7 +5834,6 @@
> >  
> >  INIT_XMM sse2
> >  cglobal chroma_p2s, 3, 7, 3
> > -
> >      ; load width and height
> >      mov         r3d, r3m
> >      mov         r4d, r4m
> > @@ -5850,11 +5849,11 @@
> >      lea         r6, [r0 + r5 * 2]
> >  
> >      movu        m0, [r6]
> > -    psllw       m0, 4
> > +    psllw       m0, (14 - BIT_DEPTH)
> >      paddw       m0, m2
> >  
> >      movu        m1, [r6 + r1]
> > -    psllw       m1, 4
> > +    psllw       m1, (14 - BIT_DEPTH)
> >      paddw       m1, m2
> >  
> >      add         r5d, 8
> > @@ -5887,7 +5886,6 @@
> >  
> >      sub         r4d, 2
> >      jnz         .loopH
> > -
> >      RET
> >  
> >  %macro PROCESS_LUMA_VER_W4_4R 0
> > @@ -5975,7 +5973,7 @@
> >      lea       r6, [tab_LumaCoeffV + r4]
> >  %endif
> >  
> > -    mova      m7, [pd_32]
> > +    mova      m7, [INTERP_OFFSET_PP]
> >  
> >      mov       dword [rsp], %2/4
> >  .loopH:
> > @@ -5988,10 +5986,10 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >  
> > -    psrad     m0, 6
> > -    psrad     m1, 6
> > -    psrad     m2, 6
> > -    psrad     m3, 6
> > +    psrad     m0, INTERP_SHIFT_PP
> > +    psrad     m1, INTERP_SHIFT_PP
> > +    psrad     m2, INTERP_SHIFT_PP
> > +    psrad     m3, INTERP_SHIFT_PP
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -6017,7 +6015,6 @@
> >  
> >      dec       dword [rsp]
> >      jnz       .loopH
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -6126,14 +6123,14 @@
> >      paddd           m0, m6
> >      paddd           m2, m6
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m2, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m2, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m2, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -6294,20 +6291,20 @@
> >      paddd           m2, m11
> >      paddd           m3, m11
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > -    psrad           m2, 6
> > -    psrad           m3, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -    psrad           m2, 10
> > -    psrad           m3, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > -    psrad           m2, 2
> > -    psrad           m3, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -6365,20 +6362,20 @@
> >      paddd           m6, m11
> >      paddd           m7, m11
> >  %ifidn %1,pp
> > -    psrad           m4, 6
> > -    psrad           m5, 6
> > -    psrad           m6, 6
> > -    psrad           m7, 6
> > +    psrad           m4, INTERP_SHIFT_PP
> > +    psrad           m5, INTERP_SHIFT_PP
> > +    psrad           m6, INTERP_SHIFT_PP
> > +    psrad           m7, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m4, 10
> > -    psrad           m5, 10
> > -    psrad           m6, 10
> > -    psrad           m7, 10
> > -%else
> > -    psrad           m4, 2
> > -    psrad           m5, 2
> > -    psrad           m6, 2
> > -    psrad           m7, 2
> > +    psrad           m4, INTERP_SHIFT_SP
> > +    psrad           m5, INTERP_SHIFT_SP
> > +    psrad           m6, INTERP_SHIFT_SP
> > +    psrad           m7, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m4, INTERP_SHIFT_PS
> > +    psrad           m5, INTERP_SHIFT_PS
> > +    psrad           m6, INTERP_SHIFT_PS
> > +    psrad           m7, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -6538,26 +6535,26 @@
> >      paddd           m4, m14
> >      paddd           m5, m14
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > -    psrad           m2, 6
> > -    psrad           m3, 6
> > -    psrad           m4, 6
> > -    psrad           m5, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> > +    psrad           m4, INTERP_SHIFT_PP
> > +    psrad           m5, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -    psrad           m2, 10
> > -    psrad           m3, 10
> > -    psrad           m4, 10
> > -    psrad           m5, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > -    psrad           m2, 2
> > -    psrad           m3, 2
> > -    psrad           m4, 2
> > -    psrad           m5, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +    psrad           m4, INTERP_SHIFT_SP
> > +    psrad           m5, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> > +    psrad           m4, INTERP_SHIFT_PS
> > +    psrad           m5, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -6620,14 +6617,14 @@
> >      paddd           m6, m14
> >      paddd           m7, m14
> >  %ifidn %1,pp
> > -    psrad           m6, 6
> > -    psrad           m7, 6
> > +    psrad           m6, INTERP_SHIFT_PP
> > +    psrad           m7, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m6, 10
> > -    psrad           m7, 10
> > -%else
> > -    psrad           m6, 2
> > -    psrad           m7, 2
> > +    psrad           m6, INTERP_SHIFT_SP
> > +    psrad           m7, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m6, INTERP_SHIFT_PS
> > +    psrad           m7, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -6734,32 +6731,32 @@
> >      paddd           m0, m14
> >      paddd           m1, m14
> >  %ifidn %1,pp
> > -    psrad           m8, 6
> > -    psrad           m9, 6
> > -    psrad           m10, 6
> > -    psrad           m11, 6
> > -    psrad           m12, 6
> > -    psrad           m13, 6
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > +    psrad           m8, INTERP_SHIFT_PP
> > +    psrad           m9, INTERP_SHIFT_PP
> > +    psrad           m10, INTERP_SHIFT_PP
> > +    psrad           m11, INTERP_SHIFT_PP
> > +    psrad           m12, INTERP_SHIFT_PP
> > +    psrad           m13, INTERP_SHIFT_PP
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m8, 10
> > -    psrad           m9, 10
> > -    psrad           m10, 10
> > -    psrad           m11, 10
> > -    psrad           m12, 10
> > -    psrad           m13, 10
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -%else
> > -    psrad           m8, 2
> > -    psrad           m9, 2
> > -    psrad           m10, 2
> > -    psrad           m11, 2
> > -    psrad           m12, 2
> > -    psrad           m13, 2
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > +    psrad           m8, INTERP_SHIFT_SP
> > +    psrad           m9, INTERP_SHIFT_SP
> > +    psrad           m10, INTERP_SHIFT_SP
> > +    psrad           m11, INTERP_SHIFT_SP
> > +    psrad           m12, INTERP_SHIFT_SP
> > +    psrad           m13, INTERP_SHIFT_SP
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m8, INTERP_SHIFT_PS
> > +    psrad           m9, INTERP_SHIFT_PS
> > +    psrad           m10, INTERP_SHIFT_PS
> > +    psrad           m11, INTERP_SHIFT_PS
> > +    psrad           m12, INTERP_SHIFT_PS
> > +    psrad           m13, INTERP_SHIFT_PS
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -6819,7 +6816,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m14, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m14, [pd_524800]
> > +    mova            m14, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m14, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -6870,7 +6867,7 @@
> >  %ifidn %3,pp
> >      vbroadcasti128  m14, [pd_32]
> >  %elifidn %3, sp
> > -    mova            m14, [pd_524800]
> > +    mova            m14, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m14, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -6953,7 +6950,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m14, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m14, [pd_524800]
> > +    mova            m14, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m14, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -7089,26 +7086,26 @@
> >      paddd           m4, m14
> >      paddd           m5, m14
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > -    psrad           m2, 6
> > -    psrad           m3, 6
> > -    psrad           m4, 6
> > -    psrad           m5, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> > +    psrad           m4, INTERP_SHIFT_PP
> > +    psrad           m5, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -    psrad           m2, 10
> > -    psrad           m3, 10
> > -    psrad           m4, 10
> > -    psrad           m5, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > -    psrad           m2, 2
> > -    psrad           m3, 2
> > -    psrad           m4, 2
> > -    psrad           m5, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +    psrad           m4, INTERP_SHIFT_SP
> > +    psrad           m5, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> > +    psrad           m4, INTERP_SHIFT_PS
> > +    psrad           m5, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -7171,14 +7168,14 @@
> >      paddd           m6, m14
> >      paddd           m7, m14
> >  %ifidn %1,pp
> > -    psrad           m6, 6
> > -    psrad           m7, 6
> > +    psrad           m6, INTERP_SHIFT_PP
> > +    psrad           m7, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m6, 10
> > -    psrad           m7, 10
> > -%else
> > -    psrad           m6, 2
> > -    psrad           m7, 2
> > +    psrad           m6, INTERP_SHIFT_SP
> > +    psrad           m7, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m6, INTERP_SHIFT_PS
> > +    psrad           m7, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -7285,32 +7282,32 @@
> >      paddd           m0, m14
> >      paddd           m1, m14
> >  %ifidn %1,pp
> > -    psrad           m8, 6
> > -    psrad           m9, 6
> > -    psrad           m10, 6
> > -    psrad           m11, 6
> > -    psrad           m12, 6
> > -    psrad           m13, 6
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > +    psrad           m8, INTERP_SHIFT_PP
> > +    psrad           m9, INTERP_SHIFT_PP
> > +    psrad           m10, INTERP_SHIFT_PP
> > +    psrad           m11, INTERP_SHIFT_PP
> > +    psrad           m12, INTERP_SHIFT_PP
> > +    psrad           m13, INTERP_SHIFT_PP
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m8, 10
> > -    psrad           m9, 10
> > -    psrad           m10, 10
> > -    psrad           m11, 10
> > -    psrad           m12, 10
> > -    psrad           m13, 10
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -%else
> > -    psrad           m8, 2
> > -    psrad           m9, 2
> > -    psrad           m10, 2
> > -    psrad           m11, 2
> > -    psrad           m12, 2
> > -    psrad           m13, 2
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > +    psrad           m8, INTERP_SHIFT_SP
> > +    psrad           m9, INTERP_SHIFT_SP
> > +    psrad           m10, INTERP_SHIFT_SP
> > +    psrad           m11, INTERP_SHIFT_SP
> > +    psrad           m12, INTERP_SHIFT_SP
> > +    psrad           m13, INTERP_SHIFT_SP
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m8, INTERP_SHIFT_PS
> > +    psrad           m9, INTERP_SHIFT_PS
> > +    psrad           m10, INTERP_SHIFT_PS
> > +    psrad           m11, INTERP_SHIFT_PS
> > +    psrad           m12, INTERP_SHIFT_PS
> > +    psrad           m13, INTERP_SHIFT_PS
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -7485,26 +7482,26 @@
> >      paddd           m4, m11
> >      paddd           m5, m11
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > -    psrad           m2, 6
> > -    psrad           m3, 6
> > -    psrad           m4, 6
> > -    psrad           m5, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> > +    psrad           m4, INTERP_SHIFT_PP
> > +    psrad           m5, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -    psrad           m2, 10
> > -    psrad           m3, 10
> > -    psrad           m4, 10
> > -    psrad           m5, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > -    psrad           m2, 2
> > -    psrad           m3, 2
> > -    psrad           m4, 2
> > -    psrad           m5, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +    psrad           m4, INTERP_SHIFT_SP
> > +    psrad           m5, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> > +    psrad           m4, INTERP_SHIFT_PS
> > +    psrad           m5, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -7556,14 +7553,14 @@
> >      paddd           m6, m11
> >      paddd           m7, m11
> >  %ifidn %1,pp
> > -    psrad           m6, 6
> > -    psrad           m7, 6
> > +    psrad           m6, INTERP_SHIFT_PP
> > +    psrad           m7, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m6, 10
> > -    psrad           m7, 10
> > -%else
> > -    psrad           m6, 2
> > -    psrad           m7, 2
> > +    psrad           m6, INTERP_SHIFT_SP
> > +    psrad           m7, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m6, INTERP_SHIFT_PS
> > +    psrad           m7, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -7600,7 +7597,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m11, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m11, [pd_524800]
> > +    mova            m11, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m11, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -7647,7 +7644,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m14, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m14, [pd_524800]
> > +    mova            m14, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m14, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -7765,20 +7762,20 @@
> >      paddd           m2, m7
> >      paddd           m3, m7
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > -    psrad           m2, 6
> > -    psrad           m3, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -    psrad           m2, 10
> > -    psrad           m3, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > -    psrad           m2, 2
> > -    psrad           m3, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -7801,7 +7798,7 @@
> >  
> >  %macro FILTER_VER_LUMA_AVX2_16x4 1
> >  INIT_YMM avx2
> > -cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
> > +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize
> >      mov             r4d, r4m
> >      shl             r4d, 7
> >      add             r1d, r1d
> > @@ -7819,7 +7816,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m7, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m7, [pd_524800]
> > +    mova            m7, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -7864,7 +7861,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m7, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m7, [pd_524800]
> > +    mova            m7, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -7904,7 +7901,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m14, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m14, [pd_524800]
> > +    mova            m14, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m14, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -8014,20 +8011,20 @@
> >      paddd           m2, m14
> >      paddd           m3, m14
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m1, 6
> > -    psrad           m2, 6
> > -    psrad           m3, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m1, 10
> > -    psrad           m2, 10
> > -    psrad           m3, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m1, 2
> > -    psrad           m2, 2
> > -    psrad           m3, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8105,20 +8102,20 @@
> >      paddd           m6, m14
> >      paddd           m7, m14
> >  %ifidn %1,pp
> > -    psrad           m4, 6
> > -    psrad           m5, 6
> > -    psrad           m6, 6
> > -    psrad           m7, 6
> > +    psrad           m4, INTERP_SHIFT_PP
> > +    psrad           m5, INTERP_SHIFT_PP
> > +    psrad           m6, INTERP_SHIFT_PP
> > +    psrad           m7, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m4, 10
> > -    psrad           m5, 10
> > -    psrad           m6, 10
> > -    psrad           m7, 10
> > -%else
> > -    psrad           m4, 2
> > -    psrad           m5, 2
> > -    psrad           m6, 2
> > -    psrad           m7, 2
> > +    psrad           m4, INTERP_SHIFT_SP
> > +    psrad           m5, INTERP_SHIFT_SP
> > +    psrad           m6, INTERP_SHIFT_SP
> > +    psrad           m7, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m4, INTERP_SHIFT_PS
> > +    psrad           m5, INTERP_SHIFT_PS
> > +    psrad           m6, INTERP_SHIFT_PS
> > +    psrad           m7, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8182,20 +8179,20 @@
> >      paddd           m10, m14
> >      paddd           m11, m14
> >  %ifidn %1,pp
> > -    psrad           m8, 6
> > -    psrad           m9, 6
> > -    psrad           m10, 6
> > -    psrad           m11, 6
> > +    psrad           m8, INTERP_SHIFT_PP
> > +    psrad           m9, INTERP_SHIFT_PP
> > +    psrad           m10, INTERP_SHIFT_PP
> > +    psrad           m11, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m8, 10
> > -    psrad           m9, 10
> > -    psrad           m10, 10
> > -    psrad           m11, 10
> > -%else
> > -    psrad           m8, 2
> > -    psrad           m9, 2
> > -    psrad           m10, 2
> > -    psrad           m11, 2
> > +    psrad           m8, INTERP_SHIFT_SP
> > +    psrad           m9, INTERP_SHIFT_SP
> > +    psrad           m10, INTERP_SHIFT_SP
> > +    psrad           m11, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m8, INTERP_SHIFT_PS
> > +    psrad           m9, INTERP_SHIFT_PS
> > +    psrad           m10, INTERP_SHIFT_PS
> > +    psrad           m11, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8251,7 +8248,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m7, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m7, [pd_524800]
> > +    mova            m7, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -8315,14 +8312,14 @@
> >      paddd           m0, m7
> >      paddd           m2, m7
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m2, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m2, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m2, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8366,14 +8363,14 @@
> >      paddd           m4, m7
> >      paddd           m1, m7
> >  %ifidn %1,pp
> > -    psrad           m4, 6
> > -    psrad           m1, 6
> > +    psrad           m4, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m4, 10
> > -    psrad           m1, 10
> > -%else
> > -    psrad           m4, 2
> > -    psrad           m1, 2
> > +    psrad           m4, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m4, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8458,14 +8455,14 @@
> >      paddd           m0, m7
> >      paddd           m2, m7
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m2, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m2, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m2, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m2, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m2, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m2, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8516,14 +8513,14 @@
> >      paddd           m4, m7
> >      paddd           m1, m7
> >  %ifidn %1,pp
> > -    psrad           m4, 6
> > -    psrad           m1, 6
> > +    psrad           m4, INTERP_SHIFT_PP
> > +    psrad           m1, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m4, 10
> > -    psrad           m1, 10
> > -%else
> > -    psrad           m4, 2
> > -    psrad           m1, 2
> > +    psrad           m4, INTERP_SHIFT_SP
> > +    psrad           m1, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m4, INTERP_SHIFT_PS
> > +    psrad           m1, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8574,14 +8571,14 @@
> >      paddd           m6, m7
> >      paddd           m5, m7
> >  %ifidn %1,pp
> > -    psrad           m6, 6
> > -    psrad           m5, 6
> > +    psrad           m6, INTERP_SHIFT_PP
> > +    psrad           m5, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m6, 10
> > -    psrad           m5, 10
> > -%else
> > -    psrad           m6, 2
> > -    psrad           m5, 2
> > +    psrad           m6, INTERP_SHIFT_SP
> > +    psrad           m5, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m6, INTERP_SHIFT_PS
> > +    psrad           m5, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8625,14 +8622,14 @@
> >      paddd           m0, m7
> >      paddd           m3, m7
> >  %ifidn %1,pp
> > -    psrad           m0, 6
> > -    psrad           m3, 6
> > +    psrad           m0, INTERP_SHIFT_PP
> > +    psrad           m3, INTERP_SHIFT_PP
> >  %elifidn %1, sp
> > -    psrad           m0, 10
> > -    psrad           m3, 10
> > -%else
> > -    psrad           m0, 2
> > -    psrad           m3, 2
> > +    psrad           m0, INTERP_SHIFT_SP
> > +    psrad           m3, INTERP_SHIFT_SP
> > +%else
> > +    psrad           m0, INTERP_SHIFT_PS
> > +    psrad           m3, INTERP_SHIFT_PS
> >  %endif
> >  %endif
> >  
> > @@ -8671,7 +8668,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m7, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m7, [pd_524800]
> > +    mova            m7, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m7, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -8706,7 +8703,7 @@
> >  %ifidn %1,pp
> >      vbroadcasti128  m14, [pd_32]
> >  %elifidn %1, sp
> > -    mova            m14, [pd_524800]
> > +    mova            m14, [INTERP_OFFSET_SP]
> >  %else
> >      vbroadcasti128  m14, [INTERP_OFFSET_PS]
> >  %endif
> > @@ -8758,10 +8755,10 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >  
> > -    psrad     m0, 2
> > -    psrad     m1, 2
> > -    psrad     m2, 2
> > -    psrad     m3, 2
> > +    psrad     m0, INTERP_SHIFT_PS
> > +    psrad     m1, INTERP_SHIFT_PS
> > +    psrad     m2, INTERP_SHIFT_PS
> > +    psrad     m3, INTERP_SHIFT_PS
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -8784,7 +8781,6 @@
> >  
> >      dec       dword [rsp]
> >      jnz       .loopH
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -8837,7 +8833,7 @@
> >      lea       r6, [tab_LumaCoeffV + r4]
> >  %endif
> >  
> > -    mova      m7, [tab_c_524800]
> > +    mova      m7, [INTERP_OFFSET_SP]
> >  
> >      mov       dword [rsp], %2/4
> >  .loopH:
> > @@ -8850,10 +8846,10 @@
> >      paddd     m2, m7
> >      paddd     m3, m7
> >  
> > -    psrad     m0, 10
> > -    psrad     m1, 10
> > -    psrad     m2, 10
> > -    psrad     m3, 10
> > +    psrad     m0, INTERP_SHIFT_SP
> > +    psrad     m1, INTERP_SHIFT_SP
> > +    psrad     m2, INTERP_SHIFT_SP
> > +    psrad     m3, INTERP_SHIFT_SP
> >  
> >      packssdw  m0, m1
> >      packssdw  m2, m3
> > @@ -8879,7 +8875,6 @@
> >  
> >      dec       dword [rsp]
> >      jnz       .loopH
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -8963,7 +8958,6 @@
> >  
> >      dec        dword [rsp]
> >      jnz        .loopH
> > -
> >      RET
> >  %endmacro
> >  
> > @@ -9011,7 +9005,7 @@
> >  %rep %1/4
> >      movd       m0, [r0]
> >      movhps     m0, [r0 + r1]
> > -    psllw      m0, 4
> > +    psllw      m0, (14 - BIT_DEPTH)
> >      psubw      m0, m1
> >  
> >      movd       [r2 + r3 * 0], m0
> > @@ -9019,7 +9013,7 @@
> >  
> >      movd       m0, [r0 + r1 * 2]
> >      movhps     m0, [r0 + r4]
> > -    psllw      m0, 4
> > +    psllw      m0, (14 - BIT_DEPTH)
> >      psubw      m0, m1
> >  
> >      movd       [r2 + r3 * 2], m0
> > @@ -10293,14 +10287,13 @@
> >      mov                         r4d,               r4m
> >      add                         r1d,               r1d
> >      add                         r3d,               r3d
> > -%ifdef PIC
> > -
> > +
> > +%ifdef PIC
> >      lea                         r6,                [tab_LumaCoeff]
> > -    lea                         r4 ,               [r4 * 8]
> > +    lea                         r4,                [r4 * 8]
> >      vbroadcasti128              m0,                [r6 + r4 * 2]
> > -
> > -%else
> > -    lea                         r4 ,                [r4 * 8]
> > +%else
> > +    lea                         r4,                [r4 * 8]
> >      vbroadcasti128              m0,                [tab_LumaCoeff + r4 * 2]
> >  %endif
> >  
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/loopfilter.asm
> > --- a/source/common/x86/loopfilter.asm	Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/loopfilter.asm	Tue Jul 21 14:30:11 2015 -0700
> > @@ -39,7 +39,7 @@
> >  cextern pb_128
> >  cextern pb_2
> >  cextern pw_2
> > -cextern pw_1023
> > +cextern pw_pixel_max
> >  cextern pb_movemask
> >  cextern pw_1
> >  cextern hmul_16p
> > @@ -81,7 +81,7 @@
> >      palignr     m2, m3, m5, 15
> >      por         m2, m0
> >  
> > -    mova        m4, [pw_1023]
> > +    mova        m4, [pw_pixel_max]
> >      psignb      m2, [pb_128]                ; m2 = signLeft
> >      pxor        m0, m0
> >      palignr     m0, m3, 15
> > @@ -127,7 +127,7 @@
> >      palignr     m2, m3, m5, 15
> >      por         m2, m0
> >  
> > -    mova        m4, [pw_1023]
> > +    mova        m4, [pw_pixel_max]
> >      psignb      m2, [pb_128]                ; m2 = signLeft
> >      pxor        m0, m0
> >      palignr     m0, m3, 15
> > @@ -249,7 +249,7 @@
> >      neg             r1b
> >      movd            xm1, r1d
> >      vinserti128     m0, m0, xm1, 1
> > -    mova            m5, [pw_1023]
> > +    mova            m5, [pw_pixel_max]
> >      mov             r1d, r4m
> >      add             r1d, r1d
> >      shr             r2d, 4
> > @@ -402,8 +402,8 @@
> >  
> >      pmaxsw      m7, m0
> >      pmaxsw      m5, m0
> > -    pminsw      m7, [pw_1023]
> > -    pminsw      m5, [pw_1023]
> > +    pminsw      m7, [pw_pixel_max]
> > +    pminsw      m5, [pw_pixel_max]
> >  
> >      movu        [r0], m7
> >      movu        [r0 + 16],  m5
> > @@ -468,7 +468,7 @@
> >      mov         r4d, r4m
> >      mova        m4, [pb_2]
> >      shr         r4d, 4
> > -    mova        m0, [pw_1023]
> > +    mova        m0, [pw_pixel_max]
> >  .loop
> >      movu        m5, [r0]
> >      movu        m3, [r0 + r3]
> > @@ -559,7 +559,7 @@
> >      add         r3d, r3d
> >      mov         r4d, r4m
> >      pxor        m0, m0                      ; m0 = 0
> > -    mova        m6, [pw_1023]
> > +    mova        m6, [pw_pixel_max]
> >      mov         r5d, r4d
> >      shr         r4d, 4
> >      mov         r6, r0
> > @@ -736,7 +736,7 @@
> >  cglobal saoCuOrgE1_2Rows, 4,5,8
> >      add             r3d, r3d
> >      mov             r4d, r4m
> > -    mova            m4, [pw_1023]
> > +    mova            m4, [pw_pixel_max]
> >      vbroadcasti128  m6, [r2]                ; m6 = m_iOffsetEo
> >      shr             r4d, 4
> >  .loop
> > @@ -884,8 +884,8 @@
> >      paddw       m5, m4
> >      pmaxsw      m7, m0
> >      pmaxsw      m5, m0
> > -    pminsw      m7, [pw_1023]
> > -    pminsw      m5, [pw_1023]
> > +    pminsw      m7, [pw_pixel_max]
> > +    pminsw      m5, [pw_pixel_max]
> >      movu        [r0], m7
> >      movu        [r0 + 16], m5
> >  
> > @@ -960,7 +960,7 @@
> >      movq            xm4, [r0 + r4 * 2]
> >      movhps          xm4, [r1 + r4]
> >      vbroadcasti128  m5, [r3]
> > -    mova            m6, [pw_1023]
> > +    mova            m6, [pw_pixel_max]
> >  .loop
> >      movu            m1, [r0]
> >      movu            m3, [r0 + r5 + 2]
> > @@ -1086,8 +1086,8 @@
> >      paddw           m7, m6
> >      pmaxsw          m1, m0
> >      pmaxsw          m7, m0
> > -    pminsw          m1, [pw_1023]
> > -    pminsw          m7, [pw_1023]
> > +    pminsw          m1, [pw_pixel_max]
> > +    pminsw          m7, [pw_pixel_max]
> >      movu            [r0], m1
> >      movu            [r0 + 32], m7
> >  
> > @@ -1212,8 +1212,8 @@
> >      paddw           m5, m4
> >      pmaxsw          m7, m0
> >      pmaxsw          m5, m0
> > -    pminsw          m7, [pw_1023]
> > -    pminsw          m5, [pw_1023]
> > +    pminsw          m7, [pw_pixel_max]
> > +    pminsw          m5, [pw_pixel_max]
> >      movu            [r0], m7
> >      movu            [r0 + 16], m5
> >  
> > @@ -1333,7 +1333,7 @@
> >      paddw           m1, m3
> >      pxor            m0, m0
> >      pmaxsw          m1, m0
> > -    pminsw          m1, [pw_1023]
> > +    pminsw          m1, [pw_pixel_max]
> >      movu            [r0], m1
> >  
> >      psubb           xm0, xm2
> > @@ -1461,8 +1461,8 @@
> >      pxor            m0, m0
> >      pmaxsw          m1, m0
> >      pmaxsw          m7, m0
> > -    pminsw          m1, [pw_1023]
> > -    pminsw          m7, [pw_1023]
> > +    pminsw          m1, [pw_pixel_max]
> > +    pminsw          m7, [pw_pixel_max]
> >      movu            [r0], m1
> >      movu            [r0 + 32], m7
> >  
> > @@ -1565,8 +1565,8 @@
> >  .loopW
> >      movu        m2, [r0 + r6]
> >      movu        m5, [r0 + r6 + 16]
> > -    psrlw       m0, m2, 5
> > -    psrlw       m6, m5, 5
> > +    psrlw       m0, m2, (BIT_DEPTH - 5)
> > +    psrlw       m6, m5, (BIT_DEPTH - 5)
> >      packuswb    m0, m6
> >      pand        m0, [pb_31]         ; m0 = [index]
> >  
> > @@ -1584,8 +1584,8 @@
> >      paddw       m5, m6
> >      pmaxsw      m2, m7
> >      pmaxsw      m5, m7
> > -    pminsw      m2, [pw_1023]
> > -    pminsw      m5, [pw_1023]
> > +    pminsw      m2, [pw_pixel_max]
> > +    pminsw      m5, [pw_pixel_max]
> >  
> >      movu        [r0 + r6], m2
> >      movu        [r0 + r6 + 16], m5
> > @@ -1656,7 +1656,7 @@
> >      sub             r1d, r2d
> >      sub             r1d, r2d
> >      shr             r2d, 4
> > -    mova            m7, [pw_1023]
> > +    mova            m7, [pw_pixel_max]
> >  
> >      mov             r6d, r3d
> >      shr             r3d, 1
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/mc-a.asm
> > --- a/source/common/x86/mc-a.asm	Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/mc-a.asm	Tue Jul 21 14:30:11 2015 -0700
> > @@ -32,6 +32,19 @@
> >  %include "x86inc.asm"
> >  %include "x86util.asm"
> >  
> > +%if BIT_DEPTH==8
> > +    %define ADDAVG_FACTOR       256
> > +    %define ADDAVG_ROUND        128
> > +%elif BIT_DEPTH==10
> > +    %define ADDAVG_FACTOR       1024
> > +    %define ADDAVG_ROUND        512
> > +%elif BIT_DEPTH==12
> > +    %define ADDAVG_FACTOR       4096
> > +    %define ADDAVG_ROUND        2048
> > +%else
> > +    %error Unsupport bit depth!
> > +%endif
> > +
> >  SECTION_RODATA 32
> >  
> >  ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
> > @@ -54,6 +67,8 @@
> >  cextern pw_512
> >  cextern pw_1023
> >  cextern pw_1024
> > +cextern pw_2048
> > +cextern pw_4096
> >  cextern pw_00ff
> >  cextern pw_pixel_max
> >  cextern pd_32
> > @@ -92,23 +107,24 @@
> >      punpcklqdq    m1,          m2
> >      punpcklqdq    m3,          m5
> >      paddw         m1,          m3
> > -    pmulhrsw      m1,          [pw_1024]
> > -    paddw         m1,          [pw_512]
> > +    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
> > +    paddw         m1,          [pw_ %+ ADDAVG_ROUND]
> >  
> >      pxor          m0,          m0
> >      pmaxsw        m1,          m0
> > -    pminsw        m1,          [pw_1023]
> > +    pminsw        m1,          [pw_pixel_max]
> >      movd          [r2],        m1
> >      pextrd        [r2 + r5],   m1, 1
> >      lea           r2,          [r2 + 2 * r5]
> >      pextrd        [r2],        m1, 2
> >      pextrd        [r2 + r5],   m1, 3
> > -
> >      RET
> > +
> > +
> >  ;-----------------------------------------------------------------------------
> >  INIT_XMM sse4
> >  cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova          m0,          [pw_512]
> > +    mova          m0,          [pw_ %+ ADDAVG_ROUND]
> >      pxor          m7,          m7
> >      add           r3,          r3
> >      add           r4,          r4
> > @@ -136,11 +152,11 @@
> >      punpcklqdq    m1,          m2
> >      punpcklqdq    m3,          m5
> >      paddw         m1,          m3
> > -    pmulhrsw      m1,          [pw_1024]
> > +    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
> >      paddw         m1,          m0
> >  
> >      pmaxsw        m1,          m7
> > -    pminsw        m1,          [pw_1023]
> > +    pminsw        m1,          [pw_pixel_max]
> >      movd          [r2],        m1
> >      pextrd        [r2 + r5],   m1, 1
> >      lea           r2,          [r2 + 2 * r5]
> > @@ -156,8 +172,8 @@
> >  ;-----------------------------------------------------------------------------
> >  INIT_XMM sse4
> >  cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m6,         [pw_1023]
> > -    mova        m7,         [pw_1024]
> > +    mova        m6,         [pw_pixel_max]
> > +    mova        m7,         [pw_ %+ ADDAVG_FACTOR]
> >      mov         r6d,        16/4
> >      add         r3,         r3
> >      add         r4,         r4
> > @@ -183,7 +199,7 @@
> >      punpcklqdq  m3,         m5
> >      paddw       m1,         m3
> >      pmulhrsw    m1,         m7
> > -    paddw       m1,         [pw_512]
> > +    paddw       m1,         [pw_ %+ ADDAVG_ROUND]
> >      pxor        m0,         m0
> >      pmaxsw      m1,         m0
> >      pminsw      m1,         m6
> > @@ -213,21 +229,21 @@
> >      punpcklqdq     m0,          m1
> >      punpcklqdq     m2,          m3
> >      paddw          m0,          m2
> > -    pmulhrsw       m0,          [pw_1024]
> > -    paddw          m0,          [pw_512]
> > +    pmulhrsw       m0,          [pw_ %+ ADDAVG_FACTOR]
> > +    paddw          m0,          [pw_ %+ ADDAVG_ROUND]
> >  
> >      pxor           m6,          m6
> >      pmaxsw         m0,          m6
> > -    pminsw         m0,          [pw_1023]
> > +    pminsw         m0,          [pw_pixel_max]
> >      movh           [r2],        m0
> >      movhps         [r2 + r5],   m0
> >      RET
> >  ;-----------------------------------------------------------------------------
> >  INIT_XMM sse4
> >  cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,             [pw_512]
> > -    mova        m5,             [pw_1023]
> > -    mova        m7,             [pw_1024]
> > +    mova        m4,             [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,             [pw_pixel_max]
> > +    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,             m6
> >      add         r3,             r3
> >      add         r4,             r4
> > @@ -264,9 +280,9 @@
> >  ;-----------------------------------------------------------------------------
> >  INIT_XMM sse4
> >  cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,             [pw_512]
> > -    mova        m5,             [pw_1023]
> > -    mova        m7,             [pw_1024]
> > +    mova        m4,             [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,             [pw_pixel_max]
> > +    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,             m6
> >      mov         r6d,            16/2
> >      add         r3,             r3
> > @@ -300,9 +316,9 @@
> >  ;-----------------------------------------------------------------------------
> >  INIT_XMM sse4
> >  cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,          [pw_512]
> > -    mova        m5,          [pw_1023]
> > -    mova        m7,          [pw_1024]
> > +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,          [pw_pixel_max]
> > +    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,          m6
> >      add         r3,          r3
> >      add         r4,          r4
> > @@ -331,9 +347,9 @@
> >  ;-----------------------------------------------------------------------------
> >  INIT_XMM sse4
> >  cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,          [pw_512]
> > -    mova        m5,          [pw_1023]
> > -    mova        m7,          [pw_1024]
> > +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,          [pw_pixel_max]
> > +    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,          m6
> >      add         r3,          r3
> >      add         r4,          r4
> > @@ -370,9 +386,9 @@
> >  %macro ADDAVG_W4_H4 1
> >  INIT_XMM sse4
> >  cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova           m4,          [pw_512]
> > -    mova           m5,          [pw_1023]
> > -    mova           m7,          [pw_1024]
> > +    mova           m4,          [pw_ %+ ADDAVG_ROUND]
> > +    mova           m5,          [pw_pixel_max]
> > +    mova           m7,          [pw_ %+ ADDAVG_FACTOR]
> >      pxor           m6,          m6
> >      add            r3,          r3
> >      add            r4,          r4
> > @@ -420,9 +436,9 @@
> >  %macro ADDAVG_W8_H4 1
> >  INIT_XMM sse4
> >  cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,          [pw_512]
> > -    mova        m5,          [pw_1023]
> > -    mova        m7,          [pw_1024]
> > +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,          [pw_pixel_max]
> > +    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,          m6
> >      add         r3,          r3
> >      add         r4,          r4
> > @@ -470,9 +486,9 @@
> >  %macro ADDAVG_W12_H4 1
> >  INIT_XMM sse4
> >  cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova           m4,             [pw_512]
> > -    mova           m5,             [pw_1023]
> > -    mova           m7,             [pw_1024]
> > +    mova           m4,             [pw_ %+ ADDAVG_ROUND]
> > +    mova           m5,             [pw_pixel_max]
> > +    mova           m7,             [pw_ %+ ADDAVG_FACTOR]
> >      pxor           m6,             m6
> >      add            r3,             r3
> >      add            r4,             r4
> > @@ -532,9 +548,9 @@
> >  %macro ADDAVG_W16_H4 1
> >  INIT_XMM sse4
> >  cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m7,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,              m6
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -601,9 +617,9 @@
> >  %macro ADDAVG_W24_H2 2
> >  INIT_XMM sse4
> >  cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m7,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,              m6
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -683,9 +699,9 @@
> >  %macro ADDAVG_W32_H2 1
> >  INIT_XMM sse4
> >  cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m7,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,              m6
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -787,9 +803,9 @@
> >  %macro ADDAVG_W48_H2 1
> >  INIT_XMM sse4
> >  cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m7,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,              m6
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -921,9 +937,9 @@
> >  %macro ADDAVG_W64_H1 1
> >  INIT_XMM sse4
> >  cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m7,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m6,              m6
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -1029,19 +1045,19 @@
> >  
> >      paddw       m0,          m1
> >      pxor        m1,          m1
> > -    pmulhrsw    m0,          [pw_1024]
> > -    paddw       m0,          [pw_512]
> > +    pmulhrsw    m0,          [pw_ %+ ADDAVG_FACTOR]
> > +    paddw       m0,          [pw_ %+ ADDAVG_ROUND]
> >      pmaxsw      m0,          m1
> > -    pminsw      m0,          [pw_1023]
> > +    pminsw      m0,          [pw_pixel_max]
> >      vextracti128 xm1,        m0, 1
> >      movu        [r2],        xm0
> >      movu        [r2 + r5 * 2], xm1
> >      RET
> >  
> >  cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,          [pw_512]
> > -    mova        m5,          [pw_1023]
> > -    mova        m3,          [pw_1024]
> > +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,          [pw_pixel_max]
> > +    mova        m3,          [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m1,          m1
> >      add         r3d,         r3d
> >      add         r4d,         r4d
> > @@ -1100,9 +1116,9 @@
> >  
> >  %macro ADDAVG_W8_H4_AVX2 1
> >  cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,          [pw_512]
> > -    mova        m5,          [pw_1023]
> > -    mova        m3,          [pw_1024]
> > +    mova        m4,          [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,          [pw_pixel_max]
> > +    mova        m3,          [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m1,          m1
> >      add         r3d,         r3d
> >      add         r4d,         r4d
> > @@ -1159,9 +1175,9 @@
> >  ADDAVG_W8_H4_AVX2 64
> >  
> >  cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova           m4,             [pw_512]
> > -    mova           m5,             [pw_1023]
> > -    mova           m3,             [pw_1024]
> > +    mova           m4,             [pw_ %+ ADDAVG_ROUND]
> > +    mova           m5,             [pw_pixel_max]
> > +    mova           m3,             [pw_ %+ ADDAVG_FACTOR]
> >      pxor           m1,             m1
> >      add            r3,             r3
> >      add            r4,             r4
> > @@ -1201,8 +1217,8 @@
> >      RET
> >  
> >  cglobal addAvg_12x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova           m4,             [pw_512]
> > -    mova           m5,             [pw_1023]
> > +    mova           m4,             [pw_ %+ ADDAVG_ROUND]
> > +    mova           m5,             [pw_pixel_max]
> >      paddw          m3,             m4,  m4
> >      pxor           m1,             m1
> >      add            r3,             r3
> > @@ -1244,9 +1260,9 @@
> >  
> >  %macro ADDAVG_W16_H4_AVX2 1
> >  cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m3,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m2,              m2
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -1291,9 +1307,9 @@
> >  ADDAVG_W16_H4_AVX2 64
> >  
> >  cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m3,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m1,              m1
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -1347,8 +1363,8 @@
> >      RET
> >  
> >  cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> >      paddw       m3,              m4,  m4
> >      pxor        m1,              m1
> >      add         r3,              r3
> > @@ -1404,9 +1420,9 @@
> >  
> >  %macro ADDAVG_W32_H2_AVX2 1
> >  cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m3,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m2,              m2
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -1468,9 +1484,9 @@
> >  ADDAVG_W32_H2_AVX2 64
> >  
> >  cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m3,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m2,              m2
> >      add         r3,              r3
> >      add         r4,              r4
> > @@ -1543,9 +1559,9 @@
> >  
> >  %macro ADDAVG_W64_H1_AVX2 1
> >  cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
> > -    mova        m4,              [pw_512]
> > -    mova        m5,              [pw_1023]
> > -    mova        m3,              [pw_1024]
> > +    mova        m4,              [pw_ %+ ADDAVG_ROUND]
> > +    mova        m5,              [pw_pixel_max]
> > +    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
> >      pxor        m2,              m2
> >      add         r3d,             r3d
> >      add         r4d,             r4d
> > diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/pixel-util8.asm
> > --- a/source/common/x86/pixel-util8.asm	Mon Jul 20 17:18:54 2015 -0700
> > +++ b/source/common/x86/pixel-util8.asm	Tue Jul 21 14:30:11 2015 -0700
> > @@ -879,8 +879,8 @@
> >  %if HIGH_BIT_DEPTH
> >      cmp         r3d, 32767
> >      jle         .skip
> > -    shr         r3d, 2
> > -    sub         r4d, 2
> > +    shr         r3d, (BIT_DEPTH - 8)
> > +    sub         r4d, (BIT_DEPTH - 8)
> >  .skip:
> >  %endif
> >      movd        m0, r4d             ; m0 = shift
> > @@ -1273,13 +1273,7 @@
> >  INIT_XMM sse4
> >  cglobal weight_pp, 4,7,7
> >  %define correction      (14 - BIT_DEPTH)
> > -%if BIT_DEPTH == 10
> > -    mova        m6, [pw_1023]
> > -%elif BIT_DEPTH == 12
> > -    mova        m6, [pw_3fff]
> > -%else
> > -  %error Unsupported BIT_DEPTH!
> > -%endif
> > +    mova        m6, [pw_pixel_max]
> >      mov         r6d, r6m
> >      mov         r4d, r4m
> >      mov         r5d, r5m
> > @@ -1423,7 +1417,7 @@
> >      movd         xm1, r7m
> >      vpbroadcastd m2, r8m
> >      mova         m5, [pw_1]
> > -    mova         m6, [pw_1023]
> > +    mova         m6, [pw_pixel_max]
> >      add         r2d, r2d
> >      add         r3d, r3d
> >      sub          r2d, r3d
> > @@ -1516,13 +1510,7 @@
> >  %if HIGH_BIT_DEPTH
> >  INIT_XMM sse4
> >  cglobal weight_sp, 6,7,8
> > -%if BIT_DEPTH == 10
> > -    mova        m1, [pw_1023]
> > -%elif BIT_DEPTH == 12
> > -    mova        m1, [pw_3fff]
> > -%else
> > -  %error Unsupported BIT_DEPTH!
> > -%endif
> > +    mova        m1, [pw_pixel_max]
> >      mova        m2, [pw_1]
> >      mov         r6d, r7m
> >      shl         r6d, 16
> > @@ -1681,7 +1669,7 @@
> >  %if HIGH_BIT_DEPTH
> >  INIT_YMM avx2
> >  cglobal weight_sp, 6,7,9
> > -    mova                      m1, [pw_1023]
> > +    mova                      m1, [pw_pixel_max]
> >      mova                      m2, [pw_1]
> >      mov                       r6d, r7m
> >      shl                       r6d, 16
> > 
> > 
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
> > 
> 
> -- 
> Steve Borho

-- 
Steve Borho


More information about the x265-devel mailing list