[x265] [PATCH 1 of 4] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX

Min Chen chenm003 at 163.com
Wed Jul 22 01:20:02 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1437514211 25200
# Node ID ab2c34d6ad913369fd8feb84aee10030ffaa0df5
# Parent  46152345eb6ff261fd90272f7a0712300d6324c0
asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
---
 source/common/x86/asm-primitives.cpp |   12 +
 source/common/x86/const-a.asm        |    1 +
 source/common/x86/intrapred16.asm    |   42 +-
 source/common/x86/ipfilter16.asm     | 1199 +++++++++++++++++-----------------
 source/common/x86/loopfilter.asm     |   48 +-
 source/common/x86/mc-a.asm           |  176 +++---
 source/common/x86/pixel-util8.asm    |   24 +-
 7 files changed, 756 insertions(+), 746 deletions(-)

diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 21 14:30:11 2015 -0700
@@ -1043,7 +1043,9 @@
 
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(ssse3);
+#endif
         INTRA_ANG_SSSE3(ssse3);
 
         p.dst4x4 = PFX(dst4_ssse3);
@@ -1126,14 +1128,18 @@
 
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
         ALL_LUMA_PU(satd, pixel_satd, sse4);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(sse4);
+#endif
 
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
         p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
         p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
         p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
 
+#if X265_DEPTH <= 10
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
+#endif
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
@@ -1147,7 +1153,9 @@
 
         // TODO: check POPCNT flag!
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
+#if X265_DEPTH <= 10
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
+#endif
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
@@ -1184,7 +1192,9 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(avx);
+#endif
         p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
@@ -1292,7 +1302,9 @@
     {
         //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
         ALL_LUMA_PU(satd, pixel_satd, xop);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(xop);
+#endif
         LUMA_VAR(xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
     }
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/const-a.asm	Tue Jul 21 14:30:11 2015 -0700
@@ -79,6 +79,7 @@
 const pw_512,               times 16 dw 512
 const pw_1023,              times 16 dw 1023
 const pw_1024,              times 16 dw 1024
+const pw_2048,              times 16 dw 2048
 const pw_4096,              times 16 dw 4096
 const pw_8192,              times  8 dw 8192
 const pw_00ff,              times 16 dw 0x00ff
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/intrapred16.asm	Tue Jul 21 14:30:11 2015 -0700
@@ -1748,7 +1748,7 @@
     ; filter top
     movu        m1,             [r2]
     paddw       m1,             m0
-    psraw       m1,             2
+    psrlw       m1,             2
     movh        [r0],           m1             ; overwrite top-left pixel, we will update it later
 
     ; filter top-left
@@ -1763,7 +1763,7 @@
     lea         r0,             [r0 + r1 * 2]
     movu        m1,             [r3 + 2]
     paddw       m1,             m0
-    psraw       m1,             2
+    psrlw       m1,             2
     movd        r3d,            m1
     mov         [r0],           r3w
     shr         r3d,            16
@@ -1872,7 +1872,7 @@
     ; filter top
     movu            m0,            [r2]
     paddw           m0,            m1
-    psraw           m0,            2
+    psrlw           m0,            2
     movu            [r6],          m0
 
     ; filter top-left
@@ -1887,7 +1887,7 @@
     add             r6,            r1
     movu            m0,            [r3 + 2]
     paddw           m0,            m1
-    psraw           m0,            2
+    psrlw           m0,            2
     pextrw          [r6],          m0, 0
     pextrw          [r6 + r1],     m0, 1
     pextrw          [r6 + r1 * 2], m0, 2
@@ -1913,13 +1913,13 @@
     movu            m2,                  [r2]
     movu            m3,                  [r2 + 16]
 
-    paddw           m0,                  m1
+    paddw           m0,                  m1                     ; dynamic range 13 bits
     paddw           m2,                  m3
-    paddw           m0,                  m2
-    movhlps         m1,                  m0
-    paddw           m0,                  m1
-    phaddw          m0,                  m0
+    paddw           m0,                  m2                     ; dynamic range 14 bits
+    movhlps         m1,                  m0                     ; dynamic range 15 bits
+    paddw           m0,                  m1                     ; dynamic range 16 bits
     pmaddwd         m0,                  [pw_1]
+    phaddd          m0,                  m0
 
     movd            r5d,                 m0
     add             r5d,                 16
@@ -1983,11 +1983,11 @@
     ; filter top
     movu            m2,                  [r2]
     paddw           m2,                  m1
-    psraw           m2,                  2
+    psrlw           m2,                  2
     movu            [r6],                m2
     movu            m3,                  [r2 + 16]
     paddw           m3,                  m1
-    psraw           m3,                  2
+    psrlw           m3,                  2
     movu            [r6 + 16],           m3
 
     ; filter top-left
@@ -2002,7 +2002,7 @@
     add             r6,                  r1
     movu            m2,                  [r3 + 2]
     paddw           m2,                  m1
-    psraw           m2,                  2
+    psrlw           m2,                  2
 
     pextrw          [r6],                m2, 0
     pextrw          [r6 + r1],           m2, 1
@@ -2019,7 +2019,7 @@
     lea             r6,                  [r6 + r1 * 2]
     movu            m3,                  [r3 + 18]
     paddw           m3,                  m1
-    psraw           m3,                  2
+    psrlw           m3,                  2
 
     pextrw          [r6],                m3, 0
     pextrw          [r6 + r1],           m3, 1
@@ -2046,21 +2046,21 @@
     movu            m1,                  [r3 + 16]
     movu            m2,                  [r3 + 32]
     movu            m3,                  [r3 + 48]
-    paddw           m0,                  m1
+    paddw           m0,                  m1             ; dynamic range 13 bits
     paddw           m2,                  m3
-    paddw           m0,                  m2
+    paddw           m0,                  m2             ; dynamic range 14 bits
     movu            m1,                  [r2]
     movu            m3,                  [r2 + 16]
     movu            m4,                  [r2 + 32]
     movu            m5,                  [r2 + 48]
-    paddw           m1,                  m3
+    paddw           m1,                  m3             ; dynamic range 13 bits
     paddw           m4,                  m5
-    paddw           m1,                  m4
-    paddw           m0,                  m1
+    paddw           m1,                  m4             ; dynamic range 14 bits
+    paddw           m0,                  m1             ; dynamic range 15 bits
+    pmaddwd         m0,                  [pw_1]
     movhlps         m1,                  m0
-    paddw           m0,                  m1
-    phaddw          m0,                  m0
-    pmaddwd         m0,                  [pw_1]
+    paddd           m0,                  m1
+    phaddd          m0,                  m0
 
     paddd           m0,                  [pd_32]     ; sum = sum + 32
     psrld           m0,                  6           ; sum = sum / 64
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/ipfilter16.asm	Tue Jul 21 14:30:11 2015 -0700
@@ -26,6 +26,25 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
+
+%define INTERP_OFFSET_PP        pd_32
+%define INTERP_SHIFT_PP         6
+
+%if BIT_DEPTH == 10
+    %define INTERP_SHIFT_PS         2
+    %define INTERP_OFFSET_PS        pd_n32768
+    %define INTERP_SHIFT_SP         10
+    %define INTERP_OFFSET_SP        pd_524800
+%elif BIT_DEPTH == 12
+    %define INTERP_SHIFT_PS         4
+    %define INTERP_OFFSET_PS        pd_n131072
+    %define INTERP_SHIFT_SP         8
+    %define INTERP_OFFSET_SP        pd_524416
+%else
+    %error Unsupport bit depth!
+%endif
+
+
 SECTION_RODATA 32
 
 tab_c_32:         times 8 dd 32
@@ -145,21 +164,9 @@
 const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
                 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
 
-%if BIT_DEPTH == 10
-    %define INTERP_OFFSET_PS        pd_n32768
-    %define INTERP_SHIFT_PS         2
-    %define INTERP_OFFSET_SP        pd_524800
-    %define INTERP_SHIFT_SP         10
-%elif BIT_DEPTH == 12
-    %define INTERP_OFFSET_PS        pd_n131072
-    %define INTERP_SHIFT_PS         4
-    %define INTERP_OFFSET_SP        pd_524416
-    %define INTERP_SHIFT_SP         8
-%else
-    %error Unsupport bit depth!
-%endif
 
 SECTION .text
+cextern pd_8
 cextern pd_32
 cextern pw_pixel_max
 cextern pd_524416
@@ -503,7 +510,7 @@
 %endif
 
 %ifidn %1,pp
-    mova      m7, [pd_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %define SHIFT 6
 %elifidn %1,ps
     mova      m7, [INTERP_OFFSET_PS]
@@ -1176,7 +1183,6 @@
 %macro FILTER_HOR_LUMA_W4 3
 INIT_XMM sse4
 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
-
     mov         r4d, r4m
     sub         r0, 6
     shl         r4d, 4
@@ -1229,7 +1235,7 @@
     packusdw    m4, m4
     CLIPW       m4, m6, m7
 %else
-    psrad       m4, 2
+    psrad       m4, INTERP_SHIFT_PS
     packssdw    m4, m4
 %endif
 
@@ -1287,7 +1293,7 @@
     mov         r4d, %2
 %ifidn %3, ps
     cmp         r5m, byte 0
-    je          .loopH
+    je         .loopH
     lea         r6, [r1 + 2 * r1]
     sub         r0, r6
     add         r4d, 7
@@ -1329,8 +1335,8 @@
     packusdw    m4, m5
     CLIPW       m4, m7, [pw_pixel_max]
 %else
-    psrad       m4, 2
-    psrad       m5, 2
+    psrad       m4, INTERP_SHIFT_PS
+    psrad       m5, INTERP_SHIFT_PS
     packssdw    m4, m5
 %endif
 
@@ -1340,7 +1346,7 @@
     add         r2, r3
 
     dec         r4d
-    jnz         .loopH
+    jnz        .loopH
     RET
 %endmacro
 
@@ -1380,7 +1386,7 @@
     mova        m0, [tab_LumaCoeff + r4]
 %endif
 %ifidn %3, pp
-    mova        m1, [pd_32]
+    mova        m1, [INTERP_OFFSET_PP]
 %else
     mova        m1, [INTERP_OFFSET_PS]
 %endif
@@ -1425,14 +1431,14 @@
     phaddd      m5, m6
     paddd       m5, m1
 %ifidn %3, pp
-    psrad       m4, 6
-    psrad       m5, 6
+    psrad       m4, INTERP_SHIFT_PP
+    psrad       m5, INTERP_SHIFT_PP
     packusdw    m4, m5
     pxor        m5, m5
     CLIPW       m4, m5, [pw_pixel_max]
 %else
-    psrad       m4, 2
-    psrad       m5, 2
+    psrad       m4, INTERP_SHIFT_PS
+    psrad       m5, INTERP_SHIFT_PS
     packssdw    m4, m5
 %endif
 
@@ -1453,12 +1459,12 @@
     phaddd      m4, m5
     paddd       m4, m1
 %ifidn %3, pp
-    psrad       m4, 6
+    psrad       m4, INTERP_SHIFT_PP
     packusdw    m4, m4
     pxor        m5, m5
     CLIPW       m4, m5, [pw_pixel_max]
 %else
-    psrad       m4, 2
+    psrad       m4, INTERP_SHIFT_PS
     packssdw    m4, m4
 %endif
 
@@ -1550,14 +1556,14 @@
     phaddd      m5, m6
     paddd       m5, m1
 %ifidn %3, pp
-    psrad       m4, 6
-    psrad       m5, 6
+    psrad       m4, INTERP_SHIFT_PP
+    psrad       m5, INTERP_SHIFT_PP
     packusdw    m4, m5
     pxor        m5, m5
     CLIPW       m4, m5, [pw_pixel_max]
 %else
-    psrad       m4, 2
-    psrad       m5, 2
+    psrad       m4, INTERP_SHIFT_PS
+    psrad       m5, INTERP_SHIFT_PS
     packssdw    m4, m5
 %endif
     movu        [r2 + x], m4
@@ -1591,14 +1597,14 @@
     phaddd      m5, m6
     paddd       m5, m1
 %ifidn %3, pp
-    psrad       m4, 6
-    psrad       m5, 6
+    psrad       m4, INTERP_SHIFT_PP
+    psrad       m5, INTERP_SHIFT_PP
     packusdw    m4, m5
     pxor        m5, m5
     CLIPW       m4, m5, [pw_pixel_max]
 %else
-    psrad       m4, 2
-    psrad       m5, 2
+    psrad       m4, INTERP_SHIFT_PS
+    psrad       m5, INTERP_SHIFT_PS
     packssdw    m4, m5
 %endif
     movu        [r2 + 16 + x], m4
@@ -1743,14 +1749,14 @@
     phaddd      m5, m6
     paddd       m5, m1
 %ifidn %3, pp
-    psrad       m4, 6
-    psrad       m5, 6
+    psrad       m4, INTERP_SHIFT_PP
+    psrad       m5, INTERP_SHIFT_PP
     packusdw    m4, m5
     pxor        m5, m5
     CLIPW       m4, m5, [pw_pixel_max]
 %else
-    psrad       m4, 2
-    psrad       m5, 2
+    psrad       m4, INTERP_SHIFT_PS
+    psrad       m5, INTERP_SHIFT_PS
     packssdw    m4, m5
 %endif
     movu        [r2], m4
@@ -1784,14 +1790,14 @@
     phaddd      m5, m6
     paddd       m5, m1
 %ifidn %3, pp
-    psrad       m4, 6
-    psrad       m5, 6
+    psrad       m4, INTERP_SHIFT_PP
+    psrad       m5, INTERP_SHIFT_PP
     packusdw    m4, m5
     pxor        m5, m5
     CLIPW       m4, m5, [pw_pixel_max]
 %else
-    psrad       m4, 2
-    psrad       m5, 2
+    psrad       m4, INTERP_SHIFT_PS
+    psrad       m5, INTERP_SHIFT_PS
     packssdw    m4, m5
 %endif
     movu        [r2 + 16], m4
@@ -1825,14 +1831,14 @@
     phaddd      m5, m6
     paddd       m5, m1
 %ifidn %3, pp
-    psrad       m4, 6
-    psrad       m5, 6
+    psrad       m4, INTERP_SHIFT_PP
+    psrad       m5, INTERP_SHIFT_PP
     packusdw    m4, m5
     pxor        m5, m5
     CLIPW       m4, m5, [pw_pixel_max]
 %else
-    psrad       m4, 2
-    psrad       m5, 2
+    psrad       m4, INTERP_SHIFT_PS
+    psrad       m5, INTERP_SHIFT_PS
     packssdw    m4, m5
 %endif
     movu        [r2 + 32], m4
@@ -1865,11 +1871,11 @@
     phaddd      m3,         m4
     paddd       m3,         m1
 %ifidn %1, pp
-    psrad       m3,         6
+    psrad       m3,         INTERP_SHIFT_PP
     packusdw    m3,         m3
     CLIPW       m3,         m7,    m6
 %else
-    psrad       m3,         2
+    psrad       m3,         INTERP_SHIFT_PS
     packssdw    m3,         m3
 %endif
     movd        [r2],       m3
@@ -1895,13 +1901,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m7,    m6
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2],       m3
@@ -1950,7 +1956,7 @@
     phaddd           m4, m4
     vpermq           m4, m4, q3120
     paddd            m4, m6
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -1969,7 +1975,7 @@
     phaddd           m4, m4
     vpermq           m4, m4, q3120
     paddd            m4, m6
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2036,7 +2042,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2064,7 +2070,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2132,7 +2138,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2160,7 +2166,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2232,7 +2238,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2260,7 +2266,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2335,7 +2341,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2363,7 +2369,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2425,7 +2431,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2453,7 +2459,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2481,7 +2487,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2545,7 +2551,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2573,7 +2579,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2601,7 +2607,7 @@
     phaddd           m4, m5
     vpermq           m4, m4, q3120
     paddd            m4, m7
-    psrad            m4, 6
+    psrad            m4, INTERP_SHIFT_PP
 
     packusdw         m4, m4
     vpermq           m4, m4, q2020
@@ -2644,32 +2650,32 @@
     mova        m1,       [INTERP_OFFSET_PS]
     cmp         r5m, byte 0
     je          .skip
-    sub         r0, r1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-
-    %if %1 == 4
-        movu        m4,         [r0 + 4]
-        pshufb      m4,         m4, m2
-        pmaddwd     m4,         m0
-        phaddd      m3,         m4
-    %else
-        phaddd      m3,         m3
-    %endif
-
-    paddd       m3,         m1
-    psrad       m3,         INTERP_SHIFT_PS
-    packssdw    m3,         m3
-
-    %if %1 == 2
-        movd        [r2],       m3
-    %else
-        movh        [r2],       m3
-    %endif
-
-    add         r0, r1
-    add         r2, r3
+    sub         r0,       r1
+    movu        m3,       [r0]
+    pshufb      m3,       m3, m2
+    pmaddwd     m3,       m0
+
+  %if %1 == 4
+    movu        m4,       [r0 + 4]
+    pshufb      m4,       m4, m2
+    pmaddwd     m4,       m0
+    phaddd      m3,       m4
+  %else
+    phaddd      m3,       m3
+  %endif
+
+    paddd       m3,       m1
+    psrad       m3,       INTERP_SHIFT_PS
+    packssdw    m3,       m3
+
+  %if %1 == 2
+    movd        [r2],     m3
+  %else
+    movh        [r2],     m3
+  %endif
+
+    add         r0,       r1
+    add         r2,       r3
     FILTER_W%1_2 %3
     lea         r0,       [r0 + 2 * r1]
     lea         r2,       [r2 + 2 * r3]
@@ -2689,7 +2695,6 @@
     lea         r2,       [r2 + 2 * r3]
     FILTER_W%1_2 %3
 %endrep
-
     RET
 %endmacro
 
@@ -2729,13 +2734,13 @@
     phaddd      m4,         m4
     paddd       m4,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m4,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m4,         INTERP_SHIFT_PP
     packusdw    m3,         m4
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m4,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m4,         INTERP_SHIFT_PS
     packssdw    m3,         m4
 %endif
     movh        [r2],       m3
@@ -2769,13 +2774,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2],       m3
@@ -2809,13 +2814,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2],       m3
@@ -2831,11 +2836,11 @@
     paddd       m3,         m1
 
 %ifidn %1, pp
-    psrad       m3,         6
+    psrad       m3,         INTERP_SHIFT_PP
     packusdw    m3,         m3
     CLIPW       m3,         m6, m7
 %else
-    psrad       m3,         2
+    psrad       m3,         INTERP_SHIFT_PS
     packssdw    m3,         m3
 %endif
     movh        [r2 + 16],  m3
@@ -2868,13 +2873,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2],       m3
@@ -2898,13 +2903,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2 + 16],  m3
@@ -2938,13 +2943,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2],       m3
@@ -2968,13 +2973,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2 + 16],  m3
@@ -2998,13 +3003,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2 + 32],  m3
@@ -3038,13 +3043,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2],       m3
@@ -3068,13 +3073,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2 + 16],  m3
@@ -3098,13 +3103,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2 + 32],  m3
@@ -3128,13 +3133,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2 + 48],  m3
@@ -3168,13 +3173,13 @@
     phaddd      m5,         m4
     paddd       m5,         m1
 %ifidn %1, pp
-    psrad       m3,         6
-    psrad       m5,         6
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
     packusdw    m3,         m5
     CLIPW       m3,         m6,    m7
 %else
-    psrad       m3,         2
-    psrad       m5,         2
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
     packssdw    m3,         m5
 %endif
     movh        [r2 + %2],       m3
@@ -3408,7 +3413,7 @@
     pmaddwd         m4, m0
     phaddd          m3, m4
     paddd           m3, m2
-    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
+    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
 
     packusdw        m3, m3
     vpermq          m3, m3, q2020
@@ -3426,7 +3431,7 @@
     pmaddwd         m4, m0
     phaddd          m3, m4
     paddd           m3, m2
-    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
+    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
 
     packusdw        m3, m3
     vpermq          m3, m3, q2020
@@ -3474,7 +3479,7 @@
     pmaddwd         m4, m0
     phaddd          m3, m4
     paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+    psrad           m3, INTERP_SHIFT_PP          ; m3 = DWORD[7 6 3 2 5 4 1 0]
 
     packusdw        m3, m3
     vpermq          m3, m3,q2020
@@ -3491,7 +3496,7 @@
     pmaddwd         m4, m0
     phaddd          m3, m4
     paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
 
     packusdw        m3, m3
     vpermq          m3, m3,q2020
@@ -4089,7 +4094,7 @@
     %ifnidn %3, ps
         mova      m7, [pw_pixel_max]
         %ifidn %3, pp
-            mova      m6, [tab_c_32]
+            mova      m6, [INTERP_OFFSET_PP]
         %else
             mova      m6, [INTERP_OFFSET_SP]
         %endif
@@ -4129,10 +4134,10 @@
     paddd     m2, m6
     paddd     m3, m6
     %ifidn %3, pp
-        psrad     m0, 6
-        psrad     m1, 6
-        psrad     m2, 6
-        psrad     m3, 6
+        psrad     m0, INTERP_SHIFT_PP
+        psrad     m1, INTERP_SHIFT_PP
+        psrad     m2, INTERP_SHIFT_PP
+        psrad     m3, INTERP_SHIFT_PP
     %else
         psrad     m0, INTERP_SHIFT_SP
         psrad     m1, INTERP_SHIFT_SP
@@ -4344,9 +4349,9 @@
         pxor      m7, m7
         mova      m6, [pw_pixel_max]
         %ifidn %2, pp
-            mova      m5, [tab_c_32]
+            mova      m5, [INTERP_OFFSET_PP]
         %else
-            mova      m5, [tab_c_524800]
+            mova      m5, [INTERP_OFFSET_SP]
         %endif
     %else
         mova      m5, [INTERP_OFFSET_PS]
@@ -4362,18 +4367,18 @@
 %elifidn %2, ps
     paddd     m0, m5
     paddd     m2, m5
-    psrad     m0, 2
-    psrad     m2, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
     packssdw  m0, m2
 %else
     paddd     m0, m5
     paddd     m2, m5
     %ifidn %2, pp
-        psrad     m0, 6
-        psrad     m2, 6
+        psrad     m0, INTERP_SHIFT_PP
+        psrad     m2, INTERP_SHIFT_PP
     %else
-        psrad     m0, 10
-        psrad     m2, 10
+        psrad     m0, INTERP_SHIFT_SP
+        psrad     m2, INTERP_SHIFT_SP
     %endif
     packusdw  m0, m2
     CLIPW     m0, m7,    m6
@@ -4389,7 +4394,6 @@
 
     dec       r4d
     jnz       .loopH
-
     RET
 %endmacro
 
@@ -4417,7 +4421,6 @@
 %macro FILTER_VER_CHROMA_W4 3
 INIT_XMM sse4
 cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
-
     add        r1d, r1d
     add        r3d, r3d
     sub        r0, r1
@@ -4439,9 +4442,9 @@
         pxor      m6, m6
         mova      m5, [pw_pixel_max]
         %ifidn %2, pp
-            mova      m4, [tab_c_32]
+            mova      m4, [INTERP_OFFSET_PP]
         %else
-            mova      m4, [tab_c_524800]
+            mova      m4, [INTERP_OFFSET_SP]
         %endif
     %else
         mova      m4, [INTERP_OFFSET_PS]
@@ -4479,18 +4482,18 @@
 %elifidn %2, ps
     paddd     m0, m4
     paddd     m1, m4
-    psrad     m0, 2
-    psrad     m1, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
     packssdw  m0, m1
 %else
     paddd     m0, m4
     paddd     m1, m4
     %ifidn %2, pp
-        psrad     m0, 6
-        psrad     m1, 6
+        psrad     m0, INTERP_SHIFT_PP
+        psrad     m1, INTERP_SHIFT_PP
     %else
-        psrad     m0, 10
-        psrad     m1, 10
+        psrad     m0, INTERP_SHIFT_SP
+        psrad     m1, INTERP_SHIFT_SP
     %endif
     packusdw  m0, m1
     CLIPW     m0, m6,    m5
@@ -4504,7 +4507,6 @@
     dec        r4d
     jnz        .loop
 %endif
-
     RET
 %endmacro
 
@@ -4524,7 +4526,6 @@
 %macro FILTER_VER_CHROMA_W6 3
 INIT_XMM sse4
 cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
-
     add       r1d, r1d
     add       r3d, r3d
     sub       r0, r1
@@ -4543,9 +4544,9 @@
     %ifnidn %2, ps
         mova      m7, [pw_pixel_max]
         %ifidn %2, pp
-            mova      m6, [tab_c_32]
+            mova      m6, [INTERP_OFFSET_PP]
         %else
-            mova      m6, [tab_c_524800]
+            mova      m6, [INTERP_OFFSET_SP]
         %endif
     %else
         mova      m6, [INTERP_OFFSET_PS]
@@ -4568,10 +4569,10 @@
     paddd     m1, m6
     paddd     m2, m6
     paddd     m3, m6
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -4581,15 +4582,15 @@
     paddd     m2, m6
     paddd     m3, m6
     %ifidn %2, pp
-        psrad     m0, 6
-        psrad     m1, 6
-        psrad     m2, 6
-        psrad     m3, 6
+        psrad     m0, INTERP_SHIFT_PP
+        psrad     m1, INTERP_SHIFT_PP
+        psrad     m2, INTERP_SHIFT_PP
+        psrad     m3, INTERP_SHIFT_PP
     %else
-        psrad     m0, 10
-        psrad     m1, 10
-        psrad     m2, 10
-        psrad     m3, 10
+        psrad     m0, INTERP_SHIFT_SP
+        psrad     m1, INTERP_SHIFT_SP
+        psrad     m2, INTERP_SHIFT_SP
+        psrad     m3, INTERP_SHIFT_SP
     %endif
     packssdw  m0, m1
     packssdw  m2, m3
@@ -4616,18 +4617,18 @@
 %elifidn %2, ps
     paddd     m0, m6
     paddd     m2, m6
-    psrad     m0, 2
-    psrad     m2, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
     packssdw  m0, m2
 %else
     paddd     m0, m6
     paddd     m2, m6
     %ifidn %2, pp
-        psrad     m0, 6
-        psrad     m2, 6
+        psrad     m0, INTERP_SHIFT_PP
+        psrad     m2, INTERP_SHIFT_PP
     %else
-        psrad     m0, 10
-        psrad     m2, 10
+        psrad     m0, INTERP_SHIFT_SP
+        psrad     m2, INTERP_SHIFT_SP
     %endif
     packusdw  m0, m2
     CLIPW     m0, m5,    m7
@@ -4644,7 +4645,6 @@
 
     dec       r4d
     jnz       .loopH
-
     RET
 %endmacro
 
@@ -4712,7 +4712,7 @@
     mov       r4d, %2/2
 
 %ifidn %3, pp
-    mova      m7, [tab_c_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %elifidn %3, sp
     mova      m7, [INTERP_OFFSET_SP]
 %elifidn %3, ps
@@ -4748,10 +4748,10 @@
     paddd     m2, m7
     paddd     m3, m7
     %ifidn %3, pp
-        psrad     m0, 6
-        psrad     m1, 6
-        psrad     m2, 6
-        psrad     m3, 6
+        psrad     m0, INTERP_SHIFT_PP
+        psrad     m1, INTERP_SHIFT_PP
+        psrad     m2, INTERP_SHIFT_PP
+        psrad     m3, INTERP_SHIFT_PP
     %else
         psrad     m0, INTERP_SHIFT_SP
         psrad     m1, INTERP_SHIFT_SP
@@ -4772,7 +4772,6 @@
 
     dec       r4d
     jnz       .loopH
-
     RET
 %endmacro
 
@@ -4868,9 +4867,9 @@
     mov             r6d, %1/4
 
 %ifidn %2,pp
-    vbroadcasti128  m8, [pd_32]
+    vbroadcasti128  m8, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova            m8, [pd_524800]
+    mova            m8, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m8, [INTERP_OFFSET_PS]
 %endif
@@ -4934,20 +4933,20 @@
     paddd           m2, m8
     paddd           m3, m8
 %ifidn %2,pp
-    psrad           m0, 6
-    psrad           m1, 6
-    psrad           m2, 6
-    psrad           m3, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
 %elifidn %2, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -5012,9 +5011,9 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [tab_c_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [pd_524800]
+    mova      m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
     mova      m7, [INTERP_OFFSET_PS]
 %endif
@@ -5034,10 +5033,10 @@
     paddd     m1, m7
     paddd     m2, m7
     paddd     m3, m7
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5047,15 +5046,15 @@
     paddd     m2, m7
     paddd     m3, m7
  %ifidn %2, pp
-    psrad     m0, 6
-    psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-%else
-    psrad     m0, 10
-    psrad     m1, 10
-    psrad     m2, 10
-    psrad     m3, 10
+    psrad     m0, INTERP_SHIFT_PP
+    psrad     m1, INTERP_SHIFT_PP
+    psrad     m2, INTERP_SHIFT_PP
+    psrad     m3, INTERP_SHIFT_PP
+%else
+    psrad     m0, INTERP_SHIFT_SP
+    psrad     m1, INTERP_SHIFT_SP
+    psrad     m2, INTERP_SHIFT_SP
+    psrad     m3, INTERP_SHIFT_SP
 %endif
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5184,9 +5183,9 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [tab_c_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [pd_524800]
+    mova      m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
     mova      m7, [INTERP_OFFSET_PS]
 %endif
@@ -5213,18 +5212,18 @@
     paddd     m1, m7
     paddd     m2, m7
     paddd     m3, m7
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
     paddd     m8, m7
     paddd     m9, m7
     paddd     m10, m7
     paddd     m11, m7
-    psrad     m8, 2
-    psrad     m9, 2
-    psrad     m10, 2
-    psrad     m11, 2
+    psrad     m8, INTERP_SHIFT_PS
+    psrad     m9, INTERP_SHIFT_PS
+    psrad     m10, INTERP_SHIFT_PS
+    psrad     m11, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5240,23 +5239,23 @@
     paddd     m10, m7
     paddd     m11, m7
  %ifidn %2, pp
-    psrad     m0, 6
-    psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-    psrad     m8, 6
-    psrad     m9, 6
-    psrad     m10, 6
-    psrad     m11, 6
-%else
-    psrad     m0, 10
-    psrad     m1, 10
-    psrad     m2, 10
-    psrad     m3, 10
-    psrad     m8, 10
-    psrad     m9, 10
-    psrad     m10, 10
-    psrad     m11, 10
+    psrad     m0, INTERP_SHIFT_PP
+    psrad     m1, INTERP_SHIFT_PP
+    psrad     m2, INTERP_SHIFT_PP
+    psrad     m3, INTERP_SHIFT_PP
+    psrad     m8, INTERP_SHIFT_PP
+    psrad     m9, INTERP_SHIFT_PP
+    psrad     m10, INTERP_SHIFT_PP
+    psrad     m11, INTERP_SHIFT_PP
+%else
+    psrad     m0, INTERP_SHIFT_SP
+    psrad     m1, INTERP_SHIFT_SP
+    psrad     m2, INTERP_SHIFT_SP
+    psrad     m3, INTERP_SHIFT_SP
+    psrad     m8, INTERP_SHIFT_SP
+    psrad     m9, INTERP_SHIFT_SP
+    psrad     m10, INTERP_SHIFT_SP
+    psrad     m11, INTERP_SHIFT_SP
 %endif
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5326,9 +5325,9 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [tab_c_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [pd_524800]
+    mova      m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
     mova      m7, [INTERP_OFFSET_PS]
 %endif
@@ -5380,10 +5379,10 @@
     paddd     m1, m7
     paddd     m2, m7
     paddd     m3, m7
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5393,15 +5392,15 @@
     paddd     m2, m7
     paddd     m3, m7
 %ifidn %2, pp
-    psrad     m0, 6
-    psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-%else
-    psrad     m0, 10
-    psrad     m1, 10
-    psrad     m2, 10
-    psrad     m3, 10
+    psrad     m0, INTERP_SHIFT_PP
+    psrad     m1, INTERP_SHIFT_PP
+    psrad     m2, INTERP_SHIFT_PP
+    psrad     m3, INTERP_SHIFT_PP
+%else
+    psrad     m0, INTERP_SHIFT_SP
+    psrad     m1, INTERP_SHIFT_SP
+    psrad     m2, INTERP_SHIFT_SP
+    psrad     m3, INTERP_SHIFT_SP
 %endif
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5457,9 +5456,9 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [tab_c_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [pd_524800]
+    mova      m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
     mova      m7, [INTERP_OFFSET_PS]
 %endif
@@ -5479,10 +5478,10 @@
     paddd     m1, m7
     paddd     m2, m7
     paddd     m3, m7
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5492,15 +5491,15 @@
     paddd     m2, m7
     paddd     m3, m7
  %ifidn %2, pp
-    psrad     m0, 6
-    psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-%else
-    psrad     m0, 10
-    psrad     m1, 10
-    psrad     m2, 10
-    psrad     m3, 10
+    psrad     m0, INTERP_SHIFT_PP
+    psrad     m1, INTERP_SHIFT_PP
+    psrad     m2, INTERP_SHIFT_PP
+    psrad     m3, INTERP_SHIFT_PP
+%else
+    psrad     m0, INTERP_SHIFT_SP
+    psrad     m1, INTERP_SHIFT_SP
+    psrad     m2, INTERP_SHIFT_SP
+    psrad     m3, INTERP_SHIFT_SP
 %endif
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5610,9 +5609,9 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [tab_c_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [pd_524800]
+    mova      m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
     mova      m7, [INTERP_OFFSET_PS]
 %endif
@@ -5639,18 +5638,18 @@
     paddd     m1, m7
     paddd     m2, m7
     paddd     m3, m7
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
     paddd     m8, m7
     paddd     m9, m7
     paddd     m10, m7
     paddd     m11, m7
-    psrad     m8, 2
-    psrad     m9, 2
-    psrad     m10, 2
-    psrad     m11, 2
+    psrad     m8, INTERP_SHIFT_PS
+    psrad     m9, INTERP_SHIFT_PS
+    psrad     m10, INTERP_SHIFT_PS
+    psrad     m11, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5666,23 +5665,23 @@
     paddd     m10, m7
     paddd     m11, m7
  %ifidn %2, pp
-    psrad     m0, 6
-    psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-    psrad     m8, 6
-    psrad     m9, 6
-    psrad     m10, 6
-    psrad     m11, 6
-%else
-    psrad     m0, 10
-    psrad     m1, 10
-    psrad     m2, 10
-    psrad     m3, 10
-    psrad     m8, 10
-    psrad     m9, 10
-    psrad     m10, 10
-    psrad     m11, 10
+    psrad     m0, INTERP_SHIFT_PP
+    psrad     m1, INTERP_SHIFT_PP
+    psrad     m2, INTERP_SHIFT_PP
+    psrad     m3, INTERP_SHIFT_PP
+    psrad     m8, INTERP_SHIFT_PP
+    psrad     m9, INTERP_SHIFT_PP
+    psrad     m10, INTERP_SHIFT_PP
+    psrad     m11, INTERP_SHIFT_PP
+%else
+    psrad     m0, INTERP_SHIFT_SP
+    psrad     m1, INTERP_SHIFT_SP
+    psrad     m2, INTERP_SHIFT_SP
+    psrad     m3, INTERP_SHIFT_SP
+    psrad     m8, INTERP_SHIFT_SP
+    psrad     m9, INTERP_SHIFT_SP
+    psrad     m10, INTERP_SHIFT_SP
+    psrad     m11, INTERP_SHIFT_SP
 %endif
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5733,9 +5732,9 @@
     mov       r4d, 32
 
 %ifidn %1, pp
-    mova      m7, [tab_c_32]
+    mova      m7, [INTERP_OFFSET_PP]
 %elifidn %1, sp
-    mova      m7, [pd_524800]
+    mova      m7, [INTERP_OFFSET_SP]
 %elifidn %1, ps
     mova      m7, [INTERP_OFFSET_PS]
 %endif
@@ -5787,10 +5786,10 @@
     paddd     m1, m7
     paddd     m2, m7
     paddd     m3, m7
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5800,15 +5799,15 @@
     paddd     m2, m7
     paddd     m3, m7
 %ifidn %1, pp
-    psrad     m0, 6
-    psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-%else
-    psrad     m0, 10
-    psrad     m1, 10
-    psrad     m2, 10
-    psrad     m3, 10
+    psrad     m0, INTERP_SHIFT_PP
+    psrad     m1, INTERP_SHIFT_PP
+    psrad     m2, INTERP_SHIFT_PP
+    psrad     m3, INTERP_SHIFT_PP
+%else
+    psrad     m0, INTERP_SHIFT_SP
+    psrad     m1, INTERP_SHIFT_SP
+    psrad     m2, INTERP_SHIFT_SP
+    psrad     m3, INTERP_SHIFT_SP
 %endif
     packssdw  m0, m1
     packssdw  m2, m3
@@ -5827,6 +5826,7 @@
     jnz       .loopH
     RET
 %endmacro
+
     FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8
     FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8
     FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7
@@ -5834,7 +5834,6 @@
 
 INIT_XMM sse2
 cglobal chroma_p2s, 3, 7, 3
-
     ; load width and height
     mov         r3d, r3m
     mov         r4d, r4m
@@ -5850,11 +5849,11 @@
     lea         r6, [r0 + r5 * 2]
 
     movu        m0, [r6]
-    psllw       m0, 4
+    psllw       m0, (14 - BIT_DEPTH)
     paddw       m0, m2
 
     movu        m1, [r6 + r1]
-    psllw       m1, 4
+    psllw       m1, (14 - BIT_DEPTH)
     paddw       m1, m2
 
     add         r5d, 8
@@ -5887,7 +5886,6 @@
 
     sub         r4d, 2
     jnz         .loopH
-
     RET
 
 %macro PROCESS_LUMA_VER_W4_4R 0
@@ -5975,7 +5973,7 @@
     lea       r6, [tab_LumaCoeffV + r4]
 %endif
 
-    mova      m7, [pd_32]
+    mova      m7, [INTERP_OFFSET_PP]
 
     mov       dword [rsp], %2/4
 .loopH:
@@ -5988,10 +5986,10 @@
     paddd     m2, m7
     paddd     m3, m7
 
-    psrad     m0, 6
-    psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
+    psrad     m0, INTERP_SHIFT_PP
+    psrad     m1, INTERP_SHIFT_PP
+    psrad     m2, INTERP_SHIFT_PP
+    psrad     m3, INTERP_SHIFT_PP
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -6017,7 +6015,6 @@
 
     dec       dword [rsp]
     jnz       .loopH
-
     RET
 %endmacro
 
@@ -6126,14 +6123,14 @@
     paddd           m0, m6
     paddd           m2, m6
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m2, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m2, 10
-%else
-    psrad           m0, 2
-    psrad           m2, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -6294,20 +6291,20 @@
     paddd           m2, m11
     paddd           m3, m11
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m1, 6
-    psrad           m2, 6
-    psrad           m3, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -6365,20 +6362,20 @@
     paddd           m6, m11
     paddd           m7, m11
 %ifidn %1,pp
-    psrad           m4, 6
-    psrad           m5, 6
-    psrad           m6, 6
-    psrad           m7, 6
+    psrad           m4, INTERP_SHIFT_PP
+    psrad           m5, INTERP_SHIFT_PP
+    psrad           m6, INTERP_SHIFT_PP
+    psrad           m7, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m4, 10
-    psrad           m5, 10
-    psrad           m6, 10
-    psrad           m7, 10
-%else
-    psrad           m4, 2
-    psrad           m5, 2
-    psrad           m6, 2
-    psrad           m7, 2
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m5, INTERP_SHIFT_SP
+    psrad           m6, INTERP_SHIFT_SP
+    psrad           m7, INTERP_SHIFT_SP
+%else
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m5, INTERP_SHIFT_PS
+    psrad           m6, INTERP_SHIFT_PS
+    psrad           m7, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -6538,26 +6535,26 @@
     paddd           m4, m14
     paddd           m5, m14
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m1, 6
-    psrad           m2, 6
-    psrad           m3, 6
-    psrad           m4, 6
-    psrad           m5, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
+    psrad           m4, INTERP_SHIFT_PP
+    psrad           m5, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-    psrad           m4, 10
-    psrad           m5, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
-    psrad           m4, 2
-    psrad           m5, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m5, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m5, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -6620,14 +6617,14 @@
     paddd           m6, m14
     paddd           m7, m14
 %ifidn %1,pp
-    psrad           m6, 6
-    psrad           m7, 6
+    psrad           m6, INTERP_SHIFT_PP
+    psrad           m7, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m6, 10
-    psrad           m7, 10
-%else
-    psrad           m6, 2
-    psrad           m7, 2
+    psrad           m6, INTERP_SHIFT_SP
+    psrad           m7, INTERP_SHIFT_SP
+%else
+    psrad           m6, INTERP_SHIFT_PS
+    psrad           m7, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -6734,32 +6731,32 @@
     paddd           m0, m14
     paddd           m1, m14
 %ifidn %1,pp
-    psrad           m8, 6
-    psrad           m9, 6
-    psrad           m10, 6
-    psrad           m11, 6
-    psrad           m12, 6
-    psrad           m13, 6
-    psrad           m0, 6
-    psrad           m1, 6
+    psrad           m8, INTERP_SHIFT_PP
+    psrad           m9, INTERP_SHIFT_PP
+    psrad           m10, INTERP_SHIFT_PP
+    psrad           m11, INTERP_SHIFT_PP
+    psrad           m12, INTERP_SHIFT_PP
+    psrad           m13, INTERP_SHIFT_PP
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m8, 10
-    psrad           m9, 10
-    psrad           m10, 10
-    psrad           m11, 10
-    psrad           m12, 10
-    psrad           m13, 10
-    psrad           m0, 10
-    psrad           m1, 10
-%else
-    psrad           m8, 2
-    psrad           m9, 2
-    psrad           m10, 2
-    psrad           m11, 2
-    psrad           m12, 2
-    psrad           m13, 2
-    psrad           m0, 2
-    psrad           m1, 2
+    psrad           m8, INTERP_SHIFT_SP
+    psrad           m9, INTERP_SHIFT_SP
+    psrad           m10, INTERP_SHIFT_SP
+    psrad           m11, INTERP_SHIFT_SP
+    psrad           m12, INTERP_SHIFT_SP
+    psrad           m13, INTERP_SHIFT_SP
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+%else
+    psrad           m8, INTERP_SHIFT_PS
+    psrad           m9, INTERP_SHIFT_PS
+    psrad           m10, INTERP_SHIFT_PS
+    psrad           m11, INTERP_SHIFT_PS
+    psrad           m12, INTERP_SHIFT_PS
+    psrad           m13, INTERP_SHIFT_PS
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -6819,7 +6816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [pd_524800]
+    mova            m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6870,7 +6867,7 @@
 %ifidn %3,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %3, sp
-    mova            m14, [pd_524800]
+    mova            m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6953,7 +6950,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [pd_524800]
+    mova            m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7089,26 +7086,26 @@
     paddd           m4, m14
     paddd           m5, m14
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m1, 6
-    psrad           m2, 6
-    psrad           m3, 6
-    psrad           m4, 6
-    psrad           m5, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
+    psrad           m4, INTERP_SHIFT_PP
+    psrad           m5, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-    psrad           m4, 10
-    psrad           m5, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
-    psrad           m4, 2
-    psrad           m5, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m5, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m5, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -7171,14 +7168,14 @@
     paddd           m6, m14
     paddd           m7, m14
 %ifidn %1,pp
-    psrad           m6, 6
-    psrad           m7, 6
+    psrad           m6, INTERP_SHIFT_PP
+    psrad           m7, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m6, 10
-    psrad           m7, 10
-%else
-    psrad           m6, 2
-    psrad           m7, 2
+    psrad           m6, INTERP_SHIFT_SP
+    psrad           m7, INTERP_SHIFT_SP
+%else
+    psrad           m6, INTERP_SHIFT_PS
+    psrad           m7, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -7285,32 +7282,32 @@
     paddd           m0, m14
     paddd           m1, m14
 %ifidn %1,pp
-    psrad           m8, 6
-    psrad           m9, 6
-    psrad           m10, 6
-    psrad           m11, 6
-    psrad           m12, 6
-    psrad           m13, 6
-    psrad           m0, 6
-    psrad           m1, 6
+    psrad           m8, INTERP_SHIFT_PP
+    psrad           m9, INTERP_SHIFT_PP
+    psrad           m10, INTERP_SHIFT_PP
+    psrad           m11, INTERP_SHIFT_PP
+    psrad           m12, INTERP_SHIFT_PP
+    psrad           m13, INTERP_SHIFT_PP
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m8, 10
-    psrad           m9, 10
-    psrad           m10, 10
-    psrad           m11, 10
-    psrad           m12, 10
-    psrad           m13, 10
-    psrad           m0, 10
-    psrad           m1, 10
-%else
-    psrad           m8, 2
-    psrad           m9, 2
-    psrad           m10, 2
-    psrad           m11, 2
-    psrad           m12, 2
-    psrad           m13, 2
-    psrad           m0, 2
-    psrad           m1, 2
+    psrad           m8, INTERP_SHIFT_SP
+    psrad           m9, INTERP_SHIFT_SP
+    psrad           m10, INTERP_SHIFT_SP
+    psrad           m11, INTERP_SHIFT_SP
+    psrad           m12, INTERP_SHIFT_SP
+    psrad           m13, INTERP_SHIFT_SP
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+%else
+    psrad           m8, INTERP_SHIFT_PS
+    psrad           m9, INTERP_SHIFT_PS
+    psrad           m10, INTERP_SHIFT_PS
+    psrad           m11, INTERP_SHIFT_PS
+    psrad           m12, INTERP_SHIFT_PS
+    psrad           m13, INTERP_SHIFT_PS
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -7485,26 +7482,26 @@
     paddd           m4, m11
     paddd           m5, m11
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m1, 6
-    psrad           m2, 6
-    psrad           m3, 6
-    psrad           m4, 6
-    psrad           m5, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
+    psrad           m4, INTERP_SHIFT_PP
+    psrad           m5, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-    psrad           m4, 10
-    psrad           m5, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
-    psrad           m4, 2
-    psrad           m5, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m5, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m5, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -7556,14 +7553,14 @@
     paddd           m6, m11
     paddd           m7, m11
 %ifidn %1,pp
-    psrad           m6, 6
-    psrad           m7, 6
+    psrad           m6, INTERP_SHIFT_PP
+    psrad           m7, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m6, 10
-    psrad           m7, 10
-%else
-    psrad           m6, 2
-    psrad           m7, 2
+    psrad           m6, INTERP_SHIFT_SP
+    psrad           m7, INTERP_SHIFT_SP
+%else
+    psrad           m6, INTERP_SHIFT_PS
+    psrad           m7, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -7600,7 +7597,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [pd_524800]
+    mova            m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -7647,7 +7644,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [pd_524800]
+    mova            m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7765,20 +7762,20 @@
     paddd           m2, m7
     paddd           m3, m7
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m1, 6
-    psrad           m2, 6
-    psrad           m3, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -7801,7 +7798,7 @@
 
 %macro FILTER_VER_LUMA_AVX2_16x4 1
 INIT_YMM avx2
-cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize
+cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize
     mov             r4d, r4m
     shl             r4d, 7
     add             r1d, r1d
@@ -7819,7 +7816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    mova            m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7864,7 +7861,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    mova            m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7904,7 +7901,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [pd_524800]
+    mova            m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -8014,20 +8011,20 @@
     paddd           m2, m14
     paddd           m3, m14
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m1, 6
-    psrad           m2, 6
-    psrad           m3, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m1, 10
-    psrad           m2, 10
-    psrad           m3, 10
-%else
-    psrad           m0, 2
-    psrad           m1, 2
-    psrad           m2, 2
-    psrad           m3, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8105,20 +8102,20 @@
     paddd           m6, m14
     paddd           m7, m14
 %ifidn %1,pp
-    psrad           m4, 6
-    psrad           m5, 6
-    psrad           m6, 6
-    psrad           m7, 6
+    psrad           m4, INTERP_SHIFT_PP
+    psrad           m5, INTERP_SHIFT_PP
+    psrad           m6, INTERP_SHIFT_PP
+    psrad           m7, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m4, 10
-    psrad           m5, 10
-    psrad           m6, 10
-    psrad           m7, 10
-%else
-    psrad           m4, 2
-    psrad           m5, 2
-    psrad           m6, 2
-    psrad           m7, 2
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m5, INTERP_SHIFT_SP
+    psrad           m6, INTERP_SHIFT_SP
+    psrad           m7, INTERP_SHIFT_SP
+%else
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m5, INTERP_SHIFT_PS
+    psrad           m6, INTERP_SHIFT_PS
+    psrad           m7, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8182,20 +8179,20 @@
     paddd           m10, m14
     paddd           m11, m14
 %ifidn %1,pp
-    psrad           m8, 6
-    psrad           m9, 6
-    psrad           m10, 6
-    psrad           m11, 6
+    psrad           m8, INTERP_SHIFT_PP
+    psrad           m9, INTERP_SHIFT_PP
+    psrad           m10, INTERP_SHIFT_PP
+    psrad           m11, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m8, 10
-    psrad           m9, 10
-    psrad           m10, 10
-    psrad           m11, 10
-%else
-    psrad           m8, 2
-    psrad           m9, 2
-    psrad           m10, 2
-    psrad           m11, 2
+    psrad           m8, INTERP_SHIFT_SP
+    psrad           m9, INTERP_SHIFT_SP
+    psrad           m10, INTERP_SHIFT_SP
+    psrad           m11, INTERP_SHIFT_SP
+%else
+    psrad           m8, INTERP_SHIFT_PS
+    psrad           m9, INTERP_SHIFT_PS
+    psrad           m10, INTERP_SHIFT_PS
+    psrad           m11, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8251,7 +8248,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    mova            m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -8315,14 +8312,14 @@
     paddd           m0, m7
     paddd           m2, m7
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m2, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m2, 10
-%else
-    psrad           m0, 2
-    psrad           m2, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8366,14 +8363,14 @@
     paddd           m4, m7
     paddd           m1, m7
 %ifidn %1,pp
-    psrad           m4, 6
-    psrad           m1, 6
+    psrad           m4, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m4, 10
-    psrad           m1, 10
-%else
-    psrad           m4, 2
-    psrad           m1, 2
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+%else
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8458,14 +8455,14 @@
     paddd           m0, m7
     paddd           m2, m7
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m2, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m2, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m2, 10
-%else
-    psrad           m0, 2
-    psrad           m2, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m2, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m2, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8516,14 +8513,14 @@
     paddd           m4, m7
     paddd           m1, m7
 %ifidn %1,pp
-    psrad           m4, 6
-    psrad           m1, 6
+    psrad           m4, INTERP_SHIFT_PP
+    psrad           m1, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m4, 10
-    psrad           m1, 10
-%else
-    psrad           m4, 2
-    psrad           m1, 2
+    psrad           m4, INTERP_SHIFT_SP
+    psrad           m1, INTERP_SHIFT_SP
+%else
+    psrad           m4, INTERP_SHIFT_PS
+    psrad           m1, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8574,14 +8571,14 @@
     paddd           m6, m7
     paddd           m5, m7
 %ifidn %1,pp
-    psrad           m6, 6
-    psrad           m5, 6
+    psrad           m6, INTERP_SHIFT_PP
+    psrad           m5, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m6, 10
-    psrad           m5, 10
-%else
-    psrad           m6, 2
-    psrad           m5, 2
+    psrad           m6, INTERP_SHIFT_SP
+    psrad           m5, INTERP_SHIFT_SP
+%else
+    psrad           m6, INTERP_SHIFT_PS
+    psrad           m5, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8625,14 +8622,14 @@
     paddd           m0, m7
     paddd           m3, m7
 %ifidn %1,pp
-    psrad           m0, 6
-    psrad           m3, 6
+    psrad           m0, INTERP_SHIFT_PP
+    psrad           m3, INTERP_SHIFT_PP
 %elifidn %1, sp
-    psrad           m0, 10
-    psrad           m3, 10
-%else
-    psrad           m0, 2
-    psrad           m3, 2
+    psrad           m0, INTERP_SHIFT_SP
+    psrad           m3, INTERP_SHIFT_SP
+%else
+    psrad           m0, INTERP_SHIFT_PS
+    psrad           m3, INTERP_SHIFT_PS
 %endif
 %endif
 
@@ -8671,7 +8668,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [pd_524800]
+    mova            m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -8706,7 +8703,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [pd_524800]
+    mova            m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -8758,10 +8755,10 @@
     paddd     m2, m7
     paddd     m3, m7
 
-    psrad     m0, 2
-    psrad     m1, 2
-    psrad     m2, 2
-    psrad     m3, 2
+    psrad     m0, INTERP_SHIFT_PS
+    psrad     m1, INTERP_SHIFT_PS
+    psrad     m2, INTERP_SHIFT_PS
+    psrad     m3, INTERP_SHIFT_PS
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -8784,7 +8781,6 @@
 
     dec       dword [rsp]
     jnz       .loopH
-
     RET
 %endmacro
 
@@ -8837,7 +8833,7 @@
     lea       r6, [tab_LumaCoeffV + r4]
 %endif
 
-    mova      m7, [tab_c_524800]
+    mova      m7, [INTERP_OFFSET_SP]
 
     mov       dword [rsp], %2/4
 .loopH:
@@ -8850,10 +8846,10 @@
     paddd     m2, m7
     paddd     m3, m7
 
-    psrad     m0, 10
-    psrad     m1, 10
-    psrad     m2, 10
-    psrad     m3, 10
+    psrad     m0, INTERP_SHIFT_SP
+    psrad     m1, INTERP_SHIFT_SP
+    psrad     m2, INTERP_SHIFT_SP
+    psrad     m3, INTERP_SHIFT_SP
 
     packssdw  m0, m1
     packssdw  m2, m3
@@ -8879,7 +8875,6 @@
 
     dec       dword [rsp]
     jnz       .loopH
-
     RET
 %endmacro
 
@@ -8963,7 +8958,6 @@
 
     dec        dword [rsp]
     jnz        .loopH
-
     RET
 %endmacro
 
@@ -9011,7 +9005,7 @@
 %rep %1/4
     movd       m0, [r0]
     movhps     m0, [r0 + r1]
-    psllw      m0, 4
+    psllw      m0, (14 - BIT_DEPTH)
     psubw      m0, m1
 
     movd       [r2 + r3 * 0], m0
@@ -9019,7 +9013,7 @@
 
     movd       m0, [r0 + r1 * 2]
     movhps     m0, [r0 + r4]
-    psllw      m0, 4
+    psllw      m0, (14 - BIT_DEPTH)
     psubw      m0, m1
 
     movd       [r2 + r3 * 2], m0
@@ -10293,14 +10287,13 @@
     mov                         r4d,               r4m
     add                         r1d,               r1d
     add                         r3d,               r3d
-%ifdef PIC
-
+
+%ifdef PIC
     lea                         r6,                [tab_LumaCoeff]
-    lea                         r4 ,               [r4 * 8]
+    lea                         r4,                [r4 * 8]
     vbroadcasti128              m0,                [r6 + r4 * 2]
-
-%else
-    lea                         r4 ,                [r4 * 8]
+%else
+    lea                         r4,                [r4 * 8]
     vbroadcasti128              m0,                [tab_LumaCoeff + r4 * 2]
 %endif
 
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/loopfilter.asm	Tue Jul 21 14:30:11 2015 -0700
@@ -39,7 +39,7 @@
 cextern pb_128
 cextern pb_2
 cextern pw_2
-cextern pw_1023
+cextern pw_pixel_max
 cextern pb_movemask
 cextern pw_1
 cextern hmul_16p
@@ -81,7 +81,7 @@
     palignr     m2, m3, m5, 15
     por         m2, m0
 
-    mova        m4, [pw_1023]
+    mova        m4, [pw_pixel_max]
     psignb      m2, [pb_128]                ; m2 = signLeft
     pxor        m0, m0
     palignr     m0, m3, 15
@@ -127,7 +127,7 @@
     palignr     m2, m3, m5, 15
     por         m2, m0
 
-    mova        m4, [pw_1023]
+    mova        m4, [pw_pixel_max]
     psignb      m2, [pb_128]                ; m2 = signLeft
     pxor        m0, m0
     palignr     m0, m3, 15
@@ -249,7 +249,7 @@
     neg             r1b
     movd            xm1, r1d
     vinserti128     m0, m0, xm1, 1
-    mova            m5, [pw_1023]
+    mova            m5, [pw_pixel_max]
     mov             r1d, r4m
     add             r1d, r1d
     shr             r2d, 4
@@ -402,8 +402,8 @@
 
     pmaxsw      m7, m0
     pmaxsw      m5, m0
-    pminsw      m7, [pw_1023]
-    pminsw      m5, [pw_1023]
+    pminsw      m7, [pw_pixel_max]
+    pminsw      m5, [pw_pixel_max]
 
     movu        [r0], m7
     movu        [r0 + 16],  m5
@@ -468,7 +468,7 @@
     mov         r4d, r4m
     mova        m4, [pb_2]
     shr         r4d, 4
-    mova        m0, [pw_1023]
+    mova        m0, [pw_pixel_max]
 .loop
     movu        m5, [r0]
     movu        m3, [r0 + r3]
@@ -559,7 +559,7 @@
     add         r3d, r3d
     mov         r4d, r4m
     pxor        m0, m0                      ; m0 = 0
-    mova        m6, [pw_1023]
+    mova        m6, [pw_pixel_max]
     mov         r5d, r4d
     shr         r4d, 4
     mov         r6, r0
@@ -736,7 +736,7 @@
 cglobal saoCuOrgE1_2Rows, 4,5,8
     add             r3d, r3d
     mov             r4d, r4m
-    mova            m4, [pw_1023]
+    mova            m4, [pw_pixel_max]
     vbroadcasti128  m6, [r2]                ; m6 = m_iOffsetEo
     shr             r4d, 4
 .loop
@@ -884,8 +884,8 @@
     paddw       m5, m4
     pmaxsw      m7, m0
     pmaxsw      m5, m0
-    pminsw      m7, [pw_1023]
-    pminsw      m5, [pw_1023]
+    pminsw      m7, [pw_pixel_max]
+    pminsw      m5, [pw_pixel_max]
     movu        [r0], m7
     movu        [r0 + 16], m5
 
@@ -960,7 +960,7 @@
     movq            xm4, [r0 + r4 * 2]
     movhps          xm4, [r1 + r4]
     vbroadcasti128  m5, [r3]
-    mova            m6, [pw_1023]
+    mova            m6, [pw_pixel_max]
 .loop
     movu            m1, [r0]
     movu            m3, [r0 + r5 + 2]
@@ -1086,8 +1086,8 @@
     paddw           m7, m6
     pmaxsw          m1, m0
     pmaxsw          m7, m0
-    pminsw          m1, [pw_1023]
-    pminsw          m7, [pw_1023]
+    pminsw          m1, [pw_pixel_max]
+    pminsw          m7, [pw_pixel_max]
     movu            [r0], m1
     movu            [r0 + 32], m7
 
@@ -1212,8 +1212,8 @@
     paddw           m5, m4
     pmaxsw          m7, m0
     pmaxsw          m5, m0
-    pminsw          m7, [pw_1023]
-    pminsw          m5, [pw_1023]
+    pminsw          m7, [pw_pixel_max]
+    pminsw          m5, [pw_pixel_max]
     movu            [r0], m7
     movu            [r0 + 16], m5
 
@@ -1333,7 +1333,7 @@
     paddw           m1, m3
     pxor            m0, m0
     pmaxsw          m1, m0
-    pminsw          m1, [pw_1023]
+    pminsw          m1, [pw_pixel_max]
     movu            [r0], m1
 
     psubb           xm0, xm2
@@ -1461,8 +1461,8 @@
     pxor            m0, m0
     pmaxsw          m1, m0
     pmaxsw          m7, m0
-    pminsw          m1, [pw_1023]
-    pminsw          m7, [pw_1023]
+    pminsw          m1, [pw_pixel_max]
+    pminsw          m7, [pw_pixel_max]
     movu            [r0], m1
     movu            [r0 + 32], m7
 
@@ -1565,8 +1565,8 @@
 .loopW
     movu        m2, [r0 + r6]
     movu        m5, [r0 + r6 + 16]
-    psrlw       m0, m2, 5
-    psrlw       m6, m5, 5
+    psrlw       m0, m2, (BIT_DEPTH - 5)
+    psrlw       m6, m5, (BIT_DEPTH - 5)
     packuswb    m0, m6
     pand        m0, [pb_31]         ; m0 = [index]
 
@@ -1584,8 +1584,8 @@
     paddw       m5, m6
     pmaxsw      m2, m7
     pmaxsw      m5, m7
-    pminsw      m2, [pw_1023]
-    pminsw      m5, [pw_1023]
+    pminsw      m2, [pw_pixel_max]
+    pminsw      m5, [pw_pixel_max]
 
     movu        [r0 + r6], m2
     movu        [r0 + r6 + 16], m5
@@ -1656,7 +1656,7 @@
     sub             r1d, r2d
     sub             r1d, r2d
     shr             r2d, 4
-    mova            m7, [pw_1023]
+    mova            m7, [pw_pixel_max]
 
     mov             r6d, r3d
     shr             r3d, 1
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/mc-a.asm	Tue Jul 21 14:30:11 2015 -0700
@@ -32,6 +32,19 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
+%if BIT_DEPTH==8
+    %define ADDAVG_FACTOR       256
+    %define ADDAVG_ROUND        128
+%elif BIT_DEPTH==10
+    %define ADDAVG_FACTOR       1024
+    %define ADDAVG_ROUND        512
+%elif BIT_DEPTH==12
+    %define ADDAVG_FACTOR       4096
+    %define ADDAVG_ROUND        2048
+%else
+    %error Unsupport bit depth!
+%endif
+
 SECTION_RODATA 32
 
 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
@@ -54,6 +67,8 @@
 cextern pw_512
 cextern pw_1023
 cextern pw_1024
+cextern pw_2048
+cextern pw_4096
 cextern pw_00ff
 cextern pw_pixel_max
 cextern pd_32
@@ -92,23 +107,24 @@
     punpcklqdq    m1,          m2
     punpcklqdq    m3,          m5
     paddw         m1,          m3
-    pmulhrsw      m1,          [pw_1024]
-    paddw         m1,          [pw_512]
+    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
+    paddw         m1,          [pw_ %+ ADDAVG_ROUND]
 
     pxor          m0,          m0
     pmaxsw        m1,          m0
-    pminsw        m1,          [pw_1023]
+    pminsw        m1,          [pw_pixel_max]
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
     lea           r2,          [r2 + 2 * r5]
     pextrd        [r2],        m1, 2
     pextrd        [r2 + r5],   m1, 3
-
     RET
+
+
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova          m0,          [pw_512]
+    mova          m0,          [pw_ %+ ADDAVG_ROUND]
     pxor          m7,          m7
     add           r3,          r3
     add           r4,          r4
@@ -136,11 +152,11 @@
     punpcklqdq    m1,          m2
     punpcklqdq    m3,          m5
     paddw         m1,          m3
-    pmulhrsw      m1,          [pw_1024]
+    pmulhrsw      m1,          [pw_ %+ ADDAVG_FACTOR]
     paddw         m1,          m0
 
     pmaxsw        m1,          m7
-    pminsw        m1,          [pw_1023]
+    pminsw        m1,          [pw_pixel_max]
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
     lea           r2,          [r2 + 2 * r5]
@@ -156,8 +172,8 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m6,         [pw_1023]
-    mova        m7,         [pw_1024]
+    mova        m6,         [pw_pixel_max]
+    mova        m7,         [pw_ %+ ADDAVG_FACTOR]
     mov         r6d,        16/4
     add         r3,         r3
     add         r4,         r4
@@ -183,7 +199,7 @@
     punpcklqdq  m3,         m5
     paddw       m1,         m3
     pmulhrsw    m1,         m7
-    paddw       m1,         [pw_512]
+    paddw       m1,         [pw_ %+ ADDAVG_ROUND]
     pxor        m0,         m0
     pmaxsw      m1,         m0
     pminsw      m1,         m6
@@ -213,21 +229,21 @@
     punpcklqdq     m0,          m1
     punpcklqdq     m2,          m3
     paddw          m0,          m2
-    pmulhrsw       m0,          [pw_1024]
-    paddw          m0,          [pw_512]
+    pmulhrsw       m0,          [pw_ %+ ADDAVG_FACTOR]
+    paddw          m0,          [pw_ %+ ADDAVG_ROUND]
 
     pxor           m6,          m6
     pmaxsw         m0,          m6
-    pminsw         m0,          [pw_1023]
+    pminsw         m0,          [pw_pixel_max]
     movh           [r2],        m0
     movhps         [r2 + r5],   m0
     RET
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,             [pw_512]
-    mova        m5,             [pw_1023]
-    mova        m7,             [pw_1024]
+    mova        m4,             [pw_ %+ ADDAVG_ROUND]
+    mova        m5,             [pw_pixel_max]
+    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,             m6
     add         r3,             r3
     add         r4,             r4
@@ -264,9 +280,9 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,             [pw_512]
-    mova        m5,             [pw_1023]
-    mova        m7,             [pw_1024]
+    mova        m4,             [pw_ %+ ADDAVG_ROUND]
+    mova        m5,             [pw_pixel_max]
+    mova        m7,             [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,             m6
     mov         r6d,            16/2
     add         r3,             r3
@@ -300,9 +316,9 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m7,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -331,9 +347,9 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m7,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -370,9 +386,9 @@
 %macro ADDAVG_W4_H4 1
 INIT_XMM sse4
 cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova           m4,          [pw_512]
-    mova           m5,          [pw_1023]
-    mova           m7,          [pw_1024]
+    mova           m4,          [pw_ %+ ADDAVG_ROUND]
+    mova           m5,          [pw_pixel_max]
+    mova           m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor           m6,          m6
     add            r3,          r3
     add            r4,          r4
@@ -420,9 +436,9 @@
 %macro ADDAVG_W8_H4 1
 INIT_XMM sse4
 cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m7,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m7,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -470,9 +486,9 @@
 %macro ADDAVG_W12_H4 1
 INIT_XMM sse4
 cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova           m4,             [pw_512]
-    mova           m5,             [pw_1023]
-    mova           m7,             [pw_1024]
+    mova           m4,             [pw_ %+ ADDAVG_ROUND]
+    mova           m5,             [pw_pixel_max]
+    mova           m7,             [pw_ %+ ADDAVG_FACTOR]
     pxor           m6,             m6
     add            r3,             r3
     add            r4,             r4
@@ -532,9 +548,9 @@
 %macro ADDAVG_W16_H4 1
 INIT_XMM sse4
 cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m7,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,              m6
     add         r3,              r3
     add         r4,              r4
@@ -601,9 +617,9 @@
 %macro ADDAVG_W24_H2 2
 INIT_XMM sse4
 cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m7,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,              m6
     add         r3,              r3
     add         r4,              r4
@@ -683,9 +699,9 @@
 %macro ADDAVG_W32_H2 1
 INIT_XMM sse4
 cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m7,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,              m6
     add         r3,              r3
     add         r4,              r4
@@ -787,9 +803,9 @@
 %macro ADDAVG_W48_H2 1
 INIT_XMM sse4
 cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m7,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,              m6
     add         r3,              r3
     add         r4,              r4
@@ -921,9 +937,9 @@
 %macro ADDAVG_W64_H1 1
 INIT_XMM sse4
 cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m7,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m7,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m6,              m6
     add         r3,              r3
     add         r4,              r4
@@ -1029,19 +1045,19 @@
 
     paddw       m0,          m1
     pxor        m1,          m1
-    pmulhrsw    m0,          [pw_1024]
-    paddw       m0,          [pw_512]
+    pmulhrsw    m0,          [pw_ %+ ADDAVG_FACTOR]
+    paddw       m0,          [pw_ %+ ADDAVG_ROUND]
     pmaxsw      m0,          m1
-    pminsw      m0,          [pw_1023]
+    pminsw      m0,          [pw_pixel_max]
     vextracti128 xm1,        m0, 1
     movu        [r2],        xm0
     movu        [r2 + r5 * 2], xm1
     RET
 
 cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m3,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m3,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m1,          m1
     add         r3d,         r3d
     add         r4d,         r4d
@@ -1100,9 +1116,9 @@
 
 %macro ADDAVG_W8_H4_AVX2 1
 cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_512]
-    mova        m5,          [pw_1023]
-    mova        m3,          [pw_1024]
+    mova        m4,          [pw_ %+ ADDAVG_ROUND]
+    mova        m5,          [pw_pixel_max]
+    mova        m3,          [pw_ %+ ADDAVG_FACTOR]
     pxor        m1,          m1
     add         r3d,         r3d
     add         r4d,         r4d
@@ -1159,9 +1175,9 @@
 ADDAVG_W8_H4_AVX2 64
 
 cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova           m4,             [pw_512]
-    mova           m5,             [pw_1023]
-    mova           m3,             [pw_1024]
+    mova           m4,             [pw_ %+ ADDAVG_ROUND]
+    mova           m5,             [pw_pixel_max]
+    mova           m3,             [pw_ %+ ADDAVG_FACTOR]
     pxor           m1,             m1
     add            r3,             r3
     add            r4,             r4
@@ -1201,8 +1217,8 @@
     RET
 
 cglobal addAvg_12x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova           m4,             [pw_512]
-    mova           m5,             [pw_1023]
+    mova           m4,             [pw_ %+ ADDAVG_ROUND]
+    mova           m5,             [pw_pixel_max]
     paddw          m3,             m4,  m4
     pxor           m1,             m1
     add            r3,             r3
@@ -1244,9 +1260,9 @@
 
 %macro ADDAVG_W16_H4_AVX2 1
 cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m3,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m2,              m2
     add         r3,              r3
     add         r4,              r4
@@ -1291,9 +1307,9 @@
 ADDAVG_W16_H4_AVX2 64
 
 cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m3,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m1,              m1
     add         r3,              r3
     add         r4,              r4
@@ -1347,8 +1363,8 @@
     RET
 
 cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
     paddw       m3,              m4,  m4
     pxor        m1,              m1
     add         r3,              r3
@@ -1404,9 +1420,9 @@
 
 %macro ADDAVG_W32_H2_AVX2 1
 cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m3,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m2,              m2
     add         r3,              r3
     add         r4,              r4
@@ -1468,9 +1484,9 @@
 ADDAVG_W32_H2_AVX2 64
 
 cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m3,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m2,              m2
     add         r3,              r3
     add         r4,              r4
@@ -1543,9 +1559,9 @@
 
 %macro ADDAVG_W64_H1_AVX2 1
 cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,              [pw_512]
-    mova        m5,              [pw_1023]
-    mova        m3,              [pw_1024]
+    mova        m4,              [pw_ %+ ADDAVG_ROUND]
+    mova        m5,              [pw_pixel_max]
+    mova        m3,              [pw_ %+ ADDAVG_FACTOR]
     pxor        m2,              m2
     add         r3d,             r3d
     add         r4d,             r4d
diff -r 46152345eb6f -r ab2c34d6ad91 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Jul 20 17:18:54 2015 -0700
+++ b/source/common/x86/pixel-util8.asm	Tue Jul 21 14:30:11 2015 -0700
@@ -879,8 +879,8 @@
 %if HIGH_BIT_DEPTH
     cmp         r3d, 32767
     jle         .skip
-    shr         r3d, 2
-    sub         r4d, 2
+    shr         r3d, (BIT_DEPTH - 8)
+    sub         r4d, (BIT_DEPTH - 8)
 .skip:
 %endif
     movd        m0, r4d             ; m0 = shift
@@ -1273,13 +1273,7 @@
 INIT_XMM sse4
 cglobal weight_pp, 4,7,7
 %define correction      (14 - BIT_DEPTH)
-%if BIT_DEPTH == 10
-    mova        m6, [pw_1023]
-%elif BIT_DEPTH == 12
-    mova        m6, [pw_3fff]
-%else
-  %error Unsupported BIT_DEPTH!
-%endif
+    mova        m6, [pw_pixel_max]
     mov         r6d, r6m
     mov         r4d, r4m
     mov         r5d, r5m
@@ -1423,7 +1417,7 @@
     movd         xm1, r7m
     vpbroadcastd m2, r8m
     mova         m5, [pw_1]
-    mova         m6, [pw_1023]
+    mova         m6, [pw_pixel_max]
     add         r2d, r2d
     add         r3d, r3d
     sub          r2d, r3d
@@ -1516,13 +1510,7 @@
 %if HIGH_BIT_DEPTH
 INIT_XMM sse4
 cglobal weight_sp, 6,7,8
-%if BIT_DEPTH == 10
-    mova        m1, [pw_1023]
-%elif BIT_DEPTH == 12
-    mova        m1, [pw_3fff]
-%else
-  %error Unsupported BIT_DEPTH!
-%endif
+    mova        m1, [pw_pixel_max]
     mova        m2, [pw_1]
     mov         r6d, r7m
     shl         r6d, 16
@@ -1681,7 +1669,7 @@
 %if HIGH_BIT_DEPTH
 INIT_YMM avx2
 cglobal weight_sp, 6,7,9
-    mova                      m1, [pw_1023]
+    mova                      m1, [pw_pixel_max]
     mova                      m2, [pw_1]
     mov                       r6d, r7m
     shl                       r6d, 16




More information about the x265-devel mailing list