[x265] [PATCH 1 of 5] asm: fix Main12 assembly up to SSSE3

Steve Borho steve at borho.org
Sat Jul 11 19:35:18 CEST 2015


On 07/10, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1436569064 25200
> # Node ID 8f60362f8555c11a14301737c0301fd6e9303448
> # Parent  7b3e1372bb28830ef0ab44cd652ecbe823573675
> asm: fix Main12 assembly up to SSSE3

Series queued, along with some Main12 and multilib testbench fixes.
On Mac I'm not making it up to SSSE3 yet.

$ ./test/TestBench --cpu SSSE3
Using random seed 55A152EF 12bit
Testing primitives: SSE2
sad[  8x8]: failed!

> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/vec/dct-sse3.cpp
> --- a/source/common/vec/dct-sse3.cpp	Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/vec/dct-sse3.cpp	Fri Jul 10 15:57:44 2015 -0700
> @@ -38,13 +38,8 @@
>  #define SHIFT1  7
>  #define ADD1    64
>  
> -#if HIGH_BIT_DEPTH
> -#define SHIFT2  10
> -#define ADD2    512
> -#else
> -#define SHIFT2  12
> -#define ADD2    2048
> -#endif
> +#define SHIFT2  (12 - (X265_DEPTH - 8))
> +#define ADD2    (1 << ((SHIFT2) - 1))
>  
>  ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
>  {
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/vec/dct-ssse3.cpp
> --- a/source/common/vec/dct-ssse3.cpp	Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/vec/dct-ssse3.cpp	Fri Jul 10 15:57:44 2015 -0700
> @@ -34,6 +34,18 @@
>  #include <pmmintrin.h> // SSE3
>  #include <tmmintrin.h> // SSSE3
>  
> +#define DCT16_SHIFT1  (3 + X265_DEPTH - 8)
> +#define DCT16_ADD1    (1 << ((DCT16_SHIFT1) - 1))
> +
> +#define DCT16_SHIFT2  10
> +#define DCT16_ADD2    (1 << ((DCT16_SHIFT2) - 1))
> +
> +#define DCT32_SHIFT1  (DCT16_SHIFT1 + 1)
> +#define DCT32_ADD1    (1 << ((DCT32_SHIFT1) - 1))
> +
> +#define DCT32_SHIFT2  (DCT16_SHIFT2 + 1)
> +#define DCT32_ADD2    (1 << ((DCT32_SHIFT2) - 1))
> +
>  using namespace X265_NS;
>  
>  ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
> @@ -100,20 +112,9 @@
>  
>  static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
>  {
> -#if HIGH_BIT_DEPTH
> -#define SHIFT1  5
> -#define ADD1    16
> -#else
> -#define SHIFT1  3
> -#define ADD1    4
> -#endif
> -
> -#define SHIFT2  10
> -#define ADD2    512
> -
>      // Const
> -    __m128i c_4     = _mm_set1_epi32(ADD1);
> -    __m128i c_512   = _mm_set1_epi32(ADD2);
> +    __m128i c_4     = _mm_set1_epi32(DCT16_ADD1);
> +    __m128i c_512   = _mm_set1_epi32(DCT16_ADD2);
>  
>      int i;
>  
> @@ -201,29 +202,29 @@
>  
>          T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
>          T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
>  
>          T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
>          T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
>  
>          T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
>          T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
>  
>          T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
>          T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
>  
> @@ -233,8 +234,8 @@
>          T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
>          T60  = _mm_hadd_epi32(T60, T61);
>          T61  = _mm_hadd_epi32(T62, T63);
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
>  
> @@ -244,8 +245,8 @@
>          T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
>          T60  = _mm_hadd_epi32(T60, T61);
>          T61  = _mm_hadd_epi32(T62, T63);
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
>  
> @@ -255,8 +256,8 @@
>          T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
>          T60  = _mm_hadd_epi32(T60, T61);
>          T61  = _mm_hadd_epi32(T62, T63);
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
>  
> @@ -266,8 +267,8 @@
>          T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
>          T60  = _mm_hadd_epi32(T60, T61);
>          T61  = _mm_hadd_epi32(T62, T63);
> -        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> -        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> +        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> +        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
>          T70  = _mm_packs_epi32(T60, T61);
>          _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
>  
> @@ -286,8 +287,8 @@
>      T63  = _mm_hadd_epi32(T66, T67); \
>      T60  = _mm_hadd_epi32(T60, T61); \
>      T61  = _mm_hadd_epi32(T62, T63); \
> -    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1); \
> -    T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1); \
> +    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1); \
> +    T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1); \
>      T70  = _mm_packs_epi32(T60, T61); \
>      _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
>  
> @@ -351,8 +352,8 @@
>  
>          T40  = _mm_hadd_epi32(T30, T31);
>          T41  = _mm_hsub_epi32(T30, T31);
> -        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> -        T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), SHIFT2);
> +        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> +        T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), DCT16_SHIFT2);
>          T40  = _mm_packs_epi32(T40, T40);
>          T41  = _mm_packs_epi32(T41, T41);
>          _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
> @@ -376,7 +377,7 @@
>          T31  = _mm_hadd_epi32(T32, T33);
>  
>          T40  = _mm_hadd_epi32(T30, T31);
> -        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> +        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
>          T40  = _mm_packs_epi32(T40, T40);
>          _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
>  
> @@ -398,7 +399,7 @@
>          T31  = _mm_hadd_epi32(T32, T33);
>  
>          T40  = _mm_hadd_epi32(T30, T31);
> -        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> +        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
>          T40  = _mm_packs_epi32(T40, T40);
>          _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
>  
> @@ -420,7 +421,7 @@
>          T31  = _mm_hadd_epi32(T32, T33);
>  
>          T40  = _mm_hadd_epi32(T30, T31);
> -        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> +        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
>          T40  = _mm_packs_epi32(T40, T40);
>          _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
>  
> @@ -442,7 +443,7 @@
>          T31  = _mm_hadd_epi32(T32, T33);
>  
>          T40  = _mm_hadd_epi32(T30, T31);
> -        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> +        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
>          T40  = _mm_packs_epi32(T40, T40);
>          _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
>  
> @@ -464,7 +465,7 @@
>          T31  = _mm_hadd_epi32(T32, T33);
>  
>          T40  = _mm_hadd_epi32(T30, T31);
> -        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> +        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
>          T40  = _mm_packs_epi32(T40, T40);
>          _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
>  
> @@ -486,7 +487,7 @@
>          T31  = _mm_hadd_epi32(T32, T33);
>  
>          T40  = _mm_hadd_epi32(T30, T31);
> -        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> +        T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
>          T40  = _mm_packs_epi32(T40, T40);
>          _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
>  
> @@ -509,7 +510,7 @@
>      T31  = _mm_hadd_epi32(T32, T33); \
>          \
>      T40  = _mm_hadd_epi32(T30, T31); \
> -    T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2); \
> +    T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2); \
>      T40  = _mm_packs_epi32(T40, T40); \
>      _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
>  
> @@ -523,10 +524,6 @@
>          MAKE_ODD(28, 15);
>  #undef MAKE_ODD
>      }
> -#undef SHIFT1
> -#undef ADD1
> -#undef SHIFT2
> -#undef ADD2
>  }
>  
>  ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) =
> @@ -681,20 +678,9 @@
>  
>  static void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
>  {
> -#if HIGH_BIT_DEPTH
> -#define SHIFT1  6
> -#define ADD1    32
> -#else
> -#define SHIFT1  4
> -#define ADD1    8
> -#endif
> -
> -#define SHIFT2  11
> -#define ADD2    1024
> -
>      // Const
> -    __m128i c_8     = _mm_set1_epi32(ADD1);
> -    __m128i c_1024  = _mm_set1_epi32(ADD2);
> +    __m128i c_8     = _mm_set1_epi32(DCT32_ADD1);
> +    __m128i c_1024  = _mm_set1_epi32(DCT32_ADD2);
>  
>      int i;
>  
> @@ -839,15 +825,15 @@
>  
>          T50  = _mm_hadd_epi32(T40, T41);
>          T51  = _mm_hadd_epi32(T42, T43);
> -        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> -        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> +        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> +        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
>          T60  = _mm_packs_epi32(T50, T51);
>          im[0][i] = T60;
>  
>          T50  = _mm_hsub_epi32(T40, T41);
>          T51  = _mm_hsub_epi32(T42, T43);
> -        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> -        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> +        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> +        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
>          T60  = _mm_packs_epi32(T50, T51);
>          im[16][i] = T60;
>  
> @@ -867,8 +853,8 @@
>  
>          T50  = _mm_hadd_epi32(T40, T41);
>          T51  = _mm_hadd_epi32(T42, T43);
> -        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> -        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> +        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> +        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
>          T60  = _mm_packs_epi32(T50, T51);
>          im[8][i] = T60;
>  
> @@ -888,8 +874,8 @@
>  
>          T50  = _mm_hadd_epi32(T40, T41);
>          T51  = _mm_hadd_epi32(T42, T43);
> -        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> -        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> +        T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> +        T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
>          T60  = _mm_packs_epi32(T50, T51);
>          im[24][i] = T60;
>  
> @@ -910,8 +896,8 @@
>          \
>      T50  = _mm_hadd_epi32(T40, T41); \
>      T51  = _mm_hadd_epi32(T42, T43); \
> -    T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
> -    T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
> +    T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
> +    T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
>      T60  = _mm_packs_epi32(T50, T51); \
>      im[(dstPos)][i] = T60;
>  
> @@ -973,8 +959,8 @@
>          \
>      T50  = _mm_hadd_epi32(T50, T51); \
>      T51  = _mm_hadd_epi32(T52, T53); \
> -    T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
> -    T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
> +    T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
> +    T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
>      T60  = _mm_packs_epi32(T50, T51); \
>      im[(dstPos)][i] = T60;
>  
> @@ -1082,7 +1068,7 @@
>          \
>      T60  = _mm_hadd_epi32(T60, T61); \
>          \
> -    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), SHIFT2); \
> +    T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), DCT32_SHIFT2); \
>      T60  = _mm_packs_epi32(T60, T60); \
>      _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
>  
> @@ -1124,10 +1110,6 @@
>          MAKE_ODD(158, 159, 160, 161, 31);
>  #undef MAKE_ODD
>      }
> -#undef SHIFT1
> -#undef ADD1
> -#undef SHIFT2
> -#undef ADD2
>  }
>  
>  namespace X265_NS {
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm	Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/const-a.asm	Fri Jul 10 15:57:44 2015 -0700
> @@ -125,6 +125,7 @@
>  const pd_2048,              times  4 dd 2048
>  const pd_ffff,              times  4 dd 0xffff
>  const pd_32767,             times  4 dd 32767
> +const pd_524416,            times  4 dd 524416
>  const pd_n32768,            times  8 dd 0xffff8000
>  const pd_n131072,           times  4 dd 0xfffe0000
>  
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm	Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/dct8.asm	Fri Jul 10 15:57:44 2015 -0700
> @@ -332,23 +332,48 @@
>  cextern pd_2048
>  cextern pw_ppppmmmm
>  cextern trans8_shuf
> +
> +
> +%if BIT_DEPTH == 12
> +    %define     DCT4_SHIFT          5
> +    %define     DCT4_ROUND          16
> +    %define    IDCT_SHIFT           8
> +    %define    IDCT_ROUND           128
> +    %define     DST4_SHIFT          5
> +    %define     DST4_ROUND          16
> +    %define     DCT8_SHIFT1         6
> +    %define     DCT8_ROUND1         32
> +%elif BIT_DEPTH == 10
> +    %define     DCT4_SHIFT          3
> +    %define     DCT4_ROUND          4
> +    %define    IDCT_SHIFT           10
> +    %define    IDCT_ROUND           512
> +    %define     DST4_SHIFT          3
> +    %define     DST4_ROUND          4
> +    %define     DCT8_SHIFT1         4
> +    %define     DCT8_ROUND1         8
> +%elif BIT_DEPTH == 8
> +    %define     DCT4_SHIFT          1
> +    %define     DCT4_ROUND          1
> +    %define    IDCT_SHIFT           12
> +    %define    IDCT_ROUND           2048
> +    %define     DST4_SHIFT          1
> +    %define     DST4_ROUND          1
> +    %define     DCT8_SHIFT1         2
> +    %define     DCT8_ROUND1         2
> +%else
> +    %error Unsupported BIT_DEPTH!
> +%endif
> +
> +%define         DCT8_ROUND2         256
> +%define         DCT8_SHIFT2         9
> +
>  ;------------------------------------------------------
>  ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  ;------------------------------------------------------
>  INIT_XMM sse2
>  cglobal dct4, 3, 4, 8
> -%if BIT_DEPTH == 12
> -  %define       DCT_SHIFT 5
> -  mova          m7, [pd_16]
> -%elif BIT_DEPTH == 10
> -  %define       DCT_SHIFT 3
> -  mova          m7, [pd_4]
> -%elif BIT_DEPTH == 8
> -  %define       DCT_SHIFT 1
> -  mova          m7, [pd_1]
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +    mova        m7, [pd_ %+ DCT4_ROUND]
>      add         r2d, r2d
>      lea         r3, [tab_dct4]
>  
> @@ -375,19 +400,19 @@
>      psubw       m2, m0
>      pmaddwd     m0, m1, m4
>      paddd       m0, m7
> -    psrad       m0, DCT_SHIFT
> +    psrad       m0, DCT4_SHIFT
>      pmaddwd     m3, m2, m5
>      paddd       m3, m7
> -    psrad       m3, DCT_SHIFT
> +    psrad       m3, DCT4_SHIFT
>      packssdw    m0, m3
>      pshufd      m0, m0, 0xD8
>      pshufhw     m0, m0, 0xB1
>      pmaddwd     m1, m6
>      paddd       m1, m7
> -    psrad       m1, DCT_SHIFT
> +    psrad       m1, DCT4_SHIFT
>      pmaddwd     m2, [r3 + 3 * 16]
>      paddd       m2, m7
> -    psrad       m2, DCT_SHIFT
> +    psrad       m2, DCT4_SHIFT
>      packssdw    m1, m2
>      pshufd      m1, m1, 0xD8
>      pshufhw     m1, m1, 0xB1
> @@ -434,18 +459,7 @@
>  ; - r2:     source stride
>  INIT_YMM avx2
>  cglobal dct4, 3, 4, 8, src, dst, srcStride
> -%if BIT_DEPTH == 12
> -    %define DCT_SHIFT 5
> -    vbroadcasti128 m7, [pd_16]
> -%elif BIT_DEPTH == 10
> -    %define DCT_SHIFT 3
> -    vbroadcasti128 m7, [pd_4]
> -%elif BIT_DEPTH == 8
> -    %define DCT_SHIFT 1
> -    vbroadcasti128 m7, [pd_1]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> +    vbroadcasti128  m7, [pd_ %+ DCT4_ROUND]
>      add             r2d, r2d
>      lea             r3, [avx2_dct4]
>  
> @@ -467,11 +481,11 @@
>  
>      pmaddwd         m2, m5
>      paddd           m2, m7
> -    psrad           m2, DCT_SHIFT
> +    psrad           m2, DCT4_SHIFT
>  
>      pmaddwd         m0, m6
>      paddd           m0, m7
> -    psrad           m0, DCT_SHIFT
> +    psrad           m0, DCT4_SHIFT
>  
>      packssdw        m2, m0
>      pshufb          m2, m4
> @@ -499,33 +513,19 @@
>  ;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  INIT_XMM sse2
> -cglobal idct4, 3, 4, 7
> -%if BIT_DEPTH == 12
> -  %define IDCT4_OFFSET  [pd_128]
> -  %define IDCT4_SHIFT   8
> -%elif BIT_DEPTH == 10
> -  %define IDCT4_OFFSET  [pd_512]
> -  %define IDCT4_SHIFT   10
> -%elif BIT_DEPTH == 8
> -  %define IDCT4_OFFSET  [pd_2048]
> -  %define IDCT4_SHIFT   12
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +cglobal idct4, 3, 4, 6
>      add         r2d, r2d
>      lea         r3, [tab_dct4]
>  
> -    mova        m6, [pd_64]
> -
>      movu        m0, [r0 + 0 * 16]
>      movu        m1, [r0 + 1 * 16]
>  
>      punpcklwd   m2, m0, m1
>      pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
> -    paddd       m3, m6
> +    paddd       m3, [pd_64]
>  
>      pmaddwd     m2, [r3 + 2 * 16]           ; m2 = E2
> -    paddd       m2, m6
> +    paddd       m2, [pd_64]
>  
>      punpckhwd   m0, m1
>      pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
> @@ -549,29 +549,27 @@
>      punpcklwd   m0, m1, m4                  ; m0 = m128iA
>      punpckhwd   m1, m4                      ; m1 = m128iD
>  
> -    mova        m6, IDCT4_OFFSET
> -
>      punpcklwd   m2, m0, m1
>      pmaddwd     m3, m2, [r3 + 0 * 16]
> -    paddd       m3, m6                      ; m3 = E1
> +    paddd       m3, [pd_ %+ IDCT_ROUND]     ; m3 = E1
>  
>      pmaddwd     m2, [r3 + 2 * 16]
> -    paddd       m2, m6                      ; m2 = E2
> +    paddd       m2, [pd_ %+ IDCT_ROUND]     ; m2 = E2
>  
>      punpckhwd   m0, m1
>      pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
>      pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2
>  
>      paddd       m4, m3, m1
> -    psrad       m4, IDCT4_SHIFT             ; m4 = m128iA
> +    psrad       m4, IDCT_SHIFT              ; m4 = m128iA
>      paddd       m5, m2, m0
> -    psrad       m5, IDCT4_SHIFT
> +    psrad       m5, IDCT_SHIFT
>      packssdw    m4, m5                      ; m4 = m128iA
>  
>      psubd       m2, m0
> -    psrad       m2, IDCT4_SHIFT
> +    psrad       m2, IDCT_SHIFT
>      psubd       m3, m1
> -    psrad       m3, IDCT4_SHIFT
> +    psrad       m3, IDCT_SHIFT
>      packssdw    m2, m3                      ; m2 = m128iD
>  
>      punpcklwd   m1, m4, m2
> @@ -585,7 +583,6 @@
>      movlps      [r1 + 2 * r2], m1
>      lea         r1, [r1 + 2 * r2]
>      movhps      [r1 + r2], m1
> -
>      RET
>  
>  ;------------------------------------------------------
> @@ -606,18 +603,7 @@
>    %define       coef3   [r3 + 3 * 16]
>  %endif ; ARCH_X86_64
>  
> -%if BIT_DEPTH == 12
> -    %define       DST_SHIFT 5
> -    mova          m5, [pd_16]
> -%elif BIT_DEPTH == 10
> -    %define       DST_SHIFT 3
> -    mova          m5, [pd_4]
> -%elif BIT_DEPTH == 8
> -    %define       DST_SHIFT 1
> -    mova          m5, [pd_1]
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> +    mova        m5, [pd_ %+ DST4_ROUND]
>      add         r2d, r2d
>      lea         r3, [tab_dst4]
>  %if ARCH_X86_64
> @@ -641,7 +627,7 @@
>      pshufd      m3, m3, q3120
>      punpcklqdq  m2, m3
>      paddd       m2, m5
> -    psrad       m2, DST_SHIFT
> +    psrad       m2, DST4_SHIFT
>      pmaddwd     m3, m0, coef1
>      pmaddwd     m4, m1, coef1
>      pshufd      m6, m4, q2301
> @@ -652,7 +638,7 @@
>      pshufd      m3, m3, q3120
>      punpcklqdq  m3, m4
>      paddd       m3, m5
> -    psrad       m3, DST_SHIFT
> +    psrad       m3, DST4_SHIFT
>      packssdw    m2, m3                       ; m2 = T70
>      pmaddwd     m3, m0, coef2
>      pmaddwd     m4, m1, coef2
> @@ -664,7 +650,7 @@
>      pshufd      m3, m3, q3120
>      punpcklqdq  m3, m4
>      paddd       m3, m5
> -    psrad       m3, DST_SHIFT
> +    psrad       m3, DST4_SHIFT
>      pmaddwd     m0, coef3
>      pmaddwd     m1, coef3
>      pshufd      m6, m0, q2301
> @@ -675,7 +661,7 @@
>      pshufd      m1, m1, q3120
>      punpcklqdq  m0, m1
>      paddd       m0, m5
> -    psrad       m0, DST_SHIFT
> +    psrad       m0, DST4_SHIFT
>      packssdw    m3, m0                       ; m3 = T71
>      mova        m5, [pd_128]
>  
> @@ -730,7 +716,6 @@
>      psrad       m2, 8
>      packssdw    m0, m2
>      movu        [r1 + 1 * 16], m0
> -
>      RET
>  
>  ;------------------------------------------------------
> @@ -749,13 +734,7 @@
>  %define         coef0   m6
>  %define         coef1   m7
>  
> -%if BIT_DEPTH == 8
> -  %define       DST_SHIFT 1
> -  mova          m5, [pd_1]
> -%elif BIT_DEPTH == 10
> -  %define       DST_SHIFT 3
> -  mova          m5, [pd_4]
> -%endif
> +    mova        m5, [pd_ %+ DST4_ROUND]
>      add         r2d, r2d
>      lea         r3, [tab_dst4]
>      mova        coef0, [r3 + 0 * 16]
> @@ -775,23 +754,23 @@
>      pmaddwd     m3, m1, coef0
>      phaddd      m2, m3
>      paddd       m2, m5
> -    psrad       m2, DST_SHIFT
> +    psrad       m2, DST4_SHIFT
>      pmaddwd     m3, m0, coef1
>      pmaddwd     m4, m1, coef1
>      phaddd      m3, m4
>      paddd       m3, m5
> -    psrad       m3, DST_SHIFT
> +    psrad       m3, DST4_SHIFT
>      packssdw    m2, m3                       ; m2 = T70
>      pmaddwd     m3, m0, coef2
>      pmaddwd     m4, m1, coef2
>      phaddd      m3, m4
>      paddd       m3, m5
> -    psrad       m3, DST_SHIFT
> +    psrad       m3, DST4_SHIFT
>      pmaddwd     m0, coef3
>      pmaddwd     m1, coef3
>      phaddd      m0, m1
>      paddd       m0, m5
> -    psrad       m0, DST_SHIFT
> +    psrad       m0, DST4_SHIFT
>      packssdw    m3, m0                       ; m3 = T71
>      mova        m5, [pd_128]
>  
> @@ -822,7 +801,6 @@
>      psrad       m2, 8
>      packssdw    m0, m2
>      movu        [r1 + 1 * 16], m0
> -
>      RET
>  
>  ;------------------------------------------------------------------
> @@ -830,13 +808,7 @@
>  ;------------------------------------------------------------------
>  INIT_YMM avx2
>  cglobal dst4, 3, 4, 6
> -%if BIT_DEPTH == 8
> -  %define       DST_SHIFT 1
> -  vpbroadcastd  m5, [pd_1]
> -%elif BIT_DEPTH == 10
> -  %define       DST_SHIFT 3
> -  vpbroadcastd  m5, [pd_4]
> -%endif
> +    vbroadcasti128 m5, [pd_ %+ DST4_ROUND]
>      mova        m4, [trans8_shuf]
>      add         r2d, r2d
>      lea         r3, [pw_dst4_tab]
> @@ -853,12 +825,12 @@
>      pmaddwd     m1, m0, [r3 + 1 * 32]
>      phaddd      m2, m1
>      paddd       m2, m5
> -    psrad       m2, DST_SHIFT
> +    psrad       m2, DST4_SHIFT
>      pmaddwd     m3, m0, [r3 + 2 * 32]
>      pmaddwd     m1, m0, [r3 + 3 * 32]
>      phaddd      m3, m1
>      paddd       m3, m5
> -    psrad       m3, DST_SHIFT
> +    psrad       m3, DST4_SHIFT
>      packssdw    m2, m3
>      vpermd      m2, m4, m2
>  
> @@ -883,18 +855,7 @@
>  ;-------------------------------------------------------
>  INIT_XMM sse2
>  cglobal idst4, 3, 4, 7
> -%if BIT_DEPTH == 12
> -    mova m6,            [pd_128]
> -  %define IDCT4_SHIFT   8
> -%elif BIT_DEPTH == 10
> -    mova m6,            [pd_512]
> -  %define IDCT4_SHIFT   10
> -%elif BIT_DEPTH == 8
> -    mova m6,            [pd_2048]
> -  %define IDCT4_SHIFT   12
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +    mova        m6, [pd_ %+ IDCT_ROUND]
>      add         r2d, r2d
>      lea         r3, [tab_idst4]
>      mova        m5, [pd_64]
> @@ -942,23 +903,23 @@
>      pmaddwd     m3, m2, [r3 + 1 * 16]
>      paddd       m0, m3
>      paddd       m0, m6
> -    psrad       m0, IDCT4_SHIFT             ; m0 = S0
> +    psrad       m0, IDCT_SHIFT              ; m0 = S0
>      pmaddwd     m3, m1, [r3 + 2 * 16]
>      pmaddwd     m4, m2, [r3 + 3 * 16]
>      paddd       m3, m4
>      paddd       m3, m6
> -    psrad       m3, IDCT4_SHIFT             ; m3 = S8
> +    psrad       m3, IDCT_SHIFT              ; m3 = S8
>      packssdw    m0, m3                      ; m0 = m128iA
>      pmaddwd     m3, m1, [r3 + 4 * 16]
>      pmaddwd     m4, m2, [r3 + 5 * 16]
>      paddd       m3, m4
>      paddd       m3, m6
> -    psrad       m3, IDCT4_SHIFT             ; m3 = S0
> +    psrad       m3, IDCT_SHIFT              ; m3 = S0
>      pmaddwd     m1, [r3 + 6 * 16]
>      pmaddwd     m2, [r3 + 7 * 16]
>      paddd       m1, m2
>      paddd       m1, m6
> -    psrad       m1, IDCT4_SHIFT             ; m1 = S8
> +    psrad       m1, IDCT_SHIFT              ; m1 = S8
>      packssdw    m3, m1                      ; m3 = m128iD
>      punpcklwd   m1, m0, m3
>      punpckhwd   m0, m3
> @@ -978,18 +939,7 @@
>  ;-----------------------------------------------------------------
>  INIT_YMM avx2
>  cglobal idst4, 3, 4, 6
> -%if BIT_DEPTH == 12
> -    vpbroadcastd    m4,     [pd_256]
> -    %define IDCT4_SHIFT     8
> -%elif BIT_DEPTH == 10
> -    vpbroadcastd    m4,     [pd_512]
> -    %define IDCT4_SHIFT     10
> -%elif BIT_DEPTH == 8
> -    vpbroadcastd    m4,     [pd_2048]
> -    %define IDCT4_SHIFT     12
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +    vbroadcasti128 m4, [pd_ %+ IDCT_ROUND]
>      add         r2d, r2d
>      lea         r3, [pw_idst4_tab]
>  
> @@ -1030,12 +980,12 @@
>      pmaddwd     m3, m2, [r3 + 1 * 32]
>      paddd       m0, m3
>      paddd       m0, m4
> -    psrad       m0, IDCT4_SHIFT
> +    psrad       m0, IDCT_SHIFT
>      pmaddwd     m3, m1, [r3 + 2 * 32]
>      pmaddwd     m2, m2, [r3 + 3 * 32]
>      paddd       m3, m2
>      paddd       m3, m4
> -    psrad       m3, IDCT4_SHIFT
> +    psrad       m3, IDCT_SHIFT
>  
>      packssdw    m0, m3
>      pshufb      m1, m0, [pb_idst4_shuf]
> @@ -1066,20 +1016,6 @@
>      ; ...
>      ; Row6[4-7] Row7[4-7]
>      ;------------------------
> -%if BIT_DEPTH == 12
> -  %define       DCT_SHIFT1 6
> -  %define       DCT_ADD1 [pd_32]
> -%elif BIT_DEPTH == 10
> -  %define       DCT_SHIFT1 4
> -  %define       DCT_ADD1 [pd_8]
> -%elif BIT_DEPTH == 8
> -  %define       DCT_SHIFT1 2
> -  %define       DCT_ADD1 [pd_2]
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> -%define         DCT_ADD2 [pd_256]
> -%define         DCT_SHIFT2 9
>  
>      add         r2, r2
>      lea         r3, [r2 * 3]
> @@ -1125,8 +1061,8 @@
>      punpckhqdq  m7, m5
>      punpcklqdq  m1, m5
>      paddd       m1, m7
> -    paddd       m1, DCT_ADD1
> -    psrad       m1, DCT_SHIFT1
> +    paddd       m1, [pd_ %+ DCT8_ROUND1]
> +    psrad       m1, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -1140,8 +1076,8 @@
>      punpckhqdq  m7, m5
>      punpcklqdq  m1, m5
>      paddd       m1, m7
> -    paddd       m1, DCT_ADD1
> -    psrad       m1, DCT_SHIFT1
> +    paddd       m1, [pd_ %+ DCT8_ROUND1]
> +    psrad       m1, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -1155,8 +1091,8 @@
>      punpckhqdq  m7, m5
>      punpcklqdq  m1, m5
>      paddd       m1, m7
> -    paddd       m1, DCT_ADD1
> -    psrad       m1, DCT_SHIFT1
> +    paddd       m1, [pd_ %+ DCT8_ROUND1]
> +    psrad       m1, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -1170,8 +1106,8 @@
>      punpckhqdq  m7, m0
>      punpcklqdq  m4, m0
>      paddd       m4, m7
> -    paddd       m4, DCT_ADD1
> -    psrad       m4, DCT_SHIFT1
> +    paddd       m4, [pd_ %+ DCT8_ROUND1]
> +    psrad       m4, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m4, m4, 0x1B
>    %endif
> @@ -1189,29 +1125,29 @@
>      pshuflw     m2, m2, 0xD8
>      pshufhw     m2, m2, 0xD8
>      pmaddwd     m3, m0, [r4 + 0*16]
> -    paddd       m3, DCT_ADD1
> -    psrad       m3, DCT_SHIFT1
> +    paddd       m3, [pd_ %+ DCT8_ROUND1]
> +    psrad       m3, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m3, m3, 0x1B
>    %endif
>      mova        [r5 + 0*2*mmsize], m3 ; Row 0
>      pmaddwd     m0, [r4 + 2*16]
> -    paddd       m0, DCT_ADD1
> -    psrad       m0, DCT_SHIFT1
> +    paddd       m0, [pd_ %+ DCT8_ROUND1]
> +    psrad       m0, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m0, m0, 0x1B
>    %endif
>      mova        [r5 + 4*2*mmsize], m0 ; Row 4
>      pmaddwd     m3, m2, [r4 + 1*16]
> -    paddd       m3, DCT_ADD1
> -    psrad       m3, DCT_SHIFT1
> +    paddd       m3, [pd_ %+ DCT8_ROUND1]
> +    psrad       m3, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m3, m3, 0x1B
>    %endif
>      mova        [r5 + 2*2*mmsize], m3 ; Row 2
>      pmaddwd     m2, [r4 + 3*16]
> -    paddd       m2, DCT_ADD1
> -    psrad       m2, DCT_SHIFT1
> +    paddd       m2, [pd_ %+ DCT8_ROUND1]
> +    psrad       m2, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m2, m2, 0x1B
>    %endif
> @@ -1271,16 +1207,16 @@
>      punpckhqdq  m7, m5
>      punpcklqdq  m3, m5
>      paddd       m3, m7                  ; m3 = [Row2 Row0]
> -    paddd       m3, DCT_ADD2
> -    psrad       m3, DCT_SHIFT2
> +    paddd       m3, [pd_ %+ DCT8_ROUND2]
> +    psrad       m3, DCT8_SHIFT2
>      pshufd      m4, m4, 0xD8
>      pshufd      m2, m2, 0xD8
>      mova        m7, m4
>      punpckhqdq  m7, m2
>      punpcklqdq  m4, m2
>      psubd       m4, m7                  ; m4 = [Row6 Row4]
> -    paddd       m4, DCT_ADD2
> -    psrad       m4, DCT_SHIFT2
> +    paddd       m4, [pd_ %+ DCT8_ROUND2]
> +    psrad       m4, DCT8_SHIFT2
>  
>      packssdw    m3, m3
>      movd        [r1 + 0*mmsize], m3
> @@ -1341,8 +1277,8 @@
>      punpckhqdq  m7, m4
>      punpcklqdq  m2, m4
>      paddd       m2, m7                  ; m2 = [Row3 Row1]
> -    paddd       m2, DCT_ADD2
> -    psrad       m2, DCT_SHIFT2
> +    paddd       m2, [pd_ %+ DCT8_ROUND2]
> +    psrad       m2, DCT8_SHIFT2
>  
>      packssdw    m2, m2
>      movd        [r1 + 1*mmsize], m2
> @@ -1397,8 +1333,8 @@
>      punpckhqdq  m7, m4
>      punpcklqdq  m2, m4
>      paddd       m2, m7                  ; m2 = [Row7 Row5]
> -    paddd       m2, DCT_ADD2
> -    psrad       m2, DCT_SHIFT2
> +    paddd       m2, [pd_ %+ DCT8_ROUND2]
> +    psrad       m2, DCT8_SHIFT2
>  
>      packssdw    m2, m2
>      movd        [r1 + 5*mmsize], m2
> @@ -1412,10 +1348,6 @@
>  %endrep
>  
>      RET
> -%undef IDCT_SHIFT1
> -%undef IDCT_ADD1
> -%undef IDCT_SHIFT2
> -%undef IDCT_ADD2
>  
>  ;-------------------------------------------------------
>  ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
> @@ -1432,18 +1364,7 @@
>      ; ...
>      ; Row6[4-7] Row7[4-7]
>      ;------------------------
> -%if BIT_DEPTH == 12
> -  %define       DCT_SHIFT 6
> -  mova          m6, [pd_16]
> -%elif BIT_DEPTH == 10
> -  %define       DCT_SHIFT 4
> -  mova          m6, [pd_8]
> -%elif BIT_DEPTH == 8
> -  %define       DCT_SHIFT 2
> -  mova          m6, [pd_2]
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +    mova        m6, [pd_ %+ DCT8_ROUND1]
>  
>      add         r2, r2
>      lea         r3, [r2 * 3]
> @@ -1485,7 +1406,7 @@
>      pmaddwd     m5, m0, [r4 + 0*16]
>      phaddd      m1, m5
>      paddd       m1, m6
> -    psrad       m1, DCT_SHIFT
> +    psrad       m1, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -1495,7 +1416,7 @@
>      pmaddwd     m5, m0, [r4 + 1*16]
>      phaddd      m1, m5
>      paddd       m1, m6
> -    psrad       m1, DCT_SHIFT
> +    psrad       m1, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -1505,7 +1426,7 @@
>      pmaddwd     m5, m0, [r4 + 2*16]
>      phaddd      m1, m5
>      paddd       m1, m6
> -    psrad       m1, DCT_SHIFT
> +    psrad       m1, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -1515,7 +1436,7 @@
>      pmaddwd     m0, [r4 + 3*16]
>      phaddd      m4, m0
>      paddd       m4, m6
> -    psrad       m4, DCT_SHIFT
> +    psrad       m4, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m4, m4, 0x1B
>    %endif
> @@ -1530,28 +1451,28 @@
>      pshufb      m2, [pb_unpackhlw1]
>      pmaddwd     m3, m0, [r4 + 0*16]
>      paddd       m3, m6
> -    psrad       m3, DCT_SHIFT
> +    psrad       m3, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m3, m3, 0x1B
>    %endif
>      mova        [r5 + 0*2*mmsize], m3 ; Row 0
>      pmaddwd     m0, [r4 + 2*16]
>      paddd       m0, m6
> -    psrad       m0, DCT_SHIFT
> +    psrad       m0, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m0, m0, 0x1B
>    %endif
>      mova        [r5 + 4*2*mmsize], m0 ; Row 4
>      pmaddwd     m3, m2, [r4 + 1*16]
>      paddd       m3, m6
> -    psrad       m3, DCT_SHIFT
> +    psrad       m3, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m3, m3, 0x1B
>    %endif
>      mova        [r5 + 2*2*mmsize], m3 ; Row 2
>      pmaddwd     m2, [r4 + 3*16]
>      paddd       m2, m6
> -    psrad       m2, DCT_SHIFT
> +    psrad       m2, DCT8_SHIFT1
>    %if x == 1
>      pshufd      m2, m2, 0x1B
>    %endif
> @@ -1649,19 +1570,6 @@
>  ;-------------------------------------------------------
>  %if ARCH_X86_64
>  INIT_XMM sse2
> -%if BIT_DEPTH == 12
> -    %define     IDCT_SHIFT 8
> -    %define     IDCT_ADD pd_128
> -%elif BIT_DEPTH == 10
> -    %define     IDCT_SHIFT 10
> -    %define     IDCT_ADD pd_512
> -%elif BIT_DEPTH == 8
> -    %define     IDCT_SHIFT 12
> -    %define     IDCT_ADD pd_2048
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
> -
>  cglobal idct8, 3, 6, 16, 0-5*mmsize
>      mova        m9, [r0 + 1 * mmsize]
>      mova        m1, [r0 + 3 * mmsize]
> @@ -1911,18 +1819,19 @@
>      psubd       m10, m2
>      mova        m2, m4
>      pmaddwd     m12, [tab_dct4 + 3 * mmsize]
> -    paddd       m0, [IDCT_ADD]
> -    paddd       m1, [IDCT_ADD]
> -    paddd       m8, [IDCT_ADD]
> -    paddd       m10, [IDCT_ADD]
> +    mova        m15, [pd_ %+ IDCT_ROUND]
> +    paddd       m0, m15
> +    paddd       m1, m15
> +    paddd       m8, m15
> +    paddd       m10, m15
>      paddd       m2, m13
>      paddd       m3, m12
> -    paddd       m2, [IDCT_ADD]
> -    paddd       m3, [IDCT_ADD]
> +    paddd       m2, m15
> +    paddd       m3, m15
>      psubd       m4, m13
>      psubd       m6, m12
> -    paddd       m4, [IDCT_ADD]
> -    paddd       m6, [IDCT_ADD]
> +    paddd       m4, m15
> +    paddd       m6, m15
>      mova        m15, [rsp + 4 * mmsize]
>      mova        m12, m8
>      psubd       m8, m7
> @@ -2018,16 +1927,12 @@
>      movq        [r1 + r3 * 2 + 8], m8
>      movhps      [r1 + r0 + 8], m8
>      RET
> -
> -%undef IDCT_SHIFT
> -%undef IDCT_ADD
>  %endif
>  
>  ;-------------------------------------------------------
>  ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  INIT_XMM ssse3
> -
>  cglobal patial_butterfly_inverse_internal_pass1
>      movh        m0, [r0]
>      movhps      m0, [r0 + 2 * 16]
> @@ -2119,15 +2024,6 @@
>      ret
>  
>  %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
> -%if BIT_DEPTH == 12
> -    %define     IDCT_SHIFT 8
> -%elif BIT_DEPTH == 10
> -    %define     IDCT_SHIFT 10
> -%elif BIT_DEPTH == 8
> -    %define     IDCT_SHIFT 12
> -%else
> -    %error Unsupported BIT_DEPTH!
> -%endif
>      pshufb      m4, %1, [pb_idct8even]
>      pmaddwd     m4, [tab_idct8_1]
>      phsubd      m5, m4
> @@ -2149,11 +2045,10 @@
>      pshufd      m4, m4, 0x1B
>  
>      packssdw    %1, m4
> -%undef IDCT_SHIFT
>  %endmacro
>  
> +INIT_XMM ssse3
>  cglobal patial_butterfly_inverse_internal_pass2
> -
>      mova        m0, [r5]
>      PARTIAL_BUTTERFLY_PROCESS_ROW m0
>      movu        [r1], m0
> @@ -2169,9 +2064,9 @@
>      mova        m3, [r5 + 48]
>      PARTIAL_BUTTERFLY_PROCESS_ROW m3
>      movu        [r1 + r3], m3
> -
>      ret
>  
> +INIT_XMM ssse3
>  cglobal idct8, 3,7,8 ;,0-16*mmsize
>      ; alignment stack to 64-bytes
>      mov         r5, rsp
> @@ -2190,15 +2085,7 @@
>  
>      call        patial_butterfly_inverse_internal_pass1
>  
> -%if BIT_DEPTH == 12
> -    mova        m6, [pd_256]
> -%elif BIT_DEPTH == 10
> -    mova        m6, [pd_512]
> -%elif BIT_DEPTH == 8
> -    mova        m6, [pd_2048]
> -%else
> -  %error Unsupported BIT_DEPTH!
> -%endif
> +    mova        m6, [pd_ %+ IDCT_ROUND]
>      add         r2, r2
>      lea         r3, [r2 * 3]
>      lea         r4, [tab_idct8_2]
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/intrapred16.asm
> --- a/source/common/x86/intrapred16.asm	Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/intrapred16.asm	Fri Jul 10 15:57:44 2015 -0700
> @@ -109,11 +109,11 @@
>  cextern pw_16
>  cextern pw_31
>  cextern pw_32
> -cextern pw_1023
>  cextern pd_16
>  cextern pd_31
>  cextern pd_32
>  cextern pw_4096
> +cextern pw_pixel_max
>  cextern multiL
>  cextern multiH
>  cextern multiH2
> @@ -1228,11 +1228,11 @@
>  
>      punpcklwd   m0,             m0      ;[4 4 3 3 2 2 1 1]
>      pshufd      m1,             m0, 0xFA
> -    add         r1,             r1
> +    add         r1d,            r1d
>      pshufd      m0,             m0, 0x50
>      movhps      [r0 + r1],      m0
>      movh        [r0 + r1 * 2],  m1
> -    lea         r1,             [r1 * 3]
> +    lea         r1d,            [r1 * 3]
>      movhps      [r0 + r1],      m1
>  
>      cmp         r4m,            byte 0
> @@ -1247,7 +1247,7 @@
>      paddw       m0,             m1
>      pxor        m1,             m1
>      pmaxsw      m0,             m1
> -    pminsw      m0,             [pw_1023]
> +    pminsw      m0,             [pw_pixel_max]
>  .quit:
>      movh        [r0],           m0
>      RET
> @@ -1583,7 +1583,7 @@
>      paddw       m0,             m1
>      pxor        m1,             m1
>      pmaxsw      m0,             m1
> -    pminsw      m0,             [pw_1023]
> +    pminsw      m0,             [pw_pixel_max]
>  
>      movh        r2,             m0
>      mov         [r0],           r2w
> @@ -2756,7 +2756,7 @@
>      paddw       m0,             m1
>      pxor        m1,             m1
>      pmaxsw      m0,             m1
> -    pminsw      m0,             [pw_1023]
> +    pminsw      m0,             [pw_pixel_max]
>  .quit:
>      movh        [r0],           m0
>      RET
> @@ -2785,7 +2785,7 @@
>      paddw       m0,             m1
>      pxor        m1,             m1
>      pmaxsw      m0,             m1
> -    pminsw      m0,             [pw_1023]
> +    pminsw      m0,             [pw_pixel_max]
>  
>      pextrw      [r0],           m0, 0
>      pextrw      [r0 + r1],      m0, 1
> @@ -4002,7 +4002,7 @@
>      paddw       m0,             m1
>      pxor        m1,             m1
>      pmaxsw      m0,             m1
> -    pminsw      m0,             [pw_1023]
> +    pminsw      m0,             [pw_pixel_max]
>  .quit:
>      movu        [r0],           m0
>      RET
> @@ -5874,7 +5874,7 @@
>      paddw       m0,             m1
>      pxor        m1,             m1
>      pmaxsw      m0,             m1
> -    pminsw      m0,             [pw_1023]
> +    pminsw      m0,             [pw_pixel_max]
>      pextrw      [r0],          m0, 0
>      pextrw      [r0 + r1],     m0, 1
>      pextrw      [r0 + r1 * 2], m0, 2
> @@ -10287,9 +10287,9 @@
>      paddw       m0,                     m1
>      pxor        m1,                     m1
>      pmaxsw      m0,                     m1
> -    pminsw      m0,                     [pw_1023]
> +    pminsw      m0,                     [pw_pixel_max]
>      pmaxsw      m3,                     m1
> -    pminsw      m3,                     [pw_1023]
> +    pminsw      m3,                     [pw_pixel_max]
>  .quit:
>      movu        [r0],                   m0
>      movu        [r0 + 16],              m3
> @@ -10359,9 +10359,9 @@
>      paddw       m0,                 m1
>      pxor        m1,                 m1
>      pmaxsw      m0,                 m1
> -    pminsw      m0,                 [pw_1023]
> +    pminsw      m0,                 [pw_pixel_max]
>      pmaxsw      m3,                 m1
> -    pminsw      m3,                 [pw_1023]
> +    pminsw      m3,                 [pw_pixel_max]
>      pextrw      [r0],               m0, 0
>      pextrw      [r0 + r1],          m0, 1
>      pextrw      [r0 + r1 * 2],      m0, 2
> @@ -12952,7 +12952,7 @@
>      paddw           m0, m1
>      pxor            m1, m1
>      pmaxsw          m0, m1
> -    pminsw          m0, [pw_1023]
> +    pminsw          m0, [pw_pixel_max]
>  .quit:
>      movu            [r0], m0
>      RET
> @@ -12999,7 +12999,7 @@
>      paddw       m0,                 m1
>      pxor        m1,                 m1
>      pmaxsw      m0,                 m1
> -    pminsw      m0,                 [pw_1023]
> +    pminsw      m0,                 [pw_pixel_max]
>      pextrw      [r0],               xm0, 0
>      pextrw      [r0 + r1],          xm0, 1
>      pextrw      [r0 + r1 * 2],      xm0, 2
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/ipfilter16.asm
> --- a/source/common/x86/ipfilter16.asm	Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/ipfilter16.asm	Fri Jul 10 15:57:44 2015 -0700
> @@ -53,7 +53,7 @@
>                           times 8 dw -4, 54
>                           times 8 dw 16, -2
>  
> -                         times 8 dw -6, 46 
> +                         times 8 dw -6, 46
>                           times 8 dw 28, -4
>  
>                           times 8 dw -4, 36
> @@ -147,15 +147,22 @@
>  
>  %if BIT_DEPTH == 10
>      %define INTERP_OFFSET_PS        pd_n32768
> +    %define INTERP_SHIFT_PS         2
> +    %define INTERP_OFFSET_SP        pd_524800
> +    %define INTERP_SHIFT_SP         10
>  %elif BIT_DEPTH == 12
>      %define INTERP_OFFSET_PS        pd_n131072
> -%else
> -%error Unsupport bit depth!
> +    %define INTERP_SHIFT_PS         4
> +    %define INTERP_OFFSET_SP        pd_524416
> +    %define INTERP_SHIFT_SP         8
> +%else
> +    %error Unsupport bit depth!
>  %endif
>  
>  SECTION .text
>  cextern pd_32
>  cextern pw_pixel_max
> +cextern pd_524416
>  cextern pd_n32768
>  cextern pd_n131072
>  cextern pw_2000
> @@ -644,8 +651,8 @@
>      packssdw    m3,     m5
>      CLIPW       m3,     m7,     m6
>  %else
> -    psrad       m3,     2
> -    psrad       m5,     2
> +    psrad       m3,     INTERP_SHIFT_PS
> +    psrad       m5,     INTERP_SHIFT_PS
>      packssdw    m3,     m5
>  %endif
>      movd        [r2 + %1], m3
> @@ -682,8 +689,8 @@
>      pshufd      m5,     m5,     q3120
>      paddd       m5,     m1
>  
> -    psrad       m3,     2
> -    psrad       m5,     2
> +    psrad       m3,     INTERP_SHIFT_PS
> +    psrad       m5,     INTERP_SHIFT_PS
>      packssdw    m3,     m5
>  
>      movd        [r2 + %1], m3
> @@ -729,8 +736,8 @@
>      packssdw    m3,     m5
>      CLIPW       m3,     m7,     m6
>  %else
> -    psrad       m3,     2
> -    psrad       m5,     2
> +    psrad       m3,     INTERP_SHIFT_PS
> +    psrad       m5,     INTERP_SHIFT_PS
>      packssdw    m3,     m5
>  %endif
>      movh        [r2 + %1], m3
> @@ -753,7 +760,7 @@
>      punpcklqdq  m3,     m4
>      paddd       m3,     m1
>  
> -    psrad       m3,     2
> +    psrad       m3,     INTERP_SHIFT_PS
>      packssdw    m3,     m3
>      movh        [r2 + r3 * 2 + %1], m3
>  %endmacro
> @@ -794,8 +801,8 @@
>      packssdw    m3,     m5
>      CLIPW       m3,     m7,     m6
>  %else
> -    psrad       m3,     2
> -    psrad       m5,     2
> +    psrad       m3,     INTERP_SHIFT_PS
> +    psrad       m5,     INTERP_SHIFT_PS
>      packssdw    m3,     m5
>  %endif
>      movdqu      [r2 + %1], m3
> @@ -905,7 +912,7 @@
>  %endif ;z < y
>  %endrep
>  
> -RET
> +    RET
>  %endmacro
>  
>  ;-----------------------------------------------------------------------------
> @@ -1183,7 +1190,7 @@
>      mova        m0, [tab_LumaCoeff + r4]
>  %endif
>  
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      mova        m1, [pd_32]
>      pxor        m6, m6
>      mova        m7, [pw_pixel_max]
> @@ -1270,7 +1277,7 @@
>      mova        m0, [tab_LumaCoeff + r4]
>  %endif
>  
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      mova        m1, [pd_32]
>      pxor        m7, m7
>  %else
> @@ -1316,7 +1323,7 @@
>      phaddd      m6, m3
>      phaddd      m5, m6
>      paddd       m5, m1
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      psrad       m4, 6
>      psrad       m5, 6
>      packusdw    m4, m5
> @@ -1372,7 +1379,7 @@
>  %else
>      mova        m0, [tab_LumaCoeff + r4]
>  %endif
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      mova        m1, [pd_32]
>  %else
>      mova        m1, [INTERP_OFFSET_PS]
> @@ -1417,131 +1424,6 @@
>      phaddd      m6, m7
>      phaddd      m5, m6
>      paddd       m5, m1
> -%ifidn %3, pp 
> -    psrad       m4, 6
> -    psrad       m5, 6
> -    packusdw    m4, m5
> -    pxor        m5, m5
> -    CLIPW       m4, m5, [pw_pixel_max]
> -%else
> -    psrad       m4, 2
> -    psrad       m5, 2
> -    packssdw    m4, m5
> -%endif
> -
> -    movu        [r2], m4
> -
> -    movu        m2, [r0 + 32]                ; m2 = src[16-23]
> -
> -    pmaddwd     m4, m3, m0                   ; m3 = src[8-15]
> -    palignr     m5, m2, m3, 2                ; m5 = src[9-16]
> -    pmaddwd     m5, m0
> -    phaddd      m4, m5
> -
> -    palignr     m5, m2, m3, 4                ; m5 = src[10-17]
> -    pmaddwd     m5, m0
> -    palignr     m2, m3, 6                    ; m2 = src[11-18]
> -    pmaddwd     m2, m0
> -    phaddd      m5, m2
> -    phaddd      m4, m5
> -    paddd       m4, m1
> -%ifidn %3, pp 
> -    psrad       m4, 6
> -    packusdw    m4, m4
> -    pxor        m5, m5
> -    CLIPW       m4, m5, [pw_pixel_max]
> -%else
> -    psrad       m4, 2
> -    packssdw    m4, m4
> -%endif
> -
> -    movh        [r2 + 16], m4
> -
> -    add         r0, r1
> -    add         r2, r3
> -
> -    dec         r4d
> -    jnz         .loopH
> -    RET
> -%endmacro
> -
> -;-------------------------------------------------------------------------------------------------------------
> -; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
> -;-------------------------------------------------------------------------------------------------------------
> -FILTER_HOR_LUMA_W12 12, 16, pp
> -
> -;----------------------------------------------------------------------------------------------------------------------------
> -; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> -;----------------------------------------------------------------------------------------------------------------------------
> -FILTER_HOR_LUMA_W12 12, 16, ps
> -
> -;--------------------------------------------------------------------------------------------------------------
> -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> -;--------------------------------------------------------------------------------------------------------------
> -%macro FILTER_HOR_LUMA_W16 3
> -INIT_XMM sse4
> -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> -
> -    add         r1, r1
> -    add         r3, r3
> -    mov         r4d, r4m
> -    sub         r0, 6
> -    shl         r4d, 4
> -
> -%ifdef PIC
> -    lea         r6, [tab_LumaCoeff]
> -    mova        m0, [r6 + r4]
> -%else
> -    mova        m0, [tab_LumaCoeff + r4]
> -%endif
> -
> -%ifidn %3, pp 
> -    mova        m1, [pd_32]
> -%else
> -    mova        m1, [INTERP_OFFSET_PS]
> -%endif
> -
> -    mov         r4d, %2
> -%ifidn %3, ps
> -    cmp         r5m, byte 0
> -    je          .loopH
> -    lea         r6, [r1 + 2 * r1]
> -    sub         r0, r6
> -    add         r4d, 7
> -%endif
> -
> -.loopH:
> -%assign x 0
> -%rep %1 / 16
> -    movu        m2, [r0 + x]                 ; m2 = src[0-7]
> -    movu        m3, [r0 + 16 + x]            ; m3 = src[8-15]
> -
> -    pmaddwd     m4, m2, m0
> -    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
> -    pmaddwd     m5, m0
> -    phaddd      m4, m5
> -
> -    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
> -    pmaddwd     m5, m0
> -    palignr     m6, m3, m2, 6                ; m6 = src[3-10]
> -    pmaddwd     m6, m0
> -    phaddd      m5, m6
> -    phaddd      m4, m5
> -    paddd       m4, m1
> -
> -    palignr     m5, m3, m2, 8                ; m5 = src[4-11]
> -    pmaddwd     m5, m0
> -    palignr     m6, m3, m2, 10               ; m6 = src[5-12]
> -    pmaddwd     m6, m0
> -    phaddd      m5, m6
> -
> -    palignr     m6, m3, m2, 12               ; m6 = src[6-13]
> -    pmaddwd     m6, m0
> -    palignr     m7, m3, m2, 14               ; m2 = src[7-14]
> -    pmaddwd     m7, m0
> -    phaddd      m6, m7
> -    phaddd      m5, m6
> -    paddd       m5, m1
>  %ifidn %3, pp
>      psrad       m4, 6
>      psrad       m5, 6
> @@ -1553,6 +1435,131 @@
>      psrad       m5, 2
>      packssdw    m4, m5
>  %endif
> +
> +    movu        [r2], m4
> +
> +    movu        m2, [r0 + 32]                ; m2 = src[16-23]
> +
> +    pmaddwd     m4, m3, m0                   ; m3 = src[8-15]
> +    palignr     m5, m2, m3, 2                ; m5 = src[9-16]
> +    pmaddwd     m5, m0
> +    phaddd      m4, m5
> +
> +    palignr     m5, m2, m3, 4                ; m5 = src[10-17]
> +    pmaddwd     m5, m0
> +    palignr     m2, m3, 6                    ; m2 = src[11-18]
> +    pmaddwd     m2, m0
> +    phaddd      m5, m2
> +    phaddd      m4, m5
> +    paddd       m4, m1
> +%ifidn %3, pp
> +    psrad       m4, 6
> +    packusdw    m4, m4
> +    pxor        m5, m5
> +    CLIPW       m4, m5, [pw_pixel_max]
> +%else
> +    psrad       m4, 2
> +    packssdw    m4, m4
> +%endif
> +
> +    movh        [r2 + 16], m4
> +
> +    add         r0, r1
> +    add         r2, r3
> +
> +    dec         r4d
> +    jnz         .loopH
> +    RET
> +%endmacro
> +
> +;-------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
> +;-------------------------------------------------------------------------------------------------------------
> +FILTER_HOR_LUMA_W12 12, 16, pp
> +
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +;----------------------------------------------------------------------------------------------------------------------------
> +FILTER_HOR_LUMA_W12 12, 16, ps
> +
> +;--------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> +;--------------------------------------------------------------------------------------------------------------
> +%macro FILTER_HOR_LUMA_W16 3
> +INIT_XMM sse4
> +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> +
> +    add         r1, r1
> +    add         r3, r3
> +    mov         r4d, r4m
> +    sub         r0, 6
> +    shl         r4d, 4
> +
> +%ifdef PIC
> +    lea         r6, [tab_LumaCoeff]
> +    mova        m0, [r6 + r4]
> +%else
> +    mova        m0, [tab_LumaCoeff + r4]
> +%endif
> +
> +%ifidn %3, pp
> +    mova        m1, [pd_32]
> +%else
> +    mova        m1, [INTERP_OFFSET_PS]
> +%endif
> +
> +    mov         r4d, %2
> +%ifidn %3, ps
> +    cmp         r5m, byte 0
> +    je          .loopH
> +    lea         r6, [r1 + 2 * r1]
> +    sub         r0, r6
> +    add         r4d, 7
> +%endif
> +
> +.loopH:
> +%assign x 0
> +%rep %1 / 16
> +    movu        m2, [r0 + x]                 ; m2 = src[0-7]
> +    movu        m3, [r0 + 16 + x]            ; m3 = src[8-15]
> +
> +    pmaddwd     m4, m2, m0
> +    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
> +    pmaddwd     m5, m0
> +    phaddd      m4, m5
> +
> +    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
> +    pmaddwd     m5, m0
> +    palignr     m6, m3, m2, 6                ; m6 = src[3-10]
> +    pmaddwd     m6, m0
> +    phaddd      m5, m6
> +    phaddd      m4, m5
> +    paddd       m4, m1
> +
> +    palignr     m5, m3, m2, 8                ; m5 = src[4-11]
> +    pmaddwd     m5, m0
> +    palignr     m6, m3, m2, 10               ; m6 = src[5-12]
> +    pmaddwd     m6, m0
> +    phaddd      m5, m6
> +
> +    palignr     m6, m3, m2, 12               ; m6 = src[6-13]
> +    pmaddwd     m6, m0
> +    palignr     m7, m3, m2, 14               ; m2 = src[7-14]
> +    pmaddwd     m7, m0
> +    phaddd      m6, m7
> +    phaddd      m5, m6
> +    paddd       m5, m1
> +%ifidn %3, pp
> +    psrad       m4, 6
> +    psrad       m5, 6
> +    packusdw    m4, m5
> +    pxor        m5, m5
> +    CLIPW       m4, m5, [pw_pixel_max]
> +%else
> +    psrad       m4, 2
> +    psrad       m5, 2
> +    packssdw    m4, m5
> +%endif
>      movu        [r2 + x], m4
>  
>      movu        m2, [r0 + 32 + x]            ; m2 = src[16-23]
> @@ -1583,7 +1590,7 @@
>      phaddd      m6, m2
>      phaddd      m5, m6
>      paddd       m5, m1
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      psrad       m4, 6
>      psrad       m5, 6
>      packusdw    m4, m5
> @@ -1690,7 +1697,7 @@
>  %else
>      mova        m0, [tab_LumaCoeff + r4]
>  %endif
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      mova        m1, [pd_32]
>  %else
>      mova        m1, [INTERP_OFFSET_PS]
> @@ -1735,7 +1742,7 @@
>      phaddd      m6, m7
>      phaddd      m5, m6
>      paddd       m5, m1
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      psrad       m4, 6
>      psrad       m5, 6
>      packusdw    m4, m5
> @@ -1776,7 +1783,7 @@
>      phaddd      m6, m7
>      phaddd      m5, m6
>      paddd       m5, m1
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      psrad       m4, 6
>      psrad       m5, 6
>      packusdw    m4, m5
> @@ -1817,7 +1824,7 @@
>      phaddd      m6, m7
>      phaddd      m5, m6
>      paddd       m5, m1
> -%ifidn %3, pp 
> +%ifidn %3, pp
>      psrad       m4, 6
>      psrad       m5, 6
>      packusdw    m4, m5
> @@ -2652,7 +2659,7 @@
>      %endif
>  
>      paddd       m3,         m1
> -    psrad       m3,         2
> +    psrad       m3,         INTERP_SHIFT_PS
>      packssdw    m3,         m3
>  
>      %if %1 == 2
> @@ -2683,7 +2690,7 @@
>      FILTER_W%1_2 %3
>  %endrep
>  
> -RET
> +    RET
>  %endmacro
>  
>  FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
> @@ -4084,7 +4091,7 @@
>          %ifidn %3, pp
>              mova      m6, [tab_c_32]
>          %else
> -            mova      m6, [tab_c_524800]
> +            mova      m6, [INTERP_OFFSET_SP]
>          %endif
>      %else
>          mova      m6, [INTERP_OFFSET_PS]
> @@ -4109,10 +4116,10 @@
>      paddd     m1, m6
>      paddd     m2, m6
>      paddd     m3, m6
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -4127,10 +4134,10 @@
>          psrad     m2, 6
>          psrad     m3, 6
>      %else
> -        psrad     m0, 10
> -        psrad     m1, 10
> -        psrad     m2, 10
> -        psrad     m3, 10
> +        psrad     m0, INTERP_SHIFT_SP
> +        psrad     m1, INTERP_SHIFT_SP
> +        psrad     m2, INTERP_SHIFT_SP
> +        psrad     m3, INTERP_SHIFT_SP
>      %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -4707,7 +4714,7 @@
>  %ifidn %3, pp
>      mova      m7, [tab_c_32]
>  %elifidn %3, sp
> -    mova      m7, [tab_c_524800]
> +    mova      m7, [INTERP_OFFSET_SP]
>  %elifidn %3, ps
>      mova      m7, [INTERP_OFFSET_PS]
>  %endif
> @@ -4728,10 +4735,10 @@
>      paddd     m1, m7
>      paddd     m2, m7
>      paddd     m3, m7
> -    psrad     m0, 2
> -    psrad     m1, 2
> -    psrad     m2, 2
> -    psrad     m3, 2
> +    psrad     m0, INTERP_SHIFT_PS
> +    psrad     m1, INTERP_SHIFT_PS
> +    psrad     m2, INTERP_SHIFT_PS
> +    psrad     m3, INTERP_SHIFT_PS
>  
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -4746,10 +4753,10 @@
>          psrad     m2, 6
>          psrad     m3, 6
>      %else
> -        psrad     m0, 10
> -        psrad     m1, 10
> -        psrad     m2, 10
> -        psrad     m3, 10
> +        psrad     m0, INTERP_SHIFT_SP
> +        psrad     m1, INTERP_SHIFT_SP
> +        psrad     m2, INTERP_SHIFT_SP
> +        psrad     m3, INTERP_SHIFT_SP
>      %endif
>      packssdw  m0, m1
>      packssdw  m2, m3
> @@ -5587,7 +5594,7 @@
>  ;-----------------------------------------------------------------------------------------------------------------
>  %macro FILTER_VER_CHROMA_W16_24xN_avx2 3
>  INIT_YMM avx2
> -%if ARCH_X86_64 
> +%if ARCH_X86_64
>  cglobal interp_4tap_vert_%2_24x%1, 5, 7, %3
>      add       r1d, r1d
>      add       r3d, r3d
> @@ -8628,7 +8635,7 @@
>      psrad           m3, 2
>  %endif
>  %endif
> -  
> +
>      packssdw        m0, m3
>  %ifidn %1,pp
>      CLIPW           m0, m1, [pw_pixel_max]
> @@ -9045,14 +9052,14 @@
>  %rep %1/4
>      movh       m0, [r0]
>      movhps     m0, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>      movh       [r2 + r3 * 0], m0
>      movhps     [r2 + r3 * 1], m0
>  
>      movh       m0, [r0 + r1 * 2]
>      movhps     m0, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>      movh       [r2 + r3 * 2], m0
>      movhps     [r2 + r4], m0
> @@ -9078,11 +9085,10 @@
>  
>      movh       m0, [r0]
>      movhps     m0, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, [pw_2000]
>      movh       [r2 + r3 * 0], m0
>      movhps     [r2 + r3 * 1], m0
> -
>      RET
>  
>  ;-----------------------------------------------------------------------------
> @@ -9106,9 +9112,9 @@
>  .loop
>      movu       m0, [r0]
>      movu       m1, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movh       [r2 + r3 * 0], m0
> @@ -9118,9 +9124,9 @@
>  
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movh       [r2 + r3 * 2], m0
> @@ -9158,22 +9164,22 @@
>  
>  .loop
>      movu       m0, [r0]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>      movu       [r2 + r3 * 0], m0
>  
>      movu       m0, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>      movu       [r2 + r3 * 1], m0
>  
>      movu       m0, [r0 + r1 * 2]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>      movu       [r2 + r3 * 2], m0
>  
>      movu       m0, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m1
>      movu       [r2 + r4], m0
>  
> @@ -9203,14 +9209,13 @@
>      movu       m0, [r0]
>      movu       m1, [r0 + r1]
>  
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, [pw_2000]
> -    psllw      m1, 4
>      psubw      m1, [pw_2000]
>  
>      movu       [r2 + r3 * 0], m0
>      movu       [r2 + r3 * 1], m1
> -
>      RET
>  
>  ;-----------------------------------------------------------------------------
> @@ -9232,11 +9237,11 @@
>      movu       m1, [r0 + r1]
>      movu       m2, [r0 + r1 * 2]
>  
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m3
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m3
> -    psllw      m2, 4
> +    psllw      m2, (14 - BIT_DEPTH)
>      psubw      m2, m3
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9247,18 +9252,17 @@
>      movu       m1, [r0 + r1 * 4]
>      movu       m2, [r0 + r5 ]
>  
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m3
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m3
> -    psllw      m2, 4
> +    psllw      m2, (14 - BIT_DEPTH)
>      psubw      m2, m3
>  
>      movu       [r2 + r6], m0
>      movu       [r2 + r3 * 4], m1
>      lea        r2, [r2 + r3 * 4]
>      movu       [r2 + r3], m2
> -
>      RET
>  
>  ;-----------------------------------------------------------------------------
> @@ -9282,9 +9286,9 @@
>  .loop
>      movu       m0, [r0]
>      movu       m1, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9292,9 +9296,9 @@
>  
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2], m0
> @@ -9302,9 +9306,9 @@
>  
>      movu       m0, [r0 + 16]
>      movu       m1, [r0 + r1 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0 + 16], m0
> @@ -9312,9 +9316,9 @@
>  
>      movu       m0, [r0 + r1 * 2 + 16]
>      movu       m1, [r0 + r5 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2 + 16], m0
> @@ -9356,9 +9360,9 @@
>  .loop
>      movu       m0, [r0]
>      movu       m1, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9366,9 +9370,9 @@
>  
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2], m0
> @@ -9412,13 +9416,13 @@
>      movu       m1, [r0 + r1]
>      movu       m2, [r0 + r1 * 2]
>      movu       m3, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9430,13 +9434,13 @@
>      movu       m1, [r0 + r1 + 16]
>      movu       m2, [r0 + r1 * 2 + 16]
>      movu       m3, [r0 + r5 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 16], m0
> @@ -9448,13 +9452,13 @@
>      movu       m1, [r0 + r1 + 32]
>      movu       m2, [r0 + r1 * 2 + 32]
>      movu       m3, [r0 + r5 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 32], m0
> @@ -9466,13 +9470,13 @@
>      movu       m1, [r0 + r1 + 48]
>      movu       m2, [r0 + r1 * 2 + 48]
>      movu       m3, [r0 + r5 + 48]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 48], m0
> @@ -9515,9 +9519,9 @@
>  .loop
>      movu       m0, [r0]
>      movu       m1, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9525,9 +9529,9 @@
>  
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2], m0
> @@ -9535,9 +9539,9 @@
>  
>      movu       m0, [r0 + 32]
>      movu       m1, [r0 + r1 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0 + 32], m0
> @@ -9545,9 +9549,9 @@
>  
>      movu       m0, [r0 + r1 * 2 + 32]
>      movu       m1, [r0 + r5 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2 + 32], m0
> @@ -9590,13 +9594,13 @@
>      movu       m1, [r0 + r1]
>      movu       m2, [r0 + r1 * 2]
>      movu       m3, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9608,13 +9612,13 @@
>      movu       m1, [r0 + r1 + 16]
>      movu       m2, [r0 + r1 * 2 + 16]
>      movu       m3, [r0 + r5 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 16], m0
> @@ -9626,13 +9630,13 @@
>      movu       m1, [r0 + r1 + 32]
>      movu       m2, [r0 + r1 * 2 + 32]
>      movu       m3, [r0 + r5 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 32], m0
> @@ -9644,13 +9648,13 @@
>      movu       m1, [r0 + r1 + 48]
>      movu       m2, [r0 + r1 * 2 + 48]
>      movu       m3, [r0 + r5 + 48]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 48], m0
> @@ -9662,13 +9666,13 @@
>      movu       m1, [r0 + r1 + 64]
>      movu       m2, [r0 + r1 * 2 + 64]
>      movu       m3, [r0 + r5 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 64], m0
> @@ -9680,13 +9684,13 @@
>      movu       m1, [r0 + r1 + 80]
>      movu       m2, [r0 + r1 * 2 + 80]
>      movu       m3, [r0 + r5 + 80]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 80], m0
> @@ -9698,13 +9702,13 @@
>      movu       m1, [r0 + r1 + 96]
>      movu       m2, [r0 + r1 * 2 + 96]
>      movu       m3, [r0 + r5 + 96]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 96], m0
> @@ -9716,13 +9720,13 @@
>      movu       m1, [r0 + r1 + 112]
>      movu       m2, [r0 + r1 * 2 + 112]
>      movu       m3, [r0 + r5 + 112]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 112], m0
> @@ -9763,9 +9767,9 @@
>  .loop
>      movu       m0, [r0]
>      movu       m1, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9773,9 +9777,9 @@
>  
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2], m0
> @@ -9783,9 +9787,9 @@
>  
>      movu       m0, [r0 + 32]
>      movu       m1, [r0 + r1 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0 + 32], m0
> @@ -9793,9 +9797,9 @@
>  
>      movu       m0, [r0 + r1 * 2 + 32]
>      movu       m1, [r0 + r5 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2 + 32], m0
> @@ -9803,9 +9807,9 @@
>  
>      movu       m0, [r0 + 64]
>      movu       m1, [r0 + r1 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0 + 64], m0
> @@ -9813,9 +9817,9 @@
>  
>      movu       m0, [r0 + r1 * 2 + 64]
>      movu       m1, [r0 + r5 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2 + 64], m0
> @@ -9823,9 +9827,9 @@
>  
>      movu       m0, [r0 + 96]
>      movu       m1, [r0 + r1 + 96]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0 + 96], m0
> @@ -9833,9 +9837,9 @@
>  
>      movu       m0, [r0 + r1 * 2 + 96]
>      movu       m1, [r0 + r5 + 96]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2 + 96], m0
> @@ -9876,13 +9880,13 @@
>      movu       m1, [r0 + r1]
>      movu       m2, [r0 + r1 * 2]
>      movu       m3, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0], m0
> @@ -9894,13 +9898,13 @@
>      movu       m1, [r0 + r1 + 16]
>      movu       m2, [r0 + r1 * 2 + 16]
>      movu       m3, [r0 + r5 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 16], m0
> @@ -9912,13 +9916,13 @@
>      movu       m1, [r0 + r1 + 32]
>      movu       m2, [r0 + r1 * 2 + 32]
>      movu       m3, [r0 + r5 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 32], m0
> @@ -9957,36 +9961,36 @@
>  .loop
>      movu       m0, [r0]
>      movu       m1, [r0 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>      movu       [r2 + r3 * 0], m0
>      movu       [r2 + r3 * 0 + 32], xm1
>  
>      movu       m0, [r0 + r1]
>      movu       m1, [r0 + r1 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>      movu       [r2 + r3 * 1], m0
>      movu       [r2 + r3 * 1 + 32], xm1
>  
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r1 * 2 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>      movu       [r2 + r3 * 2], m0
>      movu       [r2 + r3 * 2 + 32], xm1
>  
>      movu       m0, [r0 + r5]
>      movu       m1, [r0 + r5 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>      movu       [r2 + r4], m0
>      movu       [r2 + r4 + 32], xm1
> @@ -10022,9 +10026,9 @@
>  .loop
>      movu       m0, [r0]
>      movu       m1, [r0 + r1]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 0], m0
> @@ -10032,9 +10036,9 @@
>  
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
>      psubw      m0, m2
> -    psllw      m1, 4
>      psubw      m1, m2
>  
>      movu       [r2 + r3 * 2], m0
> @@ -10042,7 +10046,7 @@
>  
>      movh       m0, [r0 + 16]
>      movhps     m0, [r0 + r1 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
>  
>      movh       [r2 + r3 * 0 + 16], m0
> @@ -10050,7 +10054,7 @@
>  
>      movh       m0, [r0 + r1 * 2 + 16]
>      movhps     m0, [r0 + r5 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
>      psubw      m0, m2
>  
>      movh       [r2 + r3 * 2 + 16], m0
> @@ -10088,13 +10092,13 @@
>      movu       m1, [r0 + r1]
>      movu       m2, [r0 + r1 * 2]
>      movu       m3, [r0 + r5]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0], m0
> @@ -10106,13 +10110,13 @@
>      movu       m1, [r0 + r1 + 16]
>      movu       m2, [r0 + r1 * 2 + 16]
>      movu       m3, [r0 + r5 + 16]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 16], m0
> @@ -10124,13 +10128,13 @@
>      movu       m1, [r0 + r1 + 32]
>      movu       m2, [r0 + r1 * 2 + 32]
>      movu       m3, [r0 + r5 + 32]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 32], m0
> @@ -10142,13 +10146,13 @@
>      movu       m1, [r0 + r1 + 48]
>      movu       m2, [r0 + r1 * 2 + 48]
>      movu       m3, [r0 + r5 + 48]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 48], m0
> @@ -10160,13 +10164,13 @@
>      movu       m1, [r0 + r1 + 64]
>      movu       m2, [r0 + r1 * 2 + 64]
>      movu       m3, [r0 + r5 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 64], m0
> @@ -10178,13 +10182,13 @@
>      movu       m1, [r0 + r1 + 80]
>      movu       m2, [r0 + r1 * 2 + 80]
>      movu       m3, [r0 + r5 + 80]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
> +    psllw      m3, (14 - BIT_DEPTH)
>      psubw      m0, m4
> -    psllw      m1, 4
>      psubw      m1, m4
> -    psllw      m2, 4
>      psubw      m2, m4
> -    psllw      m3, 4
>      psubw      m3, m4
>  
>      movu       [r2 + r3 * 0 + 80], m0
> @@ -10220,11 +10224,11 @@
>      movu       m0, [r0]
>      movu       m1, [r0 + 32]
>      movu       m2, [r0 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
>      psubw      m0, m3
> -    psllw      m1, 4
>      psubw      m1, m3
> -    psllw      m2, 4
>      psubw      m2, m3
>      movu       [r2 + r3 * 0], m0
>      movu       [r2 + r3 * 0 + 32], m1
> @@ -10233,11 +10237,11 @@
>      movu       m0, [r0 + r1]
>      movu       m1, [r0 + r1 + 32]
>      movu       m2, [r0 + r1 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
>      psubw      m0, m3
> -    psllw      m1, 4
>      psubw      m1, m3
> -    psllw      m2, 4
>      psubw      m2, m3
>      movu       [r2 + r3 * 1], m0
>      movu       [r2 + r3 * 1 + 32], m1
> @@ -10246,11 +10250,11 @@
>      movu       m0, [r0 + r1 * 2]
>      movu       m1, [r0 + r1 * 2 + 32]
>      movu       m2, [r0 + r1 * 2 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
>      psubw      m0, m3
> -    psllw      m1, 4
>      psubw      m1, m3
> -    psllw      m2, 4
>      psubw      m2, m3
>      movu       [r2 + r3 * 2], m0
>      movu       [r2 + r3 * 2 + 32], m1
> @@ -10259,11 +10263,11 @@
>      movu       m0, [r0 + r5]
>      movu       m1, [r0 + r5 + 32]
>      movu       m2, [r0 + r5 + 64]
> -    psllw      m0, 4
> +    psllw      m0, (14 - BIT_DEPTH)
> +    psllw      m1, (14 - BIT_DEPTH)
> +    psllw      m2, (14 - BIT_DEPTH)
>      psubw      m0, m3
> -    psllw      m1, 4
>      psubw      m1, m3
> -    psllw      m2, 4
>      psubw      m2, m3
>      movu       [r2 + r4], m0
>      movu       [r2 + r4 + 32], m1
> @@ -10797,7 +10801,7 @@
>      pmaddwd             m6, m0
>      pmaddwd             m5, m1
>      paddd               m6, m5
> -   
> +
>      phaddd              m6, m6
>      vpermq              m6, m6, q3120
>      paddd               xm6, xm2
> @@ -12115,7 +12119,7 @@
>  %endmacro
>  
>  FILTER_VER_CHROMA_AVX2_4x4 pp, 1, 6
> -FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2 
> +FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2
>  FILTER_VER_CHROMA_AVX2_4x4 sp, 1, 10
>  FILTER_VER_CHROMA_AVX2_4x4 ss, 0, 6
>  
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list