[x265] [PATCH 1 of 5] asm: fix Main12 assembly up to SSSE3
Steve Borho
steve at borho.org
Sat Jul 11 19:35:18 CEST 2015
On 07/10, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1436569064 25200
> # Node ID 8f60362f8555c11a14301737c0301fd6e9303448
> # Parent 7b3e1372bb28830ef0ab44cd652ecbe823573675
> asm: fix Main12 assembly up to SSSE3
Series queued, along with some Main12 and multilib testbench fixes.
On Mac I'm not making it up to SSSE3 yet.
$ ./test/TestBench --cpu SSSE3
Using random seed 55A152EF 12bit
Testing primitives: SSE2
sad[ 8x8]: failed!
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/vec/dct-sse3.cpp
> --- a/source/common/vec/dct-sse3.cpp Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/vec/dct-sse3.cpp Fri Jul 10 15:57:44 2015 -0700
> @@ -38,13 +38,8 @@
> #define SHIFT1 7
> #define ADD1 64
>
> -#if HIGH_BIT_DEPTH
> -#define SHIFT2 10
> -#define ADD2 512
> -#else
> -#define SHIFT2 12
> -#define ADD2 2048
> -#endif
> +#define SHIFT2 (12 - (X265_DEPTH - 8))
> +#define ADD2 (1 << ((SHIFT2) - 1))
>
> ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
> {
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/vec/dct-ssse3.cpp
> --- a/source/common/vec/dct-ssse3.cpp Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/vec/dct-ssse3.cpp Fri Jul 10 15:57:44 2015 -0700
> @@ -34,6 +34,18 @@
> #include <pmmintrin.h> // SSE3
> #include <tmmintrin.h> // SSSE3
>
> +#define DCT16_SHIFT1 (3 + X265_DEPTH - 8)
> +#define DCT16_ADD1 (1 << ((DCT16_SHIFT1) - 1))
> +
> +#define DCT16_SHIFT2 10
> +#define DCT16_ADD2 (1 << ((DCT16_SHIFT2) - 1))
> +
> +#define DCT32_SHIFT1 (DCT16_SHIFT1 + 1)
> +#define DCT32_ADD1 (1 << ((DCT32_SHIFT1) - 1))
> +
> +#define DCT32_SHIFT2 (DCT16_SHIFT2 + 1)
> +#define DCT32_ADD2 (1 << ((DCT32_SHIFT2) - 1))
> +
> using namespace X265_NS;
>
> ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
> @@ -100,20 +112,9 @@
>
> static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
> {
> -#if HIGH_BIT_DEPTH
> -#define SHIFT1 5
> -#define ADD1 16
> -#else
> -#define SHIFT1 3
> -#define ADD1 4
> -#endif
> -
> -#define SHIFT2 10
> -#define ADD2 512
> -
> // Const
> - __m128i c_4 = _mm_set1_epi32(ADD1);
> - __m128i c_512 = _mm_set1_epi32(ADD2);
> + __m128i c_4 = _mm_set1_epi32(DCT16_ADD1);
> + __m128i c_512 = _mm_set1_epi32(DCT16_ADD2);
>
> int i;
>
> @@ -201,29 +202,29 @@
>
> T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
> T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
>
> T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
> T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
>
> T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
> T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
>
> T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
> T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
>
> @@ -233,8 +234,8 @@
> T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
> T60 = _mm_hadd_epi32(T60, T61);
> T61 = _mm_hadd_epi32(T62, T63);
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
>
> @@ -244,8 +245,8 @@
> T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
> T60 = _mm_hadd_epi32(T60, T61);
> T61 = _mm_hadd_epi32(T62, T63);
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
>
> @@ -255,8 +256,8 @@
> T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
> T60 = _mm_hadd_epi32(T60, T61);
> T61 = _mm_hadd_epi32(T62, T63);
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
>
> @@ -266,8 +267,8 @@
> T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
> T60 = _mm_hadd_epi32(T60, T61);
> T61 = _mm_hadd_epi32(T62, T63);
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
> T70 = _mm_packs_epi32(T60, T61);
> _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
>
> @@ -286,8 +287,8 @@
> T63 = _mm_hadd_epi32(T66, T67); \
> T60 = _mm_hadd_epi32(T60, T61); \
> T61 = _mm_hadd_epi32(T62, T63); \
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1); \
> - T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1); \
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1); \
> + T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1); \
> T70 = _mm_packs_epi32(T60, T61); \
> _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
>
> @@ -351,8 +352,8 @@
>
> T40 = _mm_hadd_epi32(T30, T31);
> T41 = _mm_hsub_epi32(T30, T31);
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> - T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), SHIFT2);
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> + T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), DCT16_SHIFT2);
> T40 = _mm_packs_epi32(T40, T40);
> T41 = _mm_packs_epi32(T41, T41);
> _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
> @@ -376,7 +377,7 @@
> T31 = _mm_hadd_epi32(T32, T33);
>
> T40 = _mm_hadd_epi32(T30, T31);
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> T40 = _mm_packs_epi32(T40, T40);
> _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
>
> @@ -398,7 +399,7 @@
> T31 = _mm_hadd_epi32(T32, T33);
>
> T40 = _mm_hadd_epi32(T30, T31);
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> T40 = _mm_packs_epi32(T40, T40);
> _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
>
> @@ -420,7 +421,7 @@
> T31 = _mm_hadd_epi32(T32, T33);
>
> T40 = _mm_hadd_epi32(T30, T31);
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> T40 = _mm_packs_epi32(T40, T40);
> _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
>
> @@ -442,7 +443,7 @@
> T31 = _mm_hadd_epi32(T32, T33);
>
> T40 = _mm_hadd_epi32(T30, T31);
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> T40 = _mm_packs_epi32(T40, T40);
> _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
>
> @@ -464,7 +465,7 @@
> T31 = _mm_hadd_epi32(T32, T33);
>
> T40 = _mm_hadd_epi32(T30, T31);
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> T40 = _mm_packs_epi32(T40, T40);
> _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
>
> @@ -486,7 +487,7 @@
> T31 = _mm_hadd_epi32(T32, T33);
>
> T40 = _mm_hadd_epi32(T30, T31);
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
> T40 = _mm_packs_epi32(T40, T40);
> _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
>
> @@ -509,7 +510,7 @@
> T31 = _mm_hadd_epi32(T32, T33); \
> \
> T40 = _mm_hadd_epi32(T30, T31); \
> - T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2); \
> + T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2); \
> T40 = _mm_packs_epi32(T40, T40); \
> _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
>
> @@ -523,10 +524,6 @@
> MAKE_ODD(28, 15);
> #undef MAKE_ODD
> }
> -#undef SHIFT1
> -#undef ADD1
> -#undef SHIFT2
> -#undef ADD2
> }
>
> ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) =
> @@ -681,20 +678,9 @@
>
> static void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
> {
> -#if HIGH_BIT_DEPTH
> -#define SHIFT1 6
> -#define ADD1 32
> -#else
> -#define SHIFT1 4
> -#define ADD1 8
> -#endif
> -
> -#define SHIFT2 11
> -#define ADD2 1024
> -
> // Const
> - __m128i c_8 = _mm_set1_epi32(ADD1);
> - __m128i c_1024 = _mm_set1_epi32(ADD2);
> + __m128i c_8 = _mm_set1_epi32(DCT32_ADD1);
> + __m128i c_1024 = _mm_set1_epi32(DCT32_ADD2);
>
> int i;
>
> @@ -839,15 +825,15 @@
>
> T50 = _mm_hadd_epi32(T40, T41);
> T51 = _mm_hadd_epi32(T42, T43);
> - T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> - T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
> T60 = _mm_packs_epi32(T50, T51);
> im[0][i] = T60;
>
> T50 = _mm_hsub_epi32(T40, T41);
> T51 = _mm_hsub_epi32(T42, T43);
> - T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> - T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
> T60 = _mm_packs_epi32(T50, T51);
> im[16][i] = T60;
>
> @@ -867,8 +853,8 @@
>
> T50 = _mm_hadd_epi32(T40, T41);
> T51 = _mm_hadd_epi32(T42, T43);
> - T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> - T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
> T60 = _mm_packs_epi32(T50, T51);
> im[8][i] = T60;
>
> @@ -888,8 +874,8 @@
>
> T50 = _mm_hadd_epi32(T40, T41);
> T51 = _mm_hadd_epi32(T42, T43);
> - T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
> - T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
> + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
> + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
> T60 = _mm_packs_epi32(T50, T51);
> im[24][i] = T60;
>
> @@ -910,8 +896,8 @@
> \
> T50 = _mm_hadd_epi32(T40, T41); \
> T51 = _mm_hadd_epi32(T42, T43); \
> - T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
> - T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
> + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
> + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
> T60 = _mm_packs_epi32(T50, T51); \
> im[(dstPos)][i] = T60;
>
> @@ -973,8 +959,8 @@
> \
> T50 = _mm_hadd_epi32(T50, T51); \
> T51 = _mm_hadd_epi32(T52, T53); \
> - T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
> - T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
> + T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
> + T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
> T60 = _mm_packs_epi32(T50, T51); \
> im[(dstPos)][i] = T60;
>
> @@ -1082,7 +1068,7 @@
> \
> T60 = _mm_hadd_epi32(T60, T61); \
> \
> - T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), SHIFT2); \
> + T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), DCT32_SHIFT2); \
> T60 = _mm_packs_epi32(T60, T60); \
> _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
>
> @@ -1124,10 +1110,6 @@
> MAKE_ODD(158, 159, 160, 161, 31);
> #undef MAKE_ODD
> }
> -#undef SHIFT1
> -#undef ADD1
> -#undef SHIFT2
> -#undef ADD2
> }
>
> namespace X265_NS {
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/const-a.asm Fri Jul 10 15:57:44 2015 -0700
> @@ -125,6 +125,7 @@
> const pd_2048, times 4 dd 2048
> const pd_ffff, times 4 dd 0xffff
> const pd_32767, times 4 dd 32767
> +const pd_524416, times 4 dd 524416
> const pd_n32768, times 8 dd 0xffff8000
> const pd_n131072, times 4 dd 0xfffe0000
>
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/dct8.asm Fri Jul 10 15:57:44 2015 -0700
> @@ -332,23 +332,48 @@
> cextern pd_2048
> cextern pw_ppppmmmm
> cextern trans8_shuf
> +
> +
> +%if BIT_DEPTH == 12
> + %define DCT4_SHIFT 5
> + %define DCT4_ROUND 16
> + %define IDCT_SHIFT 8
> + %define IDCT_ROUND 128
> + %define DST4_SHIFT 5
> + %define DST4_ROUND 16
> + %define DCT8_SHIFT1 6
> + %define DCT8_ROUND1 32
> +%elif BIT_DEPTH == 10
> + %define DCT4_SHIFT 3
> + %define DCT4_ROUND 4
> + %define IDCT_SHIFT 10
> + %define IDCT_ROUND 512
> + %define DST4_SHIFT 3
> + %define DST4_ROUND 4
> + %define DCT8_SHIFT1 4
> + %define DCT8_ROUND1 8
> +%elif BIT_DEPTH == 8
> + %define DCT4_SHIFT 1
> + %define DCT4_ROUND 1
> + %define IDCT_SHIFT 12
> + %define IDCT_ROUND 2048
> + %define DST4_SHIFT 1
> + %define DST4_ROUND 1
> + %define DCT8_SHIFT1 2
> + %define DCT8_ROUND1 2
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> +%define DCT8_ROUND2 256
> +%define DCT8_SHIFT2 9
> +
> ;------------------------------------------------------
> ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
> ;------------------------------------------------------
> INIT_XMM sse2
> cglobal dct4, 3, 4, 8
> -%if BIT_DEPTH == 12
> - %define DCT_SHIFT 5
> - mova m7, [pd_16]
> -%elif BIT_DEPTH == 10
> - %define DCT_SHIFT 3
> - mova m7, [pd_4]
> -%elif BIT_DEPTH == 8
> - %define DCT_SHIFT 1
> - mova m7, [pd_1]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + mova m7, [pd_ %+ DCT4_ROUND]
> add r2d, r2d
> lea r3, [tab_dct4]
>
> @@ -375,19 +400,19 @@
> psubw m2, m0
> pmaddwd m0, m1, m4
> paddd m0, m7
> - psrad m0, DCT_SHIFT
> + psrad m0, DCT4_SHIFT
> pmaddwd m3, m2, m5
> paddd m3, m7
> - psrad m3, DCT_SHIFT
> + psrad m3, DCT4_SHIFT
> packssdw m0, m3
> pshufd m0, m0, 0xD8
> pshufhw m0, m0, 0xB1
> pmaddwd m1, m6
> paddd m1, m7
> - psrad m1, DCT_SHIFT
> + psrad m1, DCT4_SHIFT
> pmaddwd m2, [r3 + 3 * 16]
> paddd m2, m7
> - psrad m2, DCT_SHIFT
> + psrad m2, DCT4_SHIFT
> packssdw m1, m2
> pshufd m1, m1, 0xD8
> pshufhw m1, m1, 0xB1
> @@ -434,18 +459,7 @@
> ; - r2: source stride
> INIT_YMM avx2
> cglobal dct4, 3, 4, 8, src, dst, srcStride
> -%if BIT_DEPTH == 12
> - %define DCT_SHIFT 5
> - vbroadcasti128 m7, [pd_16]
> -%elif BIT_DEPTH == 10
> - %define DCT_SHIFT 3
> - vbroadcasti128 m7, [pd_4]
> -%elif BIT_DEPTH == 8
> - %define DCT_SHIFT 1
> - vbroadcasti128 m7, [pd_1]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + vbroadcasti128 m7, [pd_ %+ DCT4_ROUND]
> add r2d, r2d
> lea r3, [avx2_dct4]
>
> @@ -467,11 +481,11 @@
>
> pmaddwd m2, m5
> paddd m2, m7
> - psrad m2, DCT_SHIFT
> + psrad m2, DCT4_SHIFT
>
> pmaddwd m0, m6
> paddd m0, m7
> - psrad m0, DCT_SHIFT
> + psrad m0, DCT4_SHIFT
>
> packssdw m2, m0
> pshufb m2, m4
> @@ -499,33 +513,19 @@
> ;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_XMM sse2
> -cglobal idct4, 3, 4, 7
> -%if BIT_DEPTH == 12
> - %define IDCT4_OFFSET [pd_128]
> - %define IDCT4_SHIFT 8
> -%elif BIT_DEPTH == 10
> - %define IDCT4_OFFSET [pd_512]
> - %define IDCT4_SHIFT 10
> -%elif BIT_DEPTH == 8
> - %define IDCT4_OFFSET [pd_2048]
> - %define IDCT4_SHIFT 12
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> +cglobal idct4, 3, 4, 6
> add r2d, r2d
> lea r3, [tab_dct4]
>
> - mova m6, [pd_64]
> -
> movu m0, [r0 + 0 * 16]
> movu m1, [r0 + 1 * 16]
>
> punpcklwd m2, m0, m1
> pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
> - paddd m3, m6
> + paddd m3, [pd_64]
>
> pmaddwd m2, [r3 + 2 * 16] ; m2 = E2
> - paddd m2, m6
> + paddd m2, [pd_64]
>
> punpckhwd m0, m1
> pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
> @@ -549,29 +549,27 @@
> punpcklwd m0, m1, m4 ; m0 = m128iA
> punpckhwd m1, m4 ; m1 = m128iD
>
> - mova m6, IDCT4_OFFSET
> -
> punpcklwd m2, m0, m1
> pmaddwd m3, m2, [r3 + 0 * 16]
> - paddd m3, m6 ; m3 = E1
> + paddd m3, [pd_ %+ IDCT_ROUND] ; m3 = E1
>
> pmaddwd m2, [r3 + 2 * 16]
> - paddd m2, m6 ; m2 = E2
> + paddd m2, [pd_ %+ IDCT_ROUND] ; m2 = E2
>
> punpckhwd m0, m1
> pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
> pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
>
> paddd m4, m3, m1
> - psrad m4, IDCT4_SHIFT ; m4 = m128iA
> + psrad m4, IDCT_SHIFT ; m4 = m128iA
> paddd m5, m2, m0
> - psrad m5, IDCT4_SHIFT
> + psrad m5, IDCT_SHIFT
> packssdw m4, m5 ; m4 = m128iA
>
> psubd m2, m0
> - psrad m2, IDCT4_SHIFT
> + psrad m2, IDCT_SHIFT
> psubd m3, m1
> - psrad m3, IDCT4_SHIFT
> + psrad m3, IDCT_SHIFT
> packssdw m2, m3 ; m2 = m128iD
>
> punpcklwd m1, m4, m2
> @@ -585,7 +583,6 @@
> movlps [r1 + 2 * r2], m1
> lea r1, [r1 + 2 * r2]
> movhps [r1 + r2], m1
> -
> RET
>
> ;------------------------------------------------------
> @@ -606,18 +603,7 @@
> %define coef3 [r3 + 3 * 16]
> %endif ; ARCH_X86_64
>
> -%if BIT_DEPTH == 12
> - %define DST_SHIFT 5
> - mova m5, [pd_16]
> -%elif BIT_DEPTH == 10
> - %define DST_SHIFT 3
> - mova m5, [pd_4]
> -%elif BIT_DEPTH == 8
> - %define DST_SHIFT 1
> - mova m5, [pd_1]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + mova m5, [pd_ %+ DST4_ROUND]
> add r2d, r2d
> lea r3, [tab_dst4]
> %if ARCH_X86_64
> @@ -641,7 +627,7 @@
> pshufd m3, m3, q3120
> punpcklqdq m2, m3
> paddd m2, m5
> - psrad m2, DST_SHIFT
> + psrad m2, DST4_SHIFT
> pmaddwd m3, m0, coef1
> pmaddwd m4, m1, coef1
> pshufd m6, m4, q2301
> @@ -652,7 +638,7 @@
> pshufd m3, m3, q3120
> punpcklqdq m3, m4
> paddd m3, m5
> - psrad m3, DST_SHIFT
> + psrad m3, DST4_SHIFT
> packssdw m2, m3 ; m2 = T70
> pmaddwd m3, m0, coef2
> pmaddwd m4, m1, coef2
> @@ -664,7 +650,7 @@
> pshufd m3, m3, q3120
> punpcklqdq m3, m4
> paddd m3, m5
> - psrad m3, DST_SHIFT
> + psrad m3, DST4_SHIFT
> pmaddwd m0, coef3
> pmaddwd m1, coef3
> pshufd m6, m0, q2301
> @@ -675,7 +661,7 @@
> pshufd m1, m1, q3120
> punpcklqdq m0, m1
> paddd m0, m5
> - psrad m0, DST_SHIFT
> + psrad m0, DST4_SHIFT
> packssdw m3, m0 ; m3 = T71
> mova m5, [pd_128]
>
> @@ -730,7 +716,6 @@
> psrad m2, 8
> packssdw m0, m2
> movu [r1 + 1 * 16], m0
> -
> RET
>
> ;------------------------------------------------------
> @@ -749,13 +734,7 @@
> %define coef0 m6
> %define coef1 m7
>
> -%if BIT_DEPTH == 8
> - %define DST_SHIFT 1
> - mova m5, [pd_1]
> -%elif BIT_DEPTH == 10
> - %define DST_SHIFT 3
> - mova m5, [pd_4]
> -%endif
> + mova m5, [pd_ %+ DST4_ROUND]
> add r2d, r2d
> lea r3, [tab_dst4]
> mova coef0, [r3 + 0 * 16]
> @@ -775,23 +754,23 @@
> pmaddwd m3, m1, coef0
> phaddd m2, m3
> paddd m2, m5
> - psrad m2, DST_SHIFT
> + psrad m2, DST4_SHIFT
> pmaddwd m3, m0, coef1
> pmaddwd m4, m1, coef1
> phaddd m3, m4
> paddd m3, m5
> - psrad m3, DST_SHIFT
> + psrad m3, DST4_SHIFT
> packssdw m2, m3 ; m2 = T70
> pmaddwd m3, m0, coef2
> pmaddwd m4, m1, coef2
> phaddd m3, m4
> paddd m3, m5
> - psrad m3, DST_SHIFT
> + psrad m3, DST4_SHIFT
> pmaddwd m0, coef3
> pmaddwd m1, coef3
> phaddd m0, m1
> paddd m0, m5
> - psrad m0, DST_SHIFT
> + psrad m0, DST4_SHIFT
> packssdw m3, m0 ; m3 = T71
> mova m5, [pd_128]
>
> @@ -822,7 +801,6 @@
> psrad m2, 8
> packssdw m0, m2
> movu [r1 + 1 * 16], m0
> -
> RET
>
> ;------------------------------------------------------------------
> @@ -830,13 +808,7 @@
> ;------------------------------------------------------------------
> INIT_YMM avx2
> cglobal dst4, 3, 4, 6
> -%if BIT_DEPTH == 8
> - %define DST_SHIFT 1
> - vpbroadcastd m5, [pd_1]
> -%elif BIT_DEPTH == 10
> - %define DST_SHIFT 3
> - vpbroadcastd m5, [pd_4]
> -%endif
> + vbroadcasti128 m5, [pd_ %+ DST4_ROUND]
> mova m4, [trans8_shuf]
> add r2d, r2d
> lea r3, [pw_dst4_tab]
> @@ -853,12 +825,12 @@
> pmaddwd m1, m0, [r3 + 1 * 32]
> phaddd m2, m1
> paddd m2, m5
> - psrad m2, DST_SHIFT
> + psrad m2, DST4_SHIFT
> pmaddwd m3, m0, [r3 + 2 * 32]
> pmaddwd m1, m0, [r3 + 3 * 32]
> phaddd m3, m1
> paddd m3, m5
> - psrad m3, DST_SHIFT
> + psrad m3, DST4_SHIFT
> packssdw m2, m3
> vpermd m2, m4, m2
>
> @@ -883,18 +855,7 @@
> ;-------------------------------------------------------
> INIT_XMM sse2
> cglobal idst4, 3, 4, 7
> -%if BIT_DEPTH == 12
> - mova m6, [pd_128]
> - %define IDCT4_SHIFT 8
> -%elif BIT_DEPTH == 10
> - mova m6, [pd_512]
> - %define IDCT4_SHIFT 10
> -%elif BIT_DEPTH == 8
> - mova m6, [pd_2048]
> - %define IDCT4_SHIFT 12
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + mova m6, [pd_ %+ IDCT_ROUND]
> add r2d, r2d
> lea r3, [tab_idst4]
> mova m5, [pd_64]
> @@ -942,23 +903,23 @@
> pmaddwd m3, m2, [r3 + 1 * 16]
> paddd m0, m3
> paddd m0, m6
> - psrad m0, IDCT4_SHIFT ; m0 = S0
> + psrad m0, IDCT_SHIFT ; m0 = S0
> pmaddwd m3, m1, [r3 + 2 * 16]
> pmaddwd m4, m2, [r3 + 3 * 16]
> paddd m3, m4
> paddd m3, m6
> - psrad m3, IDCT4_SHIFT ; m3 = S8
> + psrad m3, IDCT_SHIFT ; m3 = S8
> packssdw m0, m3 ; m0 = m128iA
> pmaddwd m3, m1, [r3 + 4 * 16]
> pmaddwd m4, m2, [r3 + 5 * 16]
> paddd m3, m4
> paddd m3, m6
> - psrad m3, IDCT4_SHIFT ; m3 = S0
> + psrad m3, IDCT_SHIFT ; m3 = S0
> pmaddwd m1, [r3 + 6 * 16]
> pmaddwd m2, [r3 + 7 * 16]
> paddd m1, m2
> paddd m1, m6
> - psrad m1, IDCT4_SHIFT ; m1 = S8
> + psrad m1, IDCT_SHIFT ; m1 = S8
> packssdw m3, m1 ; m3 = m128iD
> punpcklwd m1, m0, m3
> punpckhwd m0, m3
> @@ -978,18 +939,7 @@
> ;-----------------------------------------------------------------
> INIT_YMM avx2
> cglobal idst4, 3, 4, 6
> -%if BIT_DEPTH == 12
> - vpbroadcastd m4, [pd_256]
> - %define IDCT4_SHIFT 8
> -%elif BIT_DEPTH == 10
> - vpbroadcastd m4, [pd_512]
> - %define IDCT4_SHIFT 10
> -%elif BIT_DEPTH == 8
> - vpbroadcastd m4, [pd_2048]
> - %define IDCT4_SHIFT 12
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + vbroadcasti128 m4, [pd_ %+ IDCT_ROUND]
> add r2d, r2d
> lea r3, [pw_idst4_tab]
>
> @@ -1030,12 +980,12 @@
> pmaddwd m3, m2, [r3 + 1 * 32]
> paddd m0, m3
> paddd m0, m4
> - psrad m0, IDCT4_SHIFT
> + psrad m0, IDCT_SHIFT
> pmaddwd m3, m1, [r3 + 2 * 32]
> pmaddwd m2, m2, [r3 + 3 * 32]
> paddd m3, m2
> paddd m3, m4
> - psrad m3, IDCT4_SHIFT
> + psrad m3, IDCT_SHIFT
>
> packssdw m0, m3
> pshufb m1, m0, [pb_idst4_shuf]
> @@ -1066,20 +1016,6 @@
> ; ...
> ; Row6[4-7] Row7[4-7]
> ;------------------------
> -%if BIT_DEPTH == 12
> - %define DCT_SHIFT1 6
> - %define DCT_ADD1 [pd_32]
> -%elif BIT_DEPTH == 10
> - %define DCT_SHIFT1 4
> - %define DCT_ADD1 [pd_8]
> -%elif BIT_DEPTH == 8
> - %define DCT_SHIFT1 2
> - %define DCT_ADD1 [pd_2]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -%define DCT_ADD2 [pd_256]
> -%define DCT_SHIFT2 9
>
> add r2, r2
> lea r3, [r2 * 3]
> @@ -1125,8 +1061,8 @@
> punpckhqdq m7, m5
> punpcklqdq m1, m5
> paddd m1, m7
> - paddd m1, DCT_ADD1
> - psrad m1, DCT_SHIFT1
> + paddd m1, [pd_ %+ DCT8_ROUND1]
> + psrad m1, DCT8_SHIFT1
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -1140,8 +1076,8 @@
> punpckhqdq m7, m5
> punpcklqdq m1, m5
> paddd m1, m7
> - paddd m1, DCT_ADD1
> - psrad m1, DCT_SHIFT1
> + paddd m1, [pd_ %+ DCT8_ROUND1]
> + psrad m1, DCT8_SHIFT1
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -1155,8 +1091,8 @@
> punpckhqdq m7, m5
> punpcklqdq m1, m5
> paddd m1, m7
> - paddd m1, DCT_ADD1
> - psrad m1, DCT_SHIFT1
> + paddd m1, [pd_ %+ DCT8_ROUND1]
> + psrad m1, DCT8_SHIFT1
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -1170,8 +1106,8 @@
> punpckhqdq m7, m0
> punpcklqdq m4, m0
> paddd m4, m7
> - paddd m4, DCT_ADD1
> - psrad m4, DCT_SHIFT1
> + paddd m4, [pd_ %+ DCT8_ROUND1]
> + psrad m4, DCT8_SHIFT1
> %if x == 1
> pshufd m4, m4, 0x1B
> %endif
> @@ -1189,29 +1125,29 @@
> pshuflw m2, m2, 0xD8
> pshufhw m2, m2, 0xD8
> pmaddwd m3, m0, [r4 + 0*16]
> - paddd m3, DCT_ADD1
> - psrad m3, DCT_SHIFT1
> + paddd m3, [pd_ %+ DCT8_ROUND1]
> + psrad m3, DCT8_SHIFT1
> %if x == 1
> pshufd m3, m3, 0x1B
> %endif
> mova [r5 + 0*2*mmsize], m3 ; Row 0
> pmaddwd m0, [r4 + 2*16]
> - paddd m0, DCT_ADD1
> - psrad m0, DCT_SHIFT1
> + paddd m0, [pd_ %+ DCT8_ROUND1]
> + psrad m0, DCT8_SHIFT1
> %if x == 1
> pshufd m0, m0, 0x1B
> %endif
> mova [r5 + 4*2*mmsize], m0 ; Row 4
> pmaddwd m3, m2, [r4 + 1*16]
> - paddd m3, DCT_ADD1
> - psrad m3, DCT_SHIFT1
> + paddd m3, [pd_ %+ DCT8_ROUND1]
> + psrad m3, DCT8_SHIFT1
> %if x == 1
> pshufd m3, m3, 0x1B
> %endif
> mova [r5 + 2*2*mmsize], m3 ; Row 2
> pmaddwd m2, [r4 + 3*16]
> - paddd m2, DCT_ADD1
> - psrad m2, DCT_SHIFT1
> + paddd m2, [pd_ %+ DCT8_ROUND1]
> + psrad m2, DCT8_SHIFT1
> %if x == 1
> pshufd m2, m2, 0x1B
> %endif
> @@ -1271,16 +1207,16 @@
> punpckhqdq m7, m5
> punpcklqdq m3, m5
> paddd m3, m7 ; m3 = [Row2 Row0]
> - paddd m3, DCT_ADD2
> - psrad m3, DCT_SHIFT2
> + paddd m3, [pd_ %+ DCT8_ROUND2]
> + psrad m3, DCT8_SHIFT2
> pshufd m4, m4, 0xD8
> pshufd m2, m2, 0xD8
> mova m7, m4
> punpckhqdq m7, m2
> punpcklqdq m4, m2
> psubd m4, m7 ; m4 = [Row6 Row4]
> - paddd m4, DCT_ADD2
> - psrad m4, DCT_SHIFT2
> + paddd m4, [pd_ %+ DCT8_ROUND2]
> + psrad m4, DCT8_SHIFT2
>
> packssdw m3, m3
> movd [r1 + 0*mmsize], m3
> @@ -1341,8 +1277,8 @@
> punpckhqdq m7, m4
> punpcklqdq m2, m4
> paddd m2, m7 ; m2 = [Row3 Row1]
> - paddd m2, DCT_ADD2
> - psrad m2, DCT_SHIFT2
> + paddd m2, [pd_ %+ DCT8_ROUND2]
> + psrad m2, DCT8_SHIFT2
>
> packssdw m2, m2
> movd [r1 + 1*mmsize], m2
> @@ -1397,8 +1333,8 @@
> punpckhqdq m7, m4
> punpcklqdq m2, m4
> paddd m2, m7 ; m2 = [Row7 Row5]
> - paddd m2, DCT_ADD2
> - psrad m2, DCT_SHIFT2
> + paddd m2, [pd_ %+ DCT8_ROUND2]
> + psrad m2, DCT8_SHIFT2
>
> packssdw m2, m2
> movd [r1 + 5*mmsize], m2
> @@ -1412,10 +1348,6 @@
> %endrep
>
> RET
> -%undef IDCT_SHIFT1
> -%undef IDCT_ADD1
> -%undef IDCT_SHIFT2
> -%undef IDCT_ADD2
>
> ;-------------------------------------------------------
> ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
> @@ -1432,18 +1364,7 @@
> ; ...
> ; Row6[4-7] Row7[4-7]
> ;------------------------
> -%if BIT_DEPTH == 12
> - %define DCT_SHIFT 6
> - mova m6, [pd_16]
> -%elif BIT_DEPTH == 10
> - %define DCT_SHIFT 4
> - mova m6, [pd_8]
> -%elif BIT_DEPTH == 8
> - %define DCT_SHIFT 2
> - mova m6, [pd_2]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + mova m6, [pd_ %+ DCT8_ROUND1]
>
> add r2, r2
> lea r3, [r2 * 3]
> @@ -1485,7 +1406,7 @@
> pmaddwd m5, m0, [r4 + 0*16]
> phaddd m1, m5
> paddd m1, m6
> - psrad m1, DCT_SHIFT
> + psrad m1, DCT8_SHIFT1
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -1495,7 +1416,7 @@
> pmaddwd m5, m0, [r4 + 1*16]
> phaddd m1, m5
> paddd m1, m6
> - psrad m1, DCT_SHIFT
> + psrad m1, DCT8_SHIFT1
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -1505,7 +1426,7 @@
> pmaddwd m5, m0, [r4 + 2*16]
> phaddd m1, m5
> paddd m1, m6
> - psrad m1, DCT_SHIFT
> + psrad m1, DCT8_SHIFT1
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -1515,7 +1436,7 @@
> pmaddwd m0, [r4 + 3*16]
> phaddd m4, m0
> paddd m4, m6
> - psrad m4, DCT_SHIFT
> + psrad m4, DCT8_SHIFT1
> %if x == 1
> pshufd m4, m4, 0x1B
> %endif
> @@ -1530,28 +1451,28 @@
> pshufb m2, [pb_unpackhlw1]
> pmaddwd m3, m0, [r4 + 0*16]
> paddd m3, m6
> - psrad m3, DCT_SHIFT
> + psrad m3, DCT8_SHIFT1
> %if x == 1
> pshufd m3, m3, 0x1B
> %endif
> mova [r5 + 0*2*mmsize], m3 ; Row 0
> pmaddwd m0, [r4 + 2*16]
> paddd m0, m6
> - psrad m0, DCT_SHIFT
> + psrad m0, DCT8_SHIFT1
> %if x == 1
> pshufd m0, m0, 0x1B
> %endif
> mova [r5 + 4*2*mmsize], m0 ; Row 4
> pmaddwd m3, m2, [r4 + 1*16]
> paddd m3, m6
> - psrad m3, DCT_SHIFT
> + psrad m3, DCT8_SHIFT1
> %if x == 1
> pshufd m3, m3, 0x1B
> %endif
> mova [r5 + 2*2*mmsize], m3 ; Row 2
> pmaddwd m2, [r4 + 3*16]
> paddd m2, m6
> - psrad m2, DCT_SHIFT
> + psrad m2, DCT8_SHIFT1
> %if x == 1
> pshufd m2, m2, 0x1B
> %endif
> @@ -1649,19 +1570,6 @@
> ;-------------------------------------------------------
> %if ARCH_X86_64
> INIT_XMM sse2
> -%if BIT_DEPTH == 12
> - %define IDCT_SHIFT 8
> - %define IDCT_ADD pd_128
> -%elif BIT_DEPTH == 10
> - %define IDCT_SHIFT 10
> - %define IDCT_ADD pd_512
> -%elif BIT_DEPTH == 8
> - %define IDCT_SHIFT 12
> - %define IDCT_ADD pd_2048
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> -
> cglobal idct8, 3, 6, 16, 0-5*mmsize
> mova m9, [r0 + 1 * mmsize]
> mova m1, [r0 + 3 * mmsize]
> @@ -1911,18 +1819,19 @@
> psubd m10, m2
> mova m2, m4
> pmaddwd m12, [tab_dct4 + 3 * mmsize]
> - paddd m0, [IDCT_ADD]
> - paddd m1, [IDCT_ADD]
> - paddd m8, [IDCT_ADD]
> - paddd m10, [IDCT_ADD]
> + mova m15, [pd_ %+ IDCT_ROUND]
> + paddd m0, m15
> + paddd m1, m15
> + paddd m8, m15
> + paddd m10, m15
> paddd m2, m13
> paddd m3, m12
> - paddd m2, [IDCT_ADD]
> - paddd m3, [IDCT_ADD]
> + paddd m2, m15
> + paddd m3, m15
> psubd m4, m13
> psubd m6, m12
> - paddd m4, [IDCT_ADD]
> - paddd m6, [IDCT_ADD]
> + paddd m4, m15
> + paddd m6, m15
> mova m15, [rsp + 4 * mmsize]
> mova m12, m8
> psubd m8, m7
> @@ -2018,16 +1927,12 @@
> movq [r1 + r3 * 2 + 8], m8
> movhps [r1 + r0 + 8], m8
> RET
> -
> -%undef IDCT_SHIFT
> -%undef IDCT_ADD
> %endif
>
> ;-------------------------------------------------------
> ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_XMM ssse3
> -
> cglobal patial_butterfly_inverse_internal_pass1
> movh m0, [r0]
> movhps m0, [r0 + 2 * 16]
> @@ -2119,15 +2024,6 @@
> ret
>
> %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
> -%if BIT_DEPTH == 12
> - %define IDCT_SHIFT 8
> -%elif BIT_DEPTH == 10
> - %define IDCT_SHIFT 10
> -%elif BIT_DEPTH == 8
> - %define IDCT_SHIFT 12
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> pshufb m4, %1, [pb_idct8even]
> pmaddwd m4, [tab_idct8_1]
> phsubd m5, m4
> @@ -2149,11 +2045,10 @@
> pshufd m4, m4, 0x1B
>
> packssdw %1, m4
> -%undef IDCT_SHIFT
> %endmacro
>
> +INIT_XMM ssse3
> cglobal patial_butterfly_inverse_internal_pass2
> -
> mova m0, [r5]
> PARTIAL_BUTTERFLY_PROCESS_ROW m0
> movu [r1], m0
> @@ -2169,9 +2064,9 @@
> mova m3, [r5 + 48]
> PARTIAL_BUTTERFLY_PROCESS_ROW m3
> movu [r1 + r3], m3
> -
> ret
>
> +INIT_XMM ssse3
> cglobal idct8, 3,7,8 ;,0-16*mmsize
> ; alignment stack to 64-bytes
> mov r5, rsp
> @@ -2190,15 +2085,7 @@
>
> call patial_butterfly_inverse_internal_pass1
>
> -%if BIT_DEPTH == 12
> - mova m6, [pd_256]
> -%elif BIT_DEPTH == 10
> - mova m6, [pd_512]
> -%elif BIT_DEPTH == 8
> - mova m6, [pd_2048]
> -%else
> - %error Unsupported BIT_DEPTH!
> -%endif
> + mova m6, [pd_ %+ IDCT_ROUND]
> add r2, r2
> lea r3, [r2 * 3]
> lea r4, [tab_idct8_2]
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/intrapred16.asm
> --- a/source/common/x86/intrapred16.asm Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/intrapred16.asm Fri Jul 10 15:57:44 2015 -0700
> @@ -109,11 +109,11 @@
> cextern pw_16
> cextern pw_31
> cextern pw_32
> -cextern pw_1023
> cextern pd_16
> cextern pd_31
> cextern pd_32
> cextern pw_4096
> +cextern pw_pixel_max
> cextern multiL
> cextern multiH
> cextern multiH2
> @@ -1228,11 +1228,11 @@
>
> punpcklwd m0, m0 ;[4 4 3 3 2 2 1 1]
> pshufd m1, m0, 0xFA
> - add r1, r1
> + add r1d, r1d
> pshufd m0, m0, 0x50
> movhps [r0 + r1], m0
> movh [r0 + r1 * 2], m1
> - lea r1, [r1 * 3]
> + lea r1d, [r1 * 3]
> movhps [r0 + r1], m1
>
> cmp r4m, byte 0
> @@ -1247,7 +1247,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> .quit:
> movh [r0], m0
> RET
> @@ -1583,7 +1583,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
>
> movh r2, m0
> mov [r0], r2w
> @@ -2756,7 +2756,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> .quit:
> movh [r0], m0
> RET
> @@ -2785,7 +2785,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
>
> pextrw [r0], m0, 0
> pextrw [r0 + r1], m0, 1
> @@ -4002,7 +4002,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> .quit:
> movu [r0], m0
> RET
> @@ -5874,7 +5874,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> pextrw [r0], m0, 0
> pextrw [r0 + r1], m0, 1
> pextrw [r0 + r1 * 2], m0, 2
> @@ -10287,9 +10287,9 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> pmaxsw m3, m1
> - pminsw m3, [pw_1023]
> + pminsw m3, [pw_pixel_max]
> .quit:
> movu [r0], m0
> movu [r0 + 16], m3
> @@ -10359,9 +10359,9 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> pmaxsw m3, m1
> - pminsw m3, [pw_1023]
> + pminsw m3, [pw_pixel_max]
> pextrw [r0], m0, 0
> pextrw [r0 + r1], m0, 1
> pextrw [r0 + r1 * 2], m0, 2
> @@ -12952,7 +12952,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> .quit:
> movu [r0], m0
> RET
> @@ -12999,7 +12999,7 @@
> paddw m0, m1
> pxor m1, m1
> pmaxsw m0, m1
> - pminsw m0, [pw_1023]
> + pminsw m0, [pw_pixel_max]
> pextrw [r0], xm0, 0
> pextrw [r0 + r1], xm0, 1
> pextrw [r0 + r1 * 2], xm0, 2
> diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/ipfilter16.asm
> --- a/source/common/x86/ipfilter16.asm Thu Jul 09 17:47:46 2015 -0700
> +++ b/source/common/x86/ipfilter16.asm Fri Jul 10 15:57:44 2015 -0700
> @@ -53,7 +53,7 @@
> times 8 dw -4, 54
> times 8 dw 16, -2
>
> - times 8 dw -6, 46
> + times 8 dw -6, 46
> times 8 dw 28, -4
>
> times 8 dw -4, 36
> @@ -147,15 +147,22 @@
>
> %if BIT_DEPTH == 10
> %define INTERP_OFFSET_PS pd_n32768
> + %define INTERP_SHIFT_PS 2
> + %define INTERP_OFFSET_SP pd_524800
> + %define INTERP_SHIFT_SP 10
> %elif BIT_DEPTH == 12
> %define INTERP_OFFSET_PS pd_n131072
> -%else
> -%error Unsupport bit depth!
> + %define INTERP_SHIFT_PS 4
> + %define INTERP_OFFSET_SP pd_524416
> + %define INTERP_SHIFT_SP 8
> +%else
> + %error Unsupport bit depth!
> %endif
>
> SECTION .text
> cextern pd_32
> cextern pw_pixel_max
> +cextern pd_524416
> cextern pd_n32768
> cextern pd_n131072
> cextern pw_2000
> @@ -644,8 +651,8 @@
> packssdw m3, m5
> CLIPW m3, m7, m6
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movd [r2 + %1], m3
> @@ -682,8 +689,8 @@
> pshufd m5, m5, q3120
> paddd m5, m1
>
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
>
> movd [r2 + %1], m3
> @@ -729,8 +736,8 @@
> packssdw m3, m5
> CLIPW m3, m7, m6
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movh [r2 + %1], m3
> @@ -753,7 +760,7 @@
> punpcklqdq m3, m4
> paddd m3, m1
>
> - psrad m3, 2
> + psrad m3, INTERP_SHIFT_PS
> packssdw m3, m3
> movh [r2 + r3 * 2 + %1], m3
> %endmacro
> @@ -794,8 +801,8 @@
> packssdw m3, m5
> CLIPW m3, m7, m6
> %else
> - psrad m3, 2
> - psrad m5, 2
> + psrad m3, INTERP_SHIFT_PS
> + psrad m5, INTERP_SHIFT_PS
> packssdw m3, m5
> %endif
> movdqu [r2 + %1], m3
> @@ -905,7 +912,7 @@
> %endif ;z < y
> %endrep
>
> -RET
> + RET
> %endmacro
>
> ;-----------------------------------------------------------------------------
> @@ -1183,7 +1190,7 @@
> mova m0, [tab_LumaCoeff + r4]
> %endif
>
> -%ifidn %3, pp
> +%ifidn %3, pp
> mova m1, [pd_32]
> pxor m6, m6
> mova m7, [pw_pixel_max]
> @@ -1270,7 +1277,7 @@
> mova m0, [tab_LumaCoeff + r4]
> %endif
>
> -%ifidn %3, pp
> +%ifidn %3, pp
> mova m1, [pd_32]
> pxor m7, m7
> %else
> @@ -1316,7 +1323,7 @@
> phaddd m6, m3
> phaddd m5, m6
> paddd m5, m1
> -%ifidn %3, pp
> +%ifidn %3, pp
> psrad m4, 6
> psrad m5, 6
> packusdw m4, m5
> @@ -1372,7 +1379,7 @@
> %else
> mova m0, [tab_LumaCoeff + r4]
> %endif
> -%ifidn %3, pp
> +%ifidn %3, pp
> mova m1, [pd_32]
> %else
> mova m1, [INTERP_OFFSET_PS]
> @@ -1417,131 +1424,6 @@
> phaddd m6, m7
> phaddd m5, m6
> paddd m5, m1
> -%ifidn %3, pp
> - psrad m4, 6
> - psrad m5, 6
> - packusdw m4, m5
> - pxor m5, m5
> - CLIPW m4, m5, [pw_pixel_max]
> -%else
> - psrad m4, 2
> - psrad m5, 2
> - packssdw m4, m5
> -%endif
> -
> - movu [r2], m4
> -
> - movu m2, [r0 + 32] ; m2 = src[16-23]
> -
> - pmaddwd m4, m3, m0 ; m3 = src[8-15]
> - palignr m5, m2, m3, 2 ; m5 = src[9-16]
> - pmaddwd m5, m0
> - phaddd m4, m5
> -
> - palignr m5, m2, m3, 4 ; m5 = src[10-17]
> - pmaddwd m5, m0
> - palignr m2, m3, 6 ; m2 = src[11-18]
> - pmaddwd m2, m0
> - phaddd m5, m2
> - phaddd m4, m5
> - paddd m4, m1
> -%ifidn %3, pp
> - psrad m4, 6
> - packusdw m4, m4
> - pxor m5, m5
> - CLIPW m4, m5, [pw_pixel_max]
> -%else
> - psrad m4, 2
> - packssdw m4, m4
> -%endif
> -
> - movh [r2 + 16], m4
> -
> - add r0, r1
> - add r2, r3
> -
> - dec r4d
> - jnz .loopH
> - RET
> -%endmacro
> -
> -;-------------------------------------------------------------------------------------------------------------
> -; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
> -;-------------------------------------------------------------------------------------------------------------
> -FILTER_HOR_LUMA_W12 12, 16, pp
> -
> -;----------------------------------------------------------------------------------------------------------------------------
> -; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> -;----------------------------------------------------------------------------------------------------------------------------
> -FILTER_HOR_LUMA_W12 12, 16, ps
> -
> -;--------------------------------------------------------------------------------------------------------------
> -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> -;--------------------------------------------------------------------------------------------------------------
> -%macro FILTER_HOR_LUMA_W16 3
> -INIT_XMM sse4
> -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> -
> - add r1, r1
> - add r3, r3
> - mov r4d, r4m
> - sub r0, 6
> - shl r4d, 4
> -
> -%ifdef PIC
> - lea r6, [tab_LumaCoeff]
> - mova m0, [r6 + r4]
> -%else
> - mova m0, [tab_LumaCoeff + r4]
> -%endif
> -
> -%ifidn %3, pp
> - mova m1, [pd_32]
> -%else
> - mova m1, [INTERP_OFFSET_PS]
> -%endif
> -
> - mov r4d, %2
> -%ifidn %3, ps
> - cmp r5m, byte 0
> - je .loopH
> - lea r6, [r1 + 2 * r1]
> - sub r0, r6
> - add r4d, 7
> -%endif
> -
> -.loopH:
> -%assign x 0
> -%rep %1 / 16
> - movu m2, [r0 + x] ; m2 = src[0-7]
> - movu m3, [r0 + 16 + x] ; m3 = src[8-15]
> -
> - pmaddwd m4, m2, m0
> - palignr m5, m3, m2, 2 ; m5 = src[1-8]
> - pmaddwd m5, m0
> - phaddd m4, m5
> -
> - palignr m5, m3, m2, 4 ; m5 = src[2-9]
> - pmaddwd m5, m0
> - palignr m6, m3, m2, 6 ; m6 = src[3-10]
> - pmaddwd m6, m0
> - phaddd m5, m6
> - phaddd m4, m5
> - paddd m4, m1
> -
> - palignr m5, m3, m2, 8 ; m5 = src[4-11]
> - pmaddwd m5, m0
> - palignr m6, m3, m2, 10 ; m6 = src[5-12]
> - pmaddwd m6, m0
> - phaddd m5, m6
> -
> - palignr m6, m3, m2, 12 ; m6 = src[6-13]
> - pmaddwd m6, m0
> - palignr m7, m3, m2, 14 ; m2 = src[7-14]
> - pmaddwd m7, m0
> - phaddd m6, m7
> - phaddd m5, m6
> - paddd m5, m1
> %ifidn %3, pp
> psrad m4, 6
> psrad m5, 6
> @@ -1553,6 +1435,131 @@
> psrad m5, 2
> packssdw m4, m5
> %endif
> +
> + movu [r2], m4
> +
> + movu m2, [r0 + 32] ; m2 = src[16-23]
> +
> + pmaddwd m4, m3, m0 ; m3 = src[8-15]
> + palignr m5, m2, m3, 2 ; m5 = src[9-16]
> + pmaddwd m5, m0
> + phaddd m4, m5
> +
> + palignr m5, m2, m3, 4 ; m5 = src[10-17]
> + pmaddwd m5, m0
> + palignr m2, m3, 6 ; m2 = src[11-18]
> + pmaddwd m2, m0
> + phaddd m5, m2
> + phaddd m4, m5
> + paddd m4, m1
> +%ifidn %3, pp
> + psrad m4, 6
> + packusdw m4, m4
> + pxor m5, m5
> + CLIPW m4, m5, [pw_pixel_max]
> +%else
> + psrad m4, 2
> + packssdw m4, m4
> +%endif
> +
> + movh [r2 + 16], m4
> +
> + add r0, r1
> + add r2, r3
> +
> + dec r4d
> + jnz .loopH
> + RET
> +%endmacro
> +
> +;-------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
> +;-------------------------------------------------------------------------------------------------------------
> +FILTER_HOR_LUMA_W12 12, 16, pp
> +
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +;----------------------------------------------------------------------------------------------------------------------------
> +FILTER_HOR_LUMA_W12 12, 16, ps
> +
> +;--------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> +;--------------------------------------------------------------------------------------------------------------
> +%macro FILTER_HOR_LUMA_W16 3
> +INIT_XMM sse4
> +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
> +
> + add r1, r1
> + add r3, r3
> + mov r4d, r4m
> + sub r0, 6
> + shl r4d, 4
> +
> +%ifdef PIC
> + lea r6, [tab_LumaCoeff]
> + mova m0, [r6 + r4]
> +%else
> + mova m0, [tab_LumaCoeff + r4]
> +%endif
> +
> +%ifidn %3, pp
> + mova m1, [pd_32]
> +%else
> + mova m1, [INTERP_OFFSET_PS]
> +%endif
> +
> + mov r4d, %2
> +%ifidn %3, ps
> + cmp r5m, byte 0
> + je .loopH
> + lea r6, [r1 + 2 * r1]
> + sub r0, r6
> + add r4d, 7
> +%endif
> +
> +.loopH:
> +%assign x 0
> +%rep %1 / 16
> + movu m2, [r0 + x] ; m2 = src[0-7]
> + movu m3, [r0 + 16 + x] ; m3 = src[8-15]
> +
> + pmaddwd m4, m2, m0
> + palignr m5, m3, m2, 2 ; m5 = src[1-8]
> + pmaddwd m5, m0
> + phaddd m4, m5
> +
> + palignr m5, m3, m2, 4 ; m5 = src[2-9]
> + pmaddwd m5, m0
> + palignr m6, m3, m2, 6 ; m6 = src[3-10]
> + pmaddwd m6, m0
> + phaddd m5, m6
> + phaddd m4, m5
> + paddd m4, m1
> +
> + palignr m5, m3, m2, 8 ; m5 = src[4-11]
> + pmaddwd m5, m0
> + palignr m6, m3, m2, 10 ; m6 = src[5-12]
> + pmaddwd m6, m0
> + phaddd m5, m6
> +
> + palignr m6, m3, m2, 12 ; m6 = src[6-13]
> + pmaddwd m6, m0
> + palignr m7, m3, m2, 14 ; m2 = src[7-14]
> + pmaddwd m7, m0
> + phaddd m6, m7
> + phaddd m5, m6
> + paddd m5, m1
> +%ifidn %3, pp
> + psrad m4, 6
> + psrad m5, 6
> + packusdw m4, m5
> + pxor m5, m5
> + CLIPW m4, m5, [pw_pixel_max]
> +%else
> + psrad m4, 2
> + psrad m5, 2
> + packssdw m4, m5
> +%endif
> movu [r2 + x], m4
>
> movu m2, [r0 + 32 + x] ; m2 = src[16-23]
> @@ -1583,7 +1590,7 @@
> phaddd m6, m2
> phaddd m5, m6
> paddd m5, m1
> -%ifidn %3, pp
> +%ifidn %3, pp
> psrad m4, 6
> psrad m5, 6
> packusdw m4, m5
> @@ -1690,7 +1697,7 @@
> %else
> mova m0, [tab_LumaCoeff + r4]
> %endif
> -%ifidn %3, pp
> +%ifidn %3, pp
> mova m1, [pd_32]
> %else
> mova m1, [INTERP_OFFSET_PS]
> @@ -1735,7 +1742,7 @@
> phaddd m6, m7
> phaddd m5, m6
> paddd m5, m1
> -%ifidn %3, pp
> +%ifidn %3, pp
> psrad m4, 6
> psrad m5, 6
> packusdw m4, m5
> @@ -1776,7 +1783,7 @@
> phaddd m6, m7
> phaddd m5, m6
> paddd m5, m1
> -%ifidn %3, pp
> +%ifidn %3, pp
> psrad m4, 6
> psrad m5, 6
> packusdw m4, m5
> @@ -1817,7 +1824,7 @@
> phaddd m6, m7
> phaddd m5, m6
> paddd m5, m1
> -%ifidn %3, pp
> +%ifidn %3, pp
> psrad m4, 6
> psrad m5, 6
> packusdw m4, m5
> @@ -2652,7 +2659,7 @@
> %endif
>
> paddd m3, m1
> - psrad m3, 2
> + psrad m3, INTERP_SHIFT_PS
> packssdw m3, m3
>
> %if %1 == 2
> @@ -2683,7 +2690,7 @@
> FILTER_W%1_2 %3
> %endrep
>
> -RET
> + RET
> %endmacro
>
> FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
> @@ -4084,7 +4091,7 @@
> %ifidn %3, pp
> mova m6, [tab_c_32]
> %else
> - mova m6, [tab_c_524800]
> + mova m6, [INTERP_OFFSET_SP]
> %endif
> %else
> mova m6, [INTERP_OFFSET_PS]
> @@ -4109,10 +4116,10 @@
> paddd m1, m6
> paddd m2, m6
> paddd m3, m6
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -4127,10 +4134,10 @@
> psrad m2, 6
> psrad m3, 6
> %else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -4707,7 +4714,7 @@
> %ifidn %3, pp
> mova m7, [tab_c_32]
> %elifidn %3, sp
> - mova m7, [tab_c_524800]
> + mova m7, [INTERP_OFFSET_SP]
> %elifidn %3, ps
> mova m7, [INTERP_OFFSET_PS]
> %endif
> @@ -4728,10 +4735,10 @@
> paddd m1, m7
> paddd m2, m7
> paddd m3, m7
> - psrad m0, 2
> - psrad m1, 2
> - psrad m2, 2
> - psrad m3, 2
> + psrad m0, INTERP_SHIFT_PS
> + psrad m1, INTERP_SHIFT_PS
> + psrad m2, INTERP_SHIFT_PS
> + psrad m3, INTERP_SHIFT_PS
>
> packssdw m0, m1
> packssdw m2, m3
> @@ -4746,10 +4753,10 @@
> psrad m2, 6
> psrad m3, 6
> %else
> - psrad m0, 10
> - psrad m1, 10
> - psrad m2, 10
> - psrad m3, 10
> + psrad m0, INTERP_SHIFT_SP
> + psrad m1, INTERP_SHIFT_SP
> + psrad m2, INTERP_SHIFT_SP
> + psrad m3, INTERP_SHIFT_SP
> %endif
> packssdw m0, m1
> packssdw m2, m3
> @@ -5587,7 +5594,7 @@
> ;-----------------------------------------------------------------------------------------------------------------
> %macro FILTER_VER_CHROMA_W16_24xN_avx2 3
> INIT_YMM avx2
> -%if ARCH_X86_64
> +%if ARCH_X86_64
> cglobal interp_4tap_vert_%2_24x%1, 5, 7, %3
> add r1d, r1d
> add r3d, r3d
> @@ -8628,7 +8635,7 @@
> psrad m3, 2
> %endif
> %endif
> -
> +
> packssdw m0, m3
> %ifidn %1,pp
> CLIPW m0, m1, [pw_pixel_max]
> @@ -9045,14 +9052,14 @@
> %rep %1/4
> movh m0, [r0]
> movhps m0, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
> movh [r2 + r3 * 0], m0
> movhps [r2 + r3 * 1], m0
>
> movh m0, [r0 + r1 * 2]
> movhps m0, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
> movh [r2 + r3 * 2], m0
> movhps [r2 + r4], m0
> @@ -9078,11 +9085,10 @@
>
> movh m0, [r0]
> movhps m0, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, [pw_2000]
> movh [r2 + r3 * 0], m0
> movhps [r2 + r3 * 1], m0
> -
> RET
>
> ;-----------------------------------------------------------------------------
> @@ -9106,9 +9112,9 @@
> .loop
> movu m0, [r0]
> movu m1, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movh [r2 + r3 * 0], m0
> @@ -9118,9 +9124,9 @@
>
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movh [r2 + r3 * 2], m0
> @@ -9158,22 +9164,22 @@
>
> .loop
> movu m0, [r0]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
> movu [r2 + r3 * 0], m0
>
> movu m0, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
> movu [r2 + r3 * 1], m0
>
> movu m0, [r0 + r1 * 2]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
> movu [r2 + r3 * 2], m0
>
> movu m0, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m1
> movu [r2 + r4], m0
>
> @@ -9203,14 +9209,13 @@
> movu m0, [r0]
> movu m1, [r0 + r1]
>
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, [pw_2000]
> - psllw m1, 4
> psubw m1, [pw_2000]
>
> movu [r2 + r3 * 0], m0
> movu [r2 + r3 * 1], m1
> -
> RET
>
> ;-----------------------------------------------------------------------------
> @@ -9232,11 +9237,11 @@
> movu m1, [r0 + r1]
> movu m2, [r0 + r1 * 2]
>
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m3
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m3
> - psllw m2, 4
> + psllw m2, (14 - BIT_DEPTH)
> psubw m2, m3
>
> movu [r2 + r3 * 0], m0
> @@ -9247,18 +9252,17 @@
> movu m1, [r0 + r1 * 4]
> movu m2, [r0 + r5 ]
>
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m3
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m3
> - psllw m2, 4
> + psllw m2, (14 - BIT_DEPTH)
> psubw m2, m3
>
> movu [r2 + r6], m0
> movu [r2 + r3 * 4], m1
> lea r2, [r2 + r3 * 4]
> movu [r2 + r3], m2
> -
> RET
>
> ;-----------------------------------------------------------------------------
> @@ -9282,9 +9286,9 @@
> .loop
> movu m0, [r0]
> movu m1, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m2
>
> movu [r2 + r3 * 0], m0
> @@ -9292,9 +9296,9 @@
>
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m2
>
> movu [r2 + r3 * 2], m0
> @@ -9302,9 +9306,9 @@
>
> movu m0, [r0 + 16]
> movu m1, [r0 + r1 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m2
>
> movu [r2 + r3 * 0 + 16], m0
> @@ -9312,9 +9316,9 @@
>
> movu m0, [r0 + r1 * 2 + 16]
> movu m1, [r0 + r5 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m2
>
> movu [r2 + r3 * 2 + 16], m0
> @@ -9356,9 +9360,9 @@
> .loop
> movu m0, [r0]
> movu m1, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m2
>
> movu [r2 + r3 * 0], m0
> @@ -9366,9 +9370,9 @@
>
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m2
>
> movu [r2 + r3 * 2], m0
> @@ -9412,13 +9416,13 @@
> movu m1, [r0 + r1]
> movu m2, [r0 + r1 * 2]
> movu m3, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0], m0
> @@ -9430,13 +9434,13 @@
> movu m1, [r0 + r1 + 16]
> movu m2, [r0 + r1 * 2 + 16]
> movu m3, [r0 + r5 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 16], m0
> @@ -9448,13 +9452,13 @@
> movu m1, [r0 + r1 + 32]
> movu m2, [r0 + r1 * 2 + 32]
> movu m3, [r0 + r5 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 32], m0
> @@ -9466,13 +9470,13 @@
> movu m1, [r0 + r1 + 48]
> movu m2, [r0 + r1 * 2 + 48]
> movu m3, [r0 + r5 + 48]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 48], m0
> @@ -9515,9 +9519,9 @@
> .loop
> movu m0, [r0]
> movu m1, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> + psllw m1, (14 - BIT_DEPTH)
> psubw m1, m2
>
> movu [r2 + r3 * 0], m0
> @@ -9525,9 +9529,9 @@
>
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 2], m0
> @@ -9535,9 +9539,9 @@
>
> movu m0, [r0 + 32]
> movu m1, [r0 + r1 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 0 + 32], m0
> @@ -9545,9 +9549,9 @@
>
> movu m0, [r0 + r1 * 2 + 32]
> movu m1, [r0 + r5 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 2 + 32], m0
> @@ -9590,13 +9594,13 @@
> movu m1, [r0 + r1]
> movu m2, [r0 + r1 * 2]
> movu m3, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0], m0
> @@ -9608,13 +9612,13 @@
> movu m1, [r0 + r1 + 16]
> movu m2, [r0 + r1 * 2 + 16]
> movu m3, [r0 + r5 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 16], m0
> @@ -9626,13 +9630,13 @@
> movu m1, [r0 + r1 + 32]
> movu m2, [r0 + r1 * 2 + 32]
> movu m3, [r0 + r5 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 32], m0
> @@ -9644,13 +9648,13 @@
> movu m1, [r0 + r1 + 48]
> movu m2, [r0 + r1 * 2 + 48]
> movu m3, [r0 + r5 + 48]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 48], m0
> @@ -9662,13 +9666,13 @@
> movu m1, [r0 + r1 + 64]
> movu m2, [r0 + r1 * 2 + 64]
> movu m3, [r0 + r5 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 64], m0
> @@ -9680,13 +9684,13 @@
> movu m1, [r0 + r1 + 80]
> movu m2, [r0 + r1 * 2 + 80]
> movu m3, [r0 + r5 + 80]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 80], m0
> @@ -9698,13 +9702,13 @@
> movu m1, [r0 + r1 + 96]
> movu m2, [r0 + r1 * 2 + 96]
> movu m3, [r0 + r5 + 96]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 96], m0
> @@ -9716,13 +9720,13 @@
> movu m1, [r0 + r1 + 112]
> movu m2, [r0 + r1 * 2 + 112]
> movu m3, [r0 + r5 + 112]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 112], m0
> @@ -9763,9 +9767,9 @@
> .loop
> movu m0, [r0]
> movu m1, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 0], m0
> @@ -9773,9 +9777,9 @@
>
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 2], m0
> @@ -9783,9 +9787,9 @@
>
> movu m0, [r0 + 32]
> movu m1, [r0 + r1 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 0 + 32], m0
> @@ -9793,9 +9797,9 @@
>
> movu m0, [r0 + r1 * 2 + 32]
> movu m1, [r0 + r5 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 2 + 32], m0
> @@ -9803,9 +9807,9 @@
>
> movu m0, [r0 + 64]
> movu m1, [r0 + r1 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 0 + 64], m0
> @@ -9813,9 +9817,9 @@
>
> movu m0, [r0 + r1 * 2 + 64]
> movu m1, [r0 + r5 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 2 + 64], m0
> @@ -9823,9 +9827,9 @@
>
> movu m0, [r0 + 96]
> movu m1, [r0 + r1 + 96]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 0 + 96], m0
> @@ -9833,9 +9837,9 @@
>
> movu m0, [r0 + r1 * 2 + 96]
> movu m1, [r0 + r5 + 96]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 2 + 96], m0
> @@ -9876,13 +9880,13 @@
> movu m1, [r0 + r1]
> movu m2, [r0 + r1 * 2]
> movu m3, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0], m0
> @@ -9894,13 +9898,13 @@
> movu m1, [r0 + r1 + 16]
> movu m2, [r0 + r1 * 2 + 16]
> movu m3, [r0 + r5 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 16], m0
> @@ -9912,13 +9916,13 @@
> movu m1, [r0 + r1 + 32]
> movu m2, [r0 + r1 * 2 + 32]
> movu m3, [r0 + r5 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 32], m0
> @@ -9957,36 +9961,36 @@
> .loop
> movu m0, [r0]
> movu m1, [r0 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
> movu [r2 + r3 * 0], m0
> movu [r2 + r3 * 0 + 32], xm1
>
> movu m0, [r0 + r1]
> movu m1, [r0 + r1 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
> movu [r2 + r3 * 1], m0
> movu [r2 + r3 * 1 + 32], xm1
>
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r1 * 2 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
> movu [r2 + r3 * 2], m0
> movu [r2 + r3 * 2 + 32], xm1
>
> movu m0, [r0 + r5]
> movu m1, [r0 + r5 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
> movu [r2 + r4], m0
> movu [r2 + r4 + 32], xm1
> @@ -10022,9 +10026,9 @@
> .loop
> movu m0, [r0]
> movu m1, [r0 + r1]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 0], m0
> @@ -10032,9 +10036,9 @@
>
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> psubw m0, m2
> - psllw m1, 4
> psubw m1, m2
>
> movu [r2 + r3 * 2], m0
> @@ -10042,7 +10046,7 @@
>
> movh m0, [r0 + 16]
> movhps m0, [r0 + r1 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
>
> movh [r2 + r3 * 0 + 16], m0
> @@ -10050,7 +10054,7 @@
>
> movh m0, [r0 + r1 * 2 + 16]
> movhps m0, [r0 + r5 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> psubw m0, m2
>
> movh [r2 + r3 * 2 + 16], m0
> @@ -10088,13 +10092,13 @@
> movu m1, [r0 + r1]
> movu m2, [r0 + r1 * 2]
> movu m3, [r0 + r5]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0], m0
> @@ -10106,13 +10110,13 @@
> movu m1, [r0 + r1 + 16]
> movu m2, [r0 + r1 * 2 + 16]
> movu m3, [r0 + r5 + 16]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 16], m0
> @@ -10124,13 +10128,13 @@
> movu m1, [r0 + r1 + 32]
> movu m2, [r0 + r1 * 2 + 32]
> movu m3, [r0 + r5 + 32]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 32], m0
> @@ -10142,13 +10146,13 @@
> movu m1, [r0 + r1 + 48]
> movu m2, [r0 + r1 * 2 + 48]
> movu m3, [r0 + r5 + 48]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 48], m0
> @@ -10160,13 +10164,13 @@
> movu m1, [r0 + r1 + 64]
> movu m2, [r0 + r1 * 2 + 64]
> movu m3, [r0 + r5 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 64], m0
> @@ -10178,13 +10182,13 @@
> movu m1, [r0 + r1 + 80]
> movu m2, [r0 + r1 * 2 + 80]
> movu m3, [r0 + r5 + 80]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> + psllw m3, (14 - BIT_DEPTH)
> psubw m0, m4
> - psllw m1, 4
> psubw m1, m4
> - psllw m2, 4
> psubw m2, m4
> - psllw m3, 4
> psubw m3, m4
>
> movu [r2 + r3 * 0 + 80], m0
> @@ -10220,11 +10224,11 @@
> movu m0, [r0]
> movu m1, [r0 + 32]
> movu m2, [r0 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> psubw m0, m3
> - psllw m1, 4
> psubw m1, m3
> - psllw m2, 4
> psubw m2, m3
> movu [r2 + r3 * 0], m0
> movu [r2 + r3 * 0 + 32], m1
> @@ -10233,11 +10237,11 @@
> movu m0, [r0 + r1]
> movu m1, [r0 + r1 + 32]
> movu m2, [r0 + r1 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> psubw m0, m3
> - psllw m1, 4
> psubw m1, m3
> - psllw m2, 4
> psubw m2, m3
> movu [r2 + r3 * 1], m0
> movu [r2 + r3 * 1 + 32], m1
> @@ -10246,11 +10250,11 @@
> movu m0, [r0 + r1 * 2]
> movu m1, [r0 + r1 * 2 + 32]
> movu m2, [r0 + r1 * 2 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> psubw m0, m3
> - psllw m1, 4
> psubw m1, m3
> - psllw m2, 4
> psubw m2, m3
> movu [r2 + r3 * 2], m0
> movu [r2 + r3 * 2 + 32], m1
> @@ -10259,11 +10263,11 @@
> movu m0, [r0 + r5]
> movu m1, [r0 + r5 + 32]
> movu m2, [r0 + r5 + 64]
> - psllw m0, 4
> + psllw m0, (14 - BIT_DEPTH)
> + psllw m1, (14 - BIT_DEPTH)
> + psllw m2, (14 - BIT_DEPTH)
> psubw m0, m3
> - psllw m1, 4
> psubw m1, m3
> - psllw m2, 4
> psubw m2, m3
> movu [r2 + r4], m0
> movu [r2 + r4 + 32], m1
> @@ -10797,7 +10801,7 @@
> pmaddwd m6, m0
> pmaddwd m5, m1
> paddd m6, m5
> -
> +
> phaddd m6, m6
> vpermq m6, m6, q3120
> paddd xm6, xm2
> @@ -12115,7 +12119,7 @@
> %endmacro
>
> FILTER_VER_CHROMA_AVX2_4x4 pp, 1, 6
> -FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2
> +FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2
> FILTER_VER_CHROMA_AVX2_4x4 sp, 1, 10
> FILTER_VER_CHROMA_AVX2_4x4 ss, 0, 6
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list