[x265] [PATCH 1 of 5] asm: fix Main12 assembly up to SSSE3
Min Chen
chenm003 at 163.com
Sat Jul 11 04:35:29 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1436569064 25200
# Node ID 8f60362f8555c11a14301737c0301fd6e9303448
# Parent 7b3e1372bb28830ef0ab44cd652ecbe823573675
asm: fix Main12 assembly up to SSSE3
---
source/common/vec/dct-sse3.cpp | 9 +-
source/common/vec/dct-ssse3.cpp | 130 +++++-------
source/common/x86/const-a.asm | 1 +
source/common/x86/dct8.asm | 351 ++++++++++--------------------
source/common/x86/intrapred16.asm | 30 ++--
source/common/x86/ipfilter16.asm | 436 +++++++++++++++++++------------------
6 files changed, 413 insertions(+), 544 deletions(-)
diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp Thu Jul 09 17:47:46 2015 -0700
+++ b/source/common/vec/dct-sse3.cpp Fri Jul 10 15:57:44 2015 -0700
@@ -38,13 +38,8 @@
#define SHIFT1 7
#define ADD1 64
-#if HIGH_BIT_DEPTH
-#define SHIFT2 10
-#define ADD2 512
-#else
-#define SHIFT2 12
-#define ADD2 2048
-#endif
+#define SHIFT2 (12 - (X265_DEPTH - 8))
+#define ADD2 (1 << ((SHIFT2) - 1))
ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
{
diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/vec/dct-ssse3.cpp
--- a/source/common/vec/dct-ssse3.cpp Thu Jul 09 17:47:46 2015 -0700
+++ b/source/common/vec/dct-ssse3.cpp Fri Jul 10 15:57:44 2015 -0700
@@ -34,6 +34,18 @@
#include <pmmintrin.h> // SSE3
#include <tmmintrin.h> // SSSE3
+#define DCT16_SHIFT1 (3 + X265_DEPTH - 8)
+#define DCT16_ADD1 (1 << ((DCT16_SHIFT1) - 1))
+
+#define DCT16_SHIFT2 10
+#define DCT16_ADD2 (1 << ((DCT16_SHIFT2) - 1))
+
+#define DCT32_SHIFT1 (DCT16_SHIFT1 + 1)
+#define DCT32_ADD1 (1 << ((DCT32_SHIFT1) - 1))
+
+#define DCT32_SHIFT2 (DCT16_SHIFT2 + 1)
+#define DCT32_ADD2 (1 << ((DCT32_SHIFT2) - 1))
+
using namespace X265_NS;
ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
@@ -100,20 +112,9 @@
static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
{
-#if HIGH_BIT_DEPTH
-#define SHIFT1 5
-#define ADD1 16
-#else
-#define SHIFT1 3
-#define ADD1 4
-#endif
-
-#define SHIFT2 10
-#define ADD2 512
-
// Const
- __m128i c_4 = _mm_set1_epi32(ADD1);
- __m128i c_512 = _mm_set1_epi32(ADD2);
+ __m128i c_4 = _mm_set1_epi32(DCT16_ADD1);
+ __m128i c_512 = _mm_set1_epi32(DCT16_ADD2);
int i;
@@ -201,29 +202,29 @@
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
@@ -233,8 +234,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
@@ -244,8 +245,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
@@ -255,8 +256,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
@@ -266,8 +267,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
@@ -286,8 +287,8 @@
T63 = _mm_hadd_epi32(T66, T67); \
T60 = _mm_hadd_epi32(T60, T61); \
T61 = _mm_hadd_epi32(T62, T63); \
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1); \
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1); \
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1); \
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1); \
T70 = _mm_packs_epi32(T60, T61); \
_mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
@@ -351,8 +352,8 @@
T40 = _mm_hadd_epi32(T30, T31);
T41 = _mm_hsub_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
- T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), SHIFT2);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
+ T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), DCT16_SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
T41 = _mm_packs_epi32(T41, T41);
_mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
@@ -376,7 +377,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
@@ -398,7 +399,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
@@ -420,7 +421,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
@@ -442,7 +443,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
@@ -464,7 +465,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
@@ -486,7 +487,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
@@ -509,7 +510,7 @@
T31 = _mm_hadd_epi32(T32, T33); \
\
T40 = _mm_hadd_epi32(T30, T31); \
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2); \
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2); \
T40 = _mm_packs_epi32(T40, T40); \
_mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
@@ -523,10 +524,6 @@
MAKE_ODD(28, 15);
#undef MAKE_ODD
}
-#undef SHIFT1
-#undef ADD1
-#undef SHIFT2
-#undef ADD2
}
ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) =
@@ -681,20 +678,9 @@
static void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
{
-#if HIGH_BIT_DEPTH
-#define SHIFT1 6
-#define ADD1 32
-#else
-#define SHIFT1 4
-#define ADD1 8
-#endif
-
-#define SHIFT2 11
-#define ADD2 1024
-
// Const
- __m128i c_8 = _mm_set1_epi32(ADD1);
- __m128i c_1024 = _mm_set1_epi32(ADD2);
+ __m128i c_8 = _mm_set1_epi32(DCT32_ADD1);
+ __m128i c_1024 = _mm_set1_epi32(DCT32_ADD2);
int i;
@@ -839,15 +825,15 @@
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[0][i] = T60;
T50 = _mm_hsub_epi32(T40, T41);
T51 = _mm_hsub_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[16][i] = T60;
@@ -867,8 +853,8 @@
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[8][i] = T60;
@@ -888,8 +874,8 @@
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[24][i] = T60;
@@ -910,8 +896,8 @@
\
T50 = _mm_hadd_epi32(T40, T41); \
T51 = _mm_hadd_epi32(T42, T43); \
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
T60 = _mm_packs_epi32(T50, T51); \
im[(dstPos)][i] = T60;
@@ -973,8 +959,8 @@
\
T50 = _mm_hadd_epi32(T50, T51); \
T51 = _mm_hadd_epi32(T52, T53); \
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
T60 = _mm_packs_epi32(T50, T51); \
im[(dstPos)][i] = T60;
@@ -1082,7 +1068,7 @@
\
T60 = _mm_hadd_epi32(T60, T61); \
\
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), SHIFT2); \
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), DCT32_SHIFT2); \
T60 = _mm_packs_epi32(T60, T60); \
_mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
@@ -1124,10 +1110,6 @@
MAKE_ODD(158, 159, 160, 161, 31);
#undef MAKE_ODD
}
-#undef SHIFT1
-#undef ADD1
-#undef SHIFT2
-#undef ADD2
}
namespace X265_NS {
diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Thu Jul 09 17:47:46 2015 -0700
+++ b/source/common/x86/const-a.asm Fri Jul 10 15:57:44 2015 -0700
@@ -125,6 +125,7 @@
const pd_2048, times 4 dd 2048
const pd_ffff, times 4 dd 0xffff
const pd_32767, times 4 dd 32767
+const pd_524416, times 4 dd 524416
const pd_n32768, times 8 dd 0xffff8000
const pd_n131072, times 4 dd 0xfffe0000
diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Jul 09 17:47:46 2015 -0700
+++ b/source/common/x86/dct8.asm Fri Jul 10 15:57:44 2015 -0700
@@ -332,23 +332,48 @@
cextern pd_2048
cextern pw_ppppmmmm
cextern trans8_shuf
+
+
+%if BIT_DEPTH == 12
+ %define DCT4_SHIFT 5
+ %define DCT4_ROUND 16
+ %define IDCT_SHIFT 8
+ %define IDCT_ROUND 128
+ %define DST4_SHIFT 5
+ %define DST4_ROUND 16
+ %define DCT8_SHIFT1 6
+ %define DCT8_ROUND1 32
+%elif BIT_DEPTH == 10
+ %define DCT4_SHIFT 3
+ %define DCT4_ROUND 4
+ %define IDCT_SHIFT 10
+ %define IDCT_ROUND 512
+ %define DST4_SHIFT 3
+ %define DST4_ROUND 4
+ %define DCT8_SHIFT1 4
+ %define DCT8_ROUND1 8
+%elif BIT_DEPTH == 8
+ %define DCT4_SHIFT 1
+ %define DCT4_ROUND 1
+ %define IDCT_SHIFT 12
+ %define IDCT_ROUND 2048
+ %define DST4_SHIFT 1
+ %define DST4_ROUND 1
+ %define DCT8_SHIFT1 2
+ %define DCT8_ROUND1 2
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+%define DCT8_ROUND2 256
+%define DCT8_SHIFT2 9
+
;------------------------------------------------------
;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
cglobal dct4, 3, 4, 8
-%if BIT_DEPTH == 12
- %define DCT_SHIFT 5
- mova m7, [pd_16]
-%elif BIT_DEPTH == 10
- %define DCT_SHIFT 3
- mova m7, [pd_4]
-%elif BIT_DEPTH == 8
- %define DCT_SHIFT 1
- mova m7, [pd_1]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ mova m7, [pd_ %+ DCT4_ROUND]
add r2d, r2d
lea r3, [tab_dct4]
@@ -375,19 +400,19 @@
psubw m2, m0
pmaddwd m0, m1, m4
paddd m0, m7
- psrad m0, DCT_SHIFT
+ psrad m0, DCT4_SHIFT
pmaddwd m3, m2, m5
paddd m3, m7
- psrad m3, DCT_SHIFT
+ psrad m3, DCT4_SHIFT
packssdw m0, m3
pshufd m0, m0, 0xD8
pshufhw m0, m0, 0xB1
pmaddwd m1, m6
paddd m1, m7
- psrad m1, DCT_SHIFT
+ psrad m1, DCT4_SHIFT
pmaddwd m2, [r3 + 3 * 16]
paddd m2, m7
- psrad m2, DCT_SHIFT
+ psrad m2, DCT4_SHIFT
packssdw m1, m2
pshufd m1, m1, 0xD8
pshufhw m1, m1, 0xB1
@@ -434,18 +459,7 @@
; - r2: source stride
INIT_YMM avx2
cglobal dct4, 3, 4, 8, src, dst, srcStride
-%if BIT_DEPTH == 12
- %define DCT_SHIFT 5
- vbroadcasti128 m7, [pd_16]
-%elif BIT_DEPTH == 10
- %define DCT_SHIFT 3
- vbroadcasti128 m7, [pd_4]
-%elif BIT_DEPTH == 8
- %define DCT_SHIFT 1
- vbroadcasti128 m7, [pd_1]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ vbroadcasti128 m7, [pd_ %+ DCT4_ROUND]
add r2d, r2d
lea r3, [avx2_dct4]
@@ -467,11 +481,11 @@
pmaddwd m2, m5
paddd m2, m7
- psrad m2, DCT_SHIFT
+ psrad m2, DCT4_SHIFT
pmaddwd m0, m6
paddd m0, m7
- psrad m0, DCT_SHIFT
+ psrad m0, DCT4_SHIFT
packssdw m2, m0
pshufb m2, m4
@@ -499,33 +513,19 @@
;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
-cglobal idct4, 3, 4, 7
-%if BIT_DEPTH == 12
- %define IDCT4_OFFSET [pd_128]
- %define IDCT4_SHIFT 8
-%elif BIT_DEPTH == 10
- %define IDCT4_OFFSET [pd_512]
- %define IDCT4_SHIFT 10
-%elif BIT_DEPTH == 8
- %define IDCT4_OFFSET [pd_2048]
- %define IDCT4_SHIFT 12
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+cglobal idct4, 3, 4, 6
add r2d, r2d
lea r3, [tab_dct4]
- mova m6, [pd_64]
-
movu m0, [r0 + 0 * 16]
movu m1, [r0 + 1 * 16]
punpcklwd m2, m0, m1
pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
- paddd m3, m6
+ paddd m3, [pd_64]
pmaddwd m2, [r3 + 2 * 16] ; m2 = E2
- paddd m2, m6
+ paddd m2, [pd_64]
punpckhwd m0, m1
pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
@@ -549,29 +549,27 @@
punpcklwd m0, m1, m4 ; m0 = m128iA
punpckhwd m1, m4 ; m1 = m128iD
- mova m6, IDCT4_OFFSET
-
punpcklwd m2, m0, m1
pmaddwd m3, m2, [r3 + 0 * 16]
- paddd m3, m6 ; m3 = E1
+ paddd m3, [pd_ %+ IDCT_ROUND] ; m3 = E1
pmaddwd m2, [r3 + 2 * 16]
- paddd m2, m6 ; m2 = E2
+ paddd m2, [pd_ %+ IDCT_ROUND] ; m2 = E2
punpckhwd m0, m1
pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1
pmaddwd m0, [r3 + 3 * 16] ; m0 = O2
paddd m4, m3, m1
- psrad m4, IDCT4_SHIFT ; m4 = m128iA
+ psrad m4, IDCT_SHIFT ; m4 = m128iA
paddd m5, m2, m0
- psrad m5, IDCT4_SHIFT
+ psrad m5, IDCT_SHIFT
packssdw m4, m5 ; m4 = m128iA
psubd m2, m0
- psrad m2, IDCT4_SHIFT
+ psrad m2, IDCT_SHIFT
psubd m3, m1
- psrad m3, IDCT4_SHIFT
+ psrad m3, IDCT_SHIFT
packssdw m2, m3 ; m2 = m128iD
punpcklwd m1, m4, m2
@@ -585,7 +583,6 @@
movlps [r1 + 2 * r2], m1
lea r1, [r1 + 2 * r2]
movhps [r1 + r2], m1
-
RET
;------------------------------------------------------
@@ -606,18 +603,7 @@
%define coef3 [r3 + 3 * 16]
%endif ; ARCH_X86_64
-%if BIT_DEPTH == 12
- %define DST_SHIFT 5
- mova m5, [pd_16]
-%elif BIT_DEPTH == 10
- %define DST_SHIFT 3
- mova m5, [pd_4]
-%elif BIT_DEPTH == 8
- %define DST_SHIFT 1
- mova m5, [pd_1]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ mova m5, [pd_ %+ DST4_ROUND]
add r2d, r2d
lea r3, [tab_dst4]
%if ARCH_X86_64
@@ -641,7 +627,7 @@
pshufd m3, m3, q3120
punpcklqdq m2, m3
paddd m2, m5
- psrad m2, DST_SHIFT
+ psrad m2, DST4_SHIFT
pmaddwd m3, m0, coef1
pmaddwd m4, m1, coef1
pshufd m6, m4, q2301
@@ -652,7 +638,7 @@
pshufd m3, m3, q3120
punpcklqdq m3, m4
paddd m3, m5
- psrad m3, DST_SHIFT
+ psrad m3, DST4_SHIFT
packssdw m2, m3 ; m2 = T70
pmaddwd m3, m0, coef2
pmaddwd m4, m1, coef2
@@ -664,7 +650,7 @@
pshufd m3, m3, q3120
punpcklqdq m3, m4
paddd m3, m5
- psrad m3, DST_SHIFT
+ psrad m3, DST4_SHIFT
pmaddwd m0, coef3
pmaddwd m1, coef3
pshufd m6, m0, q2301
@@ -675,7 +661,7 @@
pshufd m1, m1, q3120
punpcklqdq m0, m1
paddd m0, m5
- psrad m0, DST_SHIFT
+ psrad m0, DST4_SHIFT
packssdw m3, m0 ; m3 = T71
mova m5, [pd_128]
@@ -730,7 +716,6 @@
psrad m2, 8
packssdw m0, m2
movu [r1 + 1 * 16], m0
-
RET
;------------------------------------------------------
@@ -749,13 +734,7 @@
%define coef0 m6
%define coef1 m7
-%if BIT_DEPTH == 8
- %define DST_SHIFT 1
- mova m5, [pd_1]
-%elif BIT_DEPTH == 10
- %define DST_SHIFT 3
- mova m5, [pd_4]
-%endif
+ mova m5, [pd_ %+ DST4_ROUND]
add r2d, r2d
lea r3, [tab_dst4]
mova coef0, [r3 + 0 * 16]
@@ -775,23 +754,23 @@
pmaddwd m3, m1, coef0
phaddd m2, m3
paddd m2, m5
- psrad m2, DST_SHIFT
+ psrad m2, DST4_SHIFT
pmaddwd m3, m0, coef1
pmaddwd m4, m1, coef1
phaddd m3, m4
paddd m3, m5
- psrad m3, DST_SHIFT
+ psrad m3, DST4_SHIFT
packssdw m2, m3 ; m2 = T70
pmaddwd m3, m0, coef2
pmaddwd m4, m1, coef2
phaddd m3, m4
paddd m3, m5
- psrad m3, DST_SHIFT
+ psrad m3, DST4_SHIFT
pmaddwd m0, coef3
pmaddwd m1, coef3
phaddd m0, m1
paddd m0, m5
- psrad m0, DST_SHIFT
+ psrad m0, DST4_SHIFT
packssdw m3, m0 ; m3 = T71
mova m5, [pd_128]
@@ -822,7 +801,6 @@
psrad m2, 8
packssdw m0, m2
movu [r1 + 1 * 16], m0
-
RET
;------------------------------------------------------------------
@@ -830,13 +808,7 @@
;------------------------------------------------------------------
INIT_YMM avx2
cglobal dst4, 3, 4, 6
-%if BIT_DEPTH == 8
- %define DST_SHIFT 1
- vpbroadcastd m5, [pd_1]
-%elif BIT_DEPTH == 10
- %define DST_SHIFT 3
- vpbroadcastd m5, [pd_4]
-%endif
+ vbroadcasti128 m5, [pd_ %+ DST4_ROUND]
mova m4, [trans8_shuf]
add r2d, r2d
lea r3, [pw_dst4_tab]
@@ -853,12 +825,12 @@
pmaddwd m1, m0, [r3 + 1 * 32]
phaddd m2, m1
paddd m2, m5
- psrad m2, DST_SHIFT
+ psrad m2, DST4_SHIFT
pmaddwd m3, m0, [r3 + 2 * 32]
pmaddwd m1, m0, [r3 + 3 * 32]
phaddd m3, m1
paddd m3, m5
- psrad m3, DST_SHIFT
+ psrad m3, DST4_SHIFT
packssdw m2, m3
vpermd m2, m4, m2
@@ -883,18 +855,7 @@
;-------------------------------------------------------
INIT_XMM sse2
cglobal idst4, 3, 4, 7
-%if BIT_DEPTH == 12
- mova m6, [pd_128]
- %define IDCT4_SHIFT 8
-%elif BIT_DEPTH == 10
- mova m6, [pd_512]
- %define IDCT4_SHIFT 10
-%elif BIT_DEPTH == 8
- mova m6, [pd_2048]
- %define IDCT4_SHIFT 12
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ mova m6, [pd_ %+ IDCT_ROUND]
add r2d, r2d
lea r3, [tab_idst4]
mova m5, [pd_64]
@@ -942,23 +903,23 @@
pmaddwd m3, m2, [r3 + 1 * 16]
paddd m0, m3
paddd m0, m6
- psrad m0, IDCT4_SHIFT ; m0 = S0
+ psrad m0, IDCT_SHIFT ; m0 = S0
pmaddwd m3, m1, [r3 + 2 * 16]
pmaddwd m4, m2, [r3 + 3 * 16]
paddd m3, m4
paddd m3, m6
- psrad m3, IDCT4_SHIFT ; m3 = S8
+ psrad m3, IDCT_SHIFT ; m3 = S8
packssdw m0, m3 ; m0 = m128iA
pmaddwd m3, m1, [r3 + 4 * 16]
pmaddwd m4, m2, [r3 + 5 * 16]
paddd m3, m4
paddd m3, m6
- psrad m3, IDCT4_SHIFT ; m3 = S0
+ psrad m3, IDCT_SHIFT ; m3 = S0
pmaddwd m1, [r3 + 6 * 16]
pmaddwd m2, [r3 + 7 * 16]
paddd m1, m2
paddd m1, m6
- psrad m1, IDCT4_SHIFT ; m1 = S8
+ psrad m1, IDCT_SHIFT ; m1 = S8
packssdw m3, m1 ; m3 = m128iD
punpcklwd m1, m0, m3
punpckhwd m0, m3
@@ -978,18 +939,7 @@
;-----------------------------------------------------------------
INIT_YMM avx2
cglobal idst4, 3, 4, 6
-%if BIT_DEPTH == 12
- vpbroadcastd m4, [pd_256]
- %define IDCT4_SHIFT 8
-%elif BIT_DEPTH == 10
- vpbroadcastd m4, [pd_512]
- %define IDCT4_SHIFT 10
-%elif BIT_DEPTH == 8
- vpbroadcastd m4, [pd_2048]
- %define IDCT4_SHIFT 12
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ vbroadcasti128 m4, [pd_ %+ IDCT_ROUND]
add r2d, r2d
lea r3, [pw_idst4_tab]
@@ -1030,12 +980,12 @@
pmaddwd m3, m2, [r3 + 1 * 32]
paddd m0, m3
paddd m0, m4
- psrad m0, IDCT4_SHIFT
+ psrad m0, IDCT_SHIFT
pmaddwd m3, m1, [r3 + 2 * 32]
pmaddwd m2, m2, [r3 + 3 * 32]
paddd m3, m2
paddd m3, m4
- psrad m3, IDCT4_SHIFT
+ psrad m3, IDCT_SHIFT
packssdw m0, m3
pshufb m1, m0, [pb_idst4_shuf]
@@ -1066,20 +1016,6 @@
; ...
; Row6[4-7] Row7[4-7]
;------------------------
-%if BIT_DEPTH == 12
- %define DCT_SHIFT1 6
- %define DCT_ADD1 [pd_32]
-%elif BIT_DEPTH == 10
- %define DCT_SHIFT1 4
- %define DCT_ADD1 [pd_8]
-%elif BIT_DEPTH == 8
- %define DCT_SHIFT1 2
- %define DCT_ADD1 [pd_2]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-%define DCT_ADD2 [pd_256]
-%define DCT_SHIFT2 9
add r2, r2
lea r3, [r2 * 3]
@@ -1125,8 +1061,8 @@
punpckhqdq m7, m5
punpcklqdq m1, m5
paddd m1, m7
- paddd m1, DCT_ADD1
- psrad m1, DCT_SHIFT1
+ paddd m1, [pd_ %+ DCT8_ROUND1]
+ psrad m1, DCT8_SHIFT1
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -1140,8 +1076,8 @@
punpckhqdq m7, m5
punpcklqdq m1, m5
paddd m1, m7
- paddd m1, DCT_ADD1
- psrad m1, DCT_SHIFT1
+ paddd m1, [pd_ %+ DCT8_ROUND1]
+ psrad m1, DCT8_SHIFT1
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -1155,8 +1091,8 @@
punpckhqdq m7, m5
punpcklqdq m1, m5
paddd m1, m7
- paddd m1, DCT_ADD1
- psrad m1, DCT_SHIFT1
+ paddd m1, [pd_ %+ DCT8_ROUND1]
+ psrad m1, DCT8_SHIFT1
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -1170,8 +1106,8 @@
punpckhqdq m7, m0
punpcklqdq m4, m0
paddd m4, m7
- paddd m4, DCT_ADD1
- psrad m4, DCT_SHIFT1
+ paddd m4, [pd_ %+ DCT8_ROUND1]
+ psrad m4, DCT8_SHIFT1
%if x == 1
pshufd m4, m4, 0x1B
%endif
@@ -1189,29 +1125,29 @@
pshuflw m2, m2, 0xD8
pshufhw m2, m2, 0xD8
pmaddwd m3, m0, [r4 + 0*16]
- paddd m3, DCT_ADD1
- psrad m3, DCT_SHIFT1
+ paddd m3, [pd_ %+ DCT8_ROUND1]
+ psrad m3, DCT8_SHIFT1
%if x == 1
pshufd m3, m3, 0x1B
%endif
mova [r5 + 0*2*mmsize], m3 ; Row 0
pmaddwd m0, [r4 + 2*16]
- paddd m0, DCT_ADD1
- psrad m0, DCT_SHIFT1
+ paddd m0, [pd_ %+ DCT8_ROUND1]
+ psrad m0, DCT8_SHIFT1
%if x == 1
pshufd m0, m0, 0x1B
%endif
mova [r5 + 4*2*mmsize], m0 ; Row 4
pmaddwd m3, m2, [r4 + 1*16]
- paddd m3, DCT_ADD1
- psrad m3, DCT_SHIFT1
+ paddd m3, [pd_ %+ DCT8_ROUND1]
+ psrad m3, DCT8_SHIFT1
%if x == 1
pshufd m3, m3, 0x1B
%endif
mova [r5 + 2*2*mmsize], m3 ; Row 2
pmaddwd m2, [r4 + 3*16]
- paddd m2, DCT_ADD1
- psrad m2, DCT_SHIFT1
+ paddd m2, [pd_ %+ DCT8_ROUND1]
+ psrad m2, DCT8_SHIFT1
%if x == 1
pshufd m2, m2, 0x1B
%endif
@@ -1271,16 +1207,16 @@
punpckhqdq m7, m5
punpcklqdq m3, m5
paddd m3, m7 ; m3 = [Row2 Row0]
- paddd m3, DCT_ADD2
- psrad m3, DCT_SHIFT2
+ paddd m3, [pd_ %+ DCT8_ROUND2]
+ psrad m3, DCT8_SHIFT2
pshufd m4, m4, 0xD8
pshufd m2, m2, 0xD8
mova m7, m4
punpckhqdq m7, m2
punpcklqdq m4, m2
psubd m4, m7 ; m4 = [Row6 Row4]
- paddd m4, DCT_ADD2
- psrad m4, DCT_SHIFT2
+ paddd m4, [pd_ %+ DCT8_ROUND2]
+ psrad m4, DCT8_SHIFT2
packssdw m3, m3
movd [r1 + 0*mmsize], m3
@@ -1341,8 +1277,8 @@
punpckhqdq m7, m4
punpcklqdq m2, m4
paddd m2, m7 ; m2 = [Row3 Row1]
- paddd m2, DCT_ADD2
- psrad m2, DCT_SHIFT2
+ paddd m2, [pd_ %+ DCT8_ROUND2]
+ psrad m2, DCT8_SHIFT2
packssdw m2, m2
movd [r1 + 1*mmsize], m2
@@ -1397,8 +1333,8 @@
punpckhqdq m7, m4
punpcklqdq m2, m4
paddd m2, m7 ; m2 = [Row7 Row5]
- paddd m2, DCT_ADD2
- psrad m2, DCT_SHIFT2
+ paddd m2, [pd_ %+ DCT8_ROUND2]
+ psrad m2, DCT8_SHIFT2
packssdw m2, m2
movd [r1 + 5*mmsize], m2
@@ -1412,10 +1348,6 @@
%endrep
RET
-%undef IDCT_SHIFT1
-%undef IDCT_ADD1
-%undef IDCT_SHIFT2
-%undef IDCT_ADD2
;-------------------------------------------------------
; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
@@ -1432,18 +1364,7 @@
; ...
; Row6[4-7] Row7[4-7]
;------------------------
-%if BIT_DEPTH == 12
- %define DCT_SHIFT 6
- mova m6, [pd_16]
-%elif BIT_DEPTH == 10
- %define DCT_SHIFT 4
- mova m6, [pd_8]
-%elif BIT_DEPTH == 8
- %define DCT_SHIFT 2
- mova m6, [pd_2]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ mova m6, [pd_ %+ DCT8_ROUND1]
add r2, r2
lea r3, [r2 * 3]
@@ -1485,7 +1406,7 @@
pmaddwd m5, m0, [r4 + 0*16]
phaddd m1, m5
paddd m1, m6
- psrad m1, DCT_SHIFT
+ psrad m1, DCT8_SHIFT1
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -1495,7 +1416,7 @@
pmaddwd m5, m0, [r4 + 1*16]
phaddd m1, m5
paddd m1, m6
- psrad m1, DCT_SHIFT
+ psrad m1, DCT8_SHIFT1
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -1505,7 +1426,7 @@
pmaddwd m5, m0, [r4 + 2*16]
phaddd m1, m5
paddd m1, m6
- psrad m1, DCT_SHIFT
+ psrad m1, DCT8_SHIFT1
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -1515,7 +1436,7 @@
pmaddwd m0, [r4 + 3*16]
phaddd m4, m0
paddd m4, m6
- psrad m4, DCT_SHIFT
+ psrad m4, DCT8_SHIFT1
%if x == 1
pshufd m4, m4, 0x1B
%endif
@@ -1530,28 +1451,28 @@
pshufb m2, [pb_unpackhlw1]
pmaddwd m3, m0, [r4 + 0*16]
paddd m3, m6
- psrad m3, DCT_SHIFT
+ psrad m3, DCT8_SHIFT1
%if x == 1
pshufd m3, m3, 0x1B
%endif
mova [r5 + 0*2*mmsize], m3 ; Row 0
pmaddwd m0, [r4 + 2*16]
paddd m0, m6
- psrad m0, DCT_SHIFT
+ psrad m0, DCT8_SHIFT1
%if x == 1
pshufd m0, m0, 0x1B
%endif
mova [r5 + 4*2*mmsize], m0 ; Row 4
pmaddwd m3, m2, [r4 + 1*16]
paddd m3, m6
- psrad m3, DCT_SHIFT
+ psrad m3, DCT8_SHIFT1
%if x == 1
pshufd m3, m3, 0x1B
%endif
mova [r5 + 2*2*mmsize], m3 ; Row 2
pmaddwd m2, [r4 + 3*16]
paddd m2, m6
- psrad m2, DCT_SHIFT
+ psrad m2, DCT8_SHIFT1
%if x == 1
pshufd m2, m2, 0x1B
%endif
@@ -1649,19 +1570,6 @@
;-------------------------------------------------------
%if ARCH_X86_64
INIT_XMM sse2
-%if BIT_DEPTH == 12
- %define IDCT_SHIFT 8
- %define IDCT_ADD pd_128
-%elif BIT_DEPTH == 10
- %define IDCT_SHIFT 10
- %define IDCT_ADD pd_512
-%elif BIT_DEPTH == 8
- %define IDCT_SHIFT 12
- %define IDCT_ADD pd_2048
-%else
- %error Unsupported BIT_DEPTH!
-%endif
-
cglobal idct8, 3, 6, 16, 0-5*mmsize
mova m9, [r0 + 1 * mmsize]
mova m1, [r0 + 3 * mmsize]
@@ -1911,18 +1819,19 @@
psubd m10, m2
mova m2, m4
pmaddwd m12, [tab_dct4 + 3 * mmsize]
- paddd m0, [IDCT_ADD]
- paddd m1, [IDCT_ADD]
- paddd m8, [IDCT_ADD]
- paddd m10, [IDCT_ADD]
+ mova m15, [pd_ %+ IDCT_ROUND]
+ paddd m0, m15
+ paddd m1, m15
+ paddd m8, m15
+ paddd m10, m15
paddd m2, m13
paddd m3, m12
- paddd m2, [IDCT_ADD]
- paddd m3, [IDCT_ADD]
+ paddd m2, m15
+ paddd m3, m15
psubd m4, m13
psubd m6, m12
- paddd m4, [IDCT_ADD]
- paddd m6, [IDCT_ADD]
+ paddd m4, m15
+ paddd m6, m15
mova m15, [rsp + 4 * mmsize]
mova m12, m8
psubd m8, m7
@@ -2018,16 +1927,12 @@
movq [r1 + r3 * 2 + 8], m8
movhps [r1 + r0 + 8], m8
RET
-
-%undef IDCT_SHIFT
-%undef IDCT_ADD
%endif
;-------------------------------------------------------
; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM ssse3
-
cglobal patial_butterfly_inverse_internal_pass1
movh m0, [r0]
movhps m0, [r0 + 2 * 16]
@@ -2119,15 +2024,6 @@
ret
%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
-%if BIT_DEPTH == 12
- %define IDCT_SHIFT 8
-%elif BIT_DEPTH == 10
- %define IDCT_SHIFT 10
-%elif BIT_DEPTH == 8
- %define IDCT_SHIFT 12
-%else
- %error Unsupported BIT_DEPTH!
-%endif
pshufb m4, %1, [pb_idct8even]
pmaddwd m4, [tab_idct8_1]
phsubd m5, m4
@@ -2149,11 +2045,10 @@
pshufd m4, m4, 0x1B
packssdw %1, m4
-%undef IDCT_SHIFT
%endmacro
+INIT_XMM ssse3
cglobal patial_butterfly_inverse_internal_pass2
-
mova m0, [r5]
PARTIAL_BUTTERFLY_PROCESS_ROW m0
movu [r1], m0
@@ -2169,9 +2064,9 @@
mova m3, [r5 + 48]
PARTIAL_BUTTERFLY_PROCESS_ROW m3
movu [r1 + r3], m3
-
ret
+INIT_XMM ssse3
cglobal idct8, 3,7,8 ;,0-16*mmsize
; alignment stack to 64-bytes
mov r5, rsp
@@ -2190,15 +2085,7 @@
call patial_butterfly_inverse_internal_pass1
-%if BIT_DEPTH == 12
- mova m6, [pd_256]
-%elif BIT_DEPTH == 10
- mova m6, [pd_512]
-%elif BIT_DEPTH == 8
- mova m6, [pd_2048]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+ mova m6, [pd_ %+ IDCT_ROUND]
add r2, r2
lea r3, [r2 * 3]
lea r4, [tab_idct8_2]
diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Thu Jul 09 17:47:46 2015 -0700
+++ b/source/common/x86/intrapred16.asm Fri Jul 10 15:57:44 2015 -0700
@@ -109,11 +109,11 @@
cextern pw_16
cextern pw_31
cextern pw_32
-cextern pw_1023
cextern pd_16
cextern pd_31
cextern pd_32
cextern pw_4096
+cextern pw_pixel_max
cextern multiL
cextern multiH
cextern multiH2
@@ -1228,11 +1228,11 @@
punpcklwd m0, m0 ;[4 4 3 3 2 2 1 1]
pshufd m1, m0, 0xFA
- add r1, r1
+ add r1d, r1d
pshufd m0, m0, 0x50
movhps [r0 + r1], m0
movh [r0 + r1 * 2], m1
- lea r1, [r1 * 3]
+ lea r1d, [r1 * 3]
movhps [r0 + r1], m1
cmp r4m, byte 0
@@ -1247,7 +1247,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
.quit:
movh [r0], m0
RET
@@ -1583,7 +1583,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
movh r2, m0
mov [r0], r2w
@@ -2756,7 +2756,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
.quit:
movh [r0], m0
RET
@@ -2785,7 +2785,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
pextrw [r0], m0, 0
pextrw [r0 + r1], m0, 1
@@ -4002,7 +4002,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
.quit:
movu [r0], m0
RET
@@ -5874,7 +5874,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
pextrw [r0], m0, 0
pextrw [r0 + r1], m0, 1
pextrw [r0 + r1 * 2], m0, 2
@@ -10287,9 +10287,9 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
pmaxsw m3, m1
- pminsw m3, [pw_1023]
+ pminsw m3, [pw_pixel_max]
.quit:
movu [r0], m0
movu [r0 + 16], m3
@@ -10359,9 +10359,9 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
pmaxsw m3, m1
- pminsw m3, [pw_1023]
+ pminsw m3, [pw_pixel_max]
pextrw [r0], m0, 0
pextrw [r0 + r1], m0, 1
pextrw [r0 + r1 * 2], m0, 2
@@ -12952,7 +12952,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
.quit:
movu [r0], m0
RET
@@ -12999,7 +12999,7 @@
paddw m0, m1
pxor m1, m1
pmaxsw m0, m1
- pminsw m0, [pw_1023]
+ pminsw m0, [pw_pixel_max]
pextrw [r0], xm0, 0
pextrw [r0 + r1], xm0, 1
pextrw [r0 + r1 * 2], xm0, 2
diff -r 7b3e1372bb28 -r 8f60362f8555 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Thu Jul 09 17:47:46 2015 -0700
+++ b/source/common/x86/ipfilter16.asm Fri Jul 10 15:57:44 2015 -0700
@@ -53,7 +53,7 @@
times 8 dw -4, 54
times 8 dw 16, -2
- times 8 dw -6, 46
+ times 8 dw -6, 46
times 8 dw 28, -4
times 8 dw -4, 36
@@ -147,15 +147,22 @@
%if BIT_DEPTH == 10
%define INTERP_OFFSET_PS pd_n32768
+ %define INTERP_SHIFT_PS 2
+ %define INTERP_OFFSET_SP pd_524800
+ %define INTERP_SHIFT_SP 10
%elif BIT_DEPTH == 12
%define INTERP_OFFSET_PS pd_n131072
-%else
-%error Unsupport bit depth!
+ %define INTERP_SHIFT_PS 4
+ %define INTERP_OFFSET_SP pd_524416
+ %define INTERP_SHIFT_SP 8
+%else
+ %error Unsupport bit depth!
%endif
SECTION .text
cextern pd_32
cextern pw_pixel_max
+cextern pd_524416
cextern pd_n32768
cextern pd_n131072
cextern pw_2000
@@ -644,8 +651,8 @@
packssdw m3, m5
CLIPW m3, m7, m6
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movd [r2 + %1], m3
@@ -682,8 +689,8 @@
pshufd m5, m5, q3120
paddd m5, m1
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
movd [r2 + %1], m3
@@ -729,8 +736,8 @@
packssdw m3, m5
CLIPW m3, m7, m6
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movh [r2 + %1], m3
@@ -753,7 +760,7 @@
punpcklqdq m3, m4
paddd m3, m1
- psrad m3, 2
+ psrad m3, INTERP_SHIFT_PS
packssdw m3, m3
movh [r2 + r3 * 2 + %1], m3
%endmacro
@@ -794,8 +801,8 @@
packssdw m3, m5
CLIPW m3, m7, m6
%else
- psrad m3, 2
- psrad m5, 2
+ psrad m3, INTERP_SHIFT_PS
+ psrad m5, INTERP_SHIFT_PS
packssdw m3, m5
%endif
movdqu [r2 + %1], m3
@@ -905,7 +912,7 @@
%endif ;z < y
%endrep
-RET
+ RET
%endmacro
;-----------------------------------------------------------------------------
@@ -1183,7 +1190,7 @@
mova m0, [tab_LumaCoeff + r4]
%endif
-%ifidn %3, pp
+%ifidn %3, pp
mova m1, [pd_32]
pxor m6, m6
mova m7, [pw_pixel_max]
@@ -1270,7 +1277,7 @@
mova m0, [tab_LumaCoeff + r4]
%endif
-%ifidn %3, pp
+%ifidn %3, pp
mova m1, [pd_32]
pxor m7, m7
%else
@@ -1316,7 +1323,7 @@
phaddd m6, m3
phaddd m5, m6
paddd m5, m1
-%ifidn %3, pp
+%ifidn %3, pp
psrad m4, 6
psrad m5, 6
packusdw m4, m5
@@ -1372,7 +1379,7 @@
%else
mova m0, [tab_LumaCoeff + r4]
%endif
-%ifidn %3, pp
+%ifidn %3, pp
mova m1, [pd_32]
%else
mova m1, [INTERP_OFFSET_PS]
@@ -1417,131 +1424,6 @@
phaddd m6, m7
phaddd m5, m6
paddd m5, m1
-%ifidn %3, pp
- psrad m4, 6
- psrad m5, 6
- packusdw m4, m5
- pxor m5, m5
- CLIPW m4, m5, [pw_pixel_max]
-%else
- psrad m4, 2
- psrad m5, 2
- packssdw m4, m5
-%endif
-
- movu [r2], m4
-
- movu m2, [r0 + 32] ; m2 = src[16-23]
-
- pmaddwd m4, m3, m0 ; m3 = src[8-15]
- palignr m5, m2, m3, 2 ; m5 = src[9-16]
- pmaddwd m5, m0
- phaddd m4, m5
-
- palignr m5, m2, m3, 4 ; m5 = src[10-17]
- pmaddwd m5, m0
- palignr m2, m3, 6 ; m2 = src[11-18]
- pmaddwd m2, m0
- phaddd m5, m2
- phaddd m4, m5
- paddd m4, m1
-%ifidn %3, pp
- psrad m4, 6
- packusdw m4, m4
- pxor m5, m5
- CLIPW m4, m5, [pw_pixel_max]
-%else
- psrad m4, 2
- packssdw m4, m4
-%endif
-
- movh [r2 + 16], m4
-
- add r0, r1
- add r2, r3
-
- dec r4d
- jnz .loopH
- RET
-%endmacro
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-FILTER_HOR_LUMA_W12 12, 16, pp
-
-;----------------------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;----------------------------------------------------------------------------------------------------------------------------
-FILTER_HOR_LUMA_W12 12, 16, ps
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro FILTER_HOR_LUMA_W16 3
-INIT_XMM sse4
-cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
-
- add r1, r1
- add r3, r3
- mov r4d, r4m
- sub r0, 6
- shl r4d, 4
-
-%ifdef PIC
- lea r6, [tab_LumaCoeff]
- mova m0, [r6 + r4]
-%else
- mova m0, [tab_LumaCoeff + r4]
-%endif
-
-%ifidn %3, pp
- mova m1, [pd_32]
-%else
- mova m1, [INTERP_OFFSET_PS]
-%endif
-
- mov r4d, %2
-%ifidn %3, ps
- cmp r5m, byte 0
- je .loopH
- lea r6, [r1 + 2 * r1]
- sub r0, r6
- add r4d, 7
-%endif
-
-.loopH:
-%assign x 0
-%rep %1 / 16
- movu m2, [r0 + x] ; m2 = src[0-7]
- movu m3, [r0 + 16 + x] ; m3 = src[8-15]
-
- pmaddwd m4, m2, m0
- palignr m5, m3, m2, 2 ; m5 = src[1-8]
- pmaddwd m5, m0
- phaddd m4, m5
-
- palignr m5, m3, m2, 4 ; m5 = src[2-9]
- pmaddwd m5, m0
- palignr m6, m3, m2, 6 ; m6 = src[3-10]
- pmaddwd m6, m0
- phaddd m5, m6
- phaddd m4, m5
- paddd m4, m1
-
- palignr m5, m3, m2, 8 ; m5 = src[4-11]
- pmaddwd m5, m0
- palignr m6, m3, m2, 10 ; m6 = src[5-12]
- pmaddwd m6, m0
- phaddd m5, m6
-
- palignr m6, m3, m2, 12 ; m6 = src[6-13]
- pmaddwd m6, m0
- palignr m7, m3, m2, 14 ; m2 = src[7-14]
- pmaddwd m7, m0
- phaddd m6, m7
- phaddd m5, m6
- paddd m5, m1
%ifidn %3, pp
psrad m4, 6
psrad m5, 6
@@ -1553,6 +1435,131 @@
psrad m5, 2
packssdw m4, m5
%endif
+
+ movu [r2], m4
+
+ movu m2, [r0 + 32] ; m2 = src[16-23]
+
+ pmaddwd m4, m3, m0 ; m3 = src[8-15]
+ palignr m5, m2, m3, 2 ; m5 = src[9-16]
+ pmaddwd m5, m0
+ phaddd m4, m5
+
+ palignr m5, m2, m3, 4 ; m5 = src[10-17]
+ pmaddwd m5, m0
+ palignr m2, m3, 6 ; m2 = src[11-18]
+ pmaddwd m2, m0
+ phaddd m5, m2
+ phaddd m4, m5
+ paddd m4, m1
+%ifidn %3, pp
+ psrad m4, 6
+ packusdw m4, m4
+ pxor m5, m5
+ CLIPW m4, m5, [pw_pixel_max]
+%else
+ psrad m4, 2
+ packssdw m4, m4
+%endif
+
+ movh [r2 + 16], m4
+
+ add r0, r1
+ add r2, r3
+
+ dec r4d
+ jnz .loopH
+ RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W12 12, 16, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W12 12, 16, ps
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W16 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+ add r1, r1
+ add r3, r3
+ mov r4d, r4m
+ sub r0, 6
+ shl r4d, 4
+
+%ifdef PIC
+ lea r6, [tab_LumaCoeff]
+ mova m0, [r6 + r4]
+%else
+ mova m0, [tab_LumaCoeff + r4]
+%endif
+
+%ifidn %3, pp
+ mova m1, [pd_32]
+%else
+ mova m1, [INTERP_OFFSET_PS]
+%endif
+
+ mov r4d, %2
+%ifidn %3, ps
+ cmp r5m, byte 0
+ je .loopH
+ lea r6, [r1 + 2 * r1]
+ sub r0, r6
+ add r4d, 7
+%endif
+
+.loopH:
+%assign x 0
+%rep %1 / 16
+ movu m2, [r0 + x] ; m2 = src[0-7]
+ movu m3, [r0 + 16 + x] ; m3 = src[8-15]
+
+ pmaddwd m4, m2, m0
+ palignr m5, m3, m2, 2 ; m5 = src[1-8]
+ pmaddwd m5, m0
+ phaddd m4, m5
+
+ palignr m5, m3, m2, 4 ; m5 = src[2-9]
+ pmaddwd m5, m0
+ palignr m6, m3, m2, 6 ; m6 = src[3-10]
+ pmaddwd m6, m0
+ phaddd m5, m6
+ phaddd m4, m5
+ paddd m4, m1
+
+ palignr m5, m3, m2, 8 ; m5 = src[4-11]
+ pmaddwd m5, m0
+ palignr m6, m3, m2, 10 ; m6 = src[5-12]
+ pmaddwd m6, m0
+ phaddd m5, m6
+
+ palignr m6, m3, m2, 12 ; m6 = src[6-13]
+ pmaddwd m6, m0
+ palignr m7, m3, m2, 14 ; m2 = src[7-14]
+ pmaddwd m7, m0
+ phaddd m6, m7
+ phaddd m5, m6
+ paddd m5, m1
+%ifidn %3, pp
+ psrad m4, 6
+ psrad m5, 6
+ packusdw m4, m5
+ pxor m5, m5
+ CLIPW m4, m5, [pw_pixel_max]
+%else
+ psrad m4, 2
+ psrad m5, 2
+ packssdw m4, m5
+%endif
movu [r2 + x], m4
movu m2, [r0 + 32 + x] ; m2 = src[16-23]
@@ -1583,7 +1590,7 @@
phaddd m6, m2
phaddd m5, m6
paddd m5, m1
-%ifidn %3, pp
+%ifidn %3, pp
psrad m4, 6
psrad m5, 6
packusdw m4, m5
@@ -1690,7 +1697,7 @@
%else
mova m0, [tab_LumaCoeff + r4]
%endif
-%ifidn %3, pp
+%ifidn %3, pp
mova m1, [pd_32]
%else
mova m1, [INTERP_OFFSET_PS]
@@ -1735,7 +1742,7 @@
phaddd m6, m7
phaddd m5, m6
paddd m5, m1
-%ifidn %3, pp
+%ifidn %3, pp
psrad m4, 6
psrad m5, 6
packusdw m4, m5
@@ -1776,7 +1783,7 @@
phaddd m6, m7
phaddd m5, m6
paddd m5, m1
-%ifidn %3, pp
+%ifidn %3, pp
psrad m4, 6
psrad m5, 6
packusdw m4, m5
@@ -1817,7 +1824,7 @@
phaddd m6, m7
phaddd m5, m6
paddd m5, m1
-%ifidn %3, pp
+%ifidn %3, pp
psrad m4, 6
psrad m5, 6
packusdw m4, m5
@@ -2652,7 +2659,7 @@
%endif
paddd m3, m1
- psrad m3, 2
+ psrad m3, INTERP_SHIFT_PS
packssdw m3, m3
%if %1 == 2
@@ -2683,7 +2690,7 @@
FILTER_W%1_2 %3
%endrep
-RET
+ RET
%endmacro
FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
@@ -4084,7 +4091,7 @@
%ifidn %3, pp
mova m6, [tab_c_32]
%else
- mova m6, [tab_c_524800]
+ mova m6, [INTERP_OFFSET_SP]
%endif
%else
mova m6, [INTERP_OFFSET_PS]
@@ -4109,10 +4116,10 @@
paddd m1, m6
paddd m2, m6
paddd m3, m6
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -4127,10 +4134,10 @@
psrad m2, 6
psrad m3, 6
%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -4707,7 +4714,7 @@
%ifidn %3, pp
mova m7, [tab_c_32]
%elifidn %3, sp
- mova m7, [tab_c_524800]
+ mova m7, [INTERP_OFFSET_SP]
%elifidn %3, ps
mova m7, [INTERP_OFFSET_PS]
%endif
@@ -4728,10 +4735,10 @@
paddd m1, m7
paddd m2, m7
paddd m3, m7
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
+ psrad m0, INTERP_SHIFT_PS
+ psrad m1, INTERP_SHIFT_PS
+ psrad m2, INTERP_SHIFT_PS
+ psrad m3, INTERP_SHIFT_PS
packssdw m0, m1
packssdw m2, m3
@@ -4746,10 +4753,10 @@
psrad m2, 6
psrad m3, 6
%else
- psrad m0, 10
- psrad m1, 10
- psrad m2, 10
- psrad m3, 10
+ psrad m0, INTERP_SHIFT_SP
+ psrad m1, INTERP_SHIFT_SP
+ psrad m2, INTERP_SHIFT_SP
+ psrad m3, INTERP_SHIFT_SP
%endif
packssdw m0, m1
packssdw m2, m3
@@ -5587,7 +5594,7 @@
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_CHROMA_W16_24xN_avx2 3
INIT_YMM avx2
-%if ARCH_X86_64
+%if ARCH_X86_64
cglobal interp_4tap_vert_%2_24x%1, 5, 7, %3
add r1d, r1d
add r3d, r3d
@@ -8628,7 +8635,7 @@
psrad m3, 2
%endif
%endif
-
+
packssdw m0, m3
%ifidn %1,pp
CLIPW m0, m1, [pw_pixel_max]
@@ -9045,14 +9052,14 @@
%rep %1/4
movh m0, [r0]
movhps m0, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movh [r2 + r3 * 0], m0
movhps [r2 + r3 * 1], m0
movh m0, [r0 + r1 * 2]
movhps m0, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movh [r2 + r3 * 2], m0
movhps [r2 + r4], m0
@@ -9078,11 +9085,10 @@
movh m0, [r0]
movhps m0, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, [pw_2000]
movh [r2 + r3 * 0], m0
movhps [r2 + r3 * 1], m0
-
RET
;-----------------------------------------------------------------------------
@@ -9106,9 +9112,9 @@
.loop
movu m0, [r0]
movu m1, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movh [r2 + r3 * 0], m0
@@ -9118,9 +9124,9 @@
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movh [r2 + r3 * 2], m0
@@ -9158,22 +9164,22 @@
.loop
movu m0, [r0]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movu [r2 + r3 * 0], m0
movu m0, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movu [r2 + r3 * 1], m0
movu m0, [r0 + r1 * 2]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movu [r2 + r3 * 2], m0
movu m0, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m1
movu [r2 + r4], m0
@@ -9203,14 +9209,13 @@
movu m0, [r0]
movu m1, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, [pw_2000]
- psllw m1, 4
psubw m1, [pw_2000]
movu [r2 + r3 * 0], m0
movu [r2 + r3 * 1], m1
-
RET
;-----------------------------------------------------------------------------
@@ -9232,11 +9237,11 @@
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m3
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m3
- psllw m2, 4
+ psllw m2, (14 - BIT_DEPTH)
psubw m2, m3
movu [r2 + r3 * 0], m0
@@ -9247,18 +9252,17 @@
movu m1, [r0 + r1 * 4]
movu m2, [r0 + r5 ]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m3
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m3
- psllw m2, 4
+ psllw m2, (14 - BIT_DEPTH)
psubw m2, m3
movu [r2 + r6], m0
movu [r2 + r3 * 4], m1
lea r2, [r2 + r3 * 4]
movu [r2 + r3], m2
-
RET
;-----------------------------------------------------------------------------
@@ -9282,9 +9286,9 @@
.loop
movu m0, [r0]
movu m1, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m2
movu [r2 + r3 * 0], m0
@@ -9292,9 +9296,9 @@
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m2
movu [r2 + r3 * 2], m0
@@ -9302,9 +9306,9 @@
movu m0, [r0 + 16]
movu m1, [r0 + r1 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m2
movu [r2 + r3 * 0 + 16], m0
@@ -9312,9 +9316,9 @@
movu m0, [r0 + r1 * 2 + 16]
movu m1, [r0 + r5 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m2
movu [r2 + r3 * 2 + 16], m0
@@ -9356,9 +9360,9 @@
.loop
movu m0, [r0]
movu m1, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m2
movu [r2 + r3 * 0], m0
@@ -9366,9 +9370,9 @@
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m2
movu [r2 + r3 * 2], m0
@@ -9412,13 +9416,13 @@
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0], m0
@@ -9430,13 +9434,13 @@
movu m1, [r0 + r1 + 16]
movu m2, [r0 + r1 * 2 + 16]
movu m3, [r0 + r5 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 16], m0
@@ -9448,13 +9452,13 @@
movu m1, [r0 + r1 + 32]
movu m2, [r0 + r1 * 2 + 32]
movu m3, [r0 + r5 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 32], m0
@@ -9466,13 +9470,13 @@
movu m1, [r0 + r1 + 48]
movu m2, [r0 + r1 * 2 + 48]
movu m3, [r0 + r5 + 48]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 48], m0
@@ -9515,9 +9519,9 @@
.loop
movu m0, [r0]
movu m1, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
+ psllw m1, (14 - BIT_DEPTH)
psubw m1, m2
movu [r2 + r3 * 0], m0
@@ -9525,9 +9529,9 @@
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2], m0
@@ -9535,9 +9539,9 @@
movu m0, [r0 + 32]
movu m1, [r0 + r1 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 0 + 32], m0
@@ -9545,9 +9549,9 @@
movu m0, [r0 + r1 * 2 + 32]
movu m1, [r0 + r5 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2 + 32], m0
@@ -9590,13 +9594,13 @@
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0], m0
@@ -9608,13 +9612,13 @@
movu m1, [r0 + r1 + 16]
movu m2, [r0 + r1 * 2 + 16]
movu m3, [r0 + r5 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 16], m0
@@ -9626,13 +9630,13 @@
movu m1, [r0 + r1 + 32]
movu m2, [r0 + r1 * 2 + 32]
movu m3, [r0 + r5 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 32], m0
@@ -9644,13 +9648,13 @@
movu m1, [r0 + r1 + 48]
movu m2, [r0 + r1 * 2 + 48]
movu m3, [r0 + r5 + 48]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 48], m0
@@ -9662,13 +9666,13 @@
movu m1, [r0 + r1 + 64]
movu m2, [r0 + r1 * 2 + 64]
movu m3, [r0 + r5 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 64], m0
@@ -9680,13 +9684,13 @@
movu m1, [r0 + r1 + 80]
movu m2, [r0 + r1 * 2 + 80]
movu m3, [r0 + r5 + 80]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 80], m0
@@ -9698,13 +9702,13 @@
movu m1, [r0 + r1 + 96]
movu m2, [r0 + r1 * 2 + 96]
movu m3, [r0 + r5 + 96]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 96], m0
@@ -9716,13 +9720,13 @@
movu m1, [r0 + r1 + 112]
movu m2, [r0 + r1 * 2 + 112]
movu m3, [r0 + r5 + 112]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 112], m0
@@ -9763,9 +9767,9 @@
.loop
movu m0, [r0]
movu m1, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 0], m0
@@ -9773,9 +9777,9 @@
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2], m0
@@ -9783,9 +9787,9 @@
movu m0, [r0 + 32]
movu m1, [r0 + r1 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 0 + 32], m0
@@ -9793,9 +9797,9 @@
movu m0, [r0 + r1 * 2 + 32]
movu m1, [r0 + r5 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2 + 32], m0
@@ -9803,9 +9807,9 @@
movu m0, [r0 + 64]
movu m1, [r0 + r1 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 0 + 64], m0
@@ -9813,9 +9817,9 @@
movu m0, [r0 + r1 * 2 + 64]
movu m1, [r0 + r5 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2 + 64], m0
@@ -9823,9 +9827,9 @@
movu m0, [r0 + 96]
movu m1, [r0 + r1 + 96]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 0 + 96], m0
@@ -9833,9 +9837,9 @@
movu m0, [r0 + r1 * 2 + 96]
movu m1, [r0 + r5 + 96]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2 + 96], m0
@@ -9876,13 +9880,13 @@
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0], m0
@@ -9894,13 +9898,13 @@
movu m1, [r0 + r1 + 16]
movu m2, [r0 + r1 * 2 + 16]
movu m3, [r0 + r5 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 16], m0
@@ -9912,13 +9916,13 @@
movu m1, [r0 + r1 + 32]
movu m2, [r0 + r1 * 2 + 32]
movu m3, [r0 + r5 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 32], m0
@@ -9957,36 +9961,36 @@
.loop
movu m0, [r0]
movu m1, [r0 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 0], m0
movu [r2 + r3 * 0 + 32], xm1
movu m0, [r0 + r1]
movu m1, [r0 + r1 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 1], m0
movu [r2 + r3 * 1 + 32], xm1
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r1 * 2 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2], m0
movu [r2 + r3 * 2 + 32], xm1
movu m0, [r0 + r5]
movu m1, [r0 + r5 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r4], m0
movu [r2 + r4 + 32], xm1
@@ -10022,9 +10026,9 @@
.loop
movu m0, [r0]
movu m1, [r0 + r1]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 0], m0
@@ -10032,9 +10036,9 @@
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
psubw m0, m2
- psllw m1, 4
psubw m1, m2
movu [r2 + r3 * 2], m0
@@ -10042,7 +10046,7 @@
movh m0, [r0 + 16]
movhps m0, [r0 + r1 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
movh [r2 + r3 * 0 + 16], m0
@@ -10050,7 +10054,7 @@
movh m0, [r0 + r1 * 2 + 16]
movhps m0, [r0 + r5 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
psubw m0, m2
movh [r2 + r3 * 2 + 16], m0
@@ -10088,13 +10092,13 @@
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r5]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0], m0
@@ -10106,13 +10110,13 @@
movu m1, [r0 + r1 + 16]
movu m2, [r0 + r1 * 2 + 16]
movu m3, [r0 + r5 + 16]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 16], m0
@@ -10124,13 +10128,13 @@
movu m1, [r0 + r1 + 32]
movu m2, [r0 + r1 * 2 + 32]
movu m3, [r0 + r5 + 32]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 32], m0
@@ -10142,13 +10146,13 @@
movu m1, [r0 + r1 + 48]
movu m2, [r0 + r1 * 2 + 48]
movu m3, [r0 + r5 + 48]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 48], m0
@@ -10160,13 +10164,13 @@
movu m1, [r0 + r1 + 64]
movu m2, [r0 + r1 * 2 + 64]
movu m3, [r0 + r5 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 64], m0
@@ -10178,13 +10182,13 @@
movu m1, [r0 + r1 + 80]
movu m2, [r0 + r1 * 2 + 80]
movu m3, [r0 + r5 + 80]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
+ psllw m3, (14 - BIT_DEPTH)
psubw m0, m4
- psllw m1, 4
psubw m1, m4
- psllw m2, 4
psubw m2, m4
- psllw m3, 4
psubw m3, m4
movu [r2 + r3 * 0 + 80], m0
@@ -10220,11 +10224,11 @@
movu m0, [r0]
movu m1, [r0 + 32]
movu m2, [r0 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
psubw m0, m3
- psllw m1, 4
psubw m1, m3
- psllw m2, 4
psubw m2, m3
movu [r2 + r3 * 0], m0
movu [r2 + r3 * 0 + 32], m1
@@ -10233,11 +10237,11 @@
movu m0, [r0 + r1]
movu m1, [r0 + r1 + 32]
movu m2, [r0 + r1 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
psubw m0, m3
- psllw m1, 4
psubw m1, m3
- psllw m2, 4
psubw m2, m3
movu [r2 + r3 * 1], m0
movu [r2 + r3 * 1 + 32], m1
@@ -10246,11 +10250,11 @@
movu m0, [r0 + r1 * 2]
movu m1, [r0 + r1 * 2 + 32]
movu m2, [r0 + r1 * 2 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
psubw m0, m3
- psllw m1, 4
psubw m1, m3
- psllw m2, 4
psubw m2, m3
movu [r2 + r3 * 2], m0
movu [r2 + r3 * 2 + 32], m1
@@ -10259,11 +10263,11 @@
movu m0, [r0 + r5]
movu m1, [r0 + r5 + 32]
movu m2, [r0 + r5 + 64]
- psllw m0, 4
+ psllw m0, (14 - BIT_DEPTH)
+ psllw m1, (14 - BIT_DEPTH)
+ psllw m2, (14 - BIT_DEPTH)
psubw m0, m3
- psllw m1, 4
psubw m1, m3
- psllw m2, 4
psubw m2, m3
movu [r2 + r4], m0
movu [r2 + r4 + 32], m1
@@ -10797,7 +10801,7 @@
pmaddwd m6, m0
pmaddwd m5, m1
paddd m6, m5
-
+
phaddd m6, m6
vpermq m6, m6, q3120
paddd xm6, xm2
@@ -12115,7 +12119,7 @@
%endmacro
FILTER_VER_CHROMA_AVX2_4x4 pp, 1, 6
-FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2
+FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2
FILTER_VER_CHROMA_AVX2_4x4 sp, 1, 10
FILTER_VER_CHROMA_AVX2_4x4 ss, 0, 6
More information about the x265-devel
mailing list