[x265] [PATCH] filterVertical_p_p: intrinsic for vector replacement
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Fri Aug 2 15:37:27 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1375450637 -19800
# Node ID 04484cca6c5616f855ba8b909f0bdd4cbf9454b8
# Parent f8418a4694f5eef1b4ea1f0a2a14ae0e86eeade2
filterVertical_p_p: intrinsic for vector replacement
diff -r f8418a4694f5 -r 04484cca6c56 source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc Fri Aug 02 18:55:23 2013 +0530
+++ b/source/common/vec/ipfilter.inc Fri Aug 02 19:07:17 2013 +0530
@@ -48,10 +48,10 @@
p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_p_s<8>;
#endif
+#if INSTRSET > 4
p.ipfilter_pp[FILTER_V_P_P_4] = filterVertical_p_p<4>;
p.ipfilter_pp[FILTER_V_P_P_8] = filterVertical_p_p<8>;
-#if INSTRSET > 4
p.ipfilter_sp[FILTER_V_S_P_4] = filterVertical_s_p<4>;
p.ipfilter_sp[FILTER_V_S_P_8] = filterVertical_s_p<8>;
#endif
diff -r f8418a4694f5 -r 04484cca6c56 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Fri Aug 02 18:55:23 2013 +0530
+++ b/source/common/vec/ipfilter8.inc Fri Aug 02 19:07:17 2013 +0530
@@ -512,203 +512,272 @@
}
}
+#if INSTRSET > 4
template<int N>
void filterVertical_p_p(pixel *src, int srcStride,
pixel *dst, int dstStride,
- int block_width, int block_height,
+ int width, int height,
const short *coeff)
{
- int row, col;
int offset;
- short maxVal;
int shift = IF_FILTER_PREC;
src -= (N / 2 - 1) * srcStride;
offset = 1 << (shift - 1);
- maxVal = (1 << X265_DEPTH) - 1;
- Vec8s im0;
- im0.load(coeff);
+ __m128i coeffTemp = _mm_loadu_si128((__m128i const*)coeff);
- Vec8s cm[8];
+ __m128i vm0 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+ __m128i vm1 = _mm_setr_epi8(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3);
+ __m128i vm2 = _mm_setr_epi8(4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5);
+ __m128i vm3 = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7);
+ __m128i vm4 = _mm_setr_epi8(8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9);
+ __m128i vm5 = _mm_setr_epi8(10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11);
+ __m128i vm6 = _mm_setr_epi8(12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13);
+ __m128i vm7 = _mm_setr_epi8(14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15);
- assert((N == 4) || (N == 8));
- cm[0] = broadcast(const_int(0), im0);
- cm[1] = broadcast(const_int(1), im0);
- cm[2] = broadcast(const_int(2), im0);
- cm[3] = broadcast(const_int(3), im0);
+ __m128i coeff0 = _mm_shuffle_epi8(coeffTemp, vm0);
+ __m128i coeff1 = _mm_shuffle_epi8(coeffTemp, vm1);
+ __m128i coeff2 = _mm_shuffle_epi8(coeffTemp, vm2);
+ __m128i coeff3 = _mm_shuffle_epi8(coeffTemp, vm3);
+ __m128i coeff4 = _mm_shuffle_epi8(coeffTemp, vm4);
+ __m128i coeff5 = _mm_shuffle_epi8(coeffTemp, vm5);
+ __m128i coeff6 = _mm_shuffle_epi8(coeffTemp, vm6);
+ __m128i coeff7 = _mm_shuffle_epi8(coeffTemp, vm7);
- if (N == 8)
+ __m128i mask7 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+ int row, col;
+
+ for (row = 0; row < height; row++)
{
- cm[4] = broadcast(const_int(4), im0);
- cm[5] = broadcast(const_int(5), im0);
- cm[6] = broadcast(const_int(6), im0);
- cm[7] = broadcast(const_int(7), im0);
- }
+ for (col = 0; col < (width - 15); col += 16)
+ {
+ __m128i srcCoeff = _mm_loadu_si128((__m128i*)&src[col]);
+ __m128i srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff0);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T01 = _mm_mullo_epi16(srcCoeff, coeff0);
- for (row = 0; row < block_height; row++)
- {
- for (col = 0; col < block_width - 15; col += 16)
- {
- Vec16uc row0, row1, row2, row3, row4, row5, row6, row7, sum;
- Vec8s row0_first, row0_last, row1_first, row1_last;
- Vec8s c0, c1, c2, c3, c4, c5, c6, c7;
- Vec8s sum_first, sum_last;
+ srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + srcStride]));
+ __m128i srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff1);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T11 = _mm_mullo_epi16(srcCoeff, coeff1);
- row0.load(&src[col]);
- row1.load(&src[col + srcStride]);
+ srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 2 * srcStride]));
+ __m128i srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff2);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T21 = _mm_mullo_epi16(srcCoeff, coeff2);
- c0 = cm[0];
- c1 = cm[1];
+ srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 3 * srcStride]));
+ __m128i srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff3);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T31 = _mm_mullo_epi16(srcCoeff, coeff3);
- row0_first = extend_low(row0);
- row1_first = extend_low(row1);
- row0_last = extend_high(row0);
- row1_last = extend_high(row1);
+ __m128i sum0 = _mm_add_epi16(T00, T10);
+ __m128i sum1 = _mm_add_epi16(T20, T30);
+ __m128i sumlo = _mm_add_epi16(sum0, sum1);
- row0_first = row0_first * c0;
- row1_first = row1_first * c1;
- row0_last = row0_last * c0;
- row1_last = row1_last * c1;
-
- sum_first = row0_first + row1_first;
- sum_last = row0_last + row1_last;
-
- row2.load(&src[col + 2 * srcStride]);
- row3.load(&src[col + 3 * srcStride]);
-
- c2 = cm[2];
- c3 = cm[3];
-
- row0_first = extend_low(row2);
- row0_last = extend_high(row2);
- row0_first = row0_first * c2;
- row0_last = row0_last * c2;
- row1_first = extend_low(row3);
- row1_last = extend_high(row3);
- row1_first = row1_first * c3;
- row1_last = row1_last * c3;
- sum_first += row0_first + row1_first;
- sum_last += row0_last + row1_last;
+ __m128i sum2 = _mm_add_epi16(T01, T11);
+ __m128i sum3 = _mm_add_epi16(T21, T31);
+ __m128i sumhi = _mm_add_epi16(sum2, sum3);
if (N == 8)
{
- row4.load(&src[col + 4 * srcStride]);
- row5.load(&src[col + 5 * srcStride]);
+ srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 4 * srcStride]));
+ srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
+ T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff4);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ T01 = _mm_mullo_epi16(srcCoeff, coeff4);
- c4 = cm[4];
- c5 = cm[5];
+ srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 5 * srcStride]));
+ srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
+ T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff5);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ T11 = _mm_mullo_epi16(srcCoeff, coeff5);
- row0_first = extend_low(row4);
- row0_last = extend_high(row4);
- row0_first = row0_first * c4;
- row0_last = row0_last * c4;
- row1_first = extend_low(row5);
- row1_last = extend_high(row5);
- row1_first = row1_first * c5;
- row1_last = row1_last * c5;
- sum_first += row0_first + row1_first;
- sum_last += row0_last + row1_last;
+ srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 6 * srcStride]));
+ srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
+ T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff6);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ T21 = _mm_mullo_epi16(srcCoeff, coeff6);
- row6.load(&src[col + 6 * srcStride]);
- row7.load(&src[col + 7 * srcStride]);
+ srcCoeff = _mm_loadu_si128((__m128i const*)(&src[col + 7 * srcStride]));
+ srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
+ T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff7);
+ srcCoeff = _mm_srli_si128(srcCoeff, 8);
+ srcCoeff = _mm_cvtepu8_epi16(srcCoeff);
+ T31 = _mm_mullo_epi16(srcCoeff, coeff7);
- c6 = cm[6];
- c7 = cm[7];
+ sum0 = _mm_add_epi16(T00, T10);
+ sum1 = _mm_add_epi16(T20, T30);
+ sumlo = _mm_add_epi16(sumlo, _mm_add_epi16(sum0, sum1));
- row0_first = extend_low(row6);
- row0_last = extend_high(row6);
- row0_first = row0_first * c6;
- row0_last = row0_last * c6;
- row1_first = extend_low(row7);
- row1_last = extend_high(row7);
- row1_first = row1_first * c7;
- row1_last = row1_last * c7;
-
- sum_first += row0_first + row1_first;
- sum_last += row0_last + row1_last;
+ sum2 = _mm_add_epi16(T01, T11);
+ sum3 = _mm_add_epi16(T21, T31);
+ sumhi = _mm_add_epi16(sumhi, _mm_add_epi16(sum2, sum3));
}
- sum_first = (sum_first + offset) >> shift;
- sum_last = (sum_last + offset) >> shift;
- sum_first = max(sum_first, 0);
- sum_last = max(sum_last, 0);
- Vec8s maxVal_v(maxVal);
- sum_first = min(sum_first, maxVal_v);
- sum_last = min(sum_last, maxVal_v);
+ __m128i sumOffset = _mm_set1_epi16(offset);
- sum = compress(sum_first, sum_last);
+ __m128i val1 = _mm_add_epi16(sumlo, sumOffset);
+ val1 = _mm_srai_epi16(val1, shift);
- sum.store(dst + col);
+ __m128i val2 = _mm_add_epi16(sumhi, sumOffset);
+ val2 = _mm_srai_epi16(val2, shift);
+
+ __m128i res = _mm_packus_epi16(val1, val2);
+ _mm_storeu_si128((__m128i*)&dst[col], res);
}
- //Handle the case when block_width is not multiple of 16
- for (; col < block_width; col += 8)
+ for (; col < (width - 7); col += 8)
{
- Vec16uc row0, row1, row2, row3, row4, row5, row6, row7, sum;
- Vec8s row0_first, row0_last, row1_first, row1_last;
- Vec8s c0, c1, c2, c3, c4, c5, c6, c7;
- Vec8s sum_first, sum_last;
+ __m128i srcCoeff = _mm_loadl_epi64((__m128i*)&src[col]);
+ __m128i srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff0);
- row0.load(&src[col]);
- row1.load(&src[col + srcStride]);
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + srcStride]));
+ __m128i srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff1);
- c0 = cm[0];
- c1 = cm[1];
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 2 * srcStride]));
+ __m128i srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff2);
- row0_first = extend_low(row0);
- row1_first = extend_low(row1);
- row0_first = row0_first * c0;
- row1_first = row1_first * c1;
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 3 * srcStride]));
+ __m128i srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff3);
- sum_first = row0_first + row1_first;
-
- row2.load(&src[col + 2 * srcStride]);
- row3.load(&src[col + 3 * srcStride]);
-
- c2 = cm[2];
- c3 = cm[3];
-
- row0_first = extend_low(row2);
- row0_first = row0_first * c2;
- row1_first = extend_low(row3);
- row1_first = row1_first * c3;
-
- sum_first += row0_first + row1_first;
+ __m128i sum0 = _mm_add_epi16(T00, T10);
+ __m128i sum1 = _mm_add_epi16(T20, T30);
+ __m128i sumlo = _mm_add_epi16(sum0, sum1);
if (N == 8)
{
- row4.load(&src[col + 4 * srcStride]);
- row5.load(&src[col + 5 * srcStride]);
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 4 * srcStride]));
+ srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
+ T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff4);
- c4 = cm[4];
- c5 = cm[5];
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 5 * srcStride]));
+ srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
+ T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff5);
- row0_first = extend_low(row4);
- row0_first = row0_first * c4;
- row1_first = extend_low(row5);
- row1_first = row1_first * c5;
- sum_first += row0_first + row1_first;
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 6 * srcStride]));
+ srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
+ T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff6);
- row6.load(&src[col + 6 * srcStride]);
- row7.load(&src[col + 7 * srcStride]);
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 7 * srcStride]));
+ srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
+ T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff7);
- c6 = cm[6];
- c7 = cm[7];
+ sum0 = _mm_add_epi16(T00, T10);
+ sum1 = _mm_add_epi16(T20, T30);
+ sumlo = _mm_add_epi16(sumlo, _mm_add_epi16(sum0, sum1));
+ }
+ __m128i zero = _mm_set1_epi16(0);
+ __m128i sumOffset = _mm_set1_epi16(offset);
- row0_first = extend_low(row6);
- row0_first = row0_first * c6;
- row1_first = extend_low(row7);
- row1_first = row1_first * c7;
- sum_first += row0_first + row1_first;
+ __m128i val1 = _mm_add_epi16(sumlo, sumOffset);
+ val1 = _mm_srai_epi16(val1, shift);
+
+ __m128i res = _mm_packus_epi16(val1, zero);
+ _mm_storel_epi64((__m128i*)&dst[col], res);
+ }
+
+ for (; col < width; col += 8)
+ {
+ __m128i srcCoeff = _mm_loadl_epi64((__m128i*)&src[col]);
+ __m128i srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff0);
+
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + srcStride]));
+ __m128i srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff1);
+
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 2 * srcStride]));
+ __m128i srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff2);
+
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 3 * srcStride]));
+ __m128i srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
+ __m128i T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff3);
+
+ __m128i sum0 = _mm_add_epi16(T00, T10);
+ __m128i sum1 = _mm_add_epi16(T20, T30);
+ __m128i sumlo = _mm_add_epi16(sum0, sum1);
+
+ if (N == 8)
+ {
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 4 * srcStride]));
+ srcCoeffTemp1 = _mm_cvtepu8_epi16(srcCoeff);
+ T00 = _mm_mullo_epi16(srcCoeffTemp1, coeff4);
+
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 5 * srcStride]));
+ srcCoeffTemp2 = _mm_cvtepu8_epi16(srcCoeff);
+ T10 = _mm_mullo_epi16(srcCoeffTemp2, coeff5);
+
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 6 * srcStride]));
+ srcCoeffTemp3 = _mm_cvtepu8_epi16(srcCoeff);
+ T20 = _mm_mullo_epi16(srcCoeffTemp3, coeff6);
+
+ srcCoeff = _mm_loadl_epi64((__m128i const*)(&src[col + 7 * srcStride]));
+ srcCoeffTemp4 = _mm_cvtepu8_epi16(srcCoeff);
+ T30 = _mm_mullo_epi16(srcCoeffTemp4, coeff7);
+
+ sum0 = _mm_add_epi16(T00, T10);
+ sum1 = _mm_add_epi16(T20, T30);
+ sumlo = _mm_add_epi16(sumlo, _mm_add_epi16(sum0, sum1));
}
+ __m128i zero = _mm_set1_epi16(0);
+ __m128i sumOffset = _mm_set1_epi16(offset);
- sum_first = (sum_first + offset) >> shift;
- sum_first = max(sum_first, 0);
- Vec8s maxVal_v(maxVal);
- sum_first = min(sum_first, maxVal_v);
- sum = compress(sum_first, 0);
- sum.store_partial(block_width - col, dst + col);
+ __m128i val1 = _mm_add_epi16(sumlo, sumOffset);
+ val1 = _mm_srai_epi16(val1, shift);
+
+ __m128i res = _mm_packus_epi16(val1, zero);
+
+ int n = width - col;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6;
+
+ switch (n) // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst
+ {
+ case 1: mask1 = _mm_srli_si128(mask7, 6);
+ _mm_maskmoveu_si128(res, mask1, (char*)&dst[col]);
+ break;
+
+ case 2: mask2 = _mm_srli_si128(mask7, 5);
+ _mm_maskmoveu_si128(res, mask2, (char*)&dst[col]);
+ break;
+
+ case 3: mask3 = _mm_srli_si128(mask7, 4);
+ _mm_maskmoveu_si128(res, mask3, (char*)&dst[col]);
+ break;
+
+ case 4: mask4 = _mm_srli_si128(mask7, 3);
+ _mm_maskmoveu_si128(res, mask4, (char*)&dst[col]);
+ break;
+
+ case 5: mask5 = _mm_srli_si128(mask7, 2);
+ _mm_maskmoveu_si128(res, mask5, (char*)&dst[col]);
+ break;
+
+ case 6: mask6 = _mm_srli_si128(mask7, 1);
+ _mm_maskmoveu_si128(res, mask6, (char*)&dst[col]);
+ break;
+
+ case 7: _mm_maskmoveu_si128(res, mask7, (char*)&dst[col]);
+ break;
+ }
}
src += srcStride;
@@ -716,6 +785,8 @@
}
}
+#endif /* if INSTRSET >= 4 */
+
#if INSTRSET > 3
template<int N>
void filterHorizontal_p_p(pixel *src, int srcStride, pixel *dst, int dstStride, int width, int height, short const *coeff)
More information about the x265-devel
mailing list