[x265] [PATCH] Improvement filterHorizontalMultiplaneExtend
Min Chen
chenm003 at 163.com
Sun Aug 25 14:33:58 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1377434026 -28800
# Node ID 80dff483723b39c2d738d52306f638a00e204684
# Parent c881d82f9d8571e81da4d1b01c7e2f7ce74f96f7
Improvement filterHorizontalMultiplaneExtend
diff -r c881d82f9d85 -r 80dff483723b source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Fri Aug 23 23:07:47 2013 -0500
+++ b/source/common/vec/ipfilter8.inc Sun Aug 25 20:33:46 2013 +0800
@@ -7,6 +7,7 @@
* Mahesh Pittala <mahesh at multicorewareinc.com>
* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
* Nabajit Deka <nabajit at multicorewareinc.com>
+ * Min Chen <chenm003 at 163.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -1035,6 +1036,21 @@
}
}
+ALIGN_VAR_32(const uint8_t, ipfilterH_0[][16]) =
+{
+ {0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8},
+ {2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10},
+ {4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12},
+ {6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14},
+};
+
+ALIGN_VAR_32(const int8_t, ipfilterH_1[][16]) =
+{
+ {-1, 4, -10, 58, 17, -5, 1, 0, -1, 4, -10, 58, 17, -5, 1, 0},
+ {-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1},
+ { 0, 1, -5, 17, 58, -10, 4, -1, 0, 1, -5, 17, 58, -10, 4, -1},
+};
+
void filterHorizontalMultiplaneExtend(pixel *src, int srcStride,
short *intF, short* intA, short* intB, short* intC, int intStride,
pixel *dstA, pixel *dstB, pixel *dstC, int dstStride,
@@ -1049,106 +1065,73 @@
src -= (8 / 2 - 1);
__m128i vec_src0;
__m128i vec_offset = _mm_set1_epi16(offset);
- __m128i sumaL, sumbL, sumcL, tmp, exp1;
+ __m128i tmp;
__m128i tmp16a, tmp16b, tmp16c;
// Load Ai, ai += Ai*coefi
for (row = 0; row < block_height; row++)
{
col = 0;
+ __m128i ma, mb, mc;
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
- sumbL = (_mm_unpacklo_epi8(vec_src0, _mm_setzero_si128()));
- sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+ const __m128i c_off = _mm_set1_epi16(IF_INTERNAL_OFFS);
+ const __m128i c_32 = _mm_set1_epi16(32);
+ __m128i T00;
+ __m128i T10, T11, T12, T13;
+ __m128i T20, T21, T22, T23;
- // a = b+=4*a1, c+=1*a1
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
- sumcL = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL, _mm_cvtsi32_si128(2)));
- sumaL = sumbL;
+ T00 = _mm_loadu_si128((__m128i*)(src + col + 3));
+ T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128());
+ T00 = _mm_slli_epi16(T00, 6);
+ _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(T00, c_off));
- // a +=-10*a2 b+=-11*a2 c+=-5*a2
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_sub_epi16(sumbL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
- sumcL = _mm_add_epi16(sumcL, tmp);
- tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
- sumaL = _mm_add_epi16(sumaL, tmp);
- sumbL = _mm_add_epi16(sumbL, tmp);
+ T00 = _mm_loadu_si128((__m128i*)(src + col));
- // a +=58*a3 b+=40*a3 c+=17*a3
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(_mm_sll_epi16(tmp, _mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
- exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
- sumcL = _mm_add_epi16(sumcL, exp1);
- sumaL = _mm_add_epi16(sumaL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
- sumbL = _mm_add_epi16(sumbL, tmp);
- sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+ T10 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[0]));
+ T11 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[1]));
+ T12 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[2]));
+ T13 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[3]));
- // a +=17*a4 b+=40*a4 c+=58*a4
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
- sumaL = _mm_add_epi16(sumaL, exp1);
- sumcL = _mm_add_epi16(sumcL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
- sumbL = _mm_add_epi16(sumbL, tmp);
- sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+ T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T20 = _mm_hadd_epi16(T20, T21);
+ T21 = _mm_hadd_epi16(T22, T23);
+ T20 = _mm_hadd_epi16(T20, T21);
+ _mm_storeu_si128((__m128i*)(intA + col), _mm_sub_epi16(T20, c_off));
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), T20);
- // a +=-5*a5 b+=-11*a5 c+=-10*a5
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_sub_epi16(sumbL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
- sumaL = _mm_add_epi16(sumaL, tmp);
- tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
- sumcL = _mm_add_epi16(sumcL, tmp);
- sumbL = _mm_add_epi16(sumbL, tmp);
+ T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T20 = _mm_hadd_epi16(T20, T21);
+ T21 = _mm_hadd_epi16(T22, T23);
+ T20 = _mm_hadd_epi16(T20, T21);
+ _mm_storeu_si128((__m128i*)(intB + col), _mm_sub_epi16(T20, c_off));
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), T20);
- // a +=1*a6 b+=4*a6 c+=4*a6
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumaL = _mm_add_epi16(sumaL, tmp);
- tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(2));
- sumbL = _mm_add_epi16(sumbL, tmp);
- sumcL = _mm_add_epi16(sumcL, tmp);
-
- // a +=0*a7 b+=-1*a7 c+=-1*a7
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_sub_epi16(sumbL, tmp);
- sumcL = _mm_sub_epi16(sumcL, tmp);
- sumaL = _mm_add_epi16(sumaL, vec_offset);
- sumbL = _mm_add_epi16(sumbL, vec_offset);
- sumcL = _mm_add_epi16(sumcL, vec_offset);
-
- _mm_storeu_si128((__m128i*)(intA + col), sumaL);
- sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
- sumaL = _mm_sra_epi16(sumaL, _mm_cvtsi32_si128(6));
- tmp16a = _mm_packus_epi16(sumaL, sumaL);
- _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), tmp16a);
-
- _mm_storeu_si128((__m128i*)(intB + col), sumbL);
- sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
- sumbL = _mm_sra_epi16(sumbL, _mm_cvtsi32_si128(6));
- tmp16b = _mm_packus_epi16(sumbL, sumbL);
- _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), tmp16b);
-
- _mm_storeu_si128((__m128i*)(intC + col), sumcL);
- sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
- sumcL = _mm_sra_epi16(sumcL, _mm_cvtsi32_si128(6));
- tmp16c = _mm_packus_epi16(sumcL, sumcL);
- _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), tmp16c);
+ T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T20 = _mm_hadd_epi16(T20, T21);
+ T21 = _mm_hadd_epi16(T22, T23);
+ T20 = _mm_hadd_epi16(T20, T21);
+ _mm_storeu_si128((__m128i*)(intC + col), _mm_sub_epi16(T20, c_off));
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), T20);
// Extend First column
- __m128i ma, mb, mc;
- ma = _mm_shuffle_epi8(tmp16a, _mm_set1_epi8(0));
- mb = _mm_shuffle_epi8(tmp16b, _mm_set1_epi8(0));
- mc = _mm_shuffle_epi8(tmp16c, _mm_set1_epi8(0));
-
+ ma = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstA[row * dstStride]), _mm_setzero_si128());
+ mb = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstB[row * dstStride]), _mm_setzero_si128());
+ mc = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstC[row * dstStride]), _mm_setzero_si128());
for (int i = -marginX; i < -16; i += 16)
{
_mm_storeu_si128((__m128i*)(dstA + row * dstStride + i), ma);
@@ -1164,93 +1147,59 @@
for (; col + 8 /*16*/ <= (block_width); col += 8 /*16*/) // Iterations multiple of 8
{
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
- sumbL = (_mm_unpacklo_epi8(vec_src0, _mm_setzero_si128()));
- sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+ T00 = _mm_loadu_si128((__m128i*)(src + col + 3));
+ T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128());
+ T00 = _mm_slli_epi16(T00, 6);
+ _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(T00, c_off));
- // a = b+=4*a1, c+=1*a1
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
- sumcL = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL, _mm_cvtsi32_si128(2)));
- sumaL = sumbL;
+ T00 = _mm_loadu_si128((__m128i*)(src + col));
- // a +=-10*a2 b+=-11*a2 c+=-5*a2
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_sub_epi16(sumbL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
- sumcL = _mm_add_epi16(sumcL, tmp);
- tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
- sumaL = _mm_add_epi16(sumaL, tmp);
- sumbL = _mm_add_epi16(sumbL, tmp);
+ T10 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[0]));
+ T11 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[1]));
+ T12 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[2]));
+ T13 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[3]));
- // a +=58*a3 b+=40*a3 c+=17*a3
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(_mm_sll_epi16(tmp, _mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
- exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
- sumcL = _mm_add_epi16(sumcL, exp1);
- sumaL = _mm_add_epi16(sumaL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
- sumbL = _mm_add_epi16(sumbL, tmp);
- sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+ T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[0]));
+ T20 = _mm_hadd_epi16(T20, T21);
+ T21 = _mm_hadd_epi16(T22, T23);
+ T20 = _mm_hadd_epi16(T20, T21);
+ _mm_storeu_si128((__m128i*)(intA + col), _mm_sub_epi16(T20, c_off));
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), T20);
+ tmp16a = T20;
- // a +=17*a4 b+=40*a4 c+=58*a4
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
- sumaL = _mm_add_epi16(sumaL, exp1);
- sumcL = _mm_add_epi16(sumcL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
- sumbL = _mm_add_epi16(sumbL, tmp);
- sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+ T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[1]));
+ T20 = _mm_hadd_epi16(T20, T21);
+ T21 = _mm_hadd_epi16(T22, T23);
+ T20 = _mm_hadd_epi16(T20, T21);
+ _mm_storeu_si128((__m128i*)(intB + col), _mm_sub_epi16(T20, c_off));
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), T20);
+ tmp16b = T20;
- // a +=-5*a5 b+=-11*a5 c+=-10*a5
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_sub_epi16(sumbL, tmp);
- tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
- sumaL = _mm_add_epi16(sumaL, tmp);
- tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
- sumcL = _mm_add_epi16(sumcL, tmp);
- sumbL = _mm_add_epi16(sumbL, tmp);
-
- // a +=1*a6 b+=4*a6 c+=4*a6
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumaL = _mm_add_epi16(sumaL, tmp);
- tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(2));
- sumbL = _mm_add_epi16(sumbL, tmp);
- sumcL = _mm_add_epi16(sumcL, tmp);
-
- // a +=0*a7 b+=-1*a7 c+=-1*a7
- vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
- tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
- sumbL = _mm_sub_epi16(sumbL, tmp);
- sumcL = _mm_sub_epi16(sumcL, tmp);
- sumaL = _mm_add_epi16(sumaL, vec_offset);
- sumbL = _mm_add_epi16(sumbL, vec_offset);
- sumcL = _mm_add_epi16(sumcL, vec_offset);
-
- _mm_storeu_si128((__m128i*)(intA + col), sumaL);
- sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
- sumaL = _mm_sra_epi16(sumaL, _mm_cvtsi32_si128(6));
- tmp16a = _mm_packus_epi16(sumaL, sumaL);
- _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), tmp16a);
-
- _mm_storeu_si128((__m128i*)(intB + col), sumbL);
- sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
- sumbL = _mm_sra_epi16(sumbL, _mm_cvtsi32_si128(6));
- tmp16b = _mm_packus_epi16(sumbL, sumbL);
- _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), tmp16b);
-
- _mm_storeu_si128((__m128i*)(intC + col), sumcL);
- sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
- sumcL = _mm_sra_epi16(sumcL, _mm_cvtsi32_si128(6));
- tmp16c = _mm_packus_epi16(sumcL, sumcL);
- _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), tmp16c);
+ T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[2]));
+ T20 = _mm_hadd_epi16(T20, T21);
+ T21 = _mm_hadd_epi16(T22, T23);
+ T20 = _mm_hadd_epi16(T20, T21);
+ _mm_storeu_si128((__m128i*)(intC + col), _mm_sub_epi16(T20, c_off));
+ T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
+ T20 = _mm_packus_epi16(T20, T20);
+ _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), T20);
+ tmp16c = T20;
}
+ // TODO: I think we may change algorithm to always alignment, so this code will be remove later
if (block_width - col > 0)
{
vec_src0 = _mm_loadu_si128((__m128i const*)(src + block_width - 5));
@@ -1283,17 +1232,12 @@
dstB[row * dstStride + col] = _mm_extract_epi8(sum3, 1);
dstC[row * dstStride + col] = _mm_extract_epi8(sum3, 2);
}
+ }
- tmp16a = _mm_shuffle_epi8(sum3, _mm_set1_epi8(0));
- tmp16b = _mm_shuffle_epi8(sum3, _mm_set1_epi8(1));
- tmp16c = _mm_shuffle_epi8(sum3, _mm_set1_epi8(2));
- }
- else
- {
- tmp16a = _mm_shuffle_epi8(tmp16a, _mm_set1_epi8(15));
- tmp16b = _mm_shuffle_epi8(tmp16b, _mm_set1_epi8(15));
- tmp16c = _mm_shuffle_epi8(tmp16c, _mm_set1_epi8(15));
- }
+ tmp16a = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstA[row * dstStride + block_width - 1]), _mm_setzero_si128());
+ tmp16b = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstB[row * dstStride + block_width - 1]), _mm_setzero_si128());
+ tmp16c = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstC[row * dstStride + block_width - 1]), _mm_setzero_si128());
+
// Extend last column
for (int i = -marginX; i < -16; i += 16)
{
More information about the x265-devel
mailing list