[x265] [PATCH] filterHorizontal_p_p vector portion replaced with intrinsic code
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Mon Jul 29 12:18:04 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1375093073 -19800
# Node ID 5c64a7b6b636dbe9d64daad435047e1f29330406
# Parent 9fb0dd3a7460acfeb55424a658ae1a40af12d85d
filterHorizontal_p_p vector portion replaced with intrinsic code
diff -r 9fb0dd3a7460 -r 5c64a7b6b636 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Mon Jul 29 15:41:24 2013 +0530
+++ b/source/common/vec/ipfilter8.inc Mon Jul 29 15:47:53 2013 +0530
@@ -760,22 +760,16 @@
for (; col < width; col++) // Remaining iterations
{
- Vec8s vec_sum_low, vec_zero(0);
- Vec16uc vec_src0, vec_sum;
- Vec8s vec_c;
- vec_c.load(coeff);
+ __m128i NewSrc = _mm_loadl_epi64((__m128i*)(src + col));
+ __m128i T00 = _mm_maddubs_epi16(NewSrc, T10);
+ __m128i add = _mm_hadd_epi16(T00, T00);
+ short sum = _mm_extract_epi16(add, 0);
if (N == 8)
{
- vec_src0.load(src + col);
+ add = _mm_hadd_epi16(add, add);
+ sum = _mm_extract_epi16(add, 0);
}
- else
- {
- vec_src0 = load_partial_by_i<4>(src + col);
- }
- // Assuming that there is no overflow (Everywhere in this function!)
- vec_sum_low = extend_low(vec_src0) * vec_c;
- short sum = horizontal_add(vec_sum_low);
short val = (short)(sum + offset) >> headRoom;
val = (val < 0) ? 0 : val;
val = (val > maxVal) ? maxVal : val;
More information about the x265-devel
mailing list