[x265] [PATCH] filterHorizontal_p_p vector portion replaced with intrinsic code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Jul 29 12:18:04 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1375093073 -19800
# Node ID 5c64a7b6b636dbe9d64daad435047e1f29330406
# Parent  9fb0dd3a7460acfeb55424a658ae1a40af12d85d
filterHorizontal_p_p vector portion replaced with intrinsic code

diff -r 9fb0dd3a7460 -r 5c64a7b6b636 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc	Mon Jul 29 15:41:24 2013 +0530
+++ b/source/common/vec/ipfilter8.inc	Mon Jul 29 15:47:53 2013 +0530
@@ -760,22 +760,16 @@
 
         for (; col < width; col++)                        // Remaining iterations
         {
-            Vec8s vec_sum_low, vec_zero(0);
-            Vec16uc vec_src0, vec_sum;
-            Vec8s vec_c;
-            vec_c.load(coeff);
+            __m128i NewSrc = _mm_loadl_epi64((__m128i*)(src + col));
+            __m128i T00 = _mm_maddubs_epi16(NewSrc, T10);
+            __m128i add = _mm_hadd_epi16(T00, T00);
+            short sum =  _mm_extract_epi16(add, 0);
 
             if (N == 8)
             {
-                vec_src0.load(src + col);
+                add = _mm_hadd_epi16(add, add);
+                sum =  _mm_extract_epi16(add, 0);
             }
-            else
-            {
-                vec_src0 = load_partial_by_i<4>(src + col);
-            }
-            // Assuming that there is no overflow (Everywhere in this function!)
-            vec_sum_low = extend_low(vec_src0) * vec_c;
-            short sum = horizontal_add(vec_sum_low);
             short val = (short)(sum + offset) >> headRoom;
             val = (val < 0) ? 0 : val;
             val = (val > maxVal) ? maxVal : val;


More information about the x265-devel mailing list