[x265] [PATCH 1 of 6] quant: Improved performance by SSE4

Min Chen chenm003 at 163.com
Fri Aug 16 12:52:31 CEST 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1376650159 -28800
# Node ID 681ab201ea0cb2dfb6e08bc51c8d75a107aa1c39
# Parent  ec4a9d8a39b1e67cd549f5416e92f7f4ef7a0e62
quant: Improved performance by SSE4

diff -r ec4a9d8a39b1 -r 681ab201ea0c source/common/vec/dct.inc
--- a/source/common/vec/dct.inc	Fri Aug 16 04:02:49 2013 -0500
+++ b/source/common/vec/dct.inc	Fri Aug 16 18:49:19 2013 +0800
@@ -3912,56 +3912,43 @@
     int qBits8 = qBits - 8;
     uint32_t acSum = 0;
     int dstOffset = 0;
-
-    for (int blockpos = 0; blockpos < numCoeff; blockpos = blockpos + 4)
+    __m128i acSum4 = _mm_setzero_si128();
+    __m128i addVec = _mm_set1_epi32(add);
+
+    for (int blockpos = 0; blockpos < numCoeff; blockpos += 8)
     {
-        Vec4i addVec(add);
-
-        Vec4i zero(0);
-        Vec4i one(1);
-
-        Vec4i level1;
-        level1.load(coef + blockpos);
-
-        Vec4i sign1;
-        sign1 = level1 < zero;
-        sign1 = sign1 | one;
-
-        Vec4i qCoeff1;
-        qCoeff1.load(quantCoeff + blockpos);
-        Vec4i tmplevel1 = abs(level1) * qCoeff1;
-        level1 = (tmplevel1 + addVec) >> qBits;
-        Vec4i deltaU1 = ((tmplevel1 - (level1 << qBits)) >> qBits8);
-        deltaU1.store(deltaU + blockpos);
-        acSum = acSum + horizontal_add(level1);
-        level1 = level1 * sign1;
-
-        blockpos += 4;
-
-        Vec4i level2;
-        level2.load(coef + blockpos);
-
-        Vec4i sign2;
-        sign2 = level2 < zero;
-        sign2 = sign2 | one;
-
-        Vec4i qCoeff2;
-        qCoeff2.load(quantCoeff + blockpos);
-        Vec4i tmplevel2 = abs(level2) * qCoeff2;
-        level2 = (tmplevel2 + addVec) >> qBits;
-        Vec4i deltaU2 = ((tmplevel2 - (level2 << qBits)) >> qBits8);
-        deltaU2.store(deltaU + blockpos);
-        acSum = acSum + horizontal_add(level2);
-        level2 = level2 * sign2;
-
-        Vec8s level = compress_saturated(level1, level2);
-        Vec4i qCoef_n0_n3 = extend_low(level);
-        Vec4i qCoef_n4_n7 = extend_high(level);
-        qCoef_n0_n3.store(qCoef + dstOffset);
-        dstOffset += 4;
-        qCoef_n4_n7.store(qCoef + dstOffset);
-        dstOffset += 4;
+        __m128i level1 = _mm_loadu_si128((__m128i*)(coef + blockpos));
+
+        __m128i sign1 = _mm_cmplt_epi32(level1, _mm_setzero_si128());
+
+        __m128i qCoeff1 = _mm_loadu_si128((__m128i*)(quantCoeff + blockpos));
+        __m128i tmplevel1 = _mm_mullo_epi32(_mm_abs_epi32(level1), qCoeff1);
+        level1 = _mm_srai_epi32(_mm_add_epi32(tmplevel1, addVec), qBits);
+        __m128i deltaU1 = _mm_srai_epi32(_mm_sub_epi32(tmplevel1, _mm_slli_epi32(level1, qBits)), qBits8);
+        _mm_storeu_si128((__m128i*)(deltaU + blockpos), deltaU1);
+        acSum4 = _mm_add_epi32(acSum4, level1);
+        level1 = _mm_sub_epi32(_mm_xor_si128(level1, sign1), sign1);
+        level1 = _mm_cvtepi16_epi32(_mm_packs_epi32(level1, level1));
+        _mm_storeu_si128((__m128i*)(qCoef + dstOffset), level1);
+
+        __m128i level2 = _mm_loadu_si128((__m128i*)(coef + blockpos + 4));
+        __m128i sign2 = _mm_cmplt_epi32(level2, _mm_setzero_si128());
+
+        __m128i qCoeff2 = _mm_loadu_si128((__m128i*)(quantCoeff + blockpos + 4));
+        __m128i tmplevel2 = _mm_mullo_epi32(_mm_abs_epi32(level2), qCoeff2);
+        level2 = _mm_srai_epi32(_mm_add_epi32(tmplevel2, addVec), qBits);
+        __m128i deltaU2 = _mm_srai_epi32(_mm_sub_epi32(tmplevel2, _mm_slli_epi32(level2, qBits)), qBits8);
+        _mm_storeu_si128((__m128i*)(deltaU + blockpos + 4), deltaU2);
+        acSum4 = _mm_add_epi32(acSum4, level2);
+        level2 = _mm_sub_epi32(_mm_xor_si128(level2, sign2), sign2);
+        level2 = _mm_cvtepi16_epi32(_mm_packs_epi32(level2, level2));
+        _mm_storeu_si128((__m128i*)(qCoef + dstOffset + 4), level2);
+
+        dstOffset += 8;
     }
+    acSum4 = _mm_hadd_epi32(acSum4, acSum4);
+    acSum4 = _mm_hadd_epi32(acSum4, acSum4);
+    acSum  = _mm_cvtsi128_si32(acSum4);
 
     return acSum;
 }
@@ -3975,7 +3962,9 @@
 void NAME(Setup_Vec_DCTPrimitives)(EncoderPrimitives &p)
 {
     p.dequant = dequant;
+#if INSTRSET >= X265_CPU_LEVEL_SSE41
     p.quant = quant;
+#endif
 
 #if !HIGH_BIT_DEPTH && INSTRSET > X265_CPU_LEVEL_SSSE3
     p.dct[DST_4x4] = dst4;
diff -r ec4a9d8a39b1 -r 681ab201ea0c source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Fri Aug 16 04:02:49 2013 -0500
+++ b/source/test/mbdstharness.cpp	Fri Aug 16 18:49:19 2013 +0800
@@ -256,8 +256,8 @@
         int cmp_size = sizeof(int) * height * width;
         int numCoeff = height * width;
 
+        refReturnValue = ref(mintbuf1 + j, mintbuf2 + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
         optReturnValue = opt(mintbuf1 + j, mintbuf2 + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
-        refReturnValue = ref(mintbuf1 + j, mintbuf2 + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
 
         if (memcmp(mintbuf3, mintbuf5, cmp_size))
             return false;



More information about the x265-devel mailing list