[x265] [PATCH] pixel: replace getResidual64 from vector class to intrinsic
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Tue Oct 8 11:20:57 CEST 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381223945 -19800
# Tue Oct 08 14:49:05 2013 +0530
# Node ID 23f4e0a507a6be19fceb4a2525aeb2a5fae5e1ab
# Parent 1a62566488b7ece9bbfb665e37ac402a08ce156e
pixel: replace getResidual64 from vector class to intrinsic
diff -r 1a62566488b7 -r 23f4e0a507a6 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Tue Oct 08 14:33:26 2013 +0530
+++ b/source/common/vec/pixel8.inc Tue Oct 08 14:49:05 2013 +0530
@@ -112,46 +112,41 @@
}
}
-void getResidual64(pixel *fenc, pixel *pred, short *resi, int stride)
-{
- Vec16uc f, p;
- Vec8s r;
+void getResidual64(pixel *fenc, pixel *pred, short *resi, int stride)
+{
+ __m128i T00, T01, T02, T03, T04;
+
+#define RESIDUAL_64x4(BASE, OFFSET) \
+ T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 0) * stride)); \
+ T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 0) * stride)); \
+ T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \
+ T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \
+ T04 = _mm_sub_epi16(T02, T03); \
+ _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 0) * stride), T04); \
+ T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \
+ T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \
+ T04 = _mm_sub_epi16(T02, T03); \
+ _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 0) * stride), T04); \
+ T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 1) * stride)); \
+ T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 1) * stride)); \
+ T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \
+ T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \
+ T04 = _mm_sub_epi16(T02, T03); \
+ _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 1) * stride), T04); \
+ T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \
+ T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \
+ T04 = _mm_sub_epi16(T02, T03); \
+ _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 1) * stride), T04)
+
+ for (int i = 0; i < 64; i += 2)
+ {
+ RESIDUAL_64x4(i, 0);
+ RESIDUAL_64x4(i, 16);
+ RESIDUAL_64x4(i, 32);
+ RESIDUAL_64x4(i, 48);
+ }
+}
- for (int y = 0; y < 64; y++)
- {
- f.load_a(fenc);
- p.load_a(pred);
- r = extend_low(f) - extend_low(p);
- r.store(resi);
- r = extend_high(f) - extend_high(p);
- r.store(resi + 8);
-
- f.load_a(fenc + 16);
- p.load_a(pred + 16);
- r = extend_low(f) - extend_low(p);
- r.store(resi + 16);
- r = extend_high(f) - extend_high(p);
- r.store(resi + 24);
-
- f.load_a(fenc + 32);
- p.load_a(pred + 32);
- r = extend_low(f) - extend_low(p);
- r.store(resi + 32);
- r = extend_high(f) - extend_high(p);
- r.store(resi + 40);
-
- f.load_a(fenc + 48);
- p.load_a(pred + 48);
- r = extend_low(f) - extend_low(p);
- r.store(resi + 48);
- r = extend_high(f) - extend_high(p);
- r.store(resi + 56);
-
- fenc += stride;
- pred += stride;
- resi += stride;
- }
-}
void calcRecons4(pixel* pPred, short* pResi, pixel* pReco, short* pRecQt, pixel* pRecIPred, int stride, int recstride, int ipredstride)
{
More information about the x265-devel
mailing list