[x265] [PATCH] pixel: replace getResidual32 from vector class to intrinsic
Steve Borho
steve at borho.org
Tue Oct 8 19:08:52 CEST 2013
On Tue, Oct 8, 2013 at 4:05 AM, <yuvaraj at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
> # Date 1381223006 -19800
> # Tue Oct 08 14:33:26 2013 +0530
> # Node ID 1a62566488b7ece9bbfb665e37ac402a08ce156e
> # Parent 96e30370f4d96c7fed69f432027ed3be8e01dcf6
> pixel: replace getResidual32 from vector class to intrinsic
>
> diff -r 96e30370f4d9 -r 1a62566488b7 source/common/vec/pixel8.inc
> --- a/source/common/vec/pixel8.inc Tue Oct 08 14:16:23 2013 +0530
> +++ b/source/common/vec/pixel8.inc Tue Oct 08 14:33:26 2013 +0530
> @@ -79,31 +79,37 @@
> RESIDUAL_16x4(12);
> }
>
> -void getResidual32(pixel *fenc, pixel *pred, short *resi, int stride)
> -{
> - Vec16uc f, p;
> - Vec8s r;
> -
> - for (int y = 0; y < 32; y++)
> - {
> - f.load_a(fenc);
> - p.load_a(pred);
> - r = extend_low(f) - extend_low(p);
> - r.store(resi);
> - r = extend_high(f) - extend_high(p);
> - r.store(resi + 8);
> -
> - f.load_a(fenc + 16);
> - p.load_a(pred + 16);
> - r = extend_low(f) - extend_low(p);
> - r.store(resi + 16);
> - r = extend_high(f) - extend_high(p);
> - r.store(resi + 24);
> -
> - fenc += stride;
> - pred += stride;
> - resi += stride;
> - }
> +void getResidual32(pixel *fenc, pixel *pred, short *resi, int stride)
> +{
> + __m128i T00, T01, T02, T03, T04;
> +
> +#define RESIDUAL_32x4(BASE, OFFSET) \
> + T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 0) *
> stride)); \
> + T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 0) *
> stride)); \
> + T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \
> + T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \
> + T04 = _mm_sub_epi16(T02, T03); \
> + _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 0) * stride),
> T04); \
> + T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \
> + T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \
> + T04 = _mm_sub_epi16(T02, T03); \
> + _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 0) * stride),
> T04); \
> + T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 1) *
> stride)); \
> + T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 1) *
> stride)); \
> + T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \
> + T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \
> + T04 = _mm_sub_epi16(T02, T03); \
> + _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 1) * stride),
> T04); \
> + T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \
> + T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \
> + T04 = _mm_sub_epi16(T02, T03); \
> + _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 1) * stride),
> T04)
> +
> + for (int i = 0; i < 32; i += 2)
> + {
> + RESIDUAL_32x4(i, 0);
> + RESIDUAL_32x4(i, 16);
>
I assume this macro should be named RESIDUAL_32x16; changing then queueing
> + }
> }
>
> void getResidual64(pixel *fenc, pixel *pred, short *resi, int stride)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131008/067ddef0/attachment.html>
More information about the x265-devel
mailing list