[x265] [PATCH] Vector Primitives : Removed hungarian notation
gopu at multicorewareinc.com
gopu at multicorewareinc.com
Mon Jul 8 09:22:17 CEST 2013
# HG changeset patch
# User ggopu
# Date 1373268038 -19800
# Node ID c8c33fdca89b02baf3eb8edffa9bc313f186c434
# Parent dc13d07919dbaa0a1ba82da1daf76bdf71bd08d1
Vector Primitives : Removed hungarian notation
diff -r dc13d07919db -r c8c33fdca89b source/common/vec/pixel.inc
--- a/source/common/vec/pixel.inc Mon Jul 08 11:28:20 2013 +0530
+++ b/source/common/vec/pixel.inc Mon Jul 08 12:50:38 2013 +0530
@@ -103,7 +103,7 @@
namespace {
/* File for pixels type-neutral code */
-void convert16to32(short *psOrg, int *piDst, int num)
+void convert16to32(short *Org, int *Dst, int num)
{
int i;
@@ -112,18 +112,18 @@
Vec8s im16;
Vec4i im32L, im32H;
- im16.load(psOrg);
+ im16.load(Org);
im32L = extend_low(im16);
im32H = extend_high(im16);
- im32L.store(piDst);
- im32H.store(piDst + 4);
+ im32L.store(Dst);
+ im32H.store(Dst + 4);
- psOrg += 8;
- piDst += 8;
+ Org += 8;
+ Dst += 8;
}
}
-void convert16to32_shl(int *piDst, short *psOrg, intptr_t iStride, int shift, int size)
+void convert16to32_shl(int *Dst, short *Org, intptr_t Stride, int shift, int size)
{
int i, j;
@@ -134,17 +134,17 @@
__m128i im16;
__m128i im32;
- im16 = _mm_loadl_epi64((__m128i*)&psOrg[i*iStride+j]);
+ im16 = _mm_loadl_epi64((__m128i*)&Org[i*Stride+j]);
im32 = _mm_srai_epi32(_mm_unpacklo_epi16(im16, im16), 16);
im32 = _mm_slli_epi32(im32, shift);
- _mm_storeu_si128((__m128i*)piDst, im32);
+ _mm_storeu_si128((__m128i*)Dst, im32);
- piDst += 4;
+ Dst += 4;
}
}
}
-void convert16to16_shl(short *psDst, short *psOrg, int width, int height, intptr_t stride, int shift)
+void convert16to16_shl(short *Dst, short *Org, int width, int height, intptr_t stride, int shift)
{
int i, j;
@@ -154,11 +154,11 @@
{
__m128i T00, T01;
- T00 = _mm_loadl_epi64((__m128i*)&psOrg[(i )*stride]);
- T01 = _mm_loadl_epi64((__m128i*)&psOrg[(i+1)*stride]);
+ T00 = _mm_loadl_epi64((__m128i*)&Org[(i )*stride]);
+ T01 = _mm_loadl_epi64((__m128i*)&Org[(i+1)*stride]);
T00 = _mm_unpacklo_epi64(T00, T01);
T00 = _mm_slli_epi16(T00, shift);
- _mm_storeu_si128((__m128i*)&psDst[i*4], T00);
+ _mm_storeu_si128((__m128i*)&Dst[i*4], T00);
}
}
else
@@ -169,15 +169,15 @@
{
__m128i T00;
- T00 = _mm_loadu_si128((__m128i*)&psOrg[i*stride+j]);
+ T00 = _mm_loadu_si128((__m128i*)&Org[i*stride+j]);
T00 = _mm_slli_epi16(T00, shift);
- _mm_storeu_si128((__m128i*)&psDst[i*width+j], T00);
+ _mm_storeu_si128((__m128i*)&Dst[i*width+j], T00);
}
}
}
}
-void convert32to16(int *psOrg, short *piDst, int num)
+void convert32to16(int *Org, short *Dst, int num)
{
int i;
@@ -186,17 +186,17 @@
Vec4i im32L, im32H;
Vec8s im16;
- im32L.load(psOrg);
- im32H.load(psOrg + 4);
+ im32L.load(Org);
+ im32H.load(Org + 4);
im16 = compress_saturated(im32L, im32H);
- im16.store(piDst);
+ im16.store(Dst);
- psOrg += 8;
- piDst += 8;
+ Org += 8;
+ Dst += 8;
}
}
-void convert32to16_shr(short *piDst, int *psOrg, int shift, int num)
+void convert32to16_shr(short *Dst, int *Org, int shift, int num)
{
int i;
Vec4i round = _mm_set1_epi32(1 << (shift - 1));
@@ -206,24 +206,24 @@
Vec4i im32;
Vec8s im16;
- im32.load(psOrg);
+ im32.load(Org);
im32 = (im32 + round) >> shift;
im16 = compress_saturated(im32, im32);
- store_partial(const_int(8), piDst, im16);
+ store_partial(const_int(8), Dst, im16);
- psOrg += 4;
- piDst += 4;
+ Org += 4;
+ Dst += 4;
}
}
template <int blockSize>
-void transpose(pixel* pDst, pixel* pSrc, intptr_t nStride)
+void transpose(pixel* Dst, pixel* Src, intptr_t Stride)
{
for (int k = 0; k < blockSize; k++)
{
for (int l = 0; l < blockSize; l++)
{
- pDst[k * blockSize + l] = pSrc[l * nStride + k];
+ Dst[k * blockSize + l] = Src[l * Stride + k];
}
}
}
@@ -231,21 +231,21 @@
#include "utils.h"
#if !HIGH_BIT_DEPTH
-void transpose4(pixel* pDst, pixel* pSrc, intptr_t nStride)
+void transpose4(pixel* Dst, pixel* Src, intptr_t Stride)
{
__m128i T00, T01, T02, T03;
- T00 = _mm_cvtsi32_si128(*(int*)&pSrc[0*nStride]); // [03 02 01 00]
- T01 = _mm_cvtsi32_si128(*(int*)&pSrc[1*nStride]); // [13 12 11 10]
- T02 = _mm_cvtsi32_si128(*(int*)&pSrc[2*nStride]); // [23 22 21 20]
- T03 = _mm_cvtsi32_si128(*(int*)&pSrc[3*nStride]); // [33 32 31 30]
+ T00 = _mm_cvtsi32_si128(*(int*)&Src[0*Stride]); // [03 02 01 00]
+ T01 = _mm_cvtsi32_si128(*(int*)&Src[1*Stride]); // [13 12 11 10]
+ T02 = _mm_cvtsi32_si128(*(int*)&Src[2*Stride]); // [23 22 21 20]
+ T03 = _mm_cvtsi32_si128(*(int*)&Src[3*Stride]); // [33 32 31 30]
T00 = _mm_unpacklo_epi8(T00, T01);
T01 = _mm_unpacklo_epi8(T02, T03);
T00 = _mm_unpacklo_epi16(T00, T01);
- _mm_store_si128((__m128i*)pDst, T00);
+ _mm_store_si128((__m128i*)Dst, T00);
}
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
@@ -267,107 +267,107 @@
out3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
}
-void transpose8(pixel* pDst, pixel* pSrc, intptr_t nStride)
+void transpose8(pixel* Dst, pixel* Src, intptr_t Stride)
{
__m128i T00, T01, T02, T03, T04, T05, T06, T07;
- T00 = _mm_loadl_epi64((__m128i*)&pSrc[0*nStride]); // [07 06 05 04 03 02 01 00]
- T01 = _mm_loadl_epi64((__m128i*)&pSrc[1*nStride]); // [17 16 15 14 13 12 11 10]
- T02 = _mm_loadl_epi64((__m128i*)&pSrc[2*nStride]); // [27 26 25 24 23 22 21 20]
- T03 = _mm_loadl_epi64((__m128i*)&pSrc[3*nStride]); // [37 36 35 34 33 32 31 30]
- T04 = _mm_loadl_epi64((__m128i*)&pSrc[4*nStride]); // [47 46 45 44 43 42 41 40]
- T05 = _mm_loadl_epi64((__m128i*)&pSrc[5*nStride]); // [57 56 55 54 53 52 51 50]
- T06 = _mm_loadl_epi64((__m128i*)&pSrc[6*nStride]); // [67 66 65 64 63 62 61 60]
- T07 = _mm_loadl_epi64((__m128i*)&pSrc[7*nStride]); // [77 76 75 74 73 72 71 70]
+ T00 = _mm_loadl_epi64((__m128i*)&Src[0*Stride]); // [07 06 05 04 03 02 01 00]
+ T01 = _mm_loadl_epi64((__m128i*)&Src[1*Stride]); // [17 16 15 14 13 12 11 10]
+ T02 = _mm_loadl_epi64((__m128i*)&Src[2*Stride]); // [27 26 25 24 23 22 21 20]
+ T03 = _mm_loadl_epi64((__m128i*)&Src[3*Stride]); // [37 36 35 34 33 32 31 30]
+ T04 = _mm_loadl_epi64((__m128i*)&Src[4*Stride]); // [47 46 45 44 43 42 41 40]
+ T05 = _mm_loadl_epi64((__m128i*)&Src[5*Stride]); // [57 56 55 54 53 52 51 50]
+ T06 = _mm_loadl_epi64((__m128i*)&Src[6*Stride]); // [67 66 65 64 63 62 61 60]
+ T07 = _mm_loadl_epi64((__m128i*)&Src[7*Stride]); // [77 76 75 74 73 72 71 70]
TRANSPOSE_8X8(T00, T01, T02, T03, T04, T05, T06, T07, T00, T01, T02, T03);
- _mm_store_si128((__m128i*)&pDst[0*8], T00);
- _mm_store_si128((__m128i*)&pDst[2*8], T01);
- _mm_store_si128((__m128i*)&pDst[4*8], T02);
- _mm_store_si128((__m128i*)&pDst[6*8], T03);
+ _mm_store_si128((__m128i*)&Dst[0*8], T00);
+ _mm_store_si128((__m128i*)&Dst[2*8], T01);
+ _mm_store_si128((__m128i*)&Dst[4*8], T02);
+ _mm_store_si128((__m128i*)&Dst[6*8], T03);
}
-ALWAYSINLINE void transpose16_dummy(pixel* pDst, intptr_t nStrideD, pixel* pSrc, intptr_t nStrideS)
+ALWAYSINLINE void transpose16_dummy(pixel* Dst, intptr_t StrideD, pixel* Src, intptr_t StrideS)
{
__m128i T00, T01, T02, T03, T04, T05, T06, T07;
- T00 = _mm_loadl_epi64((__m128i*)&pSrc[ 0 * nStrideS]);
- T01 = _mm_loadl_epi64((__m128i*)&pSrc[ 1 * nStrideS]);
- T02 = _mm_loadl_epi64((__m128i*)&pSrc[ 2 * nStrideS]);
- T03 = _mm_loadl_epi64((__m128i*)&pSrc[ 3 * nStrideS]);
- T04 = _mm_loadl_epi64((__m128i*)&pSrc[ 4 * nStrideS]);
- T05 = _mm_loadl_epi64((__m128i*)&pSrc[ 5 * nStrideS]);
- T06 = _mm_loadl_epi64((__m128i*)&pSrc[ 6 * nStrideS]);
- T07 = _mm_loadl_epi64((__m128i*)&pSrc[ 7 * nStrideS]);
+ T00 = _mm_loadl_epi64((__m128i*)&Src[ 0 * StrideS]);
+ T01 = _mm_loadl_epi64((__m128i*)&Src[ 1 * StrideS]);
+ T02 = _mm_loadl_epi64((__m128i*)&Src[ 2 * StrideS]);
+ T03 = _mm_loadl_epi64((__m128i*)&Src[ 3 * StrideS]);
+ T04 = _mm_loadl_epi64((__m128i*)&Src[ 4 * StrideS]);
+ T05 = _mm_loadl_epi64((__m128i*)&Src[ 5 * StrideS]);
+ T06 = _mm_loadl_epi64((__m128i*)&Src[ 6 * StrideS]);
+ T07 = _mm_loadl_epi64((__m128i*)&Src[ 7 * StrideS]);
TRANSPOSE_8X8(T00, T01, T02, T03, T04, T05, T06, T07, T00, T01, T02, T03);
- _mm_storel_epi64((__m128i*)&pDst[ 0 * nStrideD], T00);
- _mm_storeh_pi( (__m64*)&pDst[ 1 * nStrideD], _mm_castsi128_ps(T00));
- _mm_storel_epi64((__m128i*)&pDst[ 2 * nStrideD], T01);
- _mm_storeh_pi( (__m64*)&pDst[ 3 * nStrideD], _mm_castsi128_ps(T01));
- _mm_storel_epi64((__m128i*)&pDst[ 4 * nStrideD], T02);
- _mm_storeh_pi( (__m64*)&pDst[ 5 * nStrideD], _mm_castsi128_ps(T02));
- _mm_storel_epi64((__m128i*)&pDst[ 6 * nStrideD], T03);
- _mm_storeh_pi( (__m64*)&pDst[ 7 * nStrideD], _mm_castsi128_ps(T03));
+ _mm_storel_epi64((__m128i*)&Dst[ 0 * StrideD], T00);
+ _mm_storeh_pi( (__m64*)&Dst[ 1 * StrideD], _mm_castsi128_ps(T00));
+ _mm_storel_epi64((__m128i*)&Dst[ 2 * StrideD], T01);
+ _mm_storeh_pi( (__m64*)&Dst[ 3 * StrideD], _mm_castsi128_ps(T01));
+ _mm_storel_epi64((__m128i*)&Dst[ 4 * StrideD], T02);
+ _mm_storeh_pi( (__m64*)&Dst[ 5 * StrideD], _mm_castsi128_ps(T02));
+ _mm_storel_epi64((__m128i*)&Dst[ 6 * StrideD], T03);
+ _mm_storeh_pi( (__m64*)&Dst[ 7 * StrideD], _mm_castsi128_ps(T03));
- T00 = _mm_loadl_epi64((__m128i*)&pSrc[ 0 * nStrideS + 8]);
- T01 = _mm_loadl_epi64((__m128i*)&pSrc[ 1 * nStrideS + 8]);
- T02 = _mm_loadl_epi64((__m128i*)&pSrc[ 2 * nStrideS + 8]);
- T03 = _mm_loadl_epi64((__m128i*)&pSrc[ 3 * nStrideS + 8]);
- T04 = _mm_loadl_epi64((__m128i*)&pSrc[ 4 * nStrideS + 8]);
- T05 = _mm_loadl_epi64((__m128i*)&pSrc[ 5 * nStrideS + 8]);
- T06 = _mm_loadl_epi64((__m128i*)&pSrc[ 6 * nStrideS + 8]);
- T07 = _mm_loadl_epi64((__m128i*)&pSrc[ 7 * nStrideS + 8]);
+ T00 = _mm_loadl_epi64((__m128i*)&Src[ 0 * StrideS + 8]);
+ T01 = _mm_loadl_epi64((__m128i*)&Src[ 1 * StrideS + 8]);
+ T02 = _mm_loadl_epi64((__m128i*)&Src[ 2 * StrideS + 8]);
+ T03 = _mm_loadl_epi64((__m128i*)&Src[ 3 * StrideS + 8]);
+ T04 = _mm_loadl_epi64((__m128i*)&Src[ 4 * StrideS + 8]);
+ T05 = _mm_loadl_epi64((__m128i*)&Src[ 5 * StrideS + 8]);
+ T06 = _mm_loadl_epi64((__m128i*)&Src[ 6 * StrideS + 8]);
+ T07 = _mm_loadl_epi64((__m128i*)&Src[ 7 * StrideS + 8]);
TRANSPOSE_8X8(T00, T01, T02, T03, T04, T05, T06, T07, T00, T01, T02, T03);
- _mm_storel_epi64((__m128i*)&pDst[ 8 * nStrideD], T00);
- _mm_storeh_pi( (__m64*)&pDst[ 9 * nStrideD], _mm_castsi128_ps(T00));
- _mm_storel_epi64((__m128i*)&pDst[10 * nStrideD], T01);
- _mm_storeh_pi( (__m64*)&pDst[11 * nStrideD], _mm_castsi128_ps(T01));
- _mm_storel_epi64((__m128i*)&pDst[12 * nStrideD], T02);
- _mm_storeh_pi( (__m64*)&pDst[13 * nStrideD], _mm_castsi128_ps(T02));
- _mm_storel_epi64((__m128i*)&pDst[14 * nStrideD], T03);
- _mm_storeh_pi( (__m64*)&pDst[15 * nStrideD], _mm_castsi128_ps(T03));
+ _mm_storel_epi64((__m128i*)&Dst[ 8 * StrideD], T00);
+ _mm_storeh_pi( (__m64*)&Dst[ 9 * StrideD], _mm_castsi128_ps(T00));
+ _mm_storel_epi64((__m128i*)&Dst[10 * StrideD], T01);
+ _mm_storeh_pi( (__m64*)&Dst[11 * StrideD], _mm_castsi128_ps(T01));
+ _mm_storel_epi64((__m128i*)&Dst[12 * StrideD], T02);
+ _mm_storeh_pi( (__m64*)&Dst[13 * StrideD], _mm_castsi128_ps(T02));
+ _mm_storel_epi64((__m128i*)&Dst[14 * StrideD], T03);
+ _mm_storeh_pi( (__m64*)&Dst[15 * StrideD], _mm_castsi128_ps(T03));
- T00 = _mm_loadl_epi64((__m128i*)&pSrc[ 8 * nStrideS]);
- T01 = _mm_loadl_epi64((__m128i*)&pSrc[ 9 * nStrideS]);
- T02 = _mm_loadl_epi64((__m128i*)&pSrc[10 * nStrideS]);
- T03 = _mm_loadl_epi64((__m128i*)&pSrc[11 * nStrideS]);
- T04 = _mm_loadl_epi64((__m128i*)&pSrc[12 * nStrideS]);
- T05 = _mm_loadl_epi64((__m128i*)&pSrc[13 * nStrideS]);
- T06 = _mm_loadl_epi64((__m128i*)&pSrc[14 * nStrideS]);
- T07 = _mm_loadl_epi64((__m128i*)&pSrc[15 * nStrideS]);
+ T00 = _mm_loadl_epi64((__m128i*)&Src[ 8 * StrideS]);
+ T01 = _mm_loadl_epi64((__m128i*)&Src[ 9 * StrideS]);
+ T02 = _mm_loadl_epi64((__m128i*)&Src[10 * StrideS]);
+ T03 = _mm_loadl_epi64((__m128i*)&Src[11 * StrideS]);
+ T04 = _mm_loadl_epi64((__m128i*)&Src[12 * StrideS]);
+ T05 = _mm_loadl_epi64((__m128i*)&Src[13 * StrideS]);
+ T06 = _mm_loadl_epi64((__m128i*)&Src[14 * StrideS]);
+ T07 = _mm_loadl_epi64((__m128i*)&Src[15 * StrideS]);
TRANSPOSE_8X8(T00, T01, T02, T03, T04, T05, T06, T07, T00, T01, T02, T03);
- _mm_storel_epi64((__m128i*)&pDst[ 0 * nStrideD + 8], T00);
- _mm_storeh_pi( (__m64*)&pDst[ 1 * nStrideD + 8], _mm_castsi128_ps(T00));
- _mm_storel_epi64((__m128i*)&pDst[ 2 * nStrideD + 8], T01);
- _mm_storeh_pi( (__m64*)&pDst[ 3 * nStrideD + 8], _mm_castsi128_ps(T01));
- _mm_storel_epi64((__m128i*)&pDst[ 4 * nStrideD + 8], T02);
- _mm_storeh_pi( (__m64*)&pDst[ 5 * nStrideD + 8], _mm_castsi128_ps(T02));
- _mm_storel_epi64((__m128i*)&pDst[ 6 * nStrideD + 8], T03);
- _mm_storeh_pi( (__m64*)&pDst[ 7 * nStrideD + 8], _mm_castsi128_ps(T03));
+ _mm_storel_epi64((__m128i*)&Dst[ 0 * StrideD + 8], T00);
+ _mm_storeh_pi( (__m64*)&Dst[ 1 * StrideD + 8], _mm_castsi128_ps(T00));
+ _mm_storel_epi64((__m128i*)&Dst[ 2 * StrideD + 8], T01);
+ _mm_storeh_pi( (__m64*)&Dst[ 3 * StrideD + 8], _mm_castsi128_ps(T01));
+ _mm_storel_epi64((__m128i*)&Dst[ 4 * StrideD + 8], T02);
+ _mm_storeh_pi( (__m64*)&Dst[ 5 * StrideD + 8], _mm_castsi128_ps(T02));
+ _mm_storel_epi64((__m128i*)&Dst[ 6 * StrideD + 8], T03);
+ _mm_storeh_pi( (__m64*)&Dst[ 7 * StrideD + 8], _mm_castsi128_ps(T03));
- T00 = _mm_loadl_epi64((__m128i*)&pSrc[ 8 * nStrideS + 8]);
- T01 = _mm_loadl_epi64((__m128i*)&pSrc[ 9 * nStrideS + 8]);
- T02 = _mm_loadl_epi64((__m128i*)&pSrc[10 * nStrideS + 8]);
- T03 = _mm_loadl_epi64((__m128i*)&pSrc[11 * nStrideS + 8]);
- T04 = _mm_loadl_epi64((__m128i*)&pSrc[12 * nStrideS + 8]);
- T05 = _mm_loadl_epi64((__m128i*)&pSrc[13 * nStrideS + 8]);
- T06 = _mm_loadl_epi64((__m128i*)&pSrc[14 * nStrideS + 8]);
- T07 = _mm_loadl_epi64((__m128i*)&pSrc[15 * nStrideS + 8]);
+ T00 = _mm_loadl_epi64((__m128i*)&Src[ 8 * StrideS + 8]);
+ T01 = _mm_loadl_epi64((__m128i*)&Src[ 9 * StrideS + 8]);
+ T02 = _mm_loadl_epi64((__m128i*)&Src[10 * StrideS + 8]);
+ T03 = _mm_loadl_epi64((__m128i*)&Src[11 * StrideS + 8]);
+ T04 = _mm_loadl_epi64((__m128i*)&Src[12 * StrideS + 8]);
+ T05 = _mm_loadl_epi64((__m128i*)&Src[13 * StrideS + 8]);
+ T06 = _mm_loadl_epi64((__m128i*)&Src[14 * StrideS + 8]);
+ T07 = _mm_loadl_epi64((__m128i*)&Src[15 * StrideS + 8]);
TRANSPOSE_8X8(T00, T01, T02, T03, T04, T05, T06, T07, T00, T01, T02, T03);
- _mm_storel_epi64((__m128i*)&pDst[ 8 * nStrideD + 8], T00);
- _mm_storeh_pi( (__m64*)&pDst[ 9 * nStrideD + 8], _mm_castsi128_ps(T00));
- _mm_storel_epi64((__m128i*)&pDst[10 * nStrideD + 8], T01);
- _mm_storeh_pi( (__m64*)&pDst[11 * nStrideD + 8], _mm_castsi128_ps(T01));
- _mm_storel_epi64((__m128i*)&pDst[12 * nStrideD + 8], T02);
- _mm_storeh_pi( (__m64*)&pDst[13 * nStrideD + 8], _mm_castsi128_ps(T02));
- _mm_storel_epi64((__m128i*)&pDst[14 * nStrideD + 8], T03);
- _mm_storeh_pi( (__m64*)&pDst[15 * nStrideD + 8], _mm_castsi128_ps(T03));
+ _mm_storel_epi64((__m128i*)&Dst[ 8 * StrideD + 8], T00);
+ _mm_storeh_pi( (__m64*)&Dst[ 9 * StrideD + 8], _mm_castsi128_ps(T00));
+ _mm_storel_epi64((__m128i*)&Dst[10 * StrideD + 8], T01);
+ _mm_storeh_pi( (__m64*)&Dst[11 * StrideD + 8], _mm_castsi128_ps(T01));
+ _mm_storel_epi64((__m128i*)&Dst[12 * StrideD + 8], T02);
+ _mm_storeh_pi( (__m64*)&Dst[13 * StrideD + 8], _mm_castsi128_ps(T02));
+ _mm_storel_epi64((__m128i*)&Dst[14 * StrideD + 8], T03);
+ _mm_storeh_pi( (__m64*)&Dst[15 * StrideD + 8], _mm_castsi128_ps(T03));
}
-void transpose16(pixel* pDst, pixel* pSrc, intptr_t nStrideS)
+void transpose16(pixel* Dst, pixel* Src, intptr_t StrideS)
{
- transpose16_dummy(pDst, 16, pSrc, nStrideS);
+ transpose16_dummy(Dst, 16, Src, StrideS);
}
void transpose32(pixel* dst, pixel* src, intptr_t strideSrc)
diff -r dc13d07919db -r c8c33fdca89b source/common/vec/pixel16.inc
--- a/source/common/vec/pixel16.inc Mon Jul 08 11:28:20 2013 +0530
+++ b/source/common/vec/pixel16.inc Mon Jul 08 12:50:38 2013 +0530
@@ -1,7 +1,7 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
- * Authors: Steve Borho <steve at borho.org>
+ * Authors: Steve Borho <steve at borho.Org>
* Mandar Gurav <mandar at multicorewareinc.com>
* Mahesh Pittala <mahesh at multicorewareinc.com>
*
@@ -28,25 +28,25 @@
/* intrinsics for when pixel type is short */
template<int ly>
-int sad_4(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_4(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec8s m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad);
@@ -55,12 +55,12 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad);
@@ -69,25 +69,25 @@
}
template<int ly>
-int sad_8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_8(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec8s m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -96,12 +96,12 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -110,31 +110,31 @@
}
template<int ly>
-int sad_12(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_12(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec8s m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
+ m1.load_a(Org + 8);
m1.cutoff(4);
- n1.load(piCur + 8);
+ n1.load(Cur + 8);
n1.cutoff(4);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -143,18 +143,18 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
+ m1.load_a(Org + 8);
m1.cutoff(4);
- n1.load(piCur + 8);
+ n1.load(Cur + 8);
n1.cutoff(4);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -163,29 +163,29 @@
}
template<int ly>
-int sad_16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_16(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec8s m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -194,16 +194,16 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -212,33 +212,33 @@
}
template<int ly>
-int sad_24(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur)
+int sad_24(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur)
{
Vec8s m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -247,20 +247,20 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -269,37 +269,37 @@
}
template<int ly>
-int sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_32(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec8s m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur + 24);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -308,24 +308,24 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur + 24);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -334,45 +334,45 @@
}
template<int ly>
-int sad_48(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_48(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec8s m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur + 24);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur + 32);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 40);
- n1.load(piCur + 40);
+ m1.load_a(Org + 40);
+ n1.load(Cur + 40);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -381,32 +381,32 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur + 24);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur + 32);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 40);
- n1.load(piCur + 40);
+ m1.load_a(Org + 40);
+ n1.load(Cur + 40);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -415,7 +415,7 @@
}
template<int ly>
-int sad_64(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_64(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec8s m1, n1;
@@ -427,40 +427,40 @@
{
for (int i = 0; i < 4; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur + 8);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur + 24);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur + 32);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 40);
- n1.load(piCur + 40);
+ m1.load_a(Org + 40);
+ n1.load(Cur + 40);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 48);
- n1.load(piCur + 48);
+ m1.load_a(Org + 48);
+ n1.load(Cur + 48);
sad += abs(m1 - n1);
- m1.load_a(piOrg + 56);
- n1.load(piCur + 56);
+ m1.load_a(Org + 56);
+ n1.load(Cur + 56);
sad += abs(m1 - n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -471,32 +471,32 @@
}
template<int ly>
-void sad_x3_4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_4(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
Vec8us sad1(0), sad2(0), sad3(0);
Vec4i sum1(0), sum2(0), sum3(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1);
@@ -509,19 +509,19 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1);
@@ -534,32 +534,32 @@
}
template<int ly>
-void sad_x3_8(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_8(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
Vec8us sad1(0), sad2(0), sad3(0);
Vec4i sum1(0), sum2(0), sum3(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -572,19 +572,19 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -597,45 +597,45 @@
}
template<int ly>
-void sad_x3_12(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_12(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
Vec8us sad1(0), sad2(0), sad3(0);
Vec4i sum1(0), sum2(0), sum3(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
+ m1.load_a(Org + 8);
m1.cutoff(4);
- n1.load(piCur1 + 8);
+ n1.load(Cur1 + 8);
n1.cutoff(4);
- n2.load(piCur2 + 8);
+ n2.load(Cur2 + 8);
n2.cutoff(4);
- n3.load(piCur3 + 8);
+ n3.load(Cur3 + 8);
n3.cutoff(4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -648,32 +648,32 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
+ m1.load_a(Org + 8);
m1.cutoff(4);
- n1.load(piCur1 + 8);
+ n1.load(Cur1 + 8);
n1.cutoff(4);
- n2.load(piCur2 + 8);
+ n2.load(Cur2 + 8);
n2.cutoff(4);
- n3.load(piCur3 + 8);
+ n3.load(Cur3 + 8);
n3.cutoff(4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -686,41 +686,41 @@
}
template<int ly>
-void sad_x3_16(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_16(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
Vec8us sad1(0), sad2(0), sad3(0);
Vec4i sum1(0), sum2(0), sum3(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -733,28 +733,28 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -767,7 +767,7 @@
}
template<int ly>
-void sad_x3_24(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_24(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
@@ -779,37 +779,37 @@
{
for (int i = 0; i < 4; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -826,7 +826,7 @@
}
template<int ly>
-void sad_x3_32(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_32(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
@@ -838,46 +838,46 @@
{
for (int i = 0; i < 4; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 24);
- n1.load(piCur1 + 24);
- n2.load(piCur2 + 24);
- n3.load(piCur3 + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur1 + 24);
+ n2.load(Cur2 + 24);
+ n3.load(Cur3 + 24);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -894,7 +894,7 @@
}
template<int ly>
-void sad_x3_48(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_48(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
@@ -906,64 +906,64 @@
{
for (int i = 0; i < 2; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 24);
- n1.load(piCur1 + 24);
- n2.load(piCur2 + 24);
- n3.load(piCur3 + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur1 + 24);
+ n2.load(Cur2 + 24);
+ n3.load(Cur3 + 24);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 40);
- n1.load(piCur1 + 40);
- n2.load(piCur2 + 40);
- n3.load(piCur3 + 40);
+ m1.load_a(Org + 40);
+ n1.load(Cur1 + 40);
+ n2.load(Cur2 + 40);
+ n3.load(Cur3 + 40);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -980,7 +980,7 @@
}
template<int ly>
-void sad_x3_64(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_64(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3;
@@ -992,82 +992,82 @@
{
for (int i = 0; i < 2; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 24);
- n1.load(piCur1 + 24);
- n2.load(piCur2 + 24);
- n3.load(piCur3 + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur1 + 24);
+ n2.load(Cur2 + 24);
+ n3.load(Cur3 + 24);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 40);
- n1.load(piCur1 + 40);
- n2.load(piCur2 + 40);
- n3.load(piCur3 + 40);
+ m1.load_a(Org + 40);
+ n1.load(Cur1 + 40);
+ n2.load(Cur2 + 40);
+ n3.load(Cur3 + 40);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 48);
- n1.load(piCur1 + 48);
- n2.load(piCur2 + 48);
- n3.load(piCur3 + 48);
+ m1.load_a(Org + 48);
+ n1.load(Cur1 + 48);
+ n2.load(Cur2 + 48);
+ n3.load(Cur3 + 48);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- m1.load_a(piOrg + 56);
- n1.load(piCur1 + 56);
- n2.load(piCur2 + 56);
- n3.load(piCur3 + 56);
+ m1.load_a(Org + 56);
+ n1.load(Cur1 + 56);
+ n2.load(Cur2 + 56);
+ n3.load(Cur3 + 56);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1084,35 +1084,35 @@
}
template<int ly>
-void sad_x4_4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_4(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1);
@@ -1127,22 +1127,22 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1);
@@ -1157,35 +1157,35 @@
}
template<int ly>
-void sad_x4_8(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_8(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1200,22 +1200,22 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1230,39 +1230,39 @@
}
template<int ly>
-void sad_x4_12(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_12(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
+ m1.load_a(Org + 8);
m1.cutoff(4);
- n1.load(piCur1 + 8);
+ n1.load(Cur1 + 8);
n1.cutoff(4);
- n2.load(piCur2 + 8);
+ n2.load(Cur2 + 8);
n2.cutoff(4);
- n3.load(piCur3 + 8);
+ n3.load(Cur3 + 8);
n3.cutoff(4);
- n4.load(piCur4 + 8);
+ n4.load(Cur4 + 8);
n4.cutoff(4);
sad1 += abs(m1 - n1);
@@ -1270,11 +1270,11 @@
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1289,26 +1289,26 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
+ m1.load_a(Org + 8);
m1.cutoff(4);
- n1.load(piCur1 + 8);
+ n1.load(Cur1 + 8);
n1.cutoff(4);
- n2.load(piCur2 + 8);
+ n2.load(Cur2 + 8);
n2.cutoff(4);
- n3.load(piCur3 + 8);
+ n3.load(Cur3 + 8);
n3.cutoff(4);
- n4.load(piCur4 + 8);
+ n4.load(Cur4 + 8);
n4.cutoff(4);
sad1 += abs(m1 - n1);
@@ -1316,11 +1316,11 @@
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1335,46 +1335,46 @@
}
template<int ly>
-void sad_x4_16(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_16(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
- n4.load(piCur4 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
+ n4.load(Cur4 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1389,33 +1389,33 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
- n4.load(piCur4 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
+ n4.load(Cur4 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1430,7 +1430,7 @@
}
template<int ly>
-void sad_x4_24(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_24(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
@@ -1442,44 +1442,44 @@
{
for (int i = 0; i < 4; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
- n4.load(piCur4 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
+ n4.load(Cur4 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1499,7 +1499,7 @@
}
template<int ly>
-void sad_x4_32(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_32(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
@@ -1511,55 +1511,55 @@
{
for (int i = 0; i < 4; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
- n4.load(piCur4 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
+ n4.load(Cur4 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 24);
- n1.load(piCur1 + 24);
- n2.load(piCur2 + 24);
- n3.load(piCur3 + 24);
- n4.load(piCur4 + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur1 + 24);
+ n2.load(Cur2 + 24);
+ n3.load(Cur3 + 24);
+ n4.load(Cur4 + 24);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1579,7 +1579,7 @@
}
template<int ly>
-void sad_x4_48(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_48(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
@@ -1591,77 +1591,77 @@
{
for (int i = 0; i < 2; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
- n4.load(piCur4 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
+ n4.load(Cur4 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 24);
- n1.load(piCur1 + 24);
- n2.load(piCur2 + 24);
- n3.load(piCur3 + 24);
- n4.load(piCur4 + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur1 + 24);
+ n2.load(Cur2 + 24);
+ n3.load(Cur3 + 24);
+ n4.load(Cur4 + 24);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
- n4.load(piCur4 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
+ n4.load(Cur4 + 32);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 40);
- n1.load(piCur1 + 40);
- n2.load(piCur2 + 40);
- n3.load(piCur3 + 40);
- n4.load(piCur4 + 40);
+ m1.load_a(Org + 40);
+ n1.load(Cur1 + 40);
+ n2.load(Cur2 + 40);
+ n3.load(Cur3 + 40);
+ n4.load(Cur4 + 40);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1681,7 +1681,7 @@
}
template<int ly>
-void sad_x4_64(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_64(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec8s m1, n1, n2, n3, n4;
@@ -1693,99 +1693,99 @@
{
for (int i = 0; i < 2; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 8);
- n1.load(piCur1 + 8);
- n2.load(piCur2 + 8);
- n3.load(piCur3 + 8);
- n4.load(piCur4 + 8);
+ m1.load_a(Org + 8);
+ n1.load(Cur1 + 8);
+ n2.load(Cur2 + 8);
+ n3.load(Cur3 + 8);
+ n4.load(Cur4 + 8);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 24);
- n1.load(piCur1 + 24);
- n2.load(piCur2 + 24);
- n3.load(piCur3 + 24);
- n4.load(piCur4 + 24);
+ m1.load_a(Org + 24);
+ n1.load(Cur1 + 24);
+ n2.load(Cur2 + 24);
+ n3.load(Cur3 + 24);
+ n4.load(Cur4 + 24);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
- n4.load(piCur4 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
+ n4.load(Cur4 + 32);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 40);
- n1.load(piCur1 + 40);
- n2.load(piCur2 + 40);
- n3.load(piCur3 + 40);
- n4.load(piCur4 + 40);
+ m1.load_a(Org + 40);
+ n1.load(Cur1 + 40);
+ n2.load(Cur2 + 40);
+ n3.load(Cur3 + 40);
+ n4.load(Cur4 + 40);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 48);
- n1.load(piCur1 + 48);
- n2.load(piCur2 + 48);
- n3.load(piCur3 + 48);
- n4.load(piCur4 + 48);
+ m1.load_a(Org + 48);
+ n1.load(Cur1 + 48);
+ n2.load(Cur2 + 48);
+ n3.load(Cur3 + 48);
+ n4.load(Cur4 + 48);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- m1.load_a(piOrg + 56);
- n1.load(piCur1 + 56);
- n2.load(piCur2 + 56);
- n3.load(piCur3 + 56);
- n4.load(piCur4 + 56);
+ m1.load_a(Org + 56);
+ n1.load(Cur1 + 56);
+ n2.load(Cur2 + 56);
+ n3.load(Cur3 + 56);
+ n4.load(Cur4 + 56);
sad1 += abs(m1 - n1);
sad2 += abs(m1 - n2);
sad3 += abs(m1 - n3);
sad4 += abs(m1 - n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1804,42 +1804,42 @@
res[3] = horizontal_add(sum4);
}
-int satd_4x4(pixel * piOrg, intptr_t iStrideOrg, pixel * piCur, intptr_t iStrideCur)
+int satd_4x4(pixel * Org, intptr_t istrideOrg, pixel * Cur, intptr_t istrideCur)
{
int satd = 0;
Vec8s v1, v2, m1, m2;
{
- Vec8s temp1, temp2, temp3, temp4, piOrg_v, piCur_v;
- temp1.load(piOrg);
- temp2.load(piCur);
- piCur += iStrideCur;
- piOrg += iStrideOrg;
+ Vec8s temp1, temp2, temp3, temp4, Org_v, Cur_v;
+ temp1.load(Org);
+ temp2.load(Cur);
+ Cur += istrideCur;
+ Org += istrideOrg;
- temp3.load(piOrg);
- temp4.load(piCur);
- piCur += iStrideCur;
- piOrg += iStrideOrg;
+ temp3.load(Org);
+ temp4.load(Cur);
+ Cur += istrideCur;
+ Org += istrideOrg;
- piOrg_v = blend2q<0, 2>((Vec2q)temp1, (Vec2q)temp3);
- piCur_v = blend2q<0, 2>((Vec2q)temp2, (Vec2q)temp4);
+ Org_v = blend2q<0, 2>((Vec2q)temp1, (Vec2q)temp3);
+ Cur_v = blend2q<0, 2>((Vec2q)temp2, (Vec2q)temp4);
- temp1.load(piOrg);
- temp2.load(piCur);
- piCur += iStrideCur;
- piOrg += iStrideOrg;
+ temp1.load(Org);
+ temp2.load(Cur);
+ Cur += istrideCur;
+ Org += istrideOrg;
- temp3.load(piOrg);
- temp4.load(piCur);
- piCur += iStrideCur;
- piOrg += iStrideOrg;
+ temp3.load(Org);
+ temp4.load(Cur);
+ Cur += istrideCur;
+ Org += istrideOrg;
- v1 = piOrg_v - piCur_v; //diff
+ v1 = Org_v - Cur_v; //diff
- piOrg_v = blend2q<0, 2>((Vec2q)temp3, (Vec2q)temp1);
- piCur_v = blend2q<0, 2>((Vec2q)temp4, (Vec2q)temp2);
- v2 = piOrg_v - piCur_v; //diff
+ Org_v = blend2q<0, 2>((Vec2q)temp3, (Vec2q)temp1);
+ Cur_v = blend2q<0, 2>((Vec2q)temp4, (Vec2q)temp2);
+ v2 = Org_v - Cur_v; //diff
}
for (int i = 0; i < 2; i++)
@@ -1872,29 +1872,29 @@
return satd;
}
-int sa8d_8x8(pixel * piOrg, intptr_t iStrideOrg, pixel * piCur, intptr_t iStrideCur)
+int sa8d_8x8(pixel * Org, intptr_t istrideOrg, pixel * Cur, intptr_t istrideCur)
{
ALIGN_VAR_16(short, m2[8][8]);
- Vec8s diff_v1, diff_v2, piOrg_v1, piOrg_v2, piCur_v1, piCur_v2;
+ Vec8s diff_v1, diff_v2, Org_v1, Org_v2, Cur_v1, Cur_v2;
Vec8s v1, v2, t1, t2;
int j, satd = 0;
for (j = 0; j < 8; j += 2)
{
- piOrg_v1.load_a(piOrg);
- piCur_v1.load(piCur);
- piCur += iStrideCur;
- piOrg += iStrideOrg;
+ Org_v1.load_a(Org);
+ Cur_v1.load(Cur);
+ Cur += istrideCur;
+ Org += istrideOrg;
- piOrg_v2.load_a(piOrg);
- piCur_v2.load(piCur);
- piCur += iStrideCur;
- piOrg += iStrideOrg;
+ Org_v2.load_a(Org);
+ Cur_v2.load(Cur);
+ Cur += istrideCur;
+ Org += istrideOrg;
- diff_v1 = piOrg_v1 - piCur_v1;
- diff_v2 = piOrg_v2 - piCur_v2;
+ diff_v1 = Org_v1 - Cur_v1;
+ diff_v2 = Org_v2 - Cur_v2;
v1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(diff_v1, diff_v2);
v2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(diff_v1, diff_v2);
@@ -2039,7 +2039,7 @@
}
template<int lx, int ly>
-int satd(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int satd(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
int uiSum = 0;
@@ -2047,8 +2047,8 @@
{
for (int col = 0; col < lx; col += 4)
{
- uiSum += satd_4x4(piOrg + strideOrg * row + col, strideOrg,
- piCur + strideCur * row + col, strideCur);
+ uiSum += satd_4x4(Org + strideOrg * row + col, strideOrg,
+ Cur + strideCur * row + col, strideCur);
}
}
diff -r dc13d07919db -r c8c33fdca89b source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Mon Jul 08 11:28:20 2013 +0530
+++ b/source/common/vec/pixel8.inc Mon Jul 08 12:50:38 2013 +0530
@@ -1,7 +1,7 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
- * Authors: Steve Borho <steve at borho.org>
+ * Authors: Steve Borho <steve at borho.Org>
* Mandar Gurav <mandar at multicorewareinc.com>
* Mahesh Pittala <mahesh at multicorewareinc.com>
*
@@ -33,25 +33,25 @@
#endif
template<int ly>
-int sad_4(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_4(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec16uc m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.fromUint32(*(uint32_t*)piOrg);
- n1.fromUint32(*(uint32_t*)piCur);
+ m1.fromUint32(*(uint32_t*)Org);
+ n1.fromUint32(*(uint32_t*)Cur);
sad.addSumAbsDiff(m1, n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad);
@@ -60,12 +60,12 @@
while (row++ < ly)
{
- m1.fromUint32(*(uint32_t*)piOrg);
- n1.fromUint32(*(uint32_t*)piCur);
+ m1.fromUint32(*(uint32_t*)Org);
+ n1.fromUint32(*(uint32_t*)Cur);
sad.addSumAbsDiff(m1, n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad);
@@ -73,77 +73,77 @@
}
template<int size>
-ALWAYSINLINE void unrollFunc_8(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_8(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
- unrollFunc_8<1>(piOrg, strideOrg, piCur, strideCur, sad);
- unrollFunc_8<size - 1>(piOrg + strideOrg, strideOrg, piCur + strideCur, strideCur, sad);
+ unrollFunc_8<1>(Org, strideOrg, Cur, strideCur, sad);
+ unrollFunc_8<size - 1>(Org + strideOrg, strideOrg, Cur + strideCur, strideCur, sad);
}
template<>
-ALWAYSINLINE void unrollFunc_8<1>(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_8<1>(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
Vec16uc m1, n1;
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad.addSumAbsDiff(m1, n1);
}
template<int ly>
-int sad_8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_8(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec4i sum(0);
Vec8us sad;
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
if (ly < 16)
{
sad = 0;
- unrollFunc_8<ly>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_8<ly>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad);
return horizontal_add(sum);
}
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
sad = 0;
- unrollFunc_8<16>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_8<16>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad);
- piOrg += strideOrg * 16;
- piCur += strideCur * 16;
+ Org += strideOrg * 16;
+ Cur += strideCur * 16;
}
if (ly & 8)
{
sad = 0;
- unrollFunc_8<8>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_8<8>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad);
}
return horizontal_add(sum);
}
template<int ly>
-int sad_12(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_12(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec16uc m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
+ m1.load_a(Org);
m1.cutoff(12);
- n1.load(piCur);
+ n1.load(Cur);
n1.cutoff(12);
sad.addSumAbsDiff(m1, n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -152,14 +152,14 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
+ m1.load_a(Org);
m1.cutoff(12);
- n1.load(piCur);
+ n1.load(Cur);
n1.cutoff(12);
sad.addSumAbsDiff(m1, n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -167,48 +167,48 @@
}
template<int size>
-ALWAYSINLINE void unrollFunc_16(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_16(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
- unrollFunc_16<1>(piOrg, strideOrg, piCur, strideCur, sad);
- unrollFunc_16<size - 1>(piOrg + strideOrg, strideOrg, piCur + strideCur, strideCur, sad);
+ unrollFunc_16<1>(Org, strideOrg, Cur, strideCur, sad);
+ unrollFunc_16<size - 1>(Org + strideOrg, strideOrg, Cur + strideCur, strideCur, sad);
}
template<>
-ALWAYSINLINE void unrollFunc_16<1>(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_16<1>(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
Vec16uc m1, n1;
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad.addSumAbsDiff(m1, n1);
}
template<int ly>
-int sad_16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_16(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row = 0;
if (ly < 16)
{
- unrollFunc_16<ly>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_16<ly>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad) + extend_high(sad);
return horizontal_add(sum);
}
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
- unrollFunc_16<16>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_16<16>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad) + extend_high(sad);
sad = 0;
- piOrg += strideOrg * 16;
- piCur += strideCur * 16;
+ Org += strideOrg * 16;
+ Cur += strideCur * 16;
}
if (ly & 8)
{
- unrollFunc_16<8>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_16<8>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad) + extend_high(sad);
return horizontal_add(sum);
}
@@ -216,31 +216,31 @@
}
template<int ly>
-int sad_24(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur)
+int sad_24(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur)
{
Vec16uc m1, n1;
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad.addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 16);
+ m1.load_a(Org + 16);
m1.cutoff(8);
- n1.load(piCur + 16);
+ n1.load(Cur + 16);
n1.cutoff(8);
sad.addSumAbsDiff(m1, n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -249,18 +249,18 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad.addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 16);
+ m1.load_a(Org + 16);
m1.cutoff(8);
- n1.load(piCur + 16);
+ n1.load(Cur + 16);
n1.cutoff(8);
sad.addSumAbsDiff(m1, n1);
- piOrg += strideOrg;
- piCur += strideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
sum += extend_low(sad) + extend_high(sad);
@@ -268,182 +268,182 @@
}
template<int size>
-ALWAYSINLINE void unrollFunc_32(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_32(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
- unrollFunc_32<1>(piOrg, strideOrg, piCur, strideCur, sad);
- unrollFunc_32<size - 1>(piOrg + strideOrg, strideOrg, piCur + strideCur, strideCur, sad);
+ unrollFunc_32<1>(Org, strideOrg, Cur, strideCur, sad);
+ unrollFunc_32<size - 1>(Org + strideOrg, strideOrg, Cur + strideCur, strideCur, sad);
}
template<>
-ALWAYSINLINE void unrollFunc_32<1>(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_32<1>(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
Vec16uc m1, n1;
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad.addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad.addSumAbsDiff(m1, n1);
}
template<int ly>
-int sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_32(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec4i sum(0);
Vec8us sad;
- int main_iters = (ly >> 2) << 2;
+ int max_iterators = (ly >> 2) << 2;
int row;
if (ly == 4)
{
sad = 0;
- unrollFunc_32<4>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_32<4>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad) + extend_high(sad);
return horizontal_add(sum);
}
- for (row = 0; row < main_iters; row += 4)
+ for (row = 0; row < max_iterators; row += 4)
{
sad = 0;
- unrollFunc_32<4>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_32<4>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad) + extend_high(sad);
- piOrg += strideOrg * 4;
- piCur += strideCur * 4;
+ Org += strideOrg * 4;
+ Cur += strideCur * 4;
}
return horizontal_add(sum);
}
template<int size>
-ALWAYSINLINE void unrollFunc_48(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us *sad)
+ALWAYSINLINE void unrollFunc_48(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us *sad)
{
- unrollFunc_48<1>(piOrg, strideOrg, piCur, strideCur, sad);
- unrollFunc_48<size - 1>(piOrg + strideOrg, strideOrg, piCur + strideCur, strideCur, sad);
+ unrollFunc_48<1>(Org, strideOrg, Cur, strideCur, sad);
+ unrollFunc_48<size - 1>(Org + strideOrg, strideOrg, Cur + strideCur, strideCur, sad);
}
template<>
-ALWAYSINLINE void unrollFunc_48<1>(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us *sad)
+ALWAYSINLINE void unrollFunc_48<1>(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us *sad)
{
Vec16uc m1, n1;
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad[0].addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad[0].addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur + 32);
sad[0].addSumAbsDiff(m1, n1);
}
template<int ly>
-int sad_48(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_48(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec4i sum(0);
Vec8us sad(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
if (ly == 4)
{
- unrollFunc_48<4>(piOrg, strideOrg, piCur, strideCur, &sad);
+ unrollFunc_48<4>(Org, strideOrg, Cur, strideCur, &sad);
sum += extend_low(sad) + extend_high(sad);
return horizontal_add(sum);
}
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
- unrollFunc_48<8>(piOrg, strideOrg, piCur, strideCur, &sad);
+ unrollFunc_48<8>(Org, strideOrg, Cur, strideCur, &sad);
sum += extend_low(sad) + extend_high(sad);
sad = 0;
- piOrg += strideOrg * 8;
- piCur += strideCur * 8;
+ Org += strideOrg * 8;
+ Cur += strideCur * 8;
}
if (ly & 4)
{
- unrollFunc_48<4>(piOrg, strideOrg, piCur, strideCur, &sad);
+ unrollFunc_48<4>(Org, strideOrg, Cur, strideCur, &sad);
sum += extend_low(sad) + extend_high(sad);
}
return horizontal_add(sum);
}
template<int size>
-ALWAYSINLINE void unrollFunc_64(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_64(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
- unrollFunc_64<1>(piOrg, strideOrg, piCur, strideCur, sad);
- unrollFunc_64<size - 1>(piOrg + strideOrg, strideOrg, piCur + strideCur, strideCur, sad);
+ unrollFunc_64<1>(Org, strideOrg, Cur, strideCur, sad);
+ unrollFunc_64<size - 1>(Org + strideOrg, strideOrg, Cur + strideCur, strideCur, sad);
}
template<>
-ALWAYSINLINE void unrollFunc_64<1>(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
+ALWAYSINLINE void unrollFunc_64<1>(pixel *Org, intptr_t strideOrg, pixel *Cur, intptr_t strideCur, Vec8us& sad)
{
Vec16uc m1, n1;
- m1.load_a(piOrg);
- n1.load(piCur);
+ m1.load_a(Org);
+ n1.load(Cur);
sad.addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur + 16);
sad.addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur + 32);
sad.addSumAbsDiff(m1, n1);
- m1.load_a(piOrg + 48);
- n1.load(piCur + 48);
+ m1.load_a(Org + 48);
+ n1.load(Cur + 48);
sad.addSumAbsDiff(m1, n1);
}
template<int ly>
-int sad_64(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int sad_64(pixel * Org, intptr_t strideOrg, pixel * Cur, intptr_t strideCur)
{
Vec4i sum(0);
Vec8us sad;
- int main_iters = (ly >> 2) << 2;
+ int max_iterators = (ly >> 2) << 2;
int row;
- for (row = 0; row < main_iters; row += 4)
+ for (row = 0; row < max_iterators; row += 4)
{
sad = 0;
- unrollFunc_64<4>(piOrg, strideOrg, piCur, strideCur, sad);
+ unrollFunc_64<4>(Org, strideOrg, Cur, strideCur, sad);
sum += extend_low(sad) + extend_high(sad);
- piOrg += strideOrg * 4;
- piCur += strideCur * 4;
+ Org += strideOrg * 4;
+ Cur += strideCur * 4;
}
return horizontal_add(sum);
}
template<int ly>
-void sad_x3_4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_4(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
Vec4i sum1(0), sum2(0), sum3(0);
Vec8us sad1(0), sad2(0), sad3(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.fromUint32(*(uint32_t*)piOrg);
- n1.fromUint32(*(uint32_t*)piCur1);
- n2.fromUint32(*(uint32_t*)piCur2);
- n3.fromUint32(*(uint32_t*)piCur3);
+ m1.fromUint32(*(uint32_t*)Org);
+ n1.fromUint32(*(uint32_t*)Cur1);
+ n2.fromUint32(*(uint32_t*)Cur2);
+ n3.fromUint32(*(uint32_t*)Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1);
@@ -456,19 +456,19 @@
while (row++ < ly)
{
- m1.fromUint32(*(uint32_t*)piOrg);
- n1.fromUint32(*(uint32_t*)piCur1);
- n2.fromUint32(*(uint32_t*)piCur2);
- n3.fromUint32(*(uint32_t*)piCur3);
+ m1.fromUint32(*(uint32_t*)Org);
+ n1.fromUint32(*(uint32_t*)Cur1);
+ n2.fromUint32(*(uint32_t*)Cur2);
+ n3.fromUint32(*(uint32_t*)Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1);
@@ -482,32 +482,32 @@
/* For performance - This function assumes that the *last load* can access 16 elements. */
template<int ly>
-void sad_x3_8(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_8(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
Vec4i sum1(0), sum2(0), sum3(0);
Vec8us sad1(0), sad2(0), sad3(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1);
@@ -520,19 +520,19 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1);
@@ -546,36 +546,36 @@
/* For performance - This function assumes that the *last load* can access 16 elements. */
template<int ly>
-void sad_x3_12(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_12(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
Vec4i sum1(0), sum2(0), sum3(0);
Vec8us sad1(0), sad2(0), sad3(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
+ m1.load_a(Org);
m1.cutoff(12);
- n1.load(piCur1);
+ n1.load(Cur1);
n1.cutoff(12);
- n2.load(piCur2);
+ n2.load(Cur2);
n2.cutoff(12);
- n3.load(piCur3);
+ n3.load(Cur3);
n3.cutoff(12);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -588,23 +588,23 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
+ m1.load_a(Org);
m1.cutoff(12);
- n1.load(piCur1);
+ n1.load(Cur1);
n1.cutoff(12);
- n2.load(piCur2);
+ n2.load(Cur2);
n2.cutoff(12);
- n3.load(piCur3);
+ n3.load(Cur3);
n3.cutoff(12);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -617,32 +617,32 @@
}
template<int ly>
-void sad_x3_16(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_16(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
Vec4i sum1(0), sum2(0), sum3(0);
Vec8us sad1(0), sad2(0), sad3(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -655,19 +655,19 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -680,45 +680,45 @@
}
template<int ly>
-void sad_x3_24(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_24(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
Vec4i sum1(0), sum2(0), sum3(0);
Vec8us sad1(0), sad2(0), sad3(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 16);
+ m1.load_a(Org + 16);
m1.cutoff(8);
- n1.load(piCur1 + 16);
+ n1.load(Cur1 + 16);
n1.cutoff(8);
- n2.load(piCur2 + 16);
+ n2.load(Cur2 + 16);
n2.cutoff(8);
- n3.load(piCur3 + 16);
+ n3.load(Cur3 + 16);
n3.cutoff(8);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -731,32 +731,32 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 16);
+ m1.load_a(Org + 16);
m1.cutoff(8);
- n1.load(piCur1 + 16);
+ n1.load(Cur1 + 16);
n1.cutoff(8);
- n2.load(piCur2 + 16);
+ n2.load(Cur2 + 16);
n2.cutoff(8);
- n3.load(piCur3 + 16);
+ n3.load(Cur3 + 16);
n3.cutoff(8);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -769,41 +769,41 @@
}
template<int ly>
-void sad_x3_32(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_32(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
Vec4i sum1(0), sum2(0), sum3(0);
Vec8us sad1(0), sad2(0), sad3(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -816,28 +816,28 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -850,50 +850,50 @@
}
template<int ly>
-void sad_x3_48(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_48(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
Vec4i sum1(0), sum2(0), sum3(0);
Vec8us sad1(0), sad2(0), sad3(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -906,37 +906,37 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -949,7 +949,7 @@
}
template<int ly>
-void sad_x3_64(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+void sad_x3_64(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3;
@@ -961,46 +961,46 @@
{
for (int i = 0; i < 4; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- m1.load_a(piOrg + 48);
- n1.load(piCur1 + 48);
- n2.load(piCur2 + 48);
- n3.load(piCur3 + 48);
+ m1.load_a(Org + 48);
+ n1.load(Cur1 + 48);
+ n2.load(Cur2 + 48);
+ n3.load(Cur3 + 48);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1017,35 +1017,35 @@
}
template<int ly>
-void sad_x4_4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_4(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.fromUint32(*(uint32_t*)piOrg);
- n1.fromUint32(*(uint32_t*)piCur1);
- n2.fromUint32(*(uint32_t*)piCur2);
- n3.fromUint32(*(uint32_t*)piCur3);
- n4.fromUint32(*(uint32_t*)piCur4);
+ m1.fromUint32(*(uint32_t*)Org);
+ n1.fromUint32(*(uint32_t*)Cur1);
+ n2.fromUint32(*(uint32_t*)Cur2);
+ n3.fromUint32(*(uint32_t*)Cur3);
+ n4.fromUint32(*(uint32_t*)Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1);
@@ -1060,22 +1060,22 @@
while (row++ < ly)
{
- m1.fromUint32(*(uint32_t*)piOrg);
- n1.fromUint32(*(uint32_t*)piCur1);
- n2.fromUint32(*(uint32_t*)piCur2);
- n3.fromUint32(*(uint32_t*)piCur3);
- n4.fromUint32(*(uint32_t*)piCur4);
+ m1.fromUint32(*(uint32_t*)Org);
+ n1.fromUint32(*(uint32_t*)Cur1);
+ n2.fromUint32(*(uint32_t*)Cur2);
+ n3.fromUint32(*(uint32_t*)Cur3);
+ n4.fromUint32(*(uint32_t*)Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1);
@@ -1090,35 +1090,35 @@
}
template<int ly>
-void sad_x4_8(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_8(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1);
@@ -1133,22 +1133,22 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1);
@@ -1164,28 +1164,28 @@
/* For performance - This function assumes that the *last load* can access 16 elements. */
template<int ly>
-void sad_x4_12(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_12(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
+ m1.load_a(Org);
m1.cutoff(12);
- n1.load(piCur1);
+ n1.load(Cur1);
n1.cutoff(12);
- n2.load(piCur2);
+ n2.load(Cur2);
n2.cutoff(12);
- n3.load(piCur3);
+ n3.load(Cur3);
n3.cutoff(12);
- n4.load(piCur4);
+ n4.load(Cur4);
n4.cutoff(12);
sad1.addSumAbsDiff(m1, n1);
@@ -1193,11 +1193,11 @@
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1212,15 +1212,15 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
+ m1.load_a(Org);
m1.cutoff(12);
- n1.load(piCur1);
+ n1.load(Cur1);
n1.cutoff(12);
- n2.load(piCur2);
+ n2.load(Cur2);
n2.cutoff(12);
- n3.load(piCur3);
+ n3.load(Cur3);
n3.cutoff(12);
- n4.load(piCur4);
+ n4.load(Cur4);
n4.cutoff(12);
sad1.addSumAbsDiff(m1, n1);
@@ -1228,11 +1228,11 @@
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1247,35 +1247,35 @@
}
template<int ly>
-void sad_x4_16(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_16(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1290,22 +1290,22 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1320,39 +1320,39 @@
}
template<int ly>
-void sad_x4_24(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_24(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
- int main_iters = (ly >> 4) << 4;
+ int max_iterators = (ly >> 4) << 4;
int row;
- for (row = 0; row < main_iters; row += 16)
+ for (row = 0; row < max_iterators; row += 16)
{
for (int i = 0; i < 16; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 16);
+ m1.load_a(Org + 16);
m1.cutoff(8);
- n1.load(piCur1 + 16);
+ n1.load(Cur1 + 16);
n1.cutoff(8);
- n2.load(piCur2 + 16);
+ n2.load(Cur2 + 16);
n2.cutoff(8);
- n3.load(piCur3 + 16);
+ n3.load(Cur3 + 16);
n3.cutoff(8);
- n4.load(piCur4 + 16);
+ n4.load(Cur4 + 16);
n4.cutoff(8);
sad1.addSumAbsDiff(m1, n1);
@@ -1360,11 +1360,11 @@
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1379,26 +1379,26 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 16);
+ m1.load_a(Org + 16);
m1.cutoff(8);
- n1.load(piCur1 + 16);
+ n1.load(Cur1 + 16);
n1.cutoff(8);
- n2.load(piCur2 + 16);
+ n2.load(Cur2 + 16);
n2.cutoff(8);
- n3.load(piCur3 + 16);
+ n3.load(Cur3 + 16);
n3.cutoff(8);
- n4.load(piCur4 + 16);
+ n4.load(Cur4 + 16);
n4.cutoff(8);
sad1.addSumAbsDiff(m1, n1);
@@ -1406,11 +1406,11 @@
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1425,46 +1425,46 @@
}
template<int ly>
-void sad_x4_32(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_32(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1479,33 +1479,33 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1520,57 +1520,57 @@
}
template<int ly>
-void sad_x4_48(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_48(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
- int main_iters = (ly >> 3) << 3;
+ int max_iterators = (ly >> 3) << 3;
int row;
- for (row = 0; row < main_iters; row += 8)
+ for (row = 0; row < max_iterators; row += 8)
{
for (int i = 0; i < 8; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
- n4.load(piCur4 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
+ n4.load(Cur4 + 32);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1585,44 +1585,44 @@
while (row++ < ly)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
- n4.load(piCur4 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
+ n4.load(Cur4 + 32);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
@@ -1637,7 +1637,7 @@
}
template<int ly>
-void sad_x4_64(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+void sad_x4_64(pixel *Org, pixel *Cur1, pixel *Cur2, pixel *Cur3, pixel *Cur4, intptr_t strideCur, int *res)
{
Vec16uc m1, n1, n2, n3, n4;
@@ -1649,55 +1649,55 @@
{
for (int i = 0; i < 4; i++)
{
- m1.load_a(piOrg);
- n1.load(piCur1);
- n2.load(piCur2);
- n3.load(piCur3);
- n4.load(piCur4);
+ m1.load_a(Org);
+ n1.load(Cur1);
+ n2.load(Cur2);
+ n3.load(Cur3);
+ n4.load(Cur4);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 16);
- n1.load(piCur1 + 16);
- n2.load(piCur2 + 16);
- n3.load(piCur3 + 16);
- n4.load(piCur4 + 16);
+ m1.load_a(Org + 16);
+ n1.load(Cur1 + 16);
+ n2.load(Cur2 + 16);
+ n3.load(Cur3 + 16);
+ n4.load(Cur4 + 16);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 32);
- n1.load(piCur1 + 32);
- n2.load(piCur2 + 32);
- n3.load(piCur3 + 32);
- n4.load(piCur4 + 32);
+ m1.load_a(Org + 32);
+ n1.load(Cur1 + 32);
+ n2.load(Cur2 + 32);
+ n3.load(Cur3 + 32);
+ n4.load(Cur4 + 32);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- m1.load_a(piOrg + 48);
- n1.load(piCur1 + 48);
- n2.load(piCur2 + 48);
- n3.load(piCur3 + 48);
- n4.load(piCur4 + 48);
+ m1.load_a(Org + 48);
+ n1.load(Cur1 + 48);
+ n2.load(Cur2 + 48);
+ n3.load(Cur3 + 48);
+ n4.load(Cur4 + 48);
sad1.addSumAbsDiff(m1, n1);
sad2.addSumAbsDiff(m1, n2);
sad3.addSumAbsDiff(m1, n3);
sad4.addSumAbsDiff(m1, n4);
- piOrg += FENC_STRIDE;
- piCur1 += strideCur;
- piCur2 += strideCur;
- piCur3 += strideCur;
- piCur4 += strideCur;
+ Org += FENC_STRIDE;
+ Cur1 += strideCur;
+ Cur2 += strideCur;
+ Cur3 += strideCur;
+ Cur4 += strideCur;
}
sum1 += extend_low(sad1) + extend_high(sad1);
diff -r dc13d07919db -r c8c33fdca89b source/common/vec/sse.inc
--- a/source/common/vec/sse.inc Mon Jul 08 11:28:20 2013 +0530
+++ b/source/common/vec/sse.inc Mon Jul 08 12:50:38 2013 +0530
@@ -26,62 +26,62 @@
/* intrinsics for when pixel type is uint8_t */
template<int ly>
-int sse_pp4(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp4(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff(0);
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.fromUint32(*(uint32_t*)piOrg);
- n1.fromUint32(*(uint32_t*)piCur);
+ m1.fromUint32(*(uint32_t*)Org);
+ n1.fromUint32(*(uint32_t*)Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += extend_low(diff);
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_pp8(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp8(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff(0);
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += (extend_low(diff) + (extend_high(diff)));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_pp12(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp12(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff(0);
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
+ m1.load(Org);
m1.cutoff(12);
- n1.load(piCur);
+ n1.load(Cur);
n1.cutoff(12);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
@@ -91,50 +91,50 @@
diff = diff * diff;
sum += (extend_low(diff) + (extend_high(diff)));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_pp16(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp16(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_pp24(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp24(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
@@ -142,32 +142,32 @@
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff_low = extend_low(m1) - extend_low(n1);
diff_low = diff_low * diff_low;
sum_low += extend_low(diff_low);
sum_high += extend_high(diff_low);
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_pp32(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp32(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
@@ -175,33 +175,33 @@
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_pp48(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp48(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
@@ -209,8 +209,8 @@
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
@@ -218,33 +218,33 @@
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- m1.load(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load(Org + 32);
+ n1.load(Cur + 32);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_pp64(pixel* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_pp64(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec16uc m1, n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
@@ -252,8 +252,8 @@
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
@@ -261,8 +261,8 @@
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- m1.load(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load(Org + 32);
+ n1.load(Cur + 32);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
@@ -270,55 +270,55 @@
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- m1.load(piOrg + 48);
- n1.load(piCur + 48);
+ m1.load(Org + 48);
+ n1.load(Cur + 48);
diff_low = extend_low(m1) - extend_low(n1);
diff_high = extend_high(m1) - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_ss4(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss4(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_ss8(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss8(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -326,25 +326,25 @@
diff = extend_high(m1) - extend_high(n1);
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_ss12(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss12(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -352,31 +352,31 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load(Org + 8);
+ n1.load(Cur + 8);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_ss16(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss16(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -384,8 +384,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load(Org + 8);
+ n1.load(Cur + 8);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -393,25 +393,25 @@
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_ss24(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss24(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -419,8 +419,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load(Org + 8);
+ n1.load(Cur + 8);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -428,8 +428,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -437,25 +437,25 @@
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_ss32(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss32(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -463,8 +463,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load(Org + 8);
+ n1.load(Cur + 8);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -472,8 +472,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -481,8 +481,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load(Org + 24);
+ n1.load(Cur + 24);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -490,25 +490,25 @@
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_ss48(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss48(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -516,8 +516,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load(Org + 8);
+ n1.load(Cur + 8);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -525,8 +525,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -534,8 +534,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load(Org + 24);
+ n1.load(Cur + 24);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -543,8 +543,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load(Org + 32);
+ n1.load(Cur + 32);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -552,8 +552,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 40);
- n1.load(piCur + 40);
+ m1.load(Org + 40);
+ n1.load(Cur + 40);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -561,25 +561,25 @@
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_ss64(short* piOrg, intptr_t iStrideOrg, short* piCur, intptr_t iStrideCur)
+int sse_ss64(short* Org, intptr_t strideOrg, short* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec4i diff(0);
Vec8s m1, n1;
Vec4i sum(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.load(piCur);
+ m1.load(Org);
+ n1.load(Cur);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -587,8 +587,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 8);
- n1.load(piCur + 8);
+ m1.load(Org + 8);
+ n1.load(Cur + 8);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -596,8 +596,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 16);
- n1.load(piCur + 16);
+ m1.load(Org + 16);
+ n1.load(Cur + 16);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -605,8 +605,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 24);
- n1.load(piCur + 24);
+ m1.load(Org + 24);
+ n1.load(Cur + 24);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -614,8 +614,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 32);
- n1.load(piCur + 32);
+ m1.load(Org + 32);
+ n1.load(Cur + 32);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -623,8 +623,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 40);
- n1.load(piCur + 40);
+ m1.load(Org + 40);
+ n1.load(Cur + 40);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -632,8 +632,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 48);
- n1.load(piCur + 48);
+ m1.load(Org + 48);
+ n1.load(Cur + 48);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -641,8 +641,8 @@
diff = diff * diff;
sum += diff;
- m1.load(piOrg + 56);
- n1.load(piCur + 56);
+ m1.load(Org + 56);
+ n1.load(Cur + 56);
diff = extend_low(m1) - extend_low(n1);
diff = diff * diff;
sum += diff;
@@ -650,294 +650,294 @@
diff = diff * diff;
sum += diff;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum);
}
template<int ly>
-int sse_sp4(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp4(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec4i diff_low(0);
Vec4i sum_low(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- m1.load(piOrg);
- n1.fromUint32(*(uint32_t*)piCur);
+ m1.load(Org);
+ n1.fromUint32(*(uint32_t*)Cur);
diff_low = extend_low(m1) - extend_low(extend_low(n1));
diff_low = diff_low * diff_low;
sum_low += diff_low;
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low);
}
template<int ly>
-int sse_sp8(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp8(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- n1.load(piCur);
- m1.load(piOrg);
+ n1.load(Cur);
+ m1.load(Org);
diff_low = m1 - extend_low(n1);
diff_low = diff_low * diff_low;
sum_low += extend_low(diff_low);
sum_high += extend_high(diff_low);
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_sp12(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp12(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec8us diff_low(0);
Vec4i diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- n1.load(piCur);
+ n1.load(Cur);
n1.cutoff(12);
- m1.load(piOrg);
+ m1.load(Org);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 8);
+ m1.load(Org + 8);
diff_high = extend_low(m1) - extend_low(extend_high(n1));
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += extend_low(diff_low);
sum_high += (extend_high(diff_low) + diff_high);
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_sp16(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp16(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- n1.load(piCur);
- m1.load(piOrg);
+ n1.load(Cur);
+ m1.load(Org);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 8);
+ m1.load(Org + 8);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_sp24(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp24(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- n1.load(piCur);
- m1.load(piOrg);
+ n1.load(Cur);
+ m1.load(Org);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 8);
+ m1.load(Org + 8);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- n1.load(piCur + 16);
- m1.load(piOrg + 16);
+ n1.load(Cur + 16);
+ m1.load(Org + 16);
diff_low = m1 - extend_low(n1);
diff_low = diff_low * diff_low;
sum_low += extend_low(diff_low);
sum_high += extend_high(diff_low);
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_sp32(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp32(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- n1.load(piCur);
- m1.load(piOrg);
+ n1.load(Cur);
+ m1.load(Org);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 8);
+ m1.load(Org + 8);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- n1.load(piCur + 16);
- m1.load(piOrg + 16);
+ n1.load(Cur + 16);
+ m1.load(Org + 16);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 24);
+ m1.load(Org + 24);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_sp48(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp48(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- n1.load(piCur);
- m1.load(piOrg);
+ n1.load(Cur);
+ m1.load(Org);
diff_low = m1 - extend_low(n1);
- m1.load_a(piOrg + 8);
+ m1.load_a(Org + 8);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- n1.load(piCur + 16);
- m1.load(piOrg + 16);
+ n1.load(Cur + 16);
+ m1.load(Org + 16);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 24);
+ m1.load(Org + 24);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- n1.load(piCur + 32);
- m1.load(piOrg + 32);
+ n1.load(Cur + 32);
+ m1.load(Org + 32);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 40);
+ m1.load(Org + 40);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
}
template<int ly>
-int sse_sp64(short* piOrg, intptr_t iStrideOrg, pixel* piCur, intptr_t iStrideCur)
+int sse_sp64(short* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
- int iRows = ly;
+ int rows = ly;
Vec8s m1;
Vec16uc n1;
Vec8us diff_low(0), diff_high(0);
Vec4i sum_low(0), sum_high(0);
- for (; iRows != 0; iRows--)
+ for (; rows != 0; rows--)
{
- n1.load(piCur);
- m1.load(piOrg);
+ n1.load(Cur);
+ m1.load(Org);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 8);
+ m1.load(Org + 8);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- n1.load(piCur + 16);
- m1.load(piOrg + 16);
+ n1.load(Cur + 16);
+ m1.load(Org + 16);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 24);
+ m1.load(Org + 24);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- n1.load(piCur + 32);
- m1.load(piOrg + 32);
+ n1.load(Cur + 32);
+ m1.load(Org + 32);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 40);
+ m1.load(Org + 40);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- n1.load(piCur + 48);
- m1.load(piOrg + 48);
+ n1.load(Cur + 48);
+ m1.load(Org + 48);
diff_low = m1 - extend_low(n1);
- m1.load(piOrg + 56);
+ m1.load(Org + 56);
diff_high = m1 - extend_high(n1);
diff_low = diff_low * diff_low;
diff_high = diff_high * diff_high;
sum_low += (extend_low(diff_low) + extend_low(diff_high));
sum_high += (extend_high(diff_low) + extend_high(diff_high));
- piOrg += iStrideOrg;
- piCur += iStrideCur;
+ Org += strideOrg;
+ Cur += strideCur;
}
return horizontal_add(sum_low) + horizontal_add(sum_high);
More information about the x265-devel
mailing list