[x265] [PATCH] Vectorize code for PowerPC processors using GCC Altivec API
Dan Parrot
dan.parrot at mail.com
Tue Mar 1 16:23:50 CET 2016
# HG changeset patch
# User Dan Parrot <dan.parrot at mail.com>
# Date 1456842340 21600
# Tue Mar 01 08:25:40 2016 -0600
# Node ID ffe6ea584ad92364e2e17a02bcb02124607b1e69
# Parent 291beccb67606494a9a144ca2cc4411ab3e21e50
Vectorize code for PowerPC processors using GCC Altivec API.
CMake CMAKE_CXX_FLAGS must include -maltivec and -mabi=altivec for GCC to
generate the vectorized code.
diff -r 291beccb6760 -r ffe6ea584ad9 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Fri Feb 26 16:23:56 2016 +0530
+++ b/source/common/CMakeLists.txt Tue Mar 01 08:25:40 2016 -0600
@@ -89,7 +89,7 @@
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
# add ARM assembly/intrinsic files here
- set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S)
+ set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S)
set(VEC_PRIMITIVES)
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
@@ -115,10 +115,20 @@
set(WINXP winxp.h winxp.cpp)
endif(WIN32)
+# detect flags that enable powerpc altivec interface
+string(REGEX MATCH "-maltivec" ENAB_ALTIVEC_FLAG0 "${CMAKE_CXX_FLAGS}")
+string(REGEX MATCH "-mabi=altivec" ENAB_ALTIVEC_FLAG1 "${CMAKE_CXX_FLAGS}")
+
+if(ENAB_ALTIVEC_FLAG0 AND ENAB_ALTIVEC_FLAG1)
+ set(SCALAR_OR_VEC_SRCS ppc_altivec/ipfilter.cpp ppc_altivec/ppcaltivecinline.h)
+else()
+ set(SCALAR_OR_VEC_SRCS ipfilter.cpp)
+endif()
+
add_library(common OBJECT
${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${WINXP}
primitives.cpp primitives.h
- pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
+ pixel.cpp dct.cpp ${SCALAR_OR_VEC_SRCS} intrapred.cpp loopfilter.cpp
constants.cpp constants.h
cpu.cpp cpu.h version.cpp
threading.cpp threading.h
diff -r 291beccb6760 -r ffe6ea584ad9 source/common/ppc_altivec/ipfilter.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/ppc_altivec/ipfilter.cpp Tue Mar 01 08:25:40 2016 -0600
@@ -0,0 +1,675 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Deepthi Devaki <deepthidevaki at multicorewareinc.com>,
+ * Rajesh Paulraj <rajesh at multicorewareinc.com>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+ * Min Chen <chenm003 at 163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include <altivec.h>
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+
+using namespace X265_NS;
+
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
+#endif
+
+#include "ppcaltivecinline.h"
+
+namespace {
+// file local namespace
+template<int width, int height>
+void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
+{
+ vector unsigned short vShift =
+ vec_sub(vec_splats((unsigned short)IF_INTERNAL_PREC), vec_splats((unsigned short)X265_DEPTH));
+
+ vector signed short vOffset = vec_splats((signed short)IF_INTERNAL_OFFS);
+
+ int row, col;
+ int srcItemCnt, dstItemCntA, dstItemCntB;
+
+ vector signed char vMask;
+ vector signed char vDst = vec_splats((signed char)0);
+ vector signed char vRead = vec_splats((signed char)0);
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col += 16)
+ {
+ srcItemCnt = (width - col) > 16 ? 16 : (width - col);
+ dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB);
+
+ read_qword(src, col, srcItemCnt, width, &vRead);
+ vMask = vec_splats((signed char)-1);
+ compute_filter_ps(vRead, &vDst, vShift, vOffset, dstItemCntA, &vMask);
+ store_value(dstItemCntA * 2, (signed char*)((unsigned long)dst + 2 * col), vDst, vMask);
+
+ vRead = vec_sld(vRead, vec_splats((signed char)0), 8);
+ if(dstItemCntB)
+ {
+ vMask = vec_splats((signed char)-1);
+ compute_filter_ps(vRead, &vDst, vShift, vOffset, dstItemCntB, &vMask);
+ store_value(dstItemCntB * 2, (signed char*)((unsigned long)dst + 2 * col + 16), vDst, vMask);
+ }
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+static void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, int marginX)
+{
+ int itemCnt;
+ vector signed char vSrcLeft;
+ vector signed char vSrcRight;
+ vector signed char vMask = vec_splats((signed char)-1);
+ vector unsigned char vPerm;
+
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < marginX; x += 16)
+ {
+ itemCnt = (marginX - x) > 16 ? 16 : (marginX - x);
+
+ if (x == 0)
+ {
+ vPerm = vec_lvsl(0, (signed char*)txt);
+ vSrcLeft = vec_ld(0, (signed char*)txt);
+ vSrcLeft = vec_perm(vSrcLeft, vSrcLeft, vPerm);
+
+ vPerm = vec_lvsl(0, (signed char*)&txt[width - 1]);
+ vSrcRight = vec_ld(0, (signed char*)&txt[width - 1]);
+ vSrcRight = vec_perm(vSrcRight, vSrcRight, vPerm);
+
+ vSrcLeft = vec_splat(vSrcLeft, 0);
+ vSrcRight = vec_splat(vSrcRight, 0);
+ }
+
+ //left-align the result
+ for(int k = itemCnt; k < 16; k++)
+ {
+ vMask = vec_sld(vMask, vec_splats((signed char)0), 1);
+ }
+
+ store_value(itemCnt, (signed char*)&txt[-marginX + x], vSrcLeft, vMask);
+ store_value(itemCnt, (signed char*)&txt[width + x], vSrcRight, vMask);
+
+ }
+ txt += stride;
+ }
+}
+
+template<int N, int width, int height>
+void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+{
+ vector unsigned int vHeadRoom = vec_splats((unsigned int)IF_FILTER_PREC);
+ vector signed int vOffset = vec_splats((int)1);
+ vOffset = vec_sl(vOffset, vec_sub(vHeadRoom, vec_splats((unsigned int)1)));
+
+ vector unsigned short vMaxVal = vec_splats((unsigned short)1);
+ vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH));
+ vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1));
+
+ src -= (N / 2 - 1);
+
+ vector signed short vCoeff;
+ v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients
+
+ int row, col;
+ int srcItemCnt;
+
+ vector signed char vMask;
+ vector signed char vDstA, vDstB;
+
+ vector signed char vReadArr[8];
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col += 16)
+ {
+ srcItemCnt = (width - col) > 16 ? 16 : (width - col);
+
+ read_src_p(0, src, col, 1, srcItemCnt, vReadArr);
+ if(N == 8)
+ {
+ read_src_p(4, src, col, 1, srcItemCnt, vReadArr);
+ }
+ compute_pp(N, vReadArr, &vDstA, vOffset, vHeadRoom, vCoeff, vMaxVal, true);
+ compute_pp(N, vReadArr, &vDstB, vOffset, vHeadRoom, vCoeff, vMaxVal, false);
+
+ vDstA = vec_pack((vector signed short)vDstA, (vector signed short)vDstB);
+
+ vMask = vec_splats((signed char)-1);
+ //left-align the result
+ for(int k = srcItemCnt; k < 16; k++)
+ {
+ vMask = vec_sld(vMask, vec_splats((signed char)0), 1);
+ }
+ store_value(srcItemCnt, (signed char*)&dst[col], vDstA, vMask);
+
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+template<int N, int width, int height>
+void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+{
+ vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH));
+ vector unsigned int vShift = vec_sub(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom);
+ vector signed char vConstZero = vec_splats((signed char)0);
+ vector signed int vOffset = vec_splats((int)IF_INTERNAL_OFFS);
+ vOffset = vec_sub((vector signed int)vConstZero, vOffset);
+ vOffset = vec_sl(vOffset, vShift);
+
+ int blkheight = height;
+
+ src -= N / 2 - 1;
+
+ if (isRowExt)
+ {
+ src -= (N / 2 - 1) * srcStride;
+ blkheight += N - 1;
+ }
+
+ vector signed short vCoeff;
+ v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients
+
+ int row, col;
+ int srcItemCnt, dstItemCntA, dstItemCntB;
+
+ vector signed char vMask;
+ vector signed char vDst = vec_splats((signed char)0);
+
+ vector signed char vReadArr[8];
+
+ for (row = 0; row < blkheight; row++)
+ {
+ for (col = 0; col < width; col += 16)
+ {
+ srcItemCnt = (width - col) > 16 ? 16 : (width - col);
+ dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB);
+
+ read_src_p(0, src, col, 1, srcItemCnt, vReadArr);
+ if(N == 8)
+ {
+ read_src_p(4, src, col, 1, srcItemCnt, vReadArr);
+ }
+
+ vMask = vec_splats((signed char)-1);
+ compute_ps(N, dstItemCntA, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, true);
+ store_value(dstItemCntA * 2, (signed char*)&dst[col], vDst, vMask);
+
+ if(dstItemCntB)
+ {
+ vMask = vec_splats((signed char)-1);
+ compute_ps(N, dstItemCntB, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, false);
+ store_value(dstItemCntB * 2, (signed char*)((unsigned long)&dst[col] + 16), vDst, vMask);
+ }
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+template<int N, int width, int height>
+void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+{
+ vector unsigned int vHeadRoom = vec_splats((unsigned int)IF_FILTER_PREC);
+ vector signed int vOffset = vec_splats((int)1);
+ vOffset = vec_sl(vOffset, vec_sub(vHeadRoom, vec_splats((unsigned int)1)));
+
+ vector unsigned short vMaxVal = vec_splats((unsigned short)1);
+ vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH));
+ vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1));
+
+ src -= (N / 2 - 1) * srcStride;
+
+ vector signed short vCoeff;
+ v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients
+
+ int row, col;
+ int srcItemCnt;
+
+ vector signed char vMask;
+ vector signed char vDstA, vDstB;
+ vector signed char vReadArr[8];
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col += 16)
+ {
+ srcItemCnt = (width - col) > 16 ? 16 : (width - col);
+
+ read_src_p(0, src, col, srcStride, srcItemCnt, vReadArr);
+ if(N == 8)
+ {
+ read_src_p(4, src, col, srcStride, srcItemCnt, vReadArr);
+ }
+ compute_pp(N, vReadArr, &vDstA, vOffset, vHeadRoom, vCoeff, vMaxVal, true);
+ compute_pp(N, vReadArr, &vDstB, vOffset, vHeadRoom, vCoeff, vMaxVal, false);
+
+ vDstA = vec_pack((vector signed short)vDstA, (vector signed short)vDstB);
+
+ vMask = vec_splats((signed char)-1);
+ //left-align the result
+ for(int k = srcItemCnt; k < 16; k++)
+ {
+ vMask = vec_sld(vMask, vec_splats((signed char)0), 1);
+ }
+ store_value(srcItemCnt, (signed char*)&dst[col], vDstA, vMask);
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+template<int N, int width, int height>
+void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
+{
+ vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH));
+ vector unsigned int vShift = vec_sub(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom);
+ vector signed char vConstZero = vec_splats((signed char)0);
+ vector signed int vOffset = vec_splats((int)IF_INTERNAL_OFFS);
+ vOffset = vec_sub((vector signed int)vConstZero, vOffset);
+ vOffset = vec_sl(vOffset, vShift);
+
+ src -= (N / 2 - 1) * srcStride;
+
+ vector signed short vCoeff;
+ v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients
+
+ int row, col;
+ int srcItemCnt, dstItemCntA, dstItemCntB;
+
+ vector signed char vMask;
+ vector signed char vDst = vec_splats((signed char)0);
+
+ vector signed char vReadArr[8];
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col += 16)
+ {
+ srcItemCnt = (width - col) > 16 ? 16 : (width - col);
+ dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB);
+
+ read_src_p(0, src, col, srcStride, srcItemCnt, vReadArr);
+ if(N == 8)
+ {
+ read_src_p(4, src, col, srcStride, srcItemCnt, vReadArr);
+ }
+
+ vMask = vec_splats((signed char)-1);
+ compute_ps(N, dstItemCntA, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, true);
+ store_value(dstItemCntA * 2, (signed char*)((unsigned long)dst + 2 * col), vDst, vMask);
+
+ if(dstItemCntB)
+ {
+ vMask = vec_splats((signed char)-1);
+ compute_ps(N, dstItemCntB, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, false);
+ store_value(dstItemCntB * 2, (signed char*)((unsigned long)dst + 2 * col + 16), vDst, vMask);
+ }
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+template<int N, int width, int height>
+void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+{
+ vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH));
+ vector unsigned int vShift = vec_add(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom);
+ vector signed int vOffset = vec_splats((int)1);
+ vOffset = vec_sl(vOffset, vec_sub(vShift, vec_splats((unsigned int)1)));
+ vector signed int vTemp = vec_splats((int)IF_INTERNAL_OFFS);
+ vTemp = vec_sl(vTemp, vec_splats((unsigned int)IF_FILTER_PREC));
+ vOffset = vec_add(vOffset, vTemp);
+
+ vector unsigned short vMaxVal = vec_splats((unsigned short)1);
+ vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH));
+ vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1));
+
+ src -= (N / 2 - 1) * srcStride;
+
+ vector signed short vCoeff;
+ v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients
+
+ int row, col;
+ int srcItemCnt, dstItemCntA, dstItemCntB;
+
+ vector signed char vMask;
+ vector signed char vDstA, vDstB = vec_splats((signed char)0);
+ vector signed char vReadArr[8];
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col += 16)
+ {
+ srcItemCnt = (width - col) > 16 ? 16 : (width - col);
+ dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB);
+
+ read_src_s(0, src, col, srcStride, dstItemCntA, vReadArr);
+ if(N == 8)
+ {
+ read_src_s(4, src, col, srcStride, dstItemCntA, vReadArr);
+ }
+ compute_vert_sp(N, vReadArr, &vDstA, vOffset, vCoeff, vShift, vMaxVal);
+
+ if(dstItemCntB)
+ {
+ read_src_s(0, src, col + 8, srcStride, dstItemCntB, vReadArr);
+ if(N == 8)
+ {
+ read_src_s(4, src, col + 8, srcStride, dstItemCntB, vReadArr);
+ }
+ compute_vert_sp(N, vReadArr, &vDstB, vOffset, vCoeff, vShift, vMaxVal);
+ }
+
+ vDstA = vec_pack((vector signed short)vDstA, (vector signed short)vDstB);
+
+ vMask = vec_splats((signed char)-1);
+ //left-align the result
+ for(int k = srcItemCnt; k < 16; k++)
+ {
+ vMask = vec_sld(vMask, vec_splats((signed char)0), 1);
+ }
+ store_value(srcItemCnt, (signed char*)&dst[col], vDstA, vMask);
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+template<int N, int width, int height>
+void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
+{
+ vector unsigned int vShift = vec_splats((unsigned int)IF_FILTER_PREC);
+
+ src -= (N / 2 - 1) * srcStride;
+
+ vector signed short vCoeff;
+ v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients
+
+ int row, col;
+ int srcItemCnt;
+
+ vector signed char vMask;
+ vector signed char vDst;
+
+ vector signed char vReadArr[8];
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col += 8)
+ {
+ srcItemCnt = (width - col) > 8 ? 8 : (width - col);
+
+ read_src_s(0, src, col, srcStride, srcItemCnt, vReadArr);
+ if(N == 8)
+ {
+ read_src_s(4, src, col, srcStride, srcItemCnt, vReadArr);
+ }
+ compute_vert_ss(N, vReadArr, &vDst, vCoeff, vShift);
+
+ vMask = vec_splats((signed char)-1);
+ //left-align the result
+ for(int k = srcItemCnt * 2; k < 16; k++)
+ {
+ vMask = vec_sld(vMask, vec_splats((signed char)0), 1);
+ }
+ store_value(srcItemCnt * 2, (signed char*)&dst[col], vDst, vMask);
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+template<int N>
+void filterVertical_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int coeffIdx)
+{
+ vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH));
+ vector unsigned int vShift = vec_add(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom);
+ vector signed int vOffset = vec_splats((int)1);
+ vOffset = vec_sl(vOffset, vec_sub(vShift, vec_splats((unsigned int)1)));
+ vector signed int vTemp = vec_splats((int)IF_INTERNAL_OFFS);
+ vTemp = vec_sl(vTemp, vec_splats((unsigned int)IF_FILTER_PREC));
+ vOffset = vec_add(vOffset, vTemp);
+
+ vector unsigned short vMaxVal = vec_splats((unsigned short)1);
+ vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH));
+ vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1));
+
+ src -= (N / 2 - 1) * srcStride;
+
+ vector signed short vCoeff;
+ v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients
+
+ int row, col;
+ int srcItemCnt, dstItemCntA, dstItemCntB;
+
+ vector signed char vMask;
+ vector signed char vDstA, vDstB = vec_splats((signed char)0);
+ vector signed char vReadArr[8];
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col += 16)
+ {
+ srcItemCnt = (width - col) > 16 ? 16 : (width - col);
+ dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB);
+
+ read_src_s(0, src, col, srcStride, dstItemCntA, vReadArr);
+ if(N == 8)
+ {
+ read_src_s(4, src, col, srcStride, dstItemCntA, vReadArr);
+ }
+ compute_vert_sp(N, vReadArr, &vDstA, vOffset, vCoeff, vShift, vMaxVal);
+
+ if(dstItemCntB)
+ {
+ read_src_s(0, src, col + 8, srcStride, dstItemCntB, vReadArr);
+ if(N == 8)
+ {
+ read_src_s(4, src, col + 8, srcStride, dstItemCntB, vReadArr);
+ }
+ compute_vert_sp(N, vReadArr, &vDstB, vOffset, vCoeff, vShift, vMaxVal);
+ }
+
+ vDstA = vec_pack((vector signed short)vDstA, (vector signed short)vDstB);
+
+ vMask = vec_splats((signed char)-1);
+ //left-align the result
+ for(int k = srcItemCnt; k < 16; k++)
+ {
+ vMask = vec_sld(vMask, vec_splats((signed char)0), 1);
+ }
+ store_value(srcItemCnt, (signed char*)&dst[col], vDstA, vMask);
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
+
+template<int N, int width, int height>
+void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
+{
+ short immedVals[(64 + 8) * (64 + 8)];
+
+ interp_horiz_ps_c<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
+ filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
+}
+}
+
+namespace X265_NS {
+// x265 private namespace
+
+#define CHROMA_420(W, H) \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+
+#define CHROMA_422(W, H) \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+
+#define CHROMA_444(W, H) \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
+
+#define LUMA(W, H) \
+ p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_hps = interp_horiz_ps_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vpp = interp_vert_pp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vps = interp_vert_ps_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;
+
+void setupFilterPrimitives_c(EncoderPrimitives& p)
+{
+ LUMA(4, 4);
+ LUMA(8, 8);
+ CHROMA_420(4, 4);
+ LUMA(4, 8);
+ CHROMA_420(2, 4);
+ LUMA(8, 4);
+ CHROMA_420(4, 2);
+ LUMA(16, 16);
+ CHROMA_420(8, 8);
+ LUMA(16, 8);
+ CHROMA_420(8, 4);
+ LUMA(8, 16);
+ CHROMA_420(4, 8);
+ LUMA(16, 12);
+ CHROMA_420(8, 6);
+ LUMA(12, 16);
+ CHROMA_420(6, 8);
+ LUMA(16, 4);
+ CHROMA_420(8, 2);
+ LUMA(4, 16);
+ CHROMA_420(2, 8);
+ LUMA(32, 32);
+ CHROMA_420(16, 16);
+ LUMA(32, 16);
+ CHROMA_420(16, 8);
+ LUMA(16, 32);
+ CHROMA_420(8, 16);
+ LUMA(32, 24);
+ CHROMA_420(16, 12);
+ LUMA(24, 32);
+ CHROMA_420(12, 16);
+ LUMA(32, 8);
+ CHROMA_420(16, 4);
+ LUMA(8, 32);
+ CHROMA_420(4, 16);
+ LUMA(64, 64);
+ CHROMA_420(32, 32);
+ LUMA(64, 32);
+ CHROMA_420(32, 16);
+ LUMA(32, 64);
+ CHROMA_420(16, 32);
+ LUMA(64, 48);
+ CHROMA_420(32, 24);
+ LUMA(48, 64);
+ CHROMA_420(24, 32);
+ LUMA(64, 16);
+ CHROMA_420(32, 8);
+ LUMA(16, 64);
+ CHROMA_420(8, 32);
+
+ CHROMA_422(4, 8);
+ CHROMA_422(4, 4);
+ CHROMA_422(2, 4);
+ CHROMA_422(2, 8);
+ CHROMA_422(8, 16);
+ CHROMA_422(8, 8);
+ CHROMA_422(4, 16);
+ CHROMA_422(8, 12);
+ CHROMA_422(6, 16);
+ CHROMA_422(8, 4);
+ CHROMA_422(2, 16);
+ CHROMA_422(16, 32);
+ CHROMA_422(16, 16);
+ CHROMA_422(8, 32);
+ CHROMA_422(16, 24);
+ CHROMA_422(12, 32);
+ CHROMA_422(16, 8);
+ CHROMA_422(4, 32);
+ CHROMA_422(32, 64);
+ CHROMA_422(32, 32);
+ CHROMA_422(16, 64);
+ CHROMA_422(32, 48);
+ CHROMA_422(24, 64);
+ CHROMA_422(32, 16);
+ CHROMA_422(8, 64);
+
+ CHROMA_444(4, 4);
+ CHROMA_444(8, 8);
+ CHROMA_444(4, 8);
+ CHROMA_444(8, 4);
+ CHROMA_444(16, 16);
+ CHROMA_444(16, 8);
+ CHROMA_444(8, 16);
+ CHROMA_444(16, 12);
+ CHROMA_444(12, 16);
+ CHROMA_444(16, 4);
+ CHROMA_444(4, 16);
+ CHROMA_444(32, 32);
+ CHROMA_444(32, 16);
+ CHROMA_444(16, 32);
+ CHROMA_444(32, 24);
+ CHROMA_444(24, 32);
+ CHROMA_444(32, 8);
+ CHROMA_444(8, 32);
+ CHROMA_444(64, 64);
+ CHROMA_444(64, 32);
+ CHROMA_444(32, 64);
+ CHROMA_444(64, 48);
+ CHROMA_444(48, 64);
+ CHROMA_444(64, 16);
+ CHROMA_444(16, 64);
+
+ p.extendRowBorder = extendCURowColBorder;
+}
+}
diff -r 291beccb6760 -r ffe6ea584ad9 source/common/ppc_altivec/ppcaltivecinline.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/ppc_altivec/ppcaltivecinline.h Tue Mar 01 08:25:40 2016 -0600
@@ -0,0 +1,555 @@
+/*****************************************************************************
+* Copyright (C) 2015 x265 project
+*
+* Authors: Dan Parrot <dan.parrot at mail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+extern inline
+void v_load_coeff(int N, int coeffIdx, vector signed short* vCoeff)
+ __attribute__((always_inline));
+
+extern inline
+void dst_item_partition(int srcItemCnt, int* dstItemCntA, int* dstItemCntB)
+ __attribute__((always_inline));
+
+void read_src_p(int startIdx, const pixel* src, int col, intptr_t srcStride, int srcItemCnt,
+vector signed char vReadArr[])
+ __attribute__((always_inline));
+
+extern inline
+void read_src_s(int startIdx, const int16_t* src, int col, intptr_t srcStride, int srcItemCnt,
+vector signed char vReadArr[])
+ __attribute__((always_inline));
+
+extern inline
+void read_qword(const pixel* src, int col, int srcItemCnt, int width, vector signed char* vRead)
+ __attribute__((always_inline));
+
+extern inline
+void compute_pp(int N, int itemCnt, vector signed char vReadArr[], vector signed short* vSrc,
+vector signed char* vDst, vector signed int vOffset, vector unsigned int vHeadRoom, vector signed short vCoeff,
+vector unsigned short vMaxVal, vector signed char* vMask)
+ __attribute__((always_inline));
+
+extern inline
+void compute_vert_ps(int N, int dstItemCnt, vector signed char vReadArr[], vector signed short* vSrc,
+vector signed char* vDst, vector signed int vOffset, vector signed short vCoeff,
+vector signed int vShift)
+ __attribute__((always_inline));
+
+extern inline
+void compute_ps(int N, int dstItemCnt, vector signed char vReadArr[], vector signed short* vSrc,
+vector signed char* vDst, vector signed int vOffset, vector signed short vCoeff, vector unsigned int vShift,
+vector signed char* vMask, bool initXfer)
+ __attribute__((always_inline));
+
+extern inline
+void compute_filter_ps(vector signed char vRead, vector signed char* vDst, vector unsigned short vShift,
+vector signed short vOffset, int itemCnt, vector signed char* vMask)
+ __attribute__((always_inline));
+
+extern inline
+void mult_add_s(int N, vector signed char vReadArr[], vector signed short vCoeff, vector signed int vOffset,
+vector unsigned int vShift, vector signed short* vVal)
+ __attribute__((always_inline));
+
+extern inline
+void mult_add_p(int N, vector signed char vReadArr[], vector signed short vCoeff, vector signed int vOffset,
+vector unsigned int vShift, vector signed short* vVal, bool initXfer)
+ __attribute__((always_inline));
+
+extern inline
+void compute_vert_sp(int N, vector signed char vReadArr[], vector signed char* vDst, vector signed int vOffset,
+vector signed short vCoeff, vector unsigned int vShift, vector unsigned short vMaxVal)
+ __attribute__((always_inline));
+
+extern inline
+void compute_vert_ss(int N, vector signed char vReadArr[], vector signed char* vDst, vector signed short vCoeff,
+vector unsigned int vShift)
+ __attribute__((always_inline));
+
+extern inline
+void store_value(int dstByteCnt, signed char* dstAddr, vector signed char vDst, vector signed char vMask)
+ __attribute__((always_inline));
+
+extern inline
+void v_load_coeff(int N, int coeffIdx, vector signed short* vCoeff)
+{
+ const int16_t* coeff;
+ vector unsigned char vPerm;
+ vector signed char vHi, vLo;
+ vector signed char vConstZero = vec_splats((signed char)0);
+ signed char* addrHi;
+ signed char* addrLo;
+
+ if(N == 4) coeff = g_chromaFilter[coeffIdx];
+ else coeff = g_lumaFilter[coeffIdx];
+
+ if(N == 8) addrLo = (signed char *)&coeff[7];
+ else addrLo = (signed char *)&coeff[3];
+
+ addrHi = (signed char*)&coeff[0];
+
+ if(((unsigned long)addrHi & 0x0Ful) == 0)
+ {
+ *vCoeff = (vector signed short)vec_ld(0, addrHi);
+ }
+ else
+ {
+ vPerm = vec_lvsl(0, addrHi);
+ vHi = vec_ld(0, addrHi);
+ if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful)))
+ {
+ vLo = vec_ld(0, addrLo);
+ *vCoeff = (vector signed short)vec_perm(vHi, vLo, vPerm);
+ }
+ else
+ {
+ *vCoeff = (vector signed short)vec_perm(vHi, vHi, vPerm);
+ }
+ }
+
+ if(N == 4)
+ {
+ *vCoeff = vec_sld((vector signed short)vConstZero, *vCoeff, 8); // this and next line left-align coefficients and
+ *vCoeff = vec_sld(*vCoeff, (vector signed short)vConstZero, 8); // zero out the lower half of coefficient register.
+ }
+}
+
+extern inline
+void dst_item_partition(int srcItemCnt, int* dstItemCntA, int* dstItemCntB)
+{
+ if(srcItemCnt <= 8)
+ {
+ *dstItemCntA = srcItemCnt;
+ *dstItemCntB = 0;
+ }
+ else
+ {
+ *dstItemCntA = 8;
+ *dstItemCntB = srcItemCnt - 8;
+ }
+}
+
+extern inline
+void read_src_p(int startIdx, const pixel* src, int col, intptr_t srcStride, int srcItemCnt,
+vector signed char vReadArr[])
+{
+ signed char* addrHi;
+ signed char *addrLo;
+ vector unsigned char vPerm;
+ vector signed char vHi, vLo;
+
+ for(int k = startIdx; k < startIdx + 4; k++)
+ {
+ addrHi = (signed char*)&src[col + k * srcStride];
+ addrLo = (signed char *)((unsigned long)addrHi + srcItemCnt - 1);
+
+ if(((unsigned long)addrHi & 0x0Ful) == 0)
+ {
+ vReadArr[k] = vec_ld(0, addrHi);
+ }
+ else
+ {
+ vPerm = vec_lvsl(0, addrHi);
+ vHi = vec_ld(0, addrHi);
+ if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful)))
+ {
+ vLo = vec_ld(0, addrLo);
+ vReadArr[k] = vec_perm(vHi, vLo, vPerm);
+ }
+ else
+ {
+ vReadArr[k] = vec_perm(vHi, vHi, vPerm);
+ }
+ }
+ }
+}
+
+extern inline
+void read_src_s(int startIdx, const int16_t* src, int col, intptr_t srcStride, int srcItemCnt,
+vector signed char vReadArr[])
+{
+ signed char* addrHi;
+ signed char *addrLo;
+ vector unsigned char vPerm;
+ vector signed char vHi, vLo;
+
+ for(int k = startIdx; k < startIdx + 4; k++)
+ {
+ addrHi = (signed char*)&src[col + k * srcStride];
+ addrLo = (signed char *)((unsigned long)addrHi + 2 * srcItemCnt - 1);
+
+ if(((unsigned long)addrHi & 0x0Ful) == 0)
+ {
+ vReadArr[k] = vec_ld(0, addrHi);
+ }
+ else
+ {
+ vPerm = vec_lvsl(0, addrHi);
+ vHi = vec_ld(0, addrHi);
+ if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful)))
+ {
+ vLo = vec_ld(0, addrLo);
+ vReadArr[k] = vec_perm(vHi, vLo, vPerm);
+ }
+ else
+ {
+ vReadArr[k] = vec_perm(vHi, vHi, vPerm);
+ }
+ }
+ }
+}
+
+extern inline
+void read_qword(const pixel* src, int col, int srcItemCnt, int width, vector signed char* vRead)
+{
+ signed char* addrHi;
+ signed char *addrLo;
+ vector unsigned char vPerm;
+ vector signed char vHi, vLo;
+
+ addrHi = (signed char*)&src[col];
+ addrLo = (srcItemCnt < 16) ? ((signed char *)&src[width - 1]) : ((signed char *)&src[col + 15]);
+
+ if(((unsigned long)addrHi & 0x0Ful) == 0)
+ {
+ *vRead = vec_ld(0, addrHi);
+ }
+ else
+ {
+ vPerm = vec_lvsl(0, addrHi);
+ vHi = vec_ld(0, addrHi);
+ if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful)))
+ {
+ vLo = vec_ld(0, addrLo);
+ *vRead = vec_perm(vHi, vLo, vPerm);
+ }
+ else
+ {
+ *vRead = vec_perm(vHi, vHi, vPerm);
+ }
+ }
+}
+
+extern inline
+void compute_pp(int N, vector signed char vReadArr[], vector signed char* vDst, vector signed int vOffset,
+vector unsigned int vHeadRoom, vector signed short vCoeff, vector unsigned short vMaxVal, bool initXfer)
+{
+ vector signed short vVal;
+ vector bool short compare;
+
+ mult_add_p(N, vReadArr, vCoeff, vOffset, vHeadRoom, &vVal, initXfer);
+
+ compare = vec_cmplt(vVal, vec_splats((signed short)0));
+ vVal = vec_sel(vVal, vec_splats((signed short)0), compare);
+
+ compare = vec_cmpgt(vVal, (vector signed short)vMaxVal);
+ vVal = vec_sel(vVal, (vector signed short)vMaxVal, compare);
+
+ *vDst = (vector signed char)vVal;
+}
+
+void compute_ps(int N, int dstItemCnt, vector signed char vReadArr[], vector signed char* vDst, vector signed int vOffset,
+vector signed short vCoeff, vector unsigned int vShift, vector signed char* vMask, bool initXfer)
+{
+ vector signed short vVal;
+
+ mult_add_p(N, vReadArr, vCoeff, vOffset, vShift, &vVal, initXfer);
+
+ *vDst = (vector signed char)vVal;
+
+ // mask to left-align the result when less than 16-bytes
+ for (int k = dstItemCnt * 2; k < 16; k++)
+ {
+ *vMask = vec_sld(*vMask, vec_splats((signed char)0), 1);
+ }
+
+}
+
+extern inline
+void compute_filter_ps(vector signed char vRead, vector signed char* vDst, vector unsigned short vShift,
+vector signed short vOffset, int itemCnt, vector signed char* vMask)
+{
+ vector signed char vConstZero = vec_splats((signed char)0);
+ vector signed short vVal = vec_splats((signed short)0);
+
+ vVal = (vector signed short)vec_mergeh(vConstZero, vRead);
+ vVal = vec_sl(vVal, vShift);
+ vVal = vec_sub(vVal, vOffset);
+
+ *vDst = (vector signed char)vVal;
+
+ // mask needed when result is less than 16-bytes
+ for (int k = itemCnt * 2; k < 16; k++)
+ {
+ *vMask = vec_sld(*vMask, (vector signed char)vConstZero, 1);
+ }
+}
+
+extern inline
+void mult_add_s(int N, vector signed char vReadArr[], vector signed short vCoeff, vector signed int vOffset,
+vector unsigned int vShift, vector signed short* vVal)
+{
+ vector signed int vRsltEven[8];
+ vector signed int vRsltOdd[8];
+
+ if(N == 4)
+ {
+ vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]);
+ vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]);
+ vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]);
+ vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), (vector signed short)vReadArr[3]);
+
+ vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]);
+ vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]);
+ vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]);
+ vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), (vector signed short)vReadArr[3]);
+
+ // as convention, always save to lower-numbered of any pair during addition
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]);
+ vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]);
+
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]);
+ vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]);
+
+ // the 2 elements below now contain the final mult-sum of 8 elements
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]);
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]);
+ }
+ else
+ {
+ vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]);
+ vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]);
+ vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]);
+ vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), (vector signed short)vReadArr[3]);
+ vRsltEven[4] = vec_mule(vec_splat(vCoeff, 4), (vector signed short)vReadArr[4]);
+ vRsltEven[5] = vec_mule(vec_splat(vCoeff, 5), (vector signed short)vReadArr[5]);
+ vRsltEven[6] = vec_mule(vec_splat(vCoeff, 6), (vector signed short)vReadArr[6]);
+ vRsltEven[7] = vec_mule(vec_splat(vCoeff, 7), (vector signed short)vReadArr[7]);
+
+ vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]);
+ vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]);
+ vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]);
+ vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), (vector signed short)vReadArr[3]);
+ vRsltOdd[4] = vec_mulo(vec_splat(vCoeff, 4), (vector signed short)vReadArr[4]);
+ vRsltOdd[5] = vec_mulo(vec_splat(vCoeff, 5), (vector signed short)vReadArr[5]);
+ vRsltOdd[6] = vec_mulo(vec_splat(vCoeff, 6), (vector signed short)vReadArr[6]);
+ vRsltOdd[7] = vec_mulo(vec_splat(vCoeff, 7), (vector signed short)vReadArr[7]);
+
+ // as convention, always save to lower-numbered of any pair during addition
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]);
+ vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]);
+ vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[5]);
+ vRsltEven[6] = vec_add(vRsltEven[6], vRsltEven[7]);
+
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]);
+ vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]);
+ vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[5]);
+ vRsltOdd[6] = vec_add(vRsltOdd[6], vRsltOdd[7]);
+
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]);
+ vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[6]);
+
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]);
+ vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[6]);
+
+ // the 2 elements below now contain the final mult-sum of 8 elements
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[4]);
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[4]);
+ }
+
+ vRsltEven[0] = vec_add(vRsltEven[0], vOffset);
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vOffset);
+
+ vRsltEven[0] = vec_sra(vRsltEven[0], (vector unsigned int)vShift);
+ vRsltOdd[0] = vec_sra(vRsltOdd[0], (vector unsigned int)vShift);
+
+ *vVal = vec_pack(vRsltEven[0], vRsltOdd[0]);
+ *vVal = vec_perm(*vVal, *vVal, ((vector unsigned char)
+ {0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e,0x0f}));
+}
+
+extern inline
+void mult_add_p(int N, vector signed char vReadArr[], vector signed short vCoeff, vector signed int vOffset,
+vector unsigned int vShift, vector signed short* vVal, bool initXfer)
+{
+ vector signed short vOperand[8];
+ vector signed int vRsltEven[8];
+ vector signed int vRsltOdd[8];
+
+
+ if(initXfer)
+ {
+ vOperand[0] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[0]);
+ vOperand[1] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[1]);
+ vOperand[2] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[2]);
+ vOperand[3] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[3]);
+
+ if(N != 4)
+ {
+ vOperand[4] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[4]);
+ vOperand[5] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[5]);
+ vOperand[6] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[6]);
+ vOperand[7] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[7]);
+ }
+ }
+ else
+ {
+ vOperand[0] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[0]);
+ vOperand[1] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[1]);
+ vOperand[2] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[2]);
+ vOperand[3] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[3]);
+
+ if(N != 4)
+ {
+ vOperand[4] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[4]);
+ vOperand[5] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[5]);
+ vOperand[6] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[6]);
+ vOperand[7] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[7]);
+ }
+ }
+
+ if(N == 4)
+ {
+ vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), vOperand[0]);
+ vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), vOperand[1]);
+ vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), vOperand[2]);
+ vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), vOperand[3]);
+
+ vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), vOperand[0]);
+ vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), vOperand[1]);
+ vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), vOperand[2]);
+ vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), vOperand[3]);
+
+ // as convention, always save to lower-numbered of any pair during addition
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]);
+ vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]);
+
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]);
+ vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]);
+
+ // the 2 elements below now contain the final mult-sum of 8 elements
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]);
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]);
+ }
+ else
+ {
+ vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), vOperand[0]);
+ vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), vOperand[1]);
+ vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), vOperand[2]);
+ vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), vOperand[3]);
+ vRsltEven[4] = vec_mule(vec_splat(vCoeff, 4), vOperand[4]);
+ vRsltEven[5] = vec_mule(vec_splat(vCoeff, 5), vOperand[5]);
+ vRsltEven[6] = vec_mule(vec_splat(vCoeff, 6), vOperand[6]);
+ vRsltEven[7] = vec_mule(vec_splat(vCoeff, 7), vOperand[7]);
+
+ vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), vOperand[0]);
+ vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), vOperand[1]);
+ vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), vOperand[2]);
+ vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), vOperand[3]);
+ vRsltOdd[4] = vec_mulo(vec_splat(vCoeff, 4), vOperand[4]);
+ vRsltOdd[5] = vec_mulo(vec_splat(vCoeff, 5), vOperand[5]);
+ vRsltOdd[6] = vec_mulo(vec_splat(vCoeff, 6), vOperand[6]);
+ vRsltOdd[7] = vec_mulo(vec_splat(vCoeff, 7), vOperand[7]);
+
+ // as convention, always save to lower-numbered of any pair during addition
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]);
+ vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]);
+ vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[5]);
+ vRsltEven[6] = vec_add(vRsltEven[6], vRsltEven[7]);
+
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]);
+ vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]);
+ vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[5]);
+ vRsltOdd[6] = vec_add(vRsltOdd[6], vRsltOdd[7]);
+
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]);
+ vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[6]);
+
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]);
+ vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[6]);
+
+ // the 2 elements below now contain the final mult-sum of 8 elements
+ vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[4]);
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[4]);
+ }
+
+ vRsltEven[0] = vec_add(vRsltEven[0], vOffset);
+ vRsltOdd[0] = vec_add(vRsltOdd[0], vOffset);
+
+ vRsltEven[0] = vec_sra(vRsltEven[0], (vector unsigned int)vShift);
+ vRsltOdd[0] = vec_sra(vRsltOdd[0], (vector unsigned int)vShift);
+
+ *vVal = vec_pack(vRsltEven[0], vRsltOdd[0]);
+ *vVal = vec_perm(*vVal, *vVal, ((vector unsigned char)
+ {0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e,0x0f}));
+}
+
+extern inline
+void compute_vert_sp(int N, vector signed char vReadArr[], vector signed char* vDst, vector signed int vOffset,
+vector signed short vCoeff, vector unsigned int vShift, vector unsigned short vMaxVal)
+{
+ vector signed short vVal;
+ vector bool short compare;
+
+ mult_add_s(N, vReadArr, vCoeff, vOffset, vShift, &vVal);
+
+ compare = vec_cmplt(vVal, vec_splats((signed short)0));
+ vVal = vec_sel(vVal, vec_splats((signed short)0), compare);
+
+ compare = vec_cmpgt(vVal, (vector signed short)vMaxVal);
+ vVal = vec_sel(vVal, (vector signed short)vMaxVal, compare);
+
+ *vDst = (vector signed char)vVal;
+}
+
+extern inline
+void compute_vert_ss(int N, vector signed char vReadArr[], vector signed char* vDst, vector signed short vCoeff,
+vector unsigned int vShift)
+{
+ vector signed short vVal;
+
+ mult_add_s(N, vReadArr, vCoeff, vec_splats((signed int)0), vShift, &vVal);
+
+ *vDst = (vector signed char)vVal;
+}
+
+extern inline
+void store_value(int dstByteCnt, signed char* dstAddr, vector signed char vDst, vector signed char vMask)
+{
+ signed char* addrHi = dstAddr;
+ signed char* addrLo = (signed char*)((unsigned long)dstAddr + dstByteCnt - 1);
+
+ vector unsigned char vPerm = vec_lvsr(0, addrHi);
+ vector signed char vHi = vec_ld(0, addrHi);
+ vector signed char vLo = vec_splats((signed char)0);
+
+ vDst = vec_perm(vDst, vDst, vPerm);
+ vMask = vec_perm(vec_splats((signed char)0), vMask, vPerm);
+ vHi = vec_sel(vHi, vDst, (vector unsigned char)vMask);
+ vec_st(vHi, 0, addrHi);
+
+ if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful)))
+ {
+ vLo = vec_ld(0, addrLo);
+ vLo = vec_sel(vDst, vLo, (vector unsigned char)vMask);
+ vec_st(vLo, 0, addrLo);
+ }
+}
More information about the x265-devel
mailing list