[x265-commits] [x265] vec: remove intrinsic predDCFiltering() and intra_pred_dc...
Steve Borho
steve at borho.org
Fri Nov 22 19:56:46 CET 2013
details: http://hg.videolan.org/x265/rev/285fe7a59cf5
branches:
changeset: 5279:285fe7a59cf5
user: Steve Borho <steve at borho.org>
date: Fri Nov 22 12:31:33 2013 -0600
description:
vec: remove intrinsic predDCFiltering() and intra_pred_dc(), we have ASM
Subject: [x265] pixel-sse41: cleanup intrinsic weight function names
details: http://hg.videolan.org/x265/rev/a6da40d0584f
branches:
changeset: 5280:a6da40d0584f
user: Steve Borho <steve at borho.org>
date: Fri Nov 22 12:40:40 2013 -0600
description:
pixel-sse41: cleanup intrinsic weight function names
Subject: [x265] pixel-ssse3: remove scale1D_128to64, we have ASM
details: http://hg.videolan.org/x265/rev/7e2694f9a226
branches:
changeset: 5281:7e2694f9a226
user: Steve Borho <steve at borho.org>
date: Fri Nov 22 12:42:14 2013 -0600
description:
pixel-ssse3: remove scale1D_128to64, we have ASM
Subject: [x265] cmake: merge pixel-sse3.cpp into pixel-ssse3.cpp
details: http://hg.videolan.org/x265/rev/e5db4a4859ee
branches:
changeset: 5282:e5db4a4859ee
user: Steve Borho <steve at borho.org>
date: Fri Nov 22 12:46:19 2013 -0600
description:
cmake: merge pixel-sse3.cpp into pixel-ssse3.cpp
Both files only had one primitive each, and they will both probably be
replaced soon
diffstat:
source/common/CMakeLists.txt | 2 +-
source/common/vec/intra-sse41.cpp | 197 +----------------------------------
source/common/vec/pixel-sse3.cpp | 61 ----------
source/common/vec/pixel-sse41.cpp | 10 +-
source/common/vec/pixel-ssse3.cpp | 49 +++-----
source/common/vec/vec-primitives.cpp | 4 -
6 files changed, 25 insertions(+), 298 deletions(-)
diffs (truncated from 433 to 300 lines):
diff -r e28d9b6b5d65 -r e5db4a4859ee source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Fri Nov 22 12:02:55 2013 -0600
+++ b/source/common/CMakeLists.txt Fri Nov 22 12:46:19 2013 -0600
@@ -72,7 +72,7 @@ if(ENABLE_PRIMITIVES_VEC)
if(HIGH_BIT_DEPTH)
include_directories(../VectorClass)
endif()
- set(SSE3 vec/pixel-sse3.cpp vec/dct-sse3.cpp vec/blockcopy-sse3.cpp)
+ set(SSE3 vec/dct-sse3.cpp vec/blockcopy-sse3.cpp)
set(SSSE3 vec/pixel-ssse3.cpp vec/dct-ssse3.cpp vec/ipfilter-ssse3.cpp vec/intra-ssse3.cpp)
set(SSE41 vec/pixel-sse41.cpp vec/dct-sse41.cpp vec/ipfilter-sse41.cpp vec/intra-sse41.cpp vec/pixel16-sse41.cpp)
diff -r e28d9b6b5d65 -r e5db4a4859ee source/common/vec/intra-sse41.cpp
--- a/source/common/vec/intra-sse41.cpp Fri Nov 22 12:02:55 2013 -0600
+++ b/source/common/vec/intra-sse41.cpp Fri Nov 22 12:46:19 2013 -0600
@@ -35,197 +35,6 @@ using namespace x265;
namespace {
#if !HIGH_BIT_DEPTH
-inline void predDCFiltering(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)
-{
- int y;
- pixel pixDC = *dst;
- int pixDCx3 = pixDC * 3 + 2;
-
- // boundary pixels processing
- dst[0] = (pixel)((above[0] + left[0] + 2 * pixDC + 2) >> 2);
-
- __m128i im1 = _mm_set1_epi16(pixDCx3);
- __m128i im2, im3;
-
- __m128i pix;
- switch (width)
- {
- case 4:
- pix = _mm_cvtsi32_si128(*(uint32_t*)&above[1]);
- im2 = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
- im2 = _mm_srai_epi16(_mm_add_epi16(im1, im2), 2);
- pix = _mm_packus_epi16(im2, im2);
- *(uint32_t*)&dst[1] = _mm_cvtsi128_si32(pix);
- break;
-
- case 8:
- pix = _mm_loadl_epi64((__m128i*)&above[1]);
- im2 = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
- im2 = _mm_srai_epi16(_mm_add_epi16(im1, im2), 2);
- pix = _mm_packus_epi16(im2, im2);
- _mm_storel_epi64((__m128i*)&dst[1], pix);
- break;
-
- case 16:
- pix = _mm_loadu_si128((__m128i*)&above[1]);
- im2 = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
- im3 = _mm_unpackhi_epi8(pix, _mm_setzero_si128());
- im2 = _mm_srai_epi16(_mm_add_epi16(im1, im2), 2);
- im3 = _mm_srai_epi16(_mm_add_epi16(im1, im3), 2);
- pix = _mm_packus_epi16(im2, im3);
- _mm_storeu_si128((__m128i*)&dst[1], pix);
- break;
-
- case 32:
- pix = _mm_loadu_si128((__m128i*)&above[1]);
- im2 = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
- im3 = _mm_unpackhi_epi8(pix, _mm_setzero_si128());
- im2 = _mm_srai_epi16(_mm_add_epi16(im1, im2), 2);
- im3 = _mm_srai_epi16(_mm_add_epi16(im1, im3), 2);
- pix = _mm_packus_epi16(im2, im3);
- _mm_storeu_si128((__m128i*)&dst[1], pix);
-
- pix = _mm_loadu_si128((__m128i*)&above[1 + 16]);
- im2 = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
- im3 = _mm_unpackhi_epi8(pix, _mm_setzero_si128());
- im2 = _mm_srai_epi16(_mm_add_epi16(im1, im2), 2);
- im3 = _mm_srai_epi16(_mm_add_epi16(im1, im3), 2);
- pix = _mm_packus_epi16(im2, im3);
- _mm_storeu_si128((__m128i*)&dst[1 + 16], pix);
- break;
- }
-
- for (y = 1; y < width; y++)
- {
- dst[dstStride] = (pixel)((left[y] + pixDCx3) >> 2);
- dst += dstStride;
- }
-}
-
-template<int width>
-void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
-{
- int sum;
- int logSize = g_convertToBit[width] + 2;
-
- __m128i pixL, pixT, temp;
-
- switch (width)
- {
- case 4:
- pixL = _mm_cvtsi32_si128(*(uint32_t*)left);
- pixT = _mm_cvtsi32_si128(*(uint32_t*)above);
- pixL = _mm_unpacklo_epi8(pixL, _mm_setzero_si128());
- pixT = _mm_unpacklo_epi8(pixT, _mm_setzero_si128());
- temp = _mm_add_epi16(pixL, pixT);
- sum = _mm_cvtsi128_si32(_mm_hadd_epi16(_mm_hadd_epi16(temp, temp), temp));
- break;
- case 8:
-#if X86_64
- pixL = _mm_cvtsi64_si128(*(uint64_t*)left);
- pixT = _mm_cvtsi64_si128(*(uint64_t*)above);
-#else
- pixL = _mm_loadu_si128((__m128i*)left);
- pixT = _mm_loadu_si128((__m128i*)above);
-#endif
- pixL = _mm_unpacklo_epi8(pixL, _mm_setzero_si128());
- pixT = _mm_unpacklo_epi8(pixT, _mm_setzero_si128());
- temp = _mm_add_epi16(pixL, pixT);
- sum = _mm_cvtsi128_si32(_mm_hadd_epi16(_mm_hadd_epi16(_mm_hadd_epi16(temp, temp), temp), temp));
- break;
- case 16:
- pixL = _mm_loadu_si128((__m128i*)left);
- pixT = _mm_loadu_si128((__m128i*)above);
- temp = _mm_sad_epu8(pixL, _mm_setzero_si128());
- temp = _mm_add_epi16(temp, _mm_sad_epu8(pixT, _mm_setzero_si128()));
- sum = _mm_cvtsi128_si32(_mm_add_epi32(_mm_shuffle_epi32(temp, 2), temp));
- break;
-
- default:
- case 32:
- pixL = _mm_loadu_si128((__m128i*)left);
- temp = _mm_sad_epu8(pixL, _mm_setzero_si128());
- pixL = _mm_loadu_si128((__m128i*)(left + 16));
- temp = _mm_add_epi16(temp, _mm_sad_epu8(pixL, _mm_setzero_si128()));
-
- pixT = _mm_loadu_si128((__m128i*)above);
- temp = _mm_add_epi16(temp, _mm_sad_epu8(pixT, _mm_setzero_si128()));
- pixT = _mm_loadu_si128((__m128i*)(above + 16));
- temp = _mm_add_epi16(temp, _mm_sad_epu8(pixT, _mm_setzero_si128()));
- sum = _mm_cvtsi128_si32(_mm_add_epi32(_mm_shuffle_epi32(temp, 2), temp));
- break;
- }
-
- logSize += 1;
- pixel dcVal = (sum + (1 << (logSize - 1))) >> logSize;
- __m128i dcValN = _mm_set1_epi8(dcVal);
-
- pixel *dst1 = dst;
- switch (width)
- {
- case 4:
- *(uint32_t*)dst1 = _mm_cvtsi128_si32(dcValN);
- dst1 += dstStride;
- *(uint32_t*)dst1 = _mm_cvtsi128_si32(dcValN);
- dst1 += dstStride;
- *(uint32_t*)dst1 = _mm_cvtsi128_si32(dcValN);
- dst1 += dstStride;
- *(uint32_t*)dst1 = _mm_cvtsi128_si32(dcValN);
- break;
-
- case 8:
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storel_epi64((__m128i*)dst1, dcValN);
- break;
-
- case 16:
- for (int k = 0; k < 16; k += 4)
- {
- _mm_storeu_si128((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storeu_si128((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storeu_si128((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- _mm_storeu_si128((__m128i*)dst1, dcValN);
- dst1 += dstStride;
- }
-
- break;
-
- case 32:
- for (int k = 0; k < 32; k += 2)
- {
- _mm_storeu_si128((__m128i*)dst1, dcValN);
- _mm_storeu_si128((__m128i*)(dst1 + 16), dcValN);
- dst1 += dstStride;
- _mm_storeu_si128((__m128i*)dst1, dcValN);
- _mm_storeu_si128((__m128i*)(dst1 + 16), dcValN);
- dst1 += dstStride;
- }
-
- break;
- }
-
- if (filter)
- {
- predDCFiltering(above, left, dst, dstStride, width);
- }
-}
-
__m128i v_multiL, v_multiH, v_multiH2, v_multiH3, v_multiH4, v_multiH5, v_multiH6, v_multiH7;
__m128i v_multi_2Row;
@@ -8696,16 +8505,12 @@ void Setup_Vec_IPredPrimitives_sse41(Enc
p.intra_pred_planar[BLOCK_32x32] = intra_pred_planar32_sse4;
p.intra_pred_planar[BLOCK_64x64] = intra_pred_planar64_sse4;
- p.intra_pred_dc[BLOCK_4x4] = intra_pred_dc<4>;
- p.intra_pred_dc[BLOCK_8x8] = intra_pred_dc<8>;
- p.intra_pred_dc[BLOCK_16x16] = intra_pred_dc<16>;
- p.intra_pred_dc[BLOCK_32x32] = intra_pred_dc<32>;
-
#if defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER == 1500))
p.intra_pred_allangs[0] = predIntraAngs4;
p.intra_pred_allangs[1] = predIntraAngs8;
p.intra_pred_allangs[2] = predIntraAngs16;
p.intra_pred_allangs[3] = predIntraAngs32;
+
#elif defined(_MSC_VER) && defined(X86_64)
/* VC10 and VC11 both generate bad Win32 code for all these functions.
diff -r e28d9b6b5d65 -r e5db4a4859ee source/common/vec/pixel-sse3.cpp
--- a/source/common/vec/pixel-sse3.cpp Fri Nov 22 12:02:55 2013 -0600
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Steve Borho <steve at borho.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at multicorewareinc.com
- *****************************************************************************/
-
-/* this file instantiates SSE3 versions of the pixel primitives */
-
-#include "primitives.h"
-#include <assert.h>
-#include <xmmintrin.h> // SSE
-#include <pmmintrin.h> // SSE3
-
-using namespace x265;
-
-namespace {
-void convert16to32_shl(int32_t *dst, int16_t *org, intptr_t stride, int shift, int size)
-{
- int i, j;
-
- for (i = 0; i < size; i++)
- {
- for (j = 0; j < size; j += 4)
- {
- __m128i im16;
- __m128i im32;
-
- im16 = _mm_loadl_epi64((__m128i*)&org[i * stride + j]);
- im32 = _mm_srai_epi32(_mm_unpacklo_epi16(im16, im16), 16);
- im32 = _mm_slli_epi32(im32, shift);
- _mm_storeu_si128((__m128i*)dst, im32);
-
- dst += 4;
- }
- }
-}
-}
-
-namespace x265 {
-void Setup_Vec_PixelPrimitives_sse3(EncoderPrimitives &p)
-{
- p.cvt16to32_shl = convert16to32_shl;
-}
-}
diff -r e28d9b6b5d65 -r e5db4a4859ee source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp Fri Nov 22 12:02:55 2013 -0600
+++ b/source/common/vec/pixel-sse41.cpp Fri Nov 22 12:46:19 2013 -0600
@@ -33,7 +33,7 @@ using namespace x265;
More information about the x265-commits
mailing list