[x265-commits] [x265] primitives: drop ipfilter[FILTER_H_P_S_[4|8], including i...
Steve Borho
steve at borho.org
Thu Dec 12 20:54:04 CET 2013
details: http://hg.videolan.org/x265/rev/a87f12ebb55b
branches:
changeset: 5710:a87f12ebb55b
user: Steve Borho <steve at borho.org>
date: Wed Dec 11 15:37:02 2013 -0600
description:
primitives: drop ipfilter[FILTER_H_P_S_[4|8], including ipfilter-ssse3.cpp
Subject: [x265] yuv: Support 4GB+ YUV files for output
details: http://hg.videolan.org/x265/rev/06e88ad6d922
branches: stable
changeset: 5711:06e88ad6d922
user: David Bachelart <david.bachelart at bbright.com>
date: Thu Dec 12 11:32:27 2013 +0100
description:
yuv: Support 4GB+ YUV files for output
Subject: [x265] Merge with stable
details: http://hg.videolan.org/x265/rev/033a65692b6a
branches:
changeset: 5712:033a65692b6a
user: Steve Borho <steve at borho.org>
date: Thu Dec 12 13:53:33 2013 -0600
description:
Merge with stable
diffstat:
source/common/CMakeLists.txt | 2 +-
source/common/ipfilter.cpp | 46 +----------
source/common/primitives.h | 2 -
source/common/vec/ipfilter-ssse3.cpp | 143 -----------------------------------
source/common/vec/vec-primitives.cpp | 2 -
source/output/yuv.cpp | 8 +-
6 files changed, 9 insertions(+), 194 deletions(-)
diffs (297 lines):
diff -r 25f412ecaba2 -r 033a65692b6a source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/CMakeLists.txt Thu Dec 12 13:53:33 2013 -0600
@@ -62,7 +62,7 @@ if(MSVC)
endif(MSVC)
set(SSE3 vec/dct-sse3.cpp vec/blockcopy-sse3.cpp)
-set(SSSE3 vec/dct-ssse3.cpp vec/ipfilter-ssse3.cpp vec/intra-ssse3.cpp)
+set(SSSE3 vec/dct-ssse3.cpp vec/intra-ssse3.cpp)
set(SSE41 vec/dct-sse41.cpp vec/ipfilter-sse41.cpp vec/intra-sse41.cpp)
if (MSVC)
diff -r 25f412ecaba2 -r 033a65692b6a source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/ipfilter.cpp Thu Dec 12 13:53:33 2013 -0600
@@ -152,43 +152,6 @@ void filterVertical_ps_c(pixel *src, int
}
}
-template<int N>
-void filterHorizontal_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
-{
- int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
- int shift = IF_FILTER_PREC - headRoom;
- int offset = -IF_INTERNAL_OFFS << shift;
-
- src -= N / 2 - 1;
-
- int row, col;
- for (row = 0; row < height; row++)
- {
- for (col = 0; col < width; col++)
- {
- int sum;
-
- sum = src[col + 0] * coeff[0];
- sum += src[col + 1] * coeff[1];
- sum += src[col + 2] * coeff[2];
- sum += src[col + 3] * coeff[3];
- if (N == 8)
- {
- sum += src[col + 4] * coeff[4];
- sum += src[col + 5] * coeff[5];
- sum += src[col + 6] * coeff[6];
- sum += src[col + 7] * coeff[7];
- }
-
- int16_t val = (int16_t)((sum + offset) >> shift);
- dst[col] = val;
- }
-
- src += srcStride;
- dst += dstStride;
- }
-}
-
template<int dstStride>
void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
{
@@ -477,10 +440,9 @@ typedef void (*ipfilter_sp_t)(short *src
template<int N, int width, int height>
void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
{
- short m_immedVals[(64 + 8) * (64 + 8)];
-
- filterHorizontal_ps_c<N>(src - 3 * srcStride, srcStride, m_immedVals, width, width, height + 7, g_lumaFilter[idxX]);
- filterVertical_sp_c<N>(m_immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
+ short immedVals[(64 + 8) * (64 + 8)];
+ interp_horiz_ps_c<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
+ filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
}
}
@@ -556,9 +518,7 @@ void Setup_C_IPFilterPrimitives(EncoderP
LUMA(16, 64);
CHROMA(8, 32);
- p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_ps_c<8>;
p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_ps_c<8>;
- p.ipfilter_ps[FILTER_H_P_S_4] = filterHorizontal_ps_c<4>;
p.ipfilter_ps[FILTER_V_P_S_4] = filterVertical_ps_c<4>;
p.ipfilter_ss[FILTER_V_S_S_8] = filterVertical_ss_c<8>;
p.ipfilter_ss[FILTER_V_S_S_4] = filterVertical_ss_c<4>;
diff -r 25f412ecaba2 -r 033a65692b6a source/common/primitives.h
--- a/source/common/primitives.h Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/primitives.h Thu Dec 12 13:53:33 2013 -0600
@@ -118,8 +118,6 @@ enum IDcts
enum IPFilterConf_P_S
{
- FILTER_H_P_S_8,
- FILTER_H_P_S_4,
FILTER_V_P_S_8,
FILTER_V_P_S_4,
NUM_IPFILTER_P_S
diff -r 25f412ecaba2 -r 033a65692b6a source/common/vec/ipfilter-ssse3.cpp
--- a/source/common/vec/ipfilter-ssse3.cpp Wed Dec 11 15:12:02 2013 -0600
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,143 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Deepthi Devaki <deepthidevaki at multicorewareinc.com>,
- * Rajesh Paulraj <rajesh at multicorewareinc.com>
- * Mandar Gurav <mandar at multicorewareinc.com>
- * Mahesh Pittala <mahesh at multicorewareinc.com>
- * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
- * Nabajit Deka <nabajit at multicorewareinc.com>
- * Min Chen <chenm003 at 163.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at multicorewareinc.com.
- *****************************************************************************/
-
-#include "primitives.h"
-#include "TLibCommon/TComRom.h"
-#include <xmmintrin.h> // SSE
-#include <pmmintrin.h> // SSE3
-#include <tmmintrin.h> // SSSE3
-#include <string.h>
-
-#if !HIGH_BIT_DEPTH
-namespace {
-template<int N>
-void filterHorizontal_ps(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
-{
- src -= (N / 2 - 1);
-
- int offset;
- int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
- int shift = IF_FILTER_PREC;
- shift -= headRoom;
- offset = -IF_INTERNAL_OFFS << shift;
-
- int row, col;
-
- __m128i a = _mm_loadu_si128((__m128i*)coeff);
- __m128i T10 = _mm_packs_epi16(a, a);
-
- __m128i S1 = _mm_slli_si128(T10, 12);
- __m128i S2 = _mm_srli_si128(S1, 4);
- __m128i S3 = _mm_srli_si128(S2, 4);
- __m128i S4 = _mm_srli_si128(S3, 4);
- __m128i S = _mm_add_epi8(S1, _mm_add_epi8(S2, S3));
- S = _mm_add_epi8(S, S4);
-
- __m128i Tm1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8);
- __m128i Tm2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10);
- __m128i Tm3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12);
- __m128i Tm4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14);
- __m128i Tm5 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
- __m128i Tm6 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10);
-
- for (row = 0; row < height; row++)
- {
- col = 0;
- for (; col < (width - 7); col += 8)
- {
- __m128i srcCoeff = _mm_loadu_si128((__m128i*)(src + col));
- __m128i sum;
-
- if (N == 4)
- {
- __m128i T00 = _mm_shuffle_epi8(srcCoeff, Tm5);
- __m128i T20 = _mm_maddubs_epi16(T00, S);
-
- __m128i T30 = _mm_shuffle_epi8(srcCoeff, Tm6);
- __m128i T40 = _mm_maddubs_epi16(T30, S);
-
- sum = _mm_hadd_epi16(T20, T40);
- }
- else // (N == 8)
- {
- __m128i T00 = _mm_shuffle_epi8(srcCoeff, Tm1);
- __m128i T20 = _mm_maddubs_epi16(T00, T10);
-
- __m128i T30 = _mm_shuffle_epi8(srcCoeff, Tm2);
- __m128i T40 = _mm_maddubs_epi16(T30, T10);
-
- __m128i T50 = _mm_shuffle_epi8(srcCoeff, Tm3);
- __m128i T60 = _mm_maddubs_epi16(T50, T10);
-
- __m128i T70 = _mm_shuffle_epi8(srcCoeff, Tm4);
- __m128i T80 = _mm_maddubs_epi16(T70, T10);
-
- __m128i s1 = _mm_hadd_epi16(T20, T40);
- __m128i s2 = _mm_hadd_epi16(T60, T80);
- sum = _mm_hadd_epi16(s1, s2);
- }
-
- __m128i sumOffset = _mm_set1_epi16(offset);
- __m128i val = _mm_add_epi16(sum, sumOffset);
-
- val = _mm_srai_epi16(val, shift);
- _mm_storeu_si128((__m128i*)&dst[col], val);
- }
-
- for (; col < width; col++) // Remaining iterations
- {
- __m128i NewSrc = _mm_loadl_epi64((__m128i*)(src + col));
- __m128i T00 = _mm_maddubs_epi16(NewSrc, T10);
- __m128i add = _mm_hadd_epi16(T00, T00);
- int16_t sum = _mm_extract_epi16(add, 0);
- if (N == 8)
- {
- add = _mm_hadd_epi16(add, add);
- sum = _mm_extract_epi16(add, 0);
- }
- int16_t val = (int16_t)(sum + offset) >> shift;
- dst[col] = val;
- }
-
- src += srcStride;
- dst += dstStride;
- }
-}
-}
-#endif // if !HIGH_BIT_DEPTH
-
-namespace x265 {
-void Setup_Vec_IPFilterPrimitives_ssse3(EncoderPrimitives& p)
-{
-#if !HIGH_BIT_DEPTH
- p.ipfilter_ps[FILTER_H_P_S_4] = filterHorizontal_ps<4>;
- p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_ps<8>;
-#endif
-}
-}
diff -r 25f412ecaba2 -r 033a65692b6a source/common/vec/vec-primitives.cpp
--- a/source/common/vec/vec-primitives.cpp Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/vec/vec-primitives.cpp Thu Dec 12 13:53:33 2013 -0600
@@ -60,7 +60,6 @@ void Setup_Vec_DCTPrimitives_sse41(Encod
void Setup_Vec_IPredPrimitives_ssse3(EncoderPrimitives&);
void Setup_Vec_IPredPrimitives_sse41(EncoderPrimitives&);
-void Setup_Vec_IPFilterPrimitives_ssse3(EncoderPrimitives&);
void Setup_Vec_IPFilterPrimitives_sse41(EncoderPrimitives&);
/* Use primitives for the best available vector architecture */
@@ -77,7 +76,6 @@ void Setup_Instrinsic_Primitives(Encoder
if (cpuMask & X265_CPU_SSSE3)
{
Setup_Vec_IPredPrimitives_ssse3(p);
- Setup_Vec_IPFilterPrimitives_ssse3(p);
Setup_Vec_DCTPrimitives_ssse3(p);
}
#endif
diff -r 25f412ecaba2 -r 033a65692b6a source/output/yuv.cpp
--- a/source/output/yuv.cpp Wed Dec 11 15:12:02 2013 -0600
+++ b/source/output/yuv.cpp Thu Dec 12 13:53:33 2013 -0600
@@ -55,11 +55,13 @@ bool YUVOutput::writePicture(const x265_
{
PPAStartCpuEventFunc(write_yuv);
+ uint64_t fileOffset = pic.poc;
+ fileOffset *= frameSize;
#if HIGH_BIT_DEPTH
if (depth == 8)
{
int shift = pic.bitDepth - 8;
- ofs.seekp(pic.poc * frameSize);
+ ofs.seekp(fileOffset);
for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
{
uint16_t *src = (uint16_t*)pic.planes[i];
@@ -77,7 +79,7 @@ bool YUVOutput::writePicture(const x265_
}
else
{
- ofs.seekp(pic.poc * frameSize * 2);
+ ofs.seekp(fileOffset * 2);
for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
{
uint16_t *src = (uint16_t*)pic.planes[i];
@@ -89,7 +91,7 @@ bool YUVOutput::writePicture(const x265_
}
}
#else // if HIGH_BIT_DEPTH
- ofs.seekp(pic.poc * frameSize);
+ ofs.seekp(fileOffset);
for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
{
char *src = (char*)pic.planes[i];
More information about the x265-commits
mailing list