[x265-commits] [x265] primitives: drop ipfilter[FILTER_H_P_S_[4|8], including i...

Steve Borho steve at borho.org
Thu Dec 12 20:54:04 CET 2013


details:   http://hg.videolan.org/x265/rev/a87f12ebb55b
branches:  
changeset: 5710:a87f12ebb55b
user:      Steve Borho <steve at borho.org>
date:      Wed Dec 11 15:37:02 2013 -0600
description:
primitives: drop ipfilter[FILTER_H_P_S_[4|8], including ipfilter-ssse3.cpp
Subject: [x265] yuv: Support 4GB+ YUV files for output

details:   http://hg.videolan.org/x265/rev/06e88ad6d922
branches:  stable
changeset: 5711:06e88ad6d922
user:      David Bachelart <david.bachelart at bbright.com>
date:      Thu Dec 12 11:32:27 2013 +0100
description:
yuv: Support 4GB+ YUV files for output
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/033a65692b6a
branches:  
changeset: 5712:033a65692b6a
user:      Steve Borho <steve at borho.org>
date:      Thu Dec 12 13:53:33 2013 -0600
description:
Merge with stable

diffstat:

 source/common/CMakeLists.txt         |    2 +-
 source/common/ipfilter.cpp           |   46 +----------
 source/common/primitives.h           |    2 -
 source/common/vec/ipfilter-ssse3.cpp |  143 -----------------------------------
 source/common/vec/vec-primitives.cpp |    2 -
 source/output/yuv.cpp                |    8 +-
 6 files changed, 9 insertions(+), 194 deletions(-)

diffs (297 lines):

diff -r 25f412ecaba2 -r 033a65692b6a source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/CMakeLists.txt	Thu Dec 12 13:53:33 2013 -0600
@@ -62,7 +62,7 @@ if(MSVC)
 endif(MSVC)
 
 set(SSE3  vec/dct-sse3.cpp  vec/blockcopy-sse3.cpp)
-set(SSSE3 vec/dct-ssse3.cpp vec/ipfilter-ssse3.cpp vec/intra-ssse3.cpp)
+set(SSSE3 vec/dct-ssse3.cpp vec/intra-ssse3.cpp)
 set(SSE41 vec/dct-sse41.cpp vec/ipfilter-sse41.cpp vec/intra-sse41.cpp)
 
 if (MSVC)
diff -r 25f412ecaba2 -r 033a65692b6a source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/ipfilter.cpp	Thu Dec 12 13:53:33 2013 -0600
@@ -152,43 +152,6 @@ void filterVertical_ps_c(pixel *src, int
     }
 }
 
-template<int N>
-void filterHorizontal_ps_c(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
-{
-    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
-    int shift = IF_FILTER_PREC - headRoom;
-    int offset = -IF_INTERNAL_OFFS << shift;
-
-    src -= N / 2 - 1;
-
-    int row, col;
-    for (row = 0; row < height; row++)
-    {
-        for (col = 0; col < width; col++)
-        {
-            int sum;
-
-            sum  = src[col + 0] * coeff[0];
-            sum += src[col + 1] * coeff[1];
-            sum += src[col + 2] * coeff[2];
-            sum += src[col + 3] * coeff[3];
-            if (N == 8)
-            {
-                sum += src[col + 4] * coeff[4];
-                sum += src[col + 5] * coeff[5];
-                sum += src[col + 6] * coeff[6];
-                sum += src[col + 7] * coeff[7];
-            }
-
-            int16_t val = (int16_t)((sum + offset) >> shift);
-            dst[col] = val;
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-
 template<int dstStride>
 void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
 {
@@ -477,10 +440,9 @@ typedef void (*ipfilter_sp_t)(short *src
 template<int N, int width, int height>
 void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
 {
-    short m_immedVals[(64 + 8) * (64 + 8)];
-
-    filterHorizontal_ps_c<N>(src - 3 * srcStride, srcStride, m_immedVals, width, width, height + 7, g_lumaFilter[idxX]);
-    filterVertical_sp_c<N>(m_immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
+    short immedVals[(64 + 8) * (64 + 8)];
+    interp_horiz_ps_c<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
+    filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
 }
 }
 
@@ -556,9 +518,7 @@ void Setup_C_IPFilterPrimitives(EncoderP
     LUMA(16, 64);
     CHROMA(8, 32);
 
-    p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_ps_c<8>;
     p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_ps_c<8>;
-    p.ipfilter_ps[FILTER_H_P_S_4] = filterHorizontal_ps_c<4>;
     p.ipfilter_ps[FILTER_V_P_S_4] = filterVertical_ps_c<4>;
     p.ipfilter_ss[FILTER_V_S_S_8] = filterVertical_ss_c<8>;
     p.ipfilter_ss[FILTER_V_S_S_4] = filterVertical_ss_c<4>;
diff -r 25f412ecaba2 -r 033a65692b6a source/common/primitives.h
--- a/source/common/primitives.h	Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/primitives.h	Thu Dec 12 13:53:33 2013 -0600
@@ -118,8 +118,6 @@ enum IDcts
 
 enum IPFilterConf_P_S
 {
-    FILTER_H_P_S_8,
-    FILTER_H_P_S_4,
     FILTER_V_P_S_8,
     FILTER_V_P_S_4,
     NUM_IPFILTER_P_S
diff -r 25f412ecaba2 -r 033a65692b6a source/common/vec/ipfilter-ssse3.cpp
--- a/source/common/vec/ipfilter-ssse3.cpp	Wed Dec 11 15:12:02 2013 -0600
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,143 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Deepthi Devaki <deepthidevaki at multicorewareinc.com>,
- *          Rajesh Paulraj <rajesh at multicorewareinc.com>
- *          Mandar Gurav <mandar at multicorewareinc.com>
- *          Mahesh Pittala <mahesh at multicorewareinc.com>
- *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
- *          Nabajit Deka <nabajit at multicorewareinc.com>
- *          Min Chen <chenm003 at 163.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at multicorewareinc.com.
- *****************************************************************************/
-
-#include "primitives.h"
-#include "TLibCommon/TComRom.h"
-#include <xmmintrin.h> // SSE
-#include <pmmintrin.h> // SSE3
-#include <tmmintrin.h> // SSSE3
-#include <string.h>
-
-#if !HIGH_BIT_DEPTH
-namespace {
-template<int N>
-void filterHorizontal_ps(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
-{
-    src -= (N / 2 - 1);
-
-    int offset;
-    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
-    int shift = IF_FILTER_PREC;
-    shift -= headRoom;
-    offset = -IF_INTERNAL_OFFS << shift;
-
-    int row, col;
-
-    __m128i a = _mm_loadu_si128((__m128i*)coeff);
-    __m128i T10 = _mm_packs_epi16(a, a);
-
-    __m128i S1 = _mm_slli_si128(T10, 12);
-    __m128i S2 = _mm_srli_si128(S1, 4);
-    __m128i S3 = _mm_srli_si128(S2, 4);
-    __m128i S4 = _mm_srli_si128(S3, 4);
-    __m128i S = _mm_add_epi8(S1, _mm_add_epi8(S2, S3));
-    S =  _mm_add_epi8(S, S4);
-
-    __m128i Tm1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8);
-    __m128i Tm2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10);
-    __m128i Tm3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12);
-    __m128i Tm4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14);
-    __m128i Tm5 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
-    __m128i Tm6 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10);
-
-    for (row = 0; row < height; row++)
-    {
-        col = 0;
-        for (; col < (width - 7); col += 8)
-        {
-            __m128i srcCoeff = _mm_loadu_si128((__m128i*)(src + col));
-            __m128i sum;
-
-            if (N == 4)
-            {
-                __m128i T00 = _mm_shuffle_epi8(srcCoeff, Tm5);
-                __m128i T20 = _mm_maddubs_epi16(T00, S);
-
-                __m128i T30 = _mm_shuffle_epi8(srcCoeff, Tm6);
-                __m128i T40 = _mm_maddubs_epi16(T30, S);
-
-                sum = _mm_hadd_epi16(T20, T40);
-            }
-            else  // (N == 8)
-            {
-                __m128i T00 = _mm_shuffle_epi8(srcCoeff, Tm1);
-                __m128i T20 = _mm_maddubs_epi16(T00, T10);
-
-                __m128i T30 = _mm_shuffle_epi8(srcCoeff, Tm2);
-                __m128i T40 = _mm_maddubs_epi16(T30, T10);
-
-                __m128i T50 = _mm_shuffle_epi8(srcCoeff, Tm3);
-                __m128i T60 = _mm_maddubs_epi16(T50, T10);
-
-                __m128i T70 = _mm_shuffle_epi8(srcCoeff, Tm4);
-                __m128i T80 = _mm_maddubs_epi16(T70, T10);
-
-                __m128i s1 = _mm_hadd_epi16(T20, T40);
-                __m128i s2 = _mm_hadd_epi16(T60, T80);
-                sum = _mm_hadd_epi16(s1, s2);
-            }
-
-            __m128i sumOffset = _mm_set1_epi16(offset);
-            __m128i val = _mm_add_epi16(sum, sumOffset);
-
-            val = _mm_srai_epi16(val, shift);
-            _mm_storeu_si128((__m128i*)&dst[col], val);
-        }
-
-        for (; col < width; col++)                    // Remaining iterations
-        {
-            __m128i NewSrc = _mm_loadl_epi64((__m128i*)(src + col));
-            __m128i T00 = _mm_maddubs_epi16(NewSrc, T10);
-            __m128i add = _mm_hadd_epi16(T00, T00);
-            int16_t sum = _mm_extract_epi16(add, 0);
-            if (N == 8)
-            {
-                add = _mm_hadd_epi16(add, add);
-                sum = _mm_extract_epi16(add, 0);
-            }
-            int16_t val = (int16_t)(sum + offset) >> shift;
-            dst[col] = val;
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-}
-#endif // if !HIGH_BIT_DEPTH
-
-namespace x265 {
-void Setup_Vec_IPFilterPrimitives_ssse3(EncoderPrimitives& p)
-{
-#if !HIGH_BIT_DEPTH
-    p.ipfilter_ps[FILTER_H_P_S_4] = filterHorizontal_ps<4>;
-    p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_ps<8>;
-#endif
-}
-}
diff -r 25f412ecaba2 -r 033a65692b6a source/common/vec/vec-primitives.cpp
--- a/source/common/vec/vec-primitives.cpp	Wed Dec 11 15:12:02 2013 -0600
+++ b/source/common/vec/vec-primitives.cpp	Thu Dec 12 13:53:33 2013 -0600
@@ -60,7 +60,6 @@ void Setup_Vec_DCTPrimitives_sse41(Encod
 void Setup_Vec_IPredPrimitives_ssse3(EncoderPrimitives&);
 void Setup_Vec_IPredPrimitives_sse41(EncoderPrimitives&);
 
-void Setup_Vec_IPFilterPrimitives_ssse3(EncoderPrimitives&);
 void Setup_Vec_IPFilterPrimitives_sse41(EncoderPrimitives&);
 
 /* Use primitives for the best available vector architecture */
@@ -77,7 +76,6 @@ void Setup_Instrinsic_Primitives(Encoder
     if (cpuMask & X265_CPU_SSSE3)
     {
         Setup_Vec_IPredPrimitives_ssse3(p);
-        Setup_Vec_IPFilterPrimitives_ssse3(p);
         Setup_Vec_DCTPrimitives_ssse3(p);
     }
 #endif
diff -r 25f412ecaba2 -r 033a65692b6a source/output/yuv.cpp
--- a/source/output/yuv.cpp	Wed Dec 11 15:12:02 2013 -0600
+++ b/source/output/yuv.cpp	Thu Dec 12 13:53:33 2013 -0600
@@ -55,11 +55,13 @@ bool YUVOutput::writePicture(const x265_
 {
     PPAStartCpuEventFunc(write_yuv);
 
+    uint64_t fileOffset = pic.poc;
+    fileOffset *= frameSize;
 #if HIGH_BIT_DEPTH
     if (depth == 8)
     {
         int shift = pic.bitDepth - 8;
-        ofs.seekp(pic.poc * frameSize);
+        ofs.seekp(fileOffset);
         for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
         {
             uint16_t *src = (uint16_t*)pic.planes[i];
@@ -77,7 +79,7 @@ bool YUVOutput::writePicture(const x265_
     }
     else
     {
-        ofs.seekp(pic.poc * frameSize * 2);
+        ofs.seekp(fileOffset * 2);
         for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
         {
             uint16_t *src = (uint16_t*)pic.planes[i];
@@ -89,7 +91,7 @@ bool YUVOutput::writePicture(const x265_
         }
     }
 #else // if HIGH_BIT_DEPTH
-    ofs.seekp(pic.poc * frameSize);
+    ofs.seekp(fileOffset);
     for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
     {
         char *src = (char*)pic.planes[i];


More information about the x265-commits mailing list