[x265-commits] [x265] testbench: update for blockcopy, these function use dynam...
Min Chen
chenm003 at 163.com
Fri Nov 8 09:58:47 CET 2013
details: http://hg.videolan.org/x265/rev/c5e633516217
branches:
changeset: 4945:c5e633516217
user: Min Chen <chenm003 at 163.com>
date: Fri Nov 08 15:53:53 2013 +0800
description:
testbench: update for blockcopy, these function use dynamic range [0,255]
Subject: [x265] fix bug in sse_sp12
details: http://hg.videolan.org/x265/rev/f76b591b7aef
branches:
changeset: 4946:f76b591b7aef
user: Min Chen <chenm003 at 163.com>
date: Fri Nov 08 15:55:50 2013 +0800
description:
fix bug in sse_sp12
Subject: [x265] linux: re-enable sse_12x16 for clang and gcc
details: http://hg.videolan.org/x265/rev/94cba84de8dd
branches:
changeset: 4947:94cba84de8dd
user: Steve Borho <steve at borho.org>
date: Fri Nov 08 02:01:52 2013 -0600
description:
linux: re-enable sse_12x16 for clang and gcc
Subject: [x265] primitives: remove unused ipfilter_pp_t funcdef and C and intrinsic primitives
details: http://hg.videolan.org/x265/rev/fef74c2e329d
branches:
changeset: 4948:fef74c2e329d
user: Steve Borho <steve at borho.org>
date: Fri Nov 08 02:57:47 2013 -0600
description:
primitives: remove unused ipfilter_pp_t funcdef and C and intrinsic primitives
These are now completely replaced by block based assembly code
diffstat:
source/common/ipfilter.cpp | 86 ----
source/common/primitives.h | 13 +-
source/common/vec/ipfilter-sse41.cpp | 607 -----------------------------------
source/common/vec/pixel-sse41.cpp | 3 +-
source/test/ipfilterharness.cpp | 74 ----
source/test/ipfilterharness.h | 1 -
source/test/pixelharness.cpp | 26 +-
source/test/pixelharness.h | 2 +-
8 files changed, 18 insertions(+), 794 deletions(-)
diffs (truncated from 1039 to 300 lines):
diff -r fd721a5ba063 -r fef74c2e329d source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Thu Nov 07 21:27:45 2013 -0600
+++ b/source/common/ipfilter.cpp Fri Nov 08 02:57:47 2013 -0600
@@ -80,47 +80,6 @@ void filterVertical_sp_c(int16_t *src, i
}
template<int N>
-void filterHorizontal_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
-{
- int headRoom = IF_FILTER_PREC;
- int offset = (1 << (headRoom - 1));
- uint16_t maxVal = (1 << X265_DEPTH) - 1;
- const int cStride = 1;
-
- src -= (N / 2 - 1) * cStride;
-
- int row, col;
- for (row = 0; row < height; row++)
- {
- for (col = 0; col < width; col++)
- {
- int sum;
-
- sum = src[col + 0 * cStride] * coeff[0];
- sum += src[col + 1 * cStride] * coeff[1];
- sum += src[col + 2 * cStride] * coeff[2];
- sum += src[col + 3 * cStride] * coeff[3];
- if (N == 8)
- {
- sum += src[col + 4 * cStride] * coeff[4];
- sum += src[col + 5 * cStride] * coeff[5];
- sum += src[col + 6 * cStride] * coeff[6];
- sum += src[col + 7 * cStride] * coeff[7];
- }
-
- int16_t val = (int16_t)(sum + offset) >> headRoom;
-
- if (val < 0) val = 0;
- if (val > maxVal) val = maxVal;
- dst[col] = (pixel)val;
- }
-
- src += srcStride;
- dst += dstStride;
- }
-}
-
-template<int N>
void filterVertical_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coefIdx)
{
const int16_t *const c = (N == 8 ? g_lumaFilter[coefIdx] : g_chromaFilter[coefIdx]);
@@ -289,47 +248,6 @@ void filterConvertPelToShort_c(pixel *sr
}
}
-template<int N>
-void filterVertical_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *c)
-{
- int shift = IF_FILTER_PREC;
- int offset = 1 << (shift - 1);
- uint16_t maxVal = (1 << X265_DEPTH) - 1;
-
- src -= (N / 2 - 1) * srcStride;
-
- int row, col;
-
- for (row = 0; row < height; row++)
- {
- for (col = 0; col < width; col++)
- {
- int sum;
-
- sum = src[col + 0 * srcStride] * c[0];
- sum += src[col + 1 * srcStride] * c[1];
- sum += src[col + 2 * srcStride] * c[2];
- sum += src[col + 3 * srcStride] * c[3];
- if (N == 8)
- {
- sum += src[col + 4 * srcStride] * c[4];
- sum += src[col + 5 * srcStride] * c[5];
- sum += src[col + 6 * srcStride] * c[6];
- sum += src[col + 7 * srcStride] * c[7];
- }
-
- int16_t val = (int16_t)((sum + offset) >> shift);
- val = (val < 0) ? 0 : val;
- val = (val > maxVal) ? maxVal : val;
-
- dst[col] = (pixel)val;
- }
-
- src += srcStride;
- dst += dstStride;
- }
-}
-
void extendCURowColBorder(pixel* txt, intptr_t stride, int width, int height, int marginX)
{
for (int y = 0; y < height; y++)
@@ -586,16 +504,12 @@ void Setup_C_IPFilterPrimitives(EncoderP
LUMA(16, 64);
CHROMA(8, 32);
- p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_pp_c<8>;
p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_ps_c<8>;
p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_ps_c<8>;
p.ipfilter_sp[FILTER_V_S_P_8] = filterVertical_sp_c<8>;
- p.ipfilter_pp[FILTER_H_P_P_4] = filterHorizontal_pp_c<4>;
p.ipfilter_ps[FILTER_H_P_S_4] = filterHorizontal_ps_c<4>;
p.ipfilter_ps[FILTER_V_P_S_4] = filterVertical_ps_c<4>;
p.ipfilter_sp[FILTER_V_S_P_4] = filterVertical_sp_c<4>;
- p.ipfilter_pp[FILTER_V_P_P_8] = filterVertical_pp_c<8>;
- p.ipfilter_pp[FILTER_V_P_P_4] = filterVertical_pp_c<4>;
p.ipfilter_ss[FILTER_V_S_S_8] = filterVertical_ss_c<8>;
p.ipfilter_ss[FILTER_V_S_S_4] = filterVertical_ss_c<4>;
diff -r fd721a5ba063 -r fef74c2e329d source/common/primitives.h
--- a/source/common/primitives.h Thu Nov 07 21:27:45 2013 -0600
+++ b/source/common/primitives.h Fri Nov 08 02:57:47 2013 -0600
@@ -114,15 +114,6 @@ enum IDcts
NUM_IDCTS
};
-enum IPFilterConf_P_P
-{
- FILTER_H_P_P_8,
- FILTER_H_P_P_4,
- FILTER_V_P_P_8,
- FILTER_V_P_P_4,
- NUM_IPFILTER_P_P
-};
-
enum IPFilterConf_P_S
{
FILTER_H_P_S_8,
@@ -163,7 +154,6 @@ typedef int (*pixelcmp_ss_t)(int16_t *f
typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
-typedef void (*ipfilter_pp_t)(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
@@ -244,13 +234,11 @@ struct EncoderPrimitives
copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
copy_sp_t chroma_copy_sp[NUM_CHROMA_PARTITIONS];
- ipfilter_pp_t ipfilter_pp[NUM_IPFILTER_P_P];
ipfilter_ps_t ipfilter_ps[NUM_IPFILTER_P_S];
ipfilter_sp_t ipfilter_sp[NUM_IPFILTER_S_P];
ipfilter_ss_t ipfilter_ss[NUM_IPFILTER_S_S];
ipfilter_p2s_t ipfilter_p2s;
ipfilter_s2p_t ipfilter_s2p;
- extendCURowBorder_t extendRowBorder;
filter_pp_t chroma_hpp[NUM_CHROMA_PARTITIONS];
filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS];
filter_ps_t luma_hps[NUM_LUMA_PARTITIONS];
@@ -260,6 +248,7 @@ struct EncoderPrimitives
filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS];
filter_p2s_t luma_p2s;
filter_p2s_t chroma_p2s;
+ extendCURowBorder_t extendRowBorder;
intra_dc_t intra_pred_dc;
intra_planar_t intra_pred_planar;
diff -r fd721a5ba063 -r fef74c2e329d source/common/vec/ipfilter-sse41.cpp
--- a/source/common/vec/ipfilter-sse41.cpp Thu Nov 07 21:27:45 2013 -0600
+++ b/source/common/vec/ipfilter-sse41.cpp Fri Nov 08 02:57:47 2013 -0600
@@ -371,310 +371,7 @@ void filterVertical_sp(int16_t *src, int
sump = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
*(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(sump); \
}
-
-template<int N>
-void filterVertical_pp(pixel *src, intptr_t srcStride,
- pixel *dst, intptr_t dstStride,
- int width, int height,
- const int16_t *coeff)
-{
- src -= (N / 2 - 1) * srcStride;
-
- const __m128i coeffTemp = _mm_loadu_si128((__m128i const*)coeff);
-
- int row, col;
-
- if (N == 4)
- assert(height % 2 == 0);
-
- uint32_t leftCols = (8 - (width & 7)) * 8;
- uint32_t mask_shift = ((uint32_t) ~0 >> leftCols);
- uint32_t mask0 = (width & 7) <= 4 ? mask_shift : ~0;
- uint32_t mask1 = (width & 7) <= 4 ? 0 : mask_shift;
- __m128i leftmask = _mm_setr_epi32(mask0, mask1, 0, 0);
-
- if (N == 8)
- {
- __m128i vm01 = _mm_shuffle_epi32(coeffTemp, 0x00);
- __m128i vm23 = _mm_shuffle_epi32(coeffTemp, 0x55);
- __m128i vm45 = _mm_shuffle_epi32(coeffTemp, 0xAA);
- __m128i vm67 = _mm_shuffle_epi32(coeffTemp, 0xFF);
- vm01 = _mm_packs_epi16(vm01, vm01);
- vm23 = _mm_packs_epi16(vm23, vm23);
- vm45 = _mm_packs_epi16(vm45, vm45);
- vm67 = _mm_packs_epi16(vm67, vm67);
-
- __m128i T00, T01, T02, T03, T04, T05, T06, T07 /*, T08*/;
- __m128i T10, T11, T12, T13;
- for (row = 0; row < height; row += 1)
- {
- for (col = 0; col < (width & ~7); col += 8)
- {
- T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
- T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
- T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
- T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
- T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
- T05 = _mm_loadl_epi64((__m128i*)&src[(5) * srcStride + col]);
- T06 = _mm_loadl_epi64((__m128i*)&src[(6) * srcStride + col]);
- T07 = _mm_loadl_epi64((__m128i*)&src[(7) * srcStride + col]);
-
- T10 = _mm_unpacklo_epi8(T00, T01);
- T11 = _mm_unpacklo_epi8(T02, T03);
- T12 = _mm_unpacklo_epi8(T04, T05);
- T13 = _mm_unpacklo_epi8(T06, T07);
-
- T10 = _mm_maddubs_epi16(T10, vm01);
- T11 = _mm_maddubs_epi16(T11, vm23);
- T12 = _mm_maddubs_epi16(T12, vm45);
- T13 = _mm_maddubs_epi16(T13, vm67);
- T10 = _mm_add_epi16(T10, T11);
- T11 = _mm_add_epi16(T12, T13);
- T10 = _mm_add_epi16(T10, T11);
- T10 = _mm_mulhrs_epi16(T10, _mm_load_si128((__m128i*)c_512));
- T10 = _mm_packus_epi16(T10, T10);
- _mm_storel_epi64((__m128i*)&dst[0 * dstStride + col], T10);
- }
-
- assert((width - col) < 8);
- if (col != width)
- {
- T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
- T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
- T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
- T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
- T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
- T05 = _mm_loadl_epi64((__m128i*)&src[(5) * srcStride + col]);
- T06 = _mm_loadl_epi64((__m128i*)&src[(6) * srcStride + col]);
- T07 = _mm_loadl_epi64((__m128i*)&src[(7) * srcStride + col]);
-
- T10 = _mm_unpacklo_epi8(T00, T01);
- T11 = _mm_unpacklo_epi8(T02, T03);
- T12 = _mm_unpacklo_epi8(T04, T05);
- T13 = _mm_unpacklo_epi8(T06, T07);
-
- T10 = _mm_maddubs_epi16(T10, vm01);
- T11 = _mm_maddubs_epi16(T11, vm23);
- T12 = _mm_maddubs_epi16(T12, vm45);
- T13 = _mm_maddubs_epi16(T13, vm67);
- T10 = _mm_add_epi16(T10, T11);
- T11 = _mm_add_epi16(T12, T13);
- T10 = _mm_add_epi16(T10, T11);
- T10 = _mm_mulhrs_epi16(T10, _mm_load_si128((__m128i*)c_512));
- T10 = _mm_packus_epi16(T10, T10);
- _mm_maskmoveu_si128(T10, leftmask, (char*)&dst[(0) * dstStride + col]);
- }
-
- src += 1 * srcStride;
- dst += 1 * dstStride;
- } // end of row loop
- } // end of N==8
-
- if (N == 4)
- {
- __m128i vm01 = _mm_shuffle_epi32(coeffTemp, 0x00);
- __m128i vm23 = _mm_shuffle_epi32(coeffTemp, 0x55);
- vm01 = _mm_packs_epi16(vm01, vm01);
- vm23 = _mm_packs_epi16(vm23, vm23);
-
- __m128i T00, T01, T02, T03, T04;
- __m128i T10, T11;
- __m128i T20, T21;
- for (row = 0; row < height; row += 2)
- {
- for (col = 0; col < (width & ~7); col += 8)
- {
- T00 = _mm_loadl_epi64((__m128i*)&src[(0) * srcStride + col]);
- T01 = _mm_loadl_epi64((__m128i*)&src[(1) * srcStride + col]);
- T02 = _mm_loadl_epi64((__m128i*)&src[(2) * srcStride + col]);
- T03 = _mm_loadl_epi64((__m128i*)&src[(3) * srcStride + col]);
- T04 = _mm_loadl_epi64((__m128i*)&src[(4) * srcStride + col]);
-
- T10 = _mm_unpacklo_epi8(T00, T01);
- T11 = _mm_unpacklo_epi8(T02, T03);
-
- T10 = _mm_maddubs_epi16(T10, vm01);
- T11 = _mm_maddubs_epi16(T11, vm23);
- T10 = _mm_add_epi16(T10, T11);
- T10 = _mm_mulhrs_epi16(T10, _mm_load_si128((__m128i*)c_512));
- T10 = _mm_packus_epi16(T10, T10);
- _mm_storel_epi64((__m128i*)&dst[0 * dstStride + col], T10);
More information about the x265-commits
mailing list