<div dir="ltr">Steve, I have added partition based call for all size s of chroma function can you tell whether it is right direction, if it so I will replace the luma functions similarly. I have few question about approach about combining functions as you said,<div>
<div>[12:29:36 PM] Steve Borho: if YFrac is 0, it would just do lumaH_pp. If XFrac is 0, it would just do lumaV_pp. else it does src -> lumaH_ps -> temp -> lumaV_sp -> dst </div><div><br></div><div>Does YFrac and XFrac are template parameters? I think we need to combine both functions C codes in a single function and pass an extra buffer temp as an argument, further we have to modify intrinsic and asm code too?</div>
</div><div><br></div><div>Regards,</div><div>Praveen Tiwari</div></div><div class="gmail_extra"><br><br><div class="gmail_quote">On Wed, Oct 9, 2013 at 7:24 PM, <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Praveen Tiwari<br>
# Date 1381326812 -19800<br>
# Node ID 37b42347a5baefe11822888d385e4c8422f4f3f3<br>
# Parent fc7fbdd18bc0d6d7f98180332e065d83c054fe02<br>
Chroma function, partion based call<br>
<br>
diff -r fc7fbdd18bc0 -r 37b42347a5ba source/common/ipfilter.cpp<br>
--- a/source/common/ipfilter.cpp Wed Oct 09 00:00:10 2013 -0500<br>
+++ b/source/common/ipfilter.cpp Wed Oct 09 19:23:32 2013 +0530<br>
@@ -34,6 +34,56 @@<br>
#pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions<br>
#endif<br>
<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W2(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x4] = FUNC_PREFIX<4, WIDTH, 4>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x8] = FUNC_PREFIX<4, WIDTH, 8>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W4(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x2] = FUNC_PREFIX<4, WIDTH, 2>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x4] = FUNC_PREFIX<4, WIDTH, 4>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x8] = FUNC_PREFIX<4, WIDTH, 8>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x16] = FUNC_PREFIX<4, WIDTH, 16>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W6(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x8] = FUNC_PREFIX<4, WIDTH, 8>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W8(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x2] = FUNC_PREFIX<4, WIDTH, 2>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x4] = FUNC_PREFIX<4, WIDTH, 4>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x6] = FUNC_PREFIX<4, WIDTH, 6>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x8] = FUNC_PREFIX<4, WIDTH, 8>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x16] = FUNC_PREFIX<4, WIDTH, 16>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x32] = FUNC_PREFIX<4, WIDTH, 32>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W12(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x16] = FUNC_PREFIX<4, WIDTH, 16>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W16(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x4] = FUNC_PREFIX<4, WIDTH, 4>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x8] = FUNC_PREFIX<4, WIDTH, 8>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x12] = FUNC_PREFIX<4, WIDTH, 12>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x16] = FUNC_PREFIX<4, WIDTH, 16>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x32] = FUNC_PREFIX<4, WIDTH, 32>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W24(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x32] = FUNC_PREFIX<4, WIDTH, 32>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W32(FUNC_PREFIX, WIDTH) \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x8] = FUNC_PREFIX<4, WIDTH, 8>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x16] = FUNC_PREFIX<4, WIDTH, 16>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x24] = FUNC_PREFIX<4, WIDTH, 24>; \<br>
+ p.FUNC_PREFIX[CHROMA_HORIZONTAL_PP_PARTITION_ ## WIDTH ## x32] = FUNC_PREFIX<4, WIDTH, 32>;<br>
+<br>
+#define SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE(FUNC_PREFIX) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W2(FUNC_PREFIX, 2) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W4(FUNC_PREFIX, 4) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W6(FUNC_PREFIX, 6) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W8(FUNC_PREFIX, 8) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W12(FUNC_PREFIX, 12) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W16(FUNC_PREFIX, 16) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W24(FUNC_PREFIX, 24) \<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE_SUBSET_W32(FUNC_PREFIX, 32) \<br>
+<br>
namespace {<br>
template<int N><br>
void filterVertical_s_p(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)<br>
@@ -88,8 +138,8 @@<br>
}<br>
}<br>
<br>
-template<int N><br>
-void filterHorizontal_p_p(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)<br>
+template<int N, int width, int height><br>
+void filterHorizontal_p_p(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, short const *coeff)<br>
{<br>
int cStride = 1;<br>
<br>
@@ -500,11 +550,13 @@<br>
<br>
void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)<br>
{<br>
+<br>
+ SET_CHROMA_HORIZONTAL_PP_FUNC_PRIMITIVE_TABLE(filterHorizontal_p_p)<br>
+<br>
p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_p_p<8>;<br>
p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_p_s<8>;<br>
p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_p_s<8>;<br>
p.ipfilter_sp[FILTER_V_S_P_8] = filterVertical_s_p<8>;<br>
- p.ipfilter_pp[FILTER_H_P_P_4] = filterHorizontal_p_p<4>;<br>
p.ipfilter_ps[FILTER_H_P_S_4] = filterHorizontal_p_s<4>;<br>
p.ipfilter_ps[FILTER_V_P_S_4] = filterVertical_p_s<4>;<br>
p.ipfilter_sp[FILTER_V_S_P_4] = filterVertical_s_p<4>;<br>
diff -r fc7fbdd18bc0 -r 37b42347a5ba source/common/primitives.h<br>
--- a/source/common/primitives.h Wed Oct 09 00:00:10 2013 -0500<br>
+++ b/source/common/primitives.h Wed Oct 09 19:23:32 2013 +0530<br>
@@ -89,6 +89,17 @@<br>
NUM_PARTITIONS<br>
};<br>
<br>
+enum ChromaPartions<br>
+{<br>
+ CHROMA_HORIZONTAL_PP_PARTITION_2x4, CHROMA_HORIZONTAL_PP_PARTITION_2x8, CHROMA_HORIZONTAL_PP_PARTITION_4x2, CHROMA_HORIZONTAL_PP_PARTITION_4x4,<br>
+ CHROMA_HORIZONTAL_PP_PARTITION_4x8, CHROMA_HORIZONTAL_PP_PARTITION_4x16, CHROMA_HORIZONTAL_PP_PARTITION_8x2, CHROMA_HORIZONTAL_PP_PARTITION_8x4,<br>
+ CHROMA_HORIZONTAL_PP_PARTITION_8x6, CHROMA_HORIZONTAL_PP_PARTITION_8x8, CHROMA_HORIZONTAL_PP_PARTITION_8x16, CHROMA_HORIZONTAL_PP_PARTITION_8x32,<br>
+ CHROMA_HORIZONTAL_PP_PARTITION_6x8, CHROMA_HORIZONTAL_PP_PARTITION_12x16, CHROMA_HORIZONTAL_PP_PARTITION_16x4, CHROMA_HORIZONTAL_PP_PARTITION_16x8,<br>
+ CHROMA_HORIZONTAL_PP_PARTITION_16x12, CHROMA_HORIZONTAL_PP_PARTITION_16x16, CHROMA_HORIZONTAL_PP_PARTITION_16x32, CHROMA_HORIZONTAL_PP_PARTITION_24x32,<br>
+ CHROMA_HORIZONTAL_PP_PARTITION_32x8, CHROMA_HORIZONTAL_PP_PARTITION_32x16, CHROMA_HORIZONTAL_PP_PARTITION_32x24, CHROMA_HORIZONTAL_PP_PARTITION_32x32,<br>
+ NUM_CHROMA_HORIZONTAL_PP_PARTITIONS<br>
+};<br>
+<br>
enum SquareBlocks // Routines can be indexed using log2n(width)<br>
{<br>
BLOCK_4x4,<br>
@@ -205,6 +216,8 @@<br>
typedef void (*ssim_4x4x2_core_t)(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, ssim_t sums[2][4]);<br>
typedef float (*ssim_end4_t)(ssim_t sum0[5][4], ssim_t sum1[5][4], int width);<br>
<br>
+typedef void (*chromaFilterHoriz_pp)(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, const short *coeff); // Modified argument list for chroma filter, removed width and height.<br>
+<br>
/* Define a structure containing function pointers to optimized encoder<br>
* primitives. Each pointer can reference either an assembly routine,<br>
* a vectorized primitive, or a C function. */<br>
@@ -265,6 +278,8 @@<br>
downscale_t frame_init_lowres_core;<br>
ssim_4x4x2_core_t ssim_4x4x2_core;<br>
ssim_end4_t ssim_end_4;<br>
+<br>
+ chromaFilterHoriz_pp filterHorizontal_p_p[NUM_CHROMA_HORIZONTAL_PP_PARTITIONS];<br>
};<br>
<br>
/* This copy of the table is what gets used by the encoder.<br>
diff -r fc7fbdd18bc0 -r 37b42347a5ba source/common/vec/ipfilter-sse41.cpp<br>
--- a/source/common/vec/ipfilter-sse41.cpp Wed Oct 09 00:00:10 2013 -0500<br>
+++ b/source/common/vec/ipfilter-sse41.cpp Wed Oct 09 19:23:32 2013 +0530<br>
@@ -1541,8 +1541,8 @@<br>
-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0<br>
};<br>
<br>
-template<int N><br>
-void filterHorizontal_p_p(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)<br>
+template<int N, int width, int height><br>
+void filterHorizontal_p_p(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, short const *coeff)<br>
{<br>
assert(X265_DEPTH == 8);<br>
<br>
@@ -1656,9 +1656,35 @@<br>
}<br>
<br>
namespace x265 {<br>
+#define SETUP_PARTITION(W, H) \<br>
+ p.filterHorizontal_p_p[CHROMA_HORIZONTAL_PP_PARTITION_##W##x##H] = filterHorizontal_p_p##<4, W, H>;<br>
+<br>
void Setup_Vec_IPFilterPrimitives_sse41(EncoderPrimitives& p)<br>
{<br>
- p.ipfilter_pp[FILTER_H_P_P_4] = filterHorizontal_p_p<4>;<br>
+ SETUP_PARTITION(2, 4);<br>
+ SETUP_PARTITION(2, 8);<br>
+ SETUP_PARTITION(4, 2);<br>
+ SETUP_PARTITION(4, 4);<br>
+ SETUP_PARTITION(4, 8);<br>
+ SETUP_PARTITION(4, 16);<br>
+ SETUP_PARTITION(6, 8);<br>
+ SETUP_PARTITION(8, 2);<br>
+ SETUP_PARTITION(8, 4);<br>
+ SETUP_PARTITION(8, 6);<br>
+ SETUP_PARTITION(8, 8);<br>
+ SETUP_PARTITION(8, 16);<br>
+ SETUP_PARTITION(12, 16);<br>
+ SETUP_PARTITION(16, 4);<br>
+ SETUP_PARTITION(16, 8);<br>
+ SETUP_PARTITION(16, 12);<br>
+ SETUP_PARTITION(16, 16);<br>
+ SETUP_PARTITION(16, 32);<br>
+ SETUP_PARTITION(24, 32);<br>
+ SETUP_PARTITION(32, 8);<br>
+ SETUP_PARTITION(32, 16);<br>
+ SETUP_PARTITION(32, 24);<br>
+ SETUP_PARTITION(32, 32);<br>
+<br>
p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_p_p<8>;<br>
<br>
p.ipfilter_pp[FILTER_V_P_P_4] = filterVertical_p_p<4>;<br>
</blockquote></div><br></div>