[x265] [PATCH] sse: fix overflow in sse_ss for 10 bit

Wed Sep 30 17:00:25 CEST 2015

Dynamic Range:
10 + 10 + 6 + 6 = 32


so we didn't need upgrade sse_ret_t to 64bits


At 2015-09-30 18:56:51,"Divya Manivannan" <divya at multicorewareinc.com> wrote:
># HG changeset patch
># User Divya Manivannan <divya at multicorewareinc.com>
># Date 1443607299 -19800
>#      Wed Sep 30 15:31:39 2015 +0530
># Node ID aadec6615a3d5f33b4fdb00079e236b019ef1e95
># Parent  6e7761bdfe23addb862483f8407b388800de7d92
>sse: fix overflow in sse_ss for 10 bit
>
>diff -r 6e7761bdfe23 -r aadec6615a3d source/common/common.h
>--- a/source/common/common.h	Wed Sep 30 14:57:15 2015 +0530
>+++ b/source/common/common.h	Wed Sep 30 15:31:39 2015 +0530
>@@ -135,7 +135,7 @@
> typedef int32_t  ssum2_t; // Signed sum
> #endif // if HIGH_BIT_DEPTH
> 
>-#if X265_DEPTH <= 10
>+#if X265_DEPTH < 10
> typedef uint32_t sse_ret_t;
> #else
> typedef uint64_t sse_ret_t;
>diff -r 6e7761bdfe23 -r aadec6615a3d source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Wed Sep 30 14:57:15 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp	Wed Sep 30 15:31:39 2015 +0530
>@@ -1006,10 +1006,11 @@
>         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
>         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
>         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
>-#if X265_DEPTH <= 10
>-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
>-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
>-#endif
>+
>+        // sse_ss primitive need to be fixed for 10 and 12 bit
>+        //p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
>+        //ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
>+
>         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
>         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
>         p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
>@@ -1535,11 +1536,12 @@
>         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
>         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
> 
>+        // sse_ss primitive need to be fixed for 10 and 12 bit
>+        //p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
>+        //p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
>+        //p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
>+
> #if X265_DEPTH <= 10
>-        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
>-        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
>-        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
>-
>         p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
>         p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
>         p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
>diff -r 6e7761bdfe23 -r aadec6615a3d source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h	Wed Sep 30 14:57:15 2015 +0530
>+++ b/source/common/x86/pixel.h	Wed Sep 30 15:31:39 2015 +0530
>@@ -39,7 +39,7 @@
> pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
> 
> #define DECL_PIXELS(cpu) \
>-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>+    FUNCDEF_PU(sse_ret_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
>     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
>@@ -48,7 +48,7 @@
>     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
>     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
>-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
>+    FUNCDEF_CHROMA_PU(sse_ret_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
>     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
>     FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
>     FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
>diff -r 6e7761bdfe23 -r aadec6615a3d source/encoder/rdcost.h
>--- a/source/encoder/rdcost.h	Wed Sep 30 14:57:15 2015 +0530
>+++ b/source/encoder/rdcost.h	Wed Sep 30 15:31:39 2015 +0530
>@@ -91,7 +91,7 @@
> 
>     inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
>     {
>-#if X265_DEPTH <= 10
>+#if X265_DEPTH < 10
>         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
>                    "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
>                    distortion, bits, m_lambda2);
>@@ -130,7 +130,7 @@
> 
>     inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
>     {
>-#if X265_DEPTH <= 10
>+#if X265_DEPTH < 10
>         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
>                    "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
>                    dist, m_chromaDistWeight[plane - 1]);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150930/d5a74a5c/attachment.html>