[x265] [PATCH] weighted prediction pixel, interface simplification

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Oct 20 10:23:22 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1413793389 -19800
# Node ID 3366be6ef59eec3d3ca69ed52942708b5d1b3bc6
# Parent  1e09d0395826bdd01a4b4e46569853a2f04b9e95
weighted prediction pixel, interface simplification

diff -r 1e09d0395826 -r 3366be6ef59e source/common/pixel.cpp
--- a/source/common/pixel.cpp	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/common/pixel.cpp	Mon Oct 20 13:53:09 2014 +0530
@@ -640,11 +640,13 @@
     }
 }
 
-void weight_pp_c(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
 {
     int x, y;
 
     X265_CHECK(!(width & 15), "weightp alignment error\n");
+    X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
+    X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
 
     for (y = 0; y <= height - 1; y++)
     {
@@ -656,8 +658,8 @@
             x++;
         }
 
-        src += srcStride;
-        dst += dstStride;
+        src += stride;
+        dst += stride;
     }
 }
 
diff -r 1e09d0395826 -r 3366be6ef59e source/common/primitives.h
--- a/source/common/primitives.h	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/common/primitives.h	Mon Oct 20 13:53:09 2014 +0530
@@ -168,7 +168,7 @@
 typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 typedef int  (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
 
-typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
 typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
diff -r 1e09d0395826 -r 3366be6ef59e source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/common/x86/pixel-util.h	Mon Oct 20 13:53:09 2014 +0530
@@ -57,7 +57,7 @@
 void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
 
-void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 
 void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
diff -r 1e09d0395826 -r 3366be6ef59e source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/common/x86/pixel-util8.asm	Mon Oct 20 13:53:09 2014 +0530
@@ -1298,36 +1298,29 @@
 
 
 ;-----------------------------------------------------------------------------------------------------------------------------------------------
-;void weight_pp(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
 ;-----------------------------------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal weight_pp, 6, 7, 6
 
+    shl         r5d, 6      ; m0 = [w0<<6]
     mov         r6d, r6m
-    shl         r6d, 6
-    movd        m0, r6d         ; m0 = [w0<<6]
-
-    movd        m1, r7m         ; m1 = [round]
-    punpcklwd   m0, m1          ; assuming both (w0<<6) and round are using maximum of 16 bits each.
-    pshufd      m0, m0, 0       ; m0 = [w0<<6 round]
-
-    movd        m1, r8m
-
-    movd        m2, r9m
+    shl         r6d, 16
+    or          r6d, r5d    ; assuming both (w0<<6) and round are using maximum of 16 bits each.
+    movd        m0, r6d
+    pshufd      m0, m0, 0   ; m0 = [w0<<6, round]
+    movd        m1, r7m
+    movd        m2, r8m
     pshufd      m2, m2, 0
-
     mova        m5, [pw_1]
-
-    sub         r2d, r4d
-    sub         r3d, r4d
+    sub         r2d, r3d
+    shr         r3d, 4
 
 .loopH:
-    mov         r6d, r4d
-    shr         r6d, 4
+    mov         r5d, r3d
+
 .loopW:
-    movh        m4, [r0]
-    pmovzxbw    m4, m4
-
+    pmovzxbw    m4, [r0]
     punpcklwd   m3, m4, m5
     pmaddwd     m3, m0
     psrad       m3, m1
@@ -1340,12 +1333,9 @@
 
     packssdw    m3, m4
     packuswb    m3, m3
-
     movh        [r1], m3
 
-    movh        m4, [r0 + 8]
-    pmovzxbw    m4, m4
-
+    pmovzxbw    m4, [r0 + 8]
     punpcklwd   m3, m4, m5
     pmaddwd     m3, m0
     psrad       m3, m1
@@ -1358,21 +1348,19 @@
 
     packssdw    m3, m4
     packuswb    m3, m3
-
     movh        [r1 + 8], m3
 
     add         r0, 16
     add         r1, 16
 
-    dec         r6d
+    dec         r5d
     jnz         .loopW
 
     lea         r0, [r0 + r2]
-    lea         r1, [r1 + r3]
-
-    dec         r5d
+    lea         r1, [r1 + r2]
+
+    dec         r4d
     jnz         .loopH
-
     RET
 
 ;-------------------------------------------------------------------------------------------------------------------------------------------------
diff -r 1e09d0395826 -r 3366be6ef59e source/encoder/reference.cpp
--- a/source/encoder/reference.cpp	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/encoder/reference.cpp	Mon Oct 20 13:53:09 2014 +0530
@@ -92,7 +92,7 @@
     // Computing weighted CU rows
     int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
     int padwidth = (width + 15) & ~15;  // weightp assembly needs even 16 byte widths
-    primitives.weight_pp(src, dst, lumaStride, lumaStride, padwidth, height,
+    primitives.weight_pp(src, dst, lumaStride, padwidth, height,
                          weight, round << correction, shift + correction, offset);
 
     // Extending Left & Right
diff -r 1e09d0395826 -r 3366be6ef59e source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/encoder/slicetype.cpp	Mon Oct 20 13:53:09 2014 +0530
@@ -1383,7 +1383,7 @@
         int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
         int widthHeight = (int)stride;
 
-        primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, widthHeight, m_paddedLines,
+        primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
                              scale, round << correction, denom + correction, offset);
         src = m_weightedRef.fpelPlane;
     }
@@ -1481,7 +1481,7 @@
         int widthHeight = (int)stride;
 
         for (int i = 0; i < 4; i++)
-            primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, widthHeight, m_paddedLines,
+            primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines,
                                  scale, round << correction, denom + correction, offset);
 
         m_weightedRef.isWeighted = true;
diff -r 1e09d0395826 -r 3366be6ef59e source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/encoder/weightPrediction.cpp	Mon Oct 20 13:53:09 2014 +0530
@@ -186,7 +186,7 @@
         int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
         int pwidth = ((width + 15) >> 4) << 4;
 
-        primitives.weight_pp(ref, weightTemp, stride, stride, pwidth, height,
+        primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
                              weight, round << correction, denom + correction, offset);
         ref = weightTemp;
     }
diff -r 1e09d0395826 -r 3366be6ef59e source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Sun Oct 19 20:53:36 2014 -0500
+++ b/source/test/pixelharness.cpp	Mon Oct 20 13:53:09 2014 +0530
@@ -334,8 +334,8 @@
     for (int i = 0; i < ITERS; i++)
     {
         int index = i % TEST_CASES;
-        checked(opt, pixel_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round, shift, offset);
-        ref(pixel_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round, shift, offset);
+        checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round, shift, offset);
+        ref(pixel_test_buff[index] + j, ref_dest, stride, width, height, w0, round, shift, offset);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
@@ -1775,7 +1775,7 @@
     if (opt.weight_pp)
     {
         HEADER0("weight_pp");
-        REPORT_SPEEDUP(opt.weight_pp, ref.weight_pp, pbuf1, pbuf2, 64, 64, 32, 32, 128, 1 << 9, 10, 100);
+        REPORT_SPEEDUP(opt.weight_pp, ref.weight_pp, pbuf1, pbuf2, 64, 32, 32, 128, 1 << 9, 10, 100);
     }
 
     if (opt.weight_sp)


More information about the x265-devel mailing list