[x265] [PATCH] TComYuv::copyFromPicLuma, asm integration for chroma blockcopy_pp

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Nov 15 13:19:52 CET 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1384517979 -19800
# Node ID bd9aa48dcfb7c44f0d1bb873d6965f4d3dbf7433
# Parent  2307c52f11b2e6c7b21d94c207d98f9fac8e1ab9
TComYuv::copyFromPicLuma, asm integration for chroma blockcopy_pp

diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp	Fri Nov 15 12:17:31 2013 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp	Fri Nov 15 17:49:39 2013 +0530
@@ -78,6 +78,9 @@
 
     m_cwidth  = width  >> m_hChromaShift;
     m_cheight = height >> m_vChromaShift;
+
+    m_csp = csp;
+    m_part = partitionFromSizes(m_width, m_height);
 }
 
 void TComYuv::destroy()
@@ -155,8 +158,7 @@
     uint32_t dststride = getStride();
     uint32_t srcstride = srcPicYuv->getStride();
 
-    int part = partitionFromSizes(m_width, m_height);
-    primitives.luma_copy_pp[part](dst, dststride, src, srcstride);
+    primitives.luma_copy_pp[m_part](dst, dststride, src, srcstride);
 }
 
 void TComYuv::copyFromPicChroma(TComPicYuv* srcPicYuv, uint32_t cuAddr, uint32_t absZOrderIdx)
@@ -169,8 +171,8 @@
     uint32_t dststride = getCStride();
     uint32_t srcstride = srcPicYuv->getCStride();
 
-    primitives.blockcpy_pp(m_cwidth, m_cheight, dstU, dststride, srcU, srcstride);
-    primitives.blockcpy_pp(m_cwidth, m_cheight, dstV, dststride, srcV, srcstride);
+    primitives.chroma_copy_pp[m_csp][m_part](dstU, dststride, srcU, srcstride);
+    primitives.chroma_copy_pp[m_csp][m_part](dstV, dststride, srcV, srcstride);
 }
 
 void TComYuv::copyToPartYuv(TComYuv* dstPicYuv, uint32_t uiDstPartIdx)
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/Lib/TLibCommon/TComYuv.h
--- a/source/Lib/TLibCommon/TComYuv.h	Fri Nov 15 12:17:31 2013 +0530
+++ b/source/Lib/TLibCommon/TComYuv.h	Fri Nov 15 17:49:39 2013 +0530
@@ -80,6 +80,9 @@
     int m_hChromaShift;
     int m_vChromaShift;
 
+    int m_csp;
+    int m_part;       // This will eliminate all calls to part = partitionFromSizes(m_width, m_height);
+
     static int getAddrOffset(uint32_t partUnitIdx, uint32_t width)
     {
         int blkX = g_rasterToPelX[g_zscanToRaster[partUnitIdx]];
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Fri Nov 15 12:17:31 2013 +0530
+++ b/source/common/pixel.cpp	Fri Nov 15 17:49:39 2013 +0530
@@ -835,7 +835,7 @@
     p.satd[LUMA_16x64] = satd8<16, 64>;
 
 #define CHROMA(W, H) \
-    p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
+    p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
     p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
     p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
     p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/common/primitives.h
--- a/source/common/primitives.h	Fri Nov 15 12:17:31 2013 +0530
+++ b/source/common/primitives.h	Fri Nov 15 17:49:39 2013 +0530
@@ -136,6 +136,14 @@
     NUM_IPFILTER_S_S
 };
 
+// This enum maps it's members to actual csp values
+enum ColorspaceType
+{
+  CSP_NONE,
+  CSP_I420,
+  NUM_CSP
+};
+
 // Returns a LumaPartitions enum for the given size, always expected to return a valid enum
 inline int partitionFromSizes(int width, int height)
 {
@@ -232,7 +240,7 @@
     cvt32to16_shr_t cvt32to16_shr;
 
     copy_pp_t       luma_copy_pp[NUM_LUMA_PARTITIONS];
-    copy_pp_t       chroma_copy_pp[NUM_CHROMA_PARTITIONS];
+    copy_pp_t       chroma_copy_pp[NUM_CSP][NUM_CHROMA_PARTITIONS];
     copy_sp_t       luma_copy_sp[NUM_LUMA_PARTITIONS];
     copy_sp_t       chroma_copy_sp[NUM_CHROMA_PARTITIONS];
     copy_ps_t       luma_copy_ps[NUM_LUMA_PARTITIONS];
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Nov 15 12:17:31 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Nov 15 17:49:39 2013 +0530
@@ -145,9 +145,6 @@
 #define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
     p.chroma_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
 
-#define SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
-    p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;
-
 #define CHROMA_FILTERS(cpu) \
     SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
     SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \
@@ -197,32 +194,6 @@
     SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu); \
     SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu);
 
-#define CHROMA_BLOCKCOPY(cpu) \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 4, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 2, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 4, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 8, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 4, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 8, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 6, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(6, 8, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 2, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 8, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 8, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 16, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 4, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 16, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 8, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 32, cpu);
-
 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
     p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
     p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
@@ -237,6 +208,36 @@
 #define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
     p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;
 
+#define SETUP_CHROMA_FROM_LUMA(W1, H1, W2, H2, cpu) \
+    p.chroma_copy_pp[X265_CSP_I420][LUMA_ ## W1 ## x ## H1] = x265_blockcopy_pp_ ## W2 ## x ## H2 ## cpu;
+
+// For X265_CSP_I420 chroma width and height will be half of luma width and height
+#define CHROMA_BLOCKCOPY(cpu) \
+    SETUP_CHROMA_FROM_LUMA(8,   8, 4,  4,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(8,   4, 4,  2,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(4,   8, 2,  4,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(16, 16, 8,  8,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(16,  8, 8,  4,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(8,  16, 4,  8,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(16, 12, 8,  6,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(12, 16, 6,  8,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(16,  4, 8,  2,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(4,  16, 2,  8,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(32, 32, 16, 16, cpu); \
+    SETUP_CHROMA_FROM_LUMA(32, 16, 16, 8,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(16, 32, 8,  16, cpu); \
+    SETUP_CHROMA_FROM_LUMA(32, 24, 16, 12, cpu); \
+    SETUP_CHROMA_FROM_LUMA(24, 32, 12, 16, cpu); \
+    SETUP_CHROMA_FROM_LUMA(32,  8, 16, 4,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(8,  32, 4,  16, cpu); \
+    SETUP_CHROMA_FROM_LUMA(64, 64, 32, 32, cpu); \
+    SETUP_CHROMA_FROM_LUMA(64, 32, 32, 16, cpu); \
+    SETUP_CHROMA_FROM_LUMA(32, 64, 16, 32, cpu); \
+    SETUP_CHROMA_FROM_LUMA(64, 48, 32, 24, cpu); \
+    SETUP_CHROMA_FROM_LUMA(48, 64, 24, 32, cpu); \
+    SETUP_CHROMA_FROM_LUMA(64, 16, 32, 8,  cpu); \
+    SETUP_CHROMA_FROM_LUMA(16, 64, 8,  32, cpu);
+
 #define LUMA_FILTERS(cpu) \
     SETUP_LUMA_FUNC_DEF(4,   4, cpu); \
     SETUP_LUMA_FUNC_DEF(8,   8, cpu); \
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Fri Nov 15 12:17:31 2013 +0530
+++ b/source/test/pixelharness.cpp	Fri Nov 15 17:49:39 2013 +0530
@@ -724,11 +724,11 @@
         }
     }
 
-    if (opt.chroma_copy_pp[part])
+    if (opt.chroma_copy_pp[CSP_I420][part])
     {
-        if (!check_block_copy_pp(ref.chroma_copy_pp[part], opt.chroma_copy_pp[part]))
+        if (!check_block_copy_pp(ref.chroma_copy_pp[CSP_I420][part], opt.chroma_copy_pp[CSP_I420][part]))
         {
-            printf("chroma_copy_pp[%s] failed\n", chromaPartStr[part]);
+            printf("chroma_copy_pp[%s][%s] failed\n", "CSP_I420", chromaPartStr[part]);
             return false;
         }
     }
@@ -1021,10 +1021,10 @@
         REPORT_SPEEDUP(opt.luma_copy_pp[part], ref.luma_copy_pp[part], pbuf1, 64, pbuf2, 128);
     }
 
-    if (opt.chroma_copy_pp[part])
+    if (opt.chroma_copy_pp[CSP_I420][part])
     {
-        printf("ccpy_pp[%s]", chromaPartStr[part]);
-        REPORT_SPEEDUP(opt.chroma_copy_pp[part], ref.chroma_copy_pp[part], pbuf1, 64, pbuf2, 128);
+        printf("ccpy_pp[%s][%s]", "CSP_I420", chromaPartStr[part]);
+        REPORT_SPEEDUP(opt.chroma_copy_pp[CSP_I420][part], ref.chroma_copy_pp[CSP_I420][part], pbuf1, 64, pbuf2, 128);
     }
 
     if (opt.luma_copy_sp[part])


More information about the x265-devel mailing list