[x265] [PATCH] TShortYUV: asm code integration for pixelsub_ps

murugan at multicorewareinc.com murugan at multicorewareinc.com
Mon Nov 18 13:22:15 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384777276 -19800
#      Mon Nov 18 17:51:16 2013 +0530
# Node ID be8373f115dd7f152588ba8c575ad10dc6f5afb1
# Parent  c355ba4b6711bfad87ff37d650a8f1946f878eec
TShortYUV: asm code integration for pixelsub_ps

diff -r c355ba4b6711 -r be8373f115dd source/common/TShortYUV.cpp
--- a/source/common/TShortYUV.cpp	Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/TShortYUV.cpp	Mon Nov 18 17:51:16 2013 +0530
@@ -58,6 +58,7 @@
 
     m_cwidth  = width  >> m_hChromaShift;
     m_cheight = height >> m_vChromaShift;
+    m_csp = csp;
 }
 
 void TShortYUV::destroy()
@@ -78,15 +79,14 @@
 }
 
 void TShortYUV::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
-{
-    subtractLuma(srcYuv0, srcYuv1, trUnitIdx, partSize);
-    subtractChroma(srcYuv0, srcYuv1, trUnitIdx, partSize >> m_hChromaShift);
+{
+    int part = partitionFromSizes(partSize, partSize);
+    subtractLuma(srcYuv0, srcYuv1, trUnitIdx, partSize, part);
+    subtractChroma(srcYuv0, srcYuv1, trUnitIdx, partSize >> m_hChromaShift, part);
 }
 
-void TShortYUV::subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
+void TShortYUV::subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part)
 {
-    int x = partSize, y = partSize;
-
     Pel* src0 = srcYuv0->getLumaAddr(trUnitIdx, partSize);
     Pel* src1 = srcYuv1->getLumaAddr(trUnitIdx, partSize);
     int16_t* dst = getLumaAddr(trUnitIdx, partSize);
@@ -95,13 +95,11 @@
     int src1Stride = srcYuv1->getStride();
     int dstStride  = m_width;
 
-    primitives.pixelsub_ps(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);
+    primitives.luma_sub_ps[part](dst, dstStride, src0, src1, src0Stride, src1Stride);
 }
 
-void TShortYUV::subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
+void TShortYUV::subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part)
 {
-    int x = partSize, y = partSize;
-
     Pel* srcU0 = srcYuv0->getCbAddr(trUnitIdx, partSize);
     Pel* srcU1 = srcYuv1->getCbAddr(trUnitIdx, partSize);
     Pel* srcV0 = srcYuv0->getCrAddr(trUnitIdx, partSize);
@@ -113,8 +111,8 @@
     int src1Stride = srcYuv1->getCStride();
     int dstStride  = m_cwidth;
 
-    primitives.pixelsub_ps(x, y, dstU, dstStride, srcU0, srcU1, src0Stride, src1Stride);
-    primitives.pixelsub_ps(x, y, dstV, dstStride, srcV0, srcV1, src0Stride, src1Stride);
+    primitives.chroma_sub_ps[m_csp][part](dstU, dstStride, srcU0, srcU1, src0Stride, src1Stride);
+    primitives.chroma_sub_ps[m_csp][part](dstV, dstStride, srcV0, srcV1, src0Stride, src1Stride);
 }
 
 void TShortYUV::addClip(TShortYUV* srcYuv0, TShortYUV* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
diff -r c355ba4b6711 -r be8373f115dd source/common/TShortYUV.h
--- a/source/common/TShortYUV.h	Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/TShortYUV.h	Mon Nov 18 17:51:16 2013 +0530
@@ -53,6 +53,8 @@
         return blkX + blkY * size;
     }
 
+    int m_csp;
+
 public:
 
     int16_t* m_bufY;
@@ -95,8 +97,8 @@
 
     int16_t* getCrAddr(unsigned int partIdx, unsigned int size) { return m_bufCr + getAddrOffset(partIdx, size, m_cwidth); }
 
-    void subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
-    void subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
+    void subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part);
+    void subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part);
     void subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
 
     void addClip(TShortYUV* srcYuv0, TShortYUV* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
diff -r c355ba4b6711 -r be8373f115dd source/common/pixel.cpp
--- a/source/common/pixel.cpp	Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/pixel.cpp	Mon Nov 18 17:51:16 2013 +0530
@@ -838,7 +838,7 @@
     p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
     p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
     p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
-    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
+    p.chroma_sub_ps[CSP_I420][CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
 
 #define LUMA(W, H) \
     p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
diff -r c355ba4b6711 -r be8373f115dd source/common/primitives.h
--- a/source/common/primitives.h	Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/primitives.h	Mon Nov 18 17:51:16 2013 +0530
@@ -250,7 +250,7 @@
     copy_ps_t       chroma_copy_ps[NUM_CHROMA_PARTITIONS];
 
     pixel_sub_ps_t  luma_sub_ps[NUM_LUMA_PARTITIONS];
-    pixel_sub_ps_t  chroma_sub_ps[NUM_CHROMA_PARTITIONS];
+    pixel_sub_ps_t  chroma_sub_ps[NUM_CSP][NUM_CHROMA_PARTITIONS];
 
     ipfilter_ps_t   ipfilter_ps[NUM_IPFILTER_P_S];
     ipfilter_sp_t   ipfilter_sp[NUM_IPFILTER_S_P];
diff -r c355ba4b6711 -r be8373f115dd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Nov 18 17:51:16 2013 +0530
@@ -141,8 +141,7 @@
     p.chroma_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
     p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
     p.chroma_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
-    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
+    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; 
 
 #define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
     p.chroma_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
@@ -272,6 +271,35 @@
     SETUP_CHROMA_FROM_LUMA(64, 16, 32, 8,  cpu); \
     SETUP_CHROMA_FROM_LUMA(16, 64, 8,  32, cpu);
 
+#define SETUP_CHROMA_LUMA(W1, H1, W2, H2, cpu) \
+    p.chroma_sub_ps[X265_CSP_I420][LUMA_ ## W1 ## x ## H1] = x265_pixel_sub_ps_ ## W2 ## x ## H2 ## cpu;
+
+#define CHROMA_PIXELSUB_PS(cpu) \
+    SETUP_CHROMA_LUMA(8,   8, 4,  4,  cpu); \
+    SETUP_CHROMA_LUMA(8,   4, 4,  2,  cpu); \
+    SETUP_CHROMA_LUMA(4,   8, 2,  4,  cpu); \
+    SETUP_CHROMA_LUMA(16, 16, 8,  8,  cpu); \
+    SETUP_CHROMA_LUMA(16,  8, 8,  4,  cpu); \
+    SETUP_CHROMA_LUMA(8,  16, 4,  8,  cpu); \
+    SETUP_CHROMA_LUMA(16, 12, 8,  6,  cpu); \
+    SETUP_CHROMA_LUMA(12, 16, 6,  8,  cpu); \
+    SETUP_CHROMA_LUMA(16,  4, 8,  2,  cpu); \
+    SETUP_CHROMA_LUMA(4,  16, 2,  8,  cpu); \
+    SETUP_CHROMA_LUMA(32, 32, 16, 16, cpu); \
+    SETUP_CHROMA_LUMA(32, 16, 16, 8,  cpu); \
+    SETUP_CHROMA_LUMA(16, 32, 8,  16, cpu); \
+    SETUP_CHROMA_LUMA(32, 24, 16, 12, cpu); \
+    SETUP_CHROMA_LUMA(24, 32, 12, 16, cpu); \
+    SETUP_CHROMA_LUMA(32,  8, 16, 4,  cpu); \
+    SETUP_CHROMA_LUMA(8,  32, 4,  16, cpu); \
+    SETUP_CHROMA_LUMA(64, 64, 32, 32, cpu); \
+    SETUP_CHROMA_LUMA(64, 32, 32, 16, cpu); \
+    SETUP_CHROMA_LUMA(32, 64, 16, 32, cpu); \
+    SETUP_CHROMA_LUMA(64, 48, 32, 24, cpu); \
+    SETUP_CHROMA_LUMA(48, 64, 24, 32, cpu); \
+    SETUP_CHROMA_LUMA(64, 16, 32, 8,  cpu); \
+    SETUP_CHROMA_LUMA(16, 64, 8,  32, cpu);
+
 #define LUMA_FILTERS(cpu) \
     SETUP_LUMA_FUNC_DEF(4,   4, cpu); \
     SETUP_LUMA_FUNC_DEF(8,   8, cpu); \
@@ -589,6 +617,8 @@
         p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
         SA8D_INTER_FROM_BLOCK(sse4);
 
+        CHROMA_PIXELSUB_PS(_sse4);
+
         CHROMA_FILTERS(_sse4);
         LUMA_FILTERS(_sse4);
         HEVC_SATD(sse4);
diff -r c355ba4b6711 -r be8373f115dd source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Nov 18 16:49:30 2013 +0530
+++ b/source/test/pixelharness.cpp	Mon Nov 18 17:51:16 2013 +0530
@@ -781,12 +781,15 @@
         }
     }
 
-    if (opt.chroma_sub_ps[part])
+    for(int i = 0; i < NUM_CSP; i++)
     {
-        if (!check_pixel_sub_ps(ref.chroma_sub_ps[part], opt.chroma_sub_ps[part]))
+        if (opt.chroma_sub_ps[i][part])
         {
-            printf("chroma_sub_ps[%s] failed\n", chromaPartStr[part]);
-            return false;
+            if (!check_pixel_sub_ps(ref.chroma_sub_ps[i][part], opt.chroma_sub_ps[i][part]))
+            {
+                 printf("chroma_sub_ps[%s][%s] failed\n", colorSpaceNames[i], chromaPartStr[part]);
+                 return false;
+            }
         }
     }
     return true;
@@ -1063,10 +1066,13 @@
         REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
     }
 
-    if (opt.chroma_sub_ps[part])
+    for (int i = 0; i < NUM_CSP; i++)
     {
-        printf("chroma_sub_ps[%s]", chromaPartStr[part]);
-        REPORT_SPEEDUP(opt.chroma_sub_ps[part], ref.chroma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+        if (opt.chroma_sub_ps[i][part])
+        {
+            printf("chroma_sub_ps[%s][%s]", colorSpaceNames[i], chromaPartStr[part]);
+            REPORT_SPEEDUP(opt.chroma_sub_ps[i][part], ref.chroma_sub_ps[i][part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+        }
     }
 }
 


More information about the x265-devel mailing list