[x265] [PATCH] asm: ssse3 8bpp code for chroma_p2s i422, reuse luma code

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Mon Apr 6 16:26:21 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1428317574 -19800
#      Mon Apr 06 16:22:54 2015 +0530
# Node ID ff260b1e0e87edc1f27bbc2421c273514844b736
# Parent  e0e94b642a1f169658267ef17bea754a5af4a22d
asm: ssse3 8bpp code for chroma_p2s i422, reuse luma code

     chroma_p2s[4x32](3.78), chroma_p2s[8x12](5.25x), chroma_p2s[8x64](6.65x),
     chroma_p2s[12x32](9.57x), chroma_p2s[16x24](12.96x),
     chroma_p2s[16x24](12.56x), chroma_p2s[24x64](13.66x),
     chroma_p2s[32x48](9.83x)

diff -r e0e94b642a1f -r ff260b1e0e87 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Apr 06 15:08:57 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Apr 06 16:22:54 2015 +0530
@@ -1353,14 +1353,20 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = x265_filterPixelToShort_32x32_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s = x265_filterPixelToShort_8x4_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s = x265_filterPixelToShort_8x8_ssse3;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s = x265_filterPixelToShort_8x12_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s = x265_filterPixelToShort_8x16_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s = x265_filterPixelToShort_8x32_ssse3;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s = x265_filterPixelToShort_8x64_ssse3;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s = x265_filterPixelToShort_12x32_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = x265_filterPixelToShort_16x8_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = x265_filterPixelToShort_16x16_ssse3;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = x265_filterPixelToShort_16x24_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = x265_filterPixelToShort_16x32_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = x265_filterPixelToShort_16x64_ssse3;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = x265_filterPixelToShort_24x64_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = x265_filterPixelToShort_32x16_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = x265_filterPixelToShort_32x32_ssse3;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
@@ -1439,6 +1445,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s = x265_filterPixelToShort_4x4_sse4;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s = x265_filterPixelToShort_4x8_sse4;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s = x265_filterPixelToShort_4x16_sse4;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s = x265_filterPixelToShort_4x32_sse4;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = x265_filterPixelToShort_6x16_sse4;
 
 #if X86_64
diff -r e0e94b642a1f -r ff260b1e0e87 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Apr 06 15:08:57 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Apr 06 16:22:54 2015 +0530
@@ -7783,6 +7783,7 @@
 P2S_H_4xN 4
 P2S_H_4xN 8
 P2S_H_4xN 16
+P2S_H_4xN 32
 
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -7889,6 +7890,8 @@
 P2S_H_8xN 4
 P2S_H_8xN 16
 P2S_H_8xN 32
+P2S_H_8xN 12
+P2S_H_8xN 64
 
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8016,6 +8019,7 @@
 P2S_H_16xN 12
 P2S_H_16xN 32
 P2S_H_16xN 64
+P2S_H_16xN 24
 
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
@@ -8138,6 +8142,7 @@
 P2S_H_32xN 16
 P2S_H_32xN 24
 P2S_H_32xN 64
+P2S_H_32xN 48
 
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
diff -r e0e94b642a1f -r ff260b1e0e87 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Mon Apr 06 15:08:57 2015 +0530
+++ b/source/common/x86/ipfilter8.h	Mon Apr 06 16:22:54 2015 +0530
@@ -582,7 +582,17 @@
     SETUP_CHROMA_P2S_FUNC_DEF(8, 6, cpu);
 
 #define CHROMA_422_P2S_FILTERS_SSE4(cpu) \
-    SETUP_CHROMA_P2S_FUNC_DEF(6, 16, cpu);
+    SETUP_CHROMA_P2S_FUNC_DEF(6, 16, cpu); \
+    SETUP_CHROMA_P2S_FUNC_DEF(4, 32, cpu);
+
+#define CHROMA_422_P2S_FILTERS_SSSE3(cpu) \
+    SETUP_CHROMA_P2S_FUNC_DEF(8, 12, cpu); \
+    SETUP_CHROMA_P2S_FUNC_DEF(8, 64, cpu); \
+    SETUP_CHROMA_P2S_FUNC_DEF(12, 32, cpu); \
+    SETUP_CHROMA_P2S_FUNC_DEF(16, 24, cpu); \
+    SETUP_CHROMA_P2S_FUNC_DEF(16, 64, cpu); \
+    SETUP_CHROMA_P2S_FUNC_DEF(24, 64, cpu); \
+    SETUP_CHROMA_P2S_FUNC_DEF(32, 48, cpu);
 
 CHROMA_420_FILTERS(_sse4);
 CHROMA_420_FILTERS(_avx2);
@@ -604,6 +614,7 @@
 CHROMA_422_SS_FILTERS(_sse2);
 CHROMA_422_SS_FILTERS_SSE4(_sse4);
 CHROMA_422_P2S_FILTERS_SSE4(_sse4);
+CHROMA_422_P2S_FILTERS_SSSE3(_ssse3);
 
 CHROMA_444_FILTERS(_sse4);
 CHROMA_444_SP_FILTERS(_sse4);


More information about the x265-devel mailing list