[x265] [PATCH] blockcopy_ps_16x4, asm code is now sse4

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Nov 11 12:24:39 CET 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1384169067 -19800
# Node ID 6ca58aa3d85bc962510509c7f04be76c9a9cf591
# Parent  bce503510c7262f2e12583b57b2a03370cecd33a
blockcopy_ps_16x4, asm code is now sse4

diff -r bce503510c72 -r 6ca58aa3d85b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 11 16:44:45 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Nov 11 16:54:27 2013 +0530
@@ -366,12 +366,6 @@
         p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
         p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
         p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
-
-        p.chroma_copy_ps[CHROMA_16x4] = x265_blockcopy_ps_16x4_sse2;
-        p.chroma_copy_ps[CHROMA_16x8] = x265_blockcopy_ps_16x8_sse2;
-        p.chroma_copy_ps[CHROMA_16x12] = x265_blockcopy_ps_16x12_sse2;
-        p.chroma_copy_ps[CHROMA_16x16] = x265_blockcopy_ps_16x16_sse2;
-        p.chroma_copy_ps[CHROMA_16x32] = x265_blockcopy_ps_16x32_sse2;
 #if X86_64
         p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
         p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
@@ -467,6 +461,11 @@
         p.chroma_copy_ps[CHROMA_8x8] = x265_blockcopy_ps_8x8_sse4;
         p.chroma_copy_ps[CHROMA_8x16] = x265_blockcopy_ps_8x16_sse4;
         p.chroma_copy_ps[CHROMA_8x32] = x265_blockcopy_ps_8x32_sse4;
+        p.chroma_copy_ps[CHROMA_16x4] = x265_blockcopy_ps_16x4_sse4;
+        p.chroma_copy_ps[CHROMA_16x8] = x265_blockcopy_ps_16x8_sse4;
+        p.chroma_copy_ps[CHROMA_16x12] = x265_blockcopy_ps_16x12_sse4;
+        p.chroma_copy_ps[CHROMA_16x16] = x265_blockcopy_ps_16x16_sse4;
+        p.chroma_copy_ps[CHROMA_16x32] = x265_blockcopy_ps_16x32_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r bce503510c72 -r 6ca58aa3d85b source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Mon Nov 11 16:44:45 2013 +0530
+++ b/source/common/x86/blockcopy8.asm	Mon Nov 11 16:54:27 2013 +0530
@@ -1790,7 +1790,7 @@
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
-INIT_XMM sse2
+INIT_XMM sse4
 cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
 
 add        r1,      r1
@@ -1829,7 +1829,7 @@
 ; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W16_H4 2
-INIT_XMM sse2
+INIT_XMM sse4
 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
 
 add        r1,      r1
diff -r bce503510c72 -r 6ca58aa3d85b source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Mon Nov 11 16:44:45 2013 +0530
+++ b/source/common/x86/blockcopy8.h	Mon Nov 11 16:54:27 2013 +0530
@@ -99,7 +99,12 @@
     SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 6, cpu); \
     SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 8, cpu); \
     SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 16, cpu); \
-    SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 32, cpu);
+    SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 12, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 32, cpu);
 
 CHROMA_BLOCKCOPY_DEF_SSE4(_sse4);
 
@@ -112,12 +117,6 @@
 void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val);
 void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val);
 
-void x265_blockcopy_ps_16x4_sse2(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-void x265_blockcopy_ps_16x8_sse2(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-void x265_blockcopy_ps_16x12_sse2(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-void x265_blockcopy_ps_16x16_sse2(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-void x265_blockcopy_ps_16x32_sse2(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
-
 #undef SETUP_CHROMA_BLOCKCOPY_FUNC
 #undef SETUP_LUMA_BLOCK_FUNC
 #undef CHROMA_BLOCKCOPY_DEF


More information about the x265-devel mailing list