[vlc-commits] copy: remove cached cpu flags
Rémi Denis-Courmont
git at videolan.org
Sun Feb 11 16:05:46 CET 2018
vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Sun Feb 11 15:42:38 2018 +0200| [16b2f3e377c2b07437ccbd70b4c9d12768ae27cc] | committer: Rémi Denis-Courmont
copy: remove cached cpu flags
This causes a lot of extra ifdef'ery and parameters passing.
Calling vlc_CPU() is cheap anyway; it just returns a constant.
(Also nowadays at least SSE2 is enabled by default.)
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=16b2f3e377c2b07437ccbd70b4c9d12768ae27cc
---
modules/video_chroma/copy.c | 122 ++++++++++++--------------------------------
1 file changed, 34 insertions(+), 88 deletions(-)
diff --git a/modules/video_chroma/copy.c b/modules/video_chroma/copy.c
index 84cbd5b1da..a60eaa944c 100644
--- a/modules/video_chroma/copy.c
+++ b/modules/video_chroma/copy.c
@@ -95,21 +95,6 @@ void CopyCleanCache(copy_cache_t *cache)
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
-#ifndef __SSE4_1__
-# undef vlc_CPU_SSE4_1
-# define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
-#endif
-
-#ifndef __SSSE3__
-# undef vlc_CPU_SSSE3
-# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
-#endif
-
-#ifndef __SSE2__
-# undef vlc_CPU_SSE2
-# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
-#endif
-
#ifdef COPY_TEST_NOOPTIM
# undef vlc_CPU_SSE4_1
# define vlc_CPU_SSE4_1() (0)
@@ -128,12 +113,8 @@ void CopyCleanCache(copy_cache_t *cache)
VLC_SSE
static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height,
- unsigned cpu)
+ unsigned width, unsigned height)
{
-#if defined (__SSE4_1__) || !defined(CAN_COMPILE_SSSE3)
- VLC_UNUSED(cpu);
-#endif
assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
asm volatile ("mfence");
@@ -206,16 +187,11 @@ static void
SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
uint8_t *srcu, size_t srcu_pitch,
uint8_t *srcv, size_t srcv_pitch,
- unsigned int width, unsigned int height, uint8_t pixel_size,
- unsigned int cpu)
+ unsigned int width, unsigned int height, uint8_t pixel_size)
{
assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
!((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
-#if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
- VLC_UNUSED(cpu);
-#endif
-
static const uint8_t shuffle_8[] = { 0, 8,
1, 9,
2, 10,
@@ -326,12 +302,8 @@ VLC_SSE
static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height, uint8_t pixel_size,
- unsigned cpu)
+ unsigned width, unsigned height, uint8_t pixel_size)
{
-#if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
- VLC_UNUSED(cpu);
-#endif
assert(pixel_size == 1 || pixel_size == 2);
assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
@@ -440,7 +412,7 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch,
uint8_t *cache, size_t cache_size,
- unsigned height, unsigned cpu)
+ unsigned height)
{
const unsigned w16 = (src_pitch+15) & ~15;
const unsigned hstep = cache_size / w16;
@@ -453,14 +425,10 @@ static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
const unsigned hblock = __MIN(hstep, height - y);
/* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w16,
- src, src_pitch,
- src_pitch, hblock, cpu);
+ CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock);
/* Copy from our cache to the destination */
- Copy2d(dst, dst_pitch,
- cache, w16,
- src_pitch, hblock);
+ Copy2d(dst, dst_pitch, cache, w16, src_pitch, hblock);
/* */
src += src_pitch * hblock;
@@ -473,7 +441,7 @@ SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
const uint8_t *srcu, size_t srcu_pitch,
const uint8_t *srcv, size_t srcv_pitch,
uint8_t *cache, size_t cache_size,
- unsigned int height, uint8_t pixel_size, unsigned int cpu)
+ unsigned int height, uint8_t pixel_size)
{
assert(srcu_pitch == srcv_pitch);
unsigned int const w16 = (srcu_pitch+15) & ~15;
@@ -485,15 +453,14 @@ SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
unsigned int const hblock = __MIN(hstep, height - y);
/* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w16, srcu, srcu_pitch,
- srcu_pitch, hblock, cpu);
+ CopyFromUswc(cache, w16, srcu, srcu_pitch, srcu_pitch, hblock);
CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
- srcv_pitch, hblock, cpu);
+ srcv_pitch, hblock);
/* Copy from our cache to the destination */
SSE_InterleaveUV(dst, dst_pitch, cache, w16,
- cache+w16*hblock, w16, srcu_pitch, hblock, pixel_size,
- cpu);
+ cache + w16 * hblock, w16,
+ srcu_pitch, hblock, pixel_size);
/* */
srcu += hblock * srcu_pitch;
@@ -506,7 +473,7 @@ static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
uint8_t *cache, size_t cache_size,
- unsigned height, uint8_t pixel_size, unsigned cpu)
+ unsigned height, uint8_t pixel_size)
{
const unsigned w16 = (src_pitch+15) & ~15;
const unsigned hstep = cache_size / w16;
@@ -516,12 +483,11 @@ static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
const unsigned hblock = __MIN(hstep, height - y);
/* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w16, src, src_pitch,
- src_pitch, hblock, cpu);
+ CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock);
/* Copy from our cache to the destination */
SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
- cache, w16, src_pitch / 2, hblock, pixel_size, cpu);
+ cache, w16, src_pitch / 2, hblock, pixel_size);
/* */
src += src_pitch * hblock;
@@ -532,14 +498,14 @@ static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
const size_t src_pitch[static 3], unsigned height,
- const copy_cache_t *cache, unsigned cpu)
+ const copy_cache_t *cache)
{
for (unsigned n = 0; n < 3; n++) {
const unsigned d = n > 0 ? 2 : 1;
SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
src[n], src_pitch[n],
cache->buffer, cache->size,
- (height+d-1)/d, cpu);
+ (height+d-1)/d);
}
asm volatile ("emms");
}
@@ -547,48 +513,40 @@ static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
const size_t src_pitch[static 2], unsigned height,
- const copy_cache_t *cache, unsigned cpu)
+ const copy_cache_t *cache)
{
- SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
- src[0], src_pitch[0],
- cache->buffer, cache->size,
- height, cpu);
- SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
- src[1], src_pitch[1],
- cache->buffer, cache->size,
- height/2, cpu);
+ SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
+ cache->buffer, cache->size, height);
+ SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1],
+ cache->buffer, cache->size, height / 2);
asm volatile ("emms");
}
static void
SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
const size_t src_pitch[static 2], unsigned int height,
- const copy_cache_t *cache, uint8_t pixel_size,
- unsigned int cpu)
+ const copy_cache_t *cache, uint8_t pixel_size)
{
SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
- src[0], src_pitch[0], cache->buffer, cache->size,
- height, cpu);
+ src[0], src_pitch[0], cache->buffer, cache->size, height);
SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
dest->p[2].p_pixels, dest->p[2].i_pitch,
src[1], src_pitch[1], cache->buffer, cache->size,
- height / 2, pixel_size, cpu);
+ height / 2, pixel_size);
asm volatile ("emms");
}
static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
const size_t src_pitch[static 3],
unsigned height, const copy_cache_t *cache,
- uint8_t pixel_size, unsigned cpu)
+ uint8_t pixel_size)
{
- SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
- src[0], src_pitch[0],
- cache->buffer, cache->size,
- height, cpu);
+ SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
+ cache->buffer, cache->size, height);
SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
src[U_PLANE], src_pitch[U_PLANE],
src[V_PLANE], src_pitch[V_PLANE],
- cache->buffer, cache->size, height / 2, pixel_size, cpu);
+ cache->buffer, cache->size, height / 2, pixel_size);
asm volatile ("emms");
}
#undef COPY64
@@ -614,10 +572,8 @@ void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
{
ASSERT_2PLANES;
#ifdef CAN_COMPILE_SSE2
- unsigned cpu = vlc_CPU();
if (vlc_CPU_SSE2())
- return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height,
- cache, cpu);
+ return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache);
#else
(void) cache;
#endif
@@ -660,10 +616,8 @@ void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
{
ASSERT_2PLANES;
#ifdef CAN_COMPILE_SSE2
- unsigned cpu = vlc_CPU();
-
if (vlc_CPU_SSE2())
- return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 1, cpu);
+ return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 1);
#else
VLC_UNUSED(cache);
#endif
@@ -681,10 +635,8 @@ void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
{
ASSERT_2PLANES;
#ifdef CAN_COMPILE_SSE3
- unsigned cpu = vlc_CPU();
-
if (vlc_CPU_SSSE3())
- return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 2, cpu);
+ return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 2);
#else
VLC_UNUSED(cache);
#endif
@@ -714,9 +666,8 @@ void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
{
ASSERT_3PLANES;
#ifdef CAN_COMPILE_SSE2
- unsigned cpu = vlc_CPU();
if (vlc_CPU_SSE2())
- return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 1, cpu);
+ return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 1);
#else
(void) cache;
#endif
@@ -743,9 +694,8 @@ void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
{
ASSERT_3PLANES;
#ifdef CAN_COMPILE_SSE2
- unsigned cpu = vlc_CPU();
if (vlc_CPU_SSSE3())
- return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 2, cpu);
+ return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 2);
#else
(void) cache;
#endif
@@ -813,9 +763,8 @@ void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
{
ASSERT_3PLANES;
#ifdef CAN_COMPILE_SSE2
- unsigned cpu = vlc_CPU();
if (vlc_CPU_SSE2())
- return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache, cpu);
+ return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache);
#else
(void) cache;
#endif
@@ -1024,9 +973,6 @@ int main(void)
alarm(10);
#ifndef COPY_TEST_NOOPTIM
-# ifndef __SSE2__
- unsigned cpu = vlc_CPU();
-# endif
if (!vlc_CPU_SSE2())
{
fprintf(stderr, "WARNING: could not test SSE\n");
More information about the vlc-commits
mailing list