[vlc-devel] [PATCH] Do adjust filter in SSE2 and SSE4.1
gamajun at seznam.cz
gamajun at seznam.cz
Fri Jul 15 21:08:18 CEST 2011
From: Martin Briza <xbriza00 at stud.fit.vutbr.cz>
Added whole header file (modules/video_filter/adjust_asm.h) with inline assembly functions to be used in the filter
---
modules/video_filter/adjust.c | 190 ++++++++++
modules/video_filter/adjust_asm.h | 731 +++++++++++++++++++++++++++++++++++++
2 files changed, 921 insertions(+), 0 deletions(-)
create mode 100644 modules/video_filter/adjust_asm.h
diff --git a/modules/video_filter/adjust.c b/modules/video_filter/adjust.c
index db5e2be..0aa261d 100644
--- a/modules/video_filter/adjust.c
+++ b/modules/video_filter/adjust.c
@@ -39,6 +39,9 @@
#include <vlc_filter.h>
#include "filter_picture.h"
+#include <vlc_cpu.h>
+#include "adjust_asm.h"
+
#ifndef M_PI
# define M_PI 3.14159265358979323846
#endif
@@ -335,7 +338,102 @@ static picture_t *FilterPlanar( filter_t *p_filter, picture_t *p_pic )
i_x = ( cos(f_hue) + sin(f_hue) ) * 32768;
i_y = ( cos(f_hue) - sin(f_hue) ) * 32768;
+#if defined(CAN_COMPILE_SSE4_1)
+ if ( vlc_CPU() & CPU_CAPABILITY_SSE4_1 && i_sat > 256 )
+ {
+#define WRITE_UV_CLIP() \
+ i_u = *p_in++ ; i_v = *p_in_v++ ; \
+ *p_out++ = clip_uint8_vlc( (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+ * i_sat) >> 8) + 128); \
+ *p_out_v++ = clip_uint8_vlc( (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+ * i_sat) >> 8) + 128)
+
+ uint8_t i_u, i_v;
+
+ int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
+ WRITE_UV_CLIP_PLANAR_PREPARE;
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ WRITE_UV_CLIP_PLANAR_SSE4_1();
+ // WRITE_UV_CLIP_PLANAR_SSE4_1();
+ }
+ p_line_end += 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ WRITE_UV_CLIP();
+ }
+
+ p_in += p_pic->p[U_PLANE].i_pitch
+ - p_pic->p[U_PLANE].i_visible_pitch;
+ p_in_v += p_pic->p[V_PLANE].i_pitch
+ - p_pic->p[V_PLANE].i_visible_pitch;
+ p_out += p_outpic->p[U_PLANE].i_pitch
+ - p_outpic->p[U_PLANE].i_visible_pitch;
+ p_out_v += p_outpic->p[V_PLANE].i_pitch
+ - p_outpic->p[V_PLANE].i_visible_pitch;
+ }
+
+ WRITE_UV_SSE_FINISH;
+#undef WRITE_UV_CLIP
+ }
+ else
+#elif defined(CAN_COMPILE_SSE2)
+ if ( vlc_CPU() & CPU_CAPABILITY_SSE2 && i_sat <= 256 )
+ {
+#define WRITE_UV() \
+ i_u = *p_in++ ; i_v = *p_in_v++ ; \
+ *p_out++ = (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+ * i_sat) >> 8) + 128; \
+ *p_out_v++ = (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+ * i_sat) >> 8) + 128
+
+ uint8_t i_u, i_v;
+
+ int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
+ WRITE_UV_PLANAR_PREPARE;
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ WRITE_UV_PLANAR_SSE2();
+ }
+
+ p_line_end += 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ WRITE_UV();
+ }
+
+ p_in += p_pic->p[U_PLANE].i_pitch
+ - p_pic->p[U_PLANE].i_visible_pitch;
+ p_in_v += p_pic->p[V_PLANE].i_pitch
+ - p_pic->p[V_PLANE].i_visible_pitch;
+ p_out += p_outpic->p[U_PLANE].i_pitch
+ - p_outpic->p[U_PLANE].i_visible_pitch;
+ p_out_v += p_outpic->p[V_PLANE].i_pitch
+ - p_outpic->p[V_PLANE].i_visible_pitch;
+ }
+
+ WRITE_UV_SSE_FINISH;
+#undef WRITE_UV
+ }
+ else
+#endif
if ( i_sat > 256 )
{
#define WRITE_UV_CLIP() \
@@ -574,6 +672,96 @@ static picture_t *FilterPacked( filter_t *p_filter, picture_t *p_pic )
i_x = ( cos(f_hue) + sin(f_hue) ) * 32768;
i_y = ( cos(f_hue) - sin(f_hue) ) * 32768;
+#if defined(CAN_COMPILE_SSE4_1)
+ if ( vlc_CPU() & CPU_CAPABILITY_SSE4_1 && i_sat > 256 )
+ {
+#define WRITE_UV_CLIP() \
+ i_u = *p_in; p_in += 4; i_v = *p_in_v; p_in_v += 4; \
+ *p_out = clip_uint8_vlc( (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+ * i_sat) >> 8) + 128); \
+ p_out += 4; \
+ *p_out_v = clip_uint8_vlc( (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+ * i_sat) >> 8) + 128); \
+ p_out_v += 4
+
+ uint8_t i_u, i_v;
+
+ WRITE_UV_CLIP_PACKED_PREPARE;
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ WRITE_UV_CLIP_PACKED_SSE4_1();
+ }
+
+ p_line_end += 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ WRITE_UV_CLIP();
+ }
+
+ p_in += i_pitch - i_visible_pitch;
+ p_in_v += i_pitch - i_visible_pitch;
+ p_out += i_pitch - i_visible_pitch;
+ p_out_v += i_pitch - i_visible_pitch;
+ }
+
+ WRITE_UV_SSE_FINISH;
+#undef WRITE_UV_CLIP
+ }
+ else
+#elif defined(CAN_COMPILE_SSE4_1)
+ if ( vlc_CPU() & CPU_CAPABILITY_SSE4_1 && i_sat <= 256 )
+ {
+#define WRITE_UV() \
+ i_u = *p_in; p_in += 4; i_v = *p_in_v; p_in_v += 4; \
+ *p_out = (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+ * i_sat) >> 8) + 128; \
+ p_out += 4; \
+ *p_out_v = (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+ * i_sat) >> 8) + 128; \
+ p_out_v += 4
+
+ uint8_t i_u, i_v;
+
+ int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
+ WRITE_UV_PACKED_PREPARE;
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ WRITE_UV_PACKED_SSE4_1();
+ WRITE_UV_PACKED_SSE4_1();
+ }
+
+ p_line_end += 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ WRITE_UV();
+ }
+
+ p_in += i_pitch - i_visible_pitch;
+ p_in_v += i_pitch - i_visible_pitch;
+ p_out += i_pitch - i_visible_pitch;
+ p_out_v += i_pitch - i_visible_pitch;
+ }
+
+ WRITE_UV_SSE_FINISH;
+#undef WRITE_UV
+ }
+ else
+#endif
if ( i_sat > 256 )
{
#define WRITE_UV_CLIP() \
@@ -587,6 +775,8 @@ static picture_t *FilterPacked( filter_t *p_filter, picture_t *p_pic )
uint8_t i_u, i_v;
+ int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
for( ; p_in < p_in_end ; )
{
p_line_end = p_in + i_visible_pitch - 8 * 4;
diff --git a/modules/video_filter/adjust_asm.h b/modules/video_filter/adjust_asm.h
new file mode 100644
index 0000000..942592d
--- /dev/null
+++ b/modules/video_filter/adjust_asm.h
@@ -0,0 +1,731 @@
+/*****************************************************************************
+ * adjust_asm.h: SSE extension to adjust filter
+ *****************************************************************************
+ * Copyright (C) 2000-2011 the VideoLAN team
+ *
+ * Authors: Martin Briza <gamajun at seznam.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+/*****************************************************************************
+ * Macros for storing XMM registers to a variable, so it won't break anything in
+ * the function where they are used
+ * - work in pairs, but are wrapped in PREPARE and FINISH macros
+ *****************************************************************************/
+#ifdef __x86_64__
+#define STORE_XMM_REGISTERS \
+ static uint64_t xmm_temporary_storage[32]; \
+ __asm__ volatile( \
+ "movdqa %%xmm0, (%[x])\n" \
+ "movdqa %%xmm1, 16(%[x])\n" \
+ "movdqa %%xmm2, 32(%[x])\n" \
+ "movdqa %%xmm3, 48(%[x])\n" \
+ "movdqa %%xmm4, 64(%[x])\n" \
+ "movdqa %%xmm5, 80(%[x])\n" \
+ "movdqa %%xmm6, 96(%[x])\n" \
+ "movdqa %%xmm7, 112(%[x])\n" \
+ "movdqa %%xmm8, 128(%[x])\n" \
+ "movdqa %%xmm9, 144(%[x])\n" \
+ "movdqa %%xmm10, 160(%[x])\n" \
+ "movdqa %%xmm11, 176(%[x])\n" \
+ "movdqa %%xmm12, 192(%[x])\n" \
+ "movdqa %%xmm13, 208(%[x])\n" \
+ "movdqa %%xmm14, 224(%[x])\n" \
+ "movdqa %%xmm15, 240(%[x])\n" \
+ : \
+ : [x] "r" (xmm_temporary_storage) \
+ : "memory" \
+ )
+#elif defined(__i386__)
+#define STORE_XMM_REGISTERS \
+ static uint64_t xmm_temporary_storage[16]; \
+ __asm__ volatile( \
+ "movdqa %%xmm0, (%[x])\n" \
+ "movdqa %%xmm1, 16(%[x])\n" \
+ "movdqa %%xmm2, 32(%[x])\n" \
+ "movdqa %%xmm3, 48(%[x])\n" \
+ "movdqa %%xmm4, 64(%[x])\n" \
+ "movdqa %%xmm5, 80(%[x])\n" \
+ "movdqa %%xmm6, 96(%[x])\n" \
+ "movdqa %%xmm7, 112(%[x])\n" \
+ : \
+ : [x] "r" (xmm_temporary_storage) \
+ : "memory" \
+ )
+#endif
+
+#ifdef __x86_64__
+#define RESTORE_XMM_REGISTERS \
+ __asm__ volatile( \
+ "movdqa (%[x]), %%xmm0\n" \
+ "movdqa 16(%[x]), %%xmm1\n" \
+ "movdqa 32(%[x]), %%xmm2\n" \
+ "movdqa 48(%[x]), %%xmm3\n" \
+ "movdqa 64(%[x]), %%xmm4\n" \
+ "movdqa 80(%[x]), %%xmm5\n" \
+ "movdqa 96(%[x]), %%xmm6\n" \
+ "movdqa 112(%[x]), %%xmm7\n" \
+ "movdqa 128(%[x]), %%xmm8\n" \
+ "movdqa 144(%[x]), %%xmm9\n" \
+ "movdqa 160(%[x]), %%xmm10\n" \
+ "movdqa 176(%[x]), %%xmm11\n" \
+ "movdqa 192(%[x]), %%xmm12\n" \
+ "movdqa 208(%[x]), %%xmm13\n" \
+ "movdqa 224(%[x]), %%xmm14\n" \
+ "movdqa 240(%[x]), %%xmm15\n" \
+ : \
+ : [x] "r" (xmm_temporary_storage) \
+ : "memory" \
+ )
+#elif defined(__i386__)
+#define RESTORE_XMM_REGISTERS \
+ __asm__ volatile( \
+ "movdqa (%[x]), %%xmm0\n" \
+ "movdqa 16(%[x]), %%xmm1\n" \
+ "movdqa 32(%[x]), %%xmm2\n" \
+ "movdqa 48(%[x]), %%xmm3\n" \
+ "movdqa 64(%[x]), %%xmm4\n" \
+ "movdqa 80(%[x]), %%xmm5\n" \
+ "movdqa 96(%[x]), %%xmm6\n" \
+ "movdqa 112(%[x]), %%xmm7\n" \
+ : \
+ : [x] "r" (xmm_temporary_storage) \
+ : "memory" \
+ )
+#endif
+
+/*****************************************************************************
+ * Load 4 bytes of U and V data in PLANAR format in xmm0 and xmm1 in this order
+ *****************************************************************************/
+static inline void write_uv_load_planaryuv_sse2( uint8_t *p_in, uint8_t *p_in_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+ __asm__ volatile(
+ "movd (%[in]), %%xmm0\n"
+ "movd (%[in_v]), %%xmm1\n"
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklwd %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm1\n"
+ "punpcklwd %%xmm7, %%xmm1\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v)
+ : "memory" );
+#endif
+}
+
+
+static inline void write_uv_load_planaryuv_first(uint8_t *p_in, uint8_t *p_in_v)
+{
+ __asm__ volatile (
+#if defined(CAN_COMPILE_SSE2)
+ "movd (%[in]), %%xmm1\n"
+ "movd (%[in_v]), %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm1\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "punpcklwd %%xmm2, %%xmm1\n"
+ /////////////////////////////////////////
+ "movd 4(%[in]), %%xmm2\n"
+ "movd 4(%[in_v]), %%xmm5\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm2\n"
+#if defined(__x86_64__)
+ /////////////////////////////////////////
+ "movq (%[in]), %%xmm5\n"
+ "movd (%[in_v]), %%xmm3\n"
+ "movd 4(%[in_v]), %%xmm4\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "pandn %%xmm12, %%xmm5\n" // invert U (to be subtracted)
+ "punpcklbw %%xmm0, %%xmm3\n"
+ "paddw %%xmm13, %%xmm5\n" // add 1
+ "punpcklbw %%xmm0, %%xmm4\n"
+ "punpcklwd %%xmm5, %%xmm3\n"
+ "punpckhqdq %%xmm5, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm4\n"
+#endif
+#endif
+ :
+ : [in] "r" ( p_in ),
+ [in_v] "r" ( p_in_v )
+ : "eax", "memory" );
+}
+
+#if defined(__i386__)
+static inline void write_uv_load_planaryuv_second(uint8_t *p_in, uint8_t *p_in_v)
+{
+ __asm__ volatile (
+ "pcmpeqb %%xmm6, %%xmm6\n"
+ "movq (%[in]), %%xmm5\n"
+ "movd (%[in_v]), %%xmm1\n"
+ "movd 4(%[in_v]), %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "pandn %%xmm6, %%xmm5\n"
+ "punpcklbw %%xmm0, %%xmm1\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "paddw %%xmm6, %%xmm5\n"
+ "psrlw $15, %%xmm6\n"
+ "punpcklwd %%xmm5, %%xmm1\n"
+ "punpckhqdq %%xmm5, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm2\n"
+
+ "psllw $7, %%xmm6\n"
+ :
+ : [in] "r" ( p_in ),
+ [in_v] "r" ( p_in_v )
+ : "eax", "memory" );
+}
+#endif
+
+/*****************************************************************************
+ * "Unclip" results stored in XMM0 and XMM3
+ * - leave only low 8 bits of dword content of XMM0 and XMM3
+ * - if not used on planar data, packing instructions will saturate the results
+ *****************************************************************************/
+static inline void write_uv_unclip_planaryuv_sse2 ()
+{
+#if defined(CAN_COMPILE_SSE2)
+ __asm__ volatile(
+ "pslld $24, %%xmm0\n" // 0xXXXXXXXX ->
+ "psrld $24, %%xmm0\n" // -> 0x000000XX
+ "pslld $24, %%xmm3\n"
+ "psrld $24, %%xmm3\n"
+ :: );
+#endif
+}
+
+/*****************************************************************************
+ * Pack and write PlanarYUV 4B content of XMM0 in p_out and XMM3 in p_out_v
+ *****************************************************************************/
+static inline void write_uv_store_planaryuv_sse2( uint8_t *p_out, uint8_t *p_out_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+ __asm__ volatile(
+ "packusdw %%xmm7, %%xmm0\n" // pack and saturate (if there
+ "packuswb %%xmm7, %%xmm0\n" // is something to saturate)
+ "movd %%xmm0, (%[out])\n" // and store in destination
+ "packusdw %%xmm7, %%xmm3\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ "movd %%xmm3, (%[out_v])\n"
+ :
+ : [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "memory" );
+#endif
+}
+
+#if defined(__i386__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_store_planaryuv_first( uint8_t *p_out )
+{
+ __asm__ volatile(
+ "packuswb %%xmm0, %%xmm1\n"
+ "movq %%xmm1, (%[out])\n"
+ :
+ : [out] "r" (p_out)
+ : "memory" );
+}
+#elif defined(__x86_64__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_store_planaryuv_first( uint8_t *p_out,
+ uint8_t *p_out_v )
+{
+ __asm__ volatile(
+ "packuswb %%xmm0, %%xmm1\n"
+ "packuswb %%xmm0, %%xmm3\n"
+ "movq %%xmm1, (%[out])\n"
+ "movq %%xmm3, (%[out_v])\n"
+ :
+ : [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "memory" );
+}
+#endif
+
+#if defined(__i386__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_store_planaryuv_second( uint8_t *p_out_v )
+{
+ __asm__ volatile(
+ "packuswb %%xmm0, %%xmm1\n"
+ "movq %%xmm1, (%[out_v])\n"
+ :
+ : [out_v] "r" (p_out_v)
+ : "memory" );
+}
+#endif
+
+/*****************************************************************************
+ * Run the filter on a Planar YUV picture.
+ *****************************************************************************/
+static inline void write_uv_load_packedyuv_sse2( uint8_t *p_in, uint8_t *p_in_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+ __asm__ volatile(
+ "movdqu (%[in]), %%xmm0\n"
+ "movdqu (%[in_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n" // 0xFFFFFFFF
+ "psrld $24, %%xmm2\n" // 0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out unnecessary data
+ "pand %%xmm2, %%xmm1\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v)
+ : "memory");
+#endif
+}
+
+/*****************************************************************************
+ * Clip (== saturate) content of XMM0 and XMM3,
+ *****************************************************************************/
+static inline void write_uv_clip_packedyuv_sse2()
+{
+#if defined(CAN_COMPILE_SSE2)
+ __asm__ volatile(
+ "packusdw %%xmm7, %%xmm0\n"
+ "packuswb %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklwd %%xmm7, %%xmm0\n"
+ "packusdw %%xmm7, %%xmm3\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ "punpcklbw %%xmm7, %%xmm3\n"
+ "punpcklwd %%xmm7, %%xmm3\n"
+ ::);
+#endif
+}
+
+/*****************************************************************************
+ * Remove upper 24b of every dword number stored in XMM0 and XMM3
+ *****************************************************************************/
+static inline void write_uv_clear_packedyuv_sse2()
+{
+#if defined(CAN_COMPILE_SSE2)
+ __asm__ volatile(
+ "pcmpeqd %%xmm2, %%xmm2\n" // 0xFFFFFFFF
+ "psrld $24, %%xmm2\n" // 0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out unnecessary data
+ "pand %%xmm2, %%xmm3\n"
+ ::);
+#endif
+}
+
+/*****************************************************************************
+ * Store packed U from XMM0 and packed V from XMM3 registers in p_out and
+ * p_out_v respectively
+ * - must have used write_uv_clip_packedyuv_sse2()
+ * or write_uv_clear_packedyuv_sse2()
+ *****************************************************************************/
+static inline void write_uv_store_packedyuv_sse2( uint8_t *p_out, uint8_t *p_out_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+ __asm__ volatile(
+
+ "movdqu (%[out]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm0, %%xmm2\n"
+ "movdqu %%xmm2, (%[out])\n"
+
+ "movdqu (%[out_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm3, %%xmm2\n"
+ "movdqu %%xmm2, (%[out_v])\n"
+ :
+ : [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "memory" );
+#endif
+}
+
+/*****************************************************************************
+ * Basic write_uv function, used in both packed and planar formats
+ * - considers U stored in XMM0 and V in XMM3
+ * - result is stored in XMM0 (U) and XMM3 afterwards
+ * - uses ALL 8 basic (in 32bit mode) XMM registers, clearing XMM7 to be used
+ * in packing and unpacking data
+ * p_out = (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) * i_sat) >> 8) + 128
+ * p_out_v = (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) * i_sat) >> 8) + 128
+ *****************************************************************************/
+static inline void write_uv_base_sse41( int i_x, int i_y, int i_cos,
+ int i_sin, int i_sat )
+{
+#if defined(CAN_COMPILE_SSE4_1)
+#if defined(__x86_64__)
+ __asm__ volatile(
+
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "pmulld %%xmm8, %%xmm0\n"
+ "pmulld %%xmm9, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm10, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm12, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+
+ "pmulld %%xmm8, %%xmm3\n"
+ "pmulld %%xmm9, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm12, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "movd %[x], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm0\n"
+ "pmulld %%xmm5, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm7, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm6, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+
+ "movd %[y], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm3\n"
+ "pmulld %%xmm5, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm7, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm6, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+ "pxor %%xmm7, %%xmm7\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+#endif
+}
+
+#if defined(__i386__)
+static inline void write_uv_base_first( int i_x )
+{
+ __asm__ volatile(
+ ////////////////////////////////////////////
+ "pmaddwd %%xmm3, %%xmm1\n"
+ "pmaddwd %%xmm3, %%xmm2\n"
+ "movd %[x], %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "pslld $8, %%xmm1\n"
+ "psrld $16, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "psrld $16, %%xmm2\n"
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "pshuflw $0, %%xmm5, %%xmm5\n"
+ "pshufhw $0, %%xmm5, %%xmm5\n"
+ "paddw %%xmm5, %%xmm1\n"
+ "pmullw %%xmm4, %%xmm1\n"
+ "psraw $8, %%xmm1\n"
+ "paddw %%xmm6, %%xmm1\n"
+ :
+ : [x] "r" (i_x)
+ : "memory" );
+}
+#elif defined(__x86_64__)
+static inline void write_uv_base_first( )
+{
+ __asm__ volatile(
+ "pmaddwd %%xmm6, %%xmm1\n"
+ "pmaddwd %%xmm6, %%xmm2\n"
+ "pmaddwd %%xmm6, %%xmm3\n"
+ "pmaddwd %%xmm6, %%xmm4\n"
+ "psubd %%xmm10, %%xmm1\n"
+ "psubd %%xmm10, %%xmm2\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psubd %%xmm11, %%xmm4\n"
+ "pslld $8, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "pslld $8, %%xmm3\n"
+ "pslld $8, %%xmm4\n"
+ "psrld $16, %%xmm1\n"
+ "psrld $16, %%xmm2\n"
+ "psrld $16, %%xmm3\n"
+ "psrld $16, %%xmm4\n"
+ "psubd %%xmm14, %%xmm1\n"
+ "psubd %%xmm14, %%xmm2\n"
+ "psubd %%xmm14, %%xmm3\n"
+ "psubd %%xmm14, %%xmm4\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "packssdw %%xmm4, %%xmm3\n"
+ "paddw %%xmm15, %%xmm1\n"
+ "paddw %%xmm15, %%xmm3\n"
+ "pmullw %%xmm7, %%xmm1\n"
+ "pmullw %%xmm7, %%xmm3\n"
+ "psraw $8, %%xmm1\n"
+ "psraw $8, %%xmm3\n"
+ "paddw %%xmm9, %%xmm1\n"
+ "paddw %%xmm9, %%xmm3\n"
+ :
+ :
+ : "memory" );
+}
+#endif
+
+#if defined(__i386__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_base_second( int i_y )
+{
+ __asm__ volatile(
+ "pmaddwd %%xmm3, %%xmm1\n"
+ "pmaddwd %%xmm3, %%xmm2\n"
+ "movd %[y], %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "pslld $8, %%xmm1\n"
+ "psrld $16, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "psrld $16, %%xmm2\n"
+
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "pshuflw $0, %%xmm5, %%xmm5\n"
+ "pshufhw $0, %%xmm5, %%xmm5\n"
+ "paddw %%xmm5, %%xmm1\n"
+ "pmullw %%xmm4, %%xmm1\n"
+ "psraw $8, %%xmm1\n"
+ "paddw %%xmm6, %%xmm1\n"
+ :
+ : [y] "r" (i_y)
+ : "memory" );
+}
+#endif
+
+/*****************************************************************************
+ * NEEDED to be called before used in adjust filter.
+ * Clear XMM7 register and store values used in every computation.
+ * WARNING! On 32bit systems, it will take XMM4, XMM5 and XMM6 registers for
+ * constants, on 64bit systems XMM8-XMM12 are filled
+ *****************************************************************************/
+static inline void write_uv_prepare_sse2( int i_x, int i_y, int i_cos, int i_sin,
+ int i_sat )
+{
+#if defined(CAN_COMPILE_SSE2)
+#if defined(__x86_64__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm8\n"
+ "movd %[sin], %%xmm9\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "movd %[sat], %%xmm12\n"
+ "pshufd $0, %%xmm8, %%xmm8\n"
+ "pshufd $0, %%xmm9, %%xmm9\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm12, %%xmm12\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm4\n"
+ "movd %[sin], %%xmm5\n"
+ "movd %[sat], %%xmm6\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ :
+ : [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+#endif
+}
+
+static inline void write_uv_prepare_planar( int i_x, int i_y, int i_cos,
+ int i_sin, int i_sat_sub )
+{
+ __asm__ volatile(
+#if !defined(__x86_64__)
+ "pxor %%xmm0, %%xmm0\n"
+ "movd %[cos], %%xmm3\n"
+ "movd %[sin], %%xmm4\n"
+ "pslld $16, %%xmm4\n"
+ "pslld $16, %%xmm3\n"
+ "psrld $16, %%xmm3\n"
+ "pshufd $0, %%xmm3, %%xmm3\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "por %%xmm4, %%xmm3\n"
+ "movd %[sat], %%xmm4\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pcmpeqb %%xmm6, %%xmm6\n"
+ "psrlw $15, %%xmm6\n"
+ "psllw $7, %%xmm6\n"
+#else
+ "pxor %%xmm0, %%xmm0\n"
+ "movd %[cos], %%xmm6\n"
+ "movd %[sin], %%xmm7\n"
+ "pslld $16, %%xmm7\n"
+ "pslld $16, %%xmm6\n"
+ "psrld $16, %%xmm6\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+ "por %%xmm7, %%xmm6\n"
+ "movd %[sat], %%xmm7\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pcmpeqb %%xmm12, %%xmm12\n"
+ "pcmpeqb %%xmm9, %%xmm9\n"
+ "pcmpeqb %%xmm13, %%xmm13\n"
+ "psrlw $15, %%xmm9\n"
+ "psllw $7, %%xmm9\n"
+ "psrlw $15, %%xmm13\n"
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm14\n"
+ "pshufd $0, %%xmm14, %%xmm14\n"
+ "mov $0x80008000, %%eax\n"
+ "movd %%eax, %%xmm15\n"
+ "pshufd $0, %%xmm15, %%xmm15\n"
+#endif
+ :
+ :
+ [x] "r" ( i_x ),
+ [y] "r" ( i_y ),
+ [sat] "r" ( i_sat_sub ),
+ [sin] "r" ( i_sin ),
+ [cos] "r" ( i_cos )
+ : "eax", "memory" );
+}
+
+/*****************************************************************************
+ * WRITE_UV preparation and finishing macros
+ * - they prepare the registers to obtain needed data to work in adjust filter
+ *****************************************************************************/
+#define WRITE_UV_CLIP_PLANAR_PREPARE \
+ STORE_XMM_REGISTERS; \
+ write_uv_prepare_sse2( i_x, i_y, i_cos, i_sin, i_sat );
+
+#define WRITE_UV_PLANAR_PREPARE \
+ STORE_XMM_REGISTERS; \
+ write_uv_prepare_planar( i_x, i_y, i_cos, i_sin, i_sat_sub );
+
+#define WRITE_UV_CLIP_PACKED_PREPARE \
+ STORE_XMM_REGISTERS; \
+ write_uv_prepare_sse2( i_x, i_y, i_cos, i_sin, i_sat );
+
+#define WRITE_UV_PACKED_PREPARE \
+ STORE_XMM_REGISTERS; \
+ write_uv_prepare_sse2( i_x, i_y, i_cos, i_sin, i_sat );
+
+#define WRITE_UV_SSE_FINISH \
+ RESTORE_XMM_REGISTERS;
+
+/*****************************************************************************
+ * WRITE_UV macros declaration
+ * - they work in same way as WRITE_UV() and WRITE_UV_CLIP() and that means
+ * even identical names of variables used in them
+ *****************************************************************************/
+#define WRITE_UV_CLIP_PLANAR_SSE4_1() \
+ write_uv_load_planaryuv_sse2( p_in, p_in_v ); \
+ write_uv_base_sse41( i_x, i_y, i_cos, i_sin, i_sat ); \
+ write_uv_store_planaryuv_sse2( p_out, p_out_v ); \
+ p_out += 4; p_out_v += 4; p_in += 4; p_in_v += 4
+
+#define WRITE_UV_CLIP_PACKED_SSE4_1() \
+ write_uv_load_packedyuv_sse2( p_in, p_in_v ); \
+ write_uv_base_sse41( i_x, i_y, i_cos, i_sin, i_sat ); \
+ write_uv_clip_packedyuv_sse2(); \
+ write_uv_store_packedyuv_sse2( p_out, p_out_v ); \
+ p_out += 4; p_out_v += 4; p_in += 4; p_in_v += 4
+
+#define WRITE_UV_PACKED_SSE4_1() \
+ write_uv_load_packedyuv_sse2( p_in, p_in_v ); \
+ write_uv_base_sse41( i_x, i_y, i_cos, i_sin, i_sat ); \
+ write_uv_clear_packedyuv_sse2(); \
+ write_uv_store_packedyuv_sse2( p_out, p_out_v ); \
+ p_out += 4; p_out_v += 4; p_in += 4; p_in_v += 4
+
+#ifdef __x86_64__
+#define WRITE_UV_PLANAR_SSE2() \
+ write_uv_load_planaryuv_first( p_in, p_in_v); \
+ write_uv_base_first( i_x ); \
+ write_uv_store_planaryuv_first( p_out, p_out_v ); \
+ p_in += 8; p_in_v += 8; p_out += 8; p_out_v += 8
+#endif
+
+#ifdef __i386__
+#define WRITE_UV_PLANAR_SSE2() \
+ write_uv_load_planaryuv_first( p_in, p_in_v); \
+ write_uv_base_first( i_x ); \
+ write_uv_store_planaryuv_first( p_out, p_out_v ); \
+ write_uv_load_planaryuv_second( p_in, p_in_v); \
+ write_uv_base_second( i_y ); \
+ write_uv_store_planaryuv_second( p_out_v ); \
+ p_in += 8; p_in_v += 8; p_out += 8; p_out_v += 8
+#endif
--
1.7.1
More information about the vlc-devel
mailing list