[vlc-devel] [PATCH] Phosphor deinterlacer, reworked += 4

Sun Mar 27 18:05:17 CEST 2011

Hi,

On Thu, Mar 24, 2011 at 08:56:18PM +0200, Juha Jeronen wrote:
> >From 1388f52b8734e180f7dd439b2e9d7f9581d02a73 Mon Sep 17 00:00:00 2001
> From: Juha Jeronen <juha.jeronen at jyu.fi>
> Date: Thu, 24 Mar 2011 20:33:35 +0200
> Subject: [PATCH 1/2] RenderX(): cache vlc_CPU()
 No remarks, I will apply it.

> >From e3f851bb9822379052ec2071a63b6c7896048e3c Mon Sep 17 00:00:00 2001
> From: Juha Jeronen <juha.jeronen at jyu.fi>
> Date: Thu, 24 Mar 2011 20:50:52 +0200
> Subject: [PATCH 2/2] Phosphor deinterlacer
  
> +/* Converts a full-frame plane_t to a field plane_t */
> +static void PlaneToFieldPlane( plane_t *p_dst, const plane_t *p_src,
> +                               int i_field );
 The order of the parameter and the function name does not seem
to match (FieldFromPlane or swaping dst/src maybe). Not important.

>  static const char *const ppsz_filter_options[] = {
> -    "mode", NULL
> +    "mode", "phosphor-chroma", "phosphor-dimmer", NULL
>  };
 Could you put NULL on an independant line? (it would simplify
future changes if any). Not important.

> +static void DarkenField( picture_t *p_dst, int i_field, int i_strength )
> +{
> +    assert( p_dst != NULL );
> +    assert( i_field == 0 || i_field == 1 );
> +    assert( i_strength >= 1 && i_strength <= 3 );
> +
> +    unsigned ui_cpu = vlc_CPU();
> +
> +    /* Bitwise ANDing with this clears the i_strength highest bits
> +       of each byte */
> +    uint64_t remove_high_u64;
> +#ifdef CAN_COMPILE_MMXEXT
> +    uint64_t i_strength_u64 = i_strength; /* for MMX version (needs to know
> +                                             number of bits) */
> +#endif
> +    int_fast16_t div; /* for C version of 4:2:2 chroma handler */
> +    if( i_strength == 1 )
> +    {
> +        remove_high_u64 = INT64_C(0x7F7F7F7F7F7F7F7F);
> +        div = 2;
> +    }
> +    else if( i_strength == 2 )
> +    {
> +        remove_high_u64 = INT64_C(0x3F3F3F3F3F3F3F3F);
> +        div = 4;
> +    }
> +    else /* i_strength == 3 */
> +    {
> +        remove_high_u64 = INT64_C(0x1F1F1F1F1F1F1F1F);
> +        div = 8;
> +    }
 div = 1 << i_strength;
 remove_high_u8 = 0xFF >> i_strength;
 remove_high_u64 = remove_high_u8 * INT64_C(0x0101010101010101);
Would be simpler and more generic.

> +    /* Process luma.
> +
> +       For luma, the operation is just a shift + bitwise AND, so we vectorize
> +       even in the C version.
> +
> +       There is an MMX version, too, because it performs about twice faster.
> +    */
> +    int i_plane = Y_PLANE;
> +    uint8_t *p_out, *p_out_end;
> +    int w = p_dst->p[i_plane].i_visible_pitch;
> +    p_out = p_dst->p[i_plane].p_pixels;
> +    p_out_end = p_out + p_dst->p[i_plane].i_pitch
> +                      * p_dst->p[i_plane].i_visible_lines;
> +
> +    /* skip first line for bottom field */
> +    if( i_field == 1 )
> +        p_out += p_dst->p[i_plane].i_pitch;
> +
> +    int wm8 = w % 8;   /* remainder */
> +    int w8  = w - wm8; /* part of width that is divisible by 8 */
> +    for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
> +    {
> +        uint64_t *po = (uint64_t *)p_out;
> +#ifdef CAN_COMPILE_MMXEXT
> +        if( ui_cpu & CPU_CAPABILITY_MMXEXT )
> +        {
> +            movq_m2r( i_strength_u64,  mm1 );
> +            movq_m2r( remove_high_u64, mm2 );
> +            for( int x = 0 ; x < w8; x += 8 )
> +            {
> +                movq_m2r( (*po), mm0 );
> +
> +                psrlq_r2r( mm1, mm0 );
> +                pand_r2r(  mm2, mm0 );
> +
> +                movq_r2m( mm0, (*po++) );
> +            }
> +        }
> +        else
> +        {
> +#endif
> +            for( int x = 0 ; x < w8; x += 8, ++po )
> +                (*po) = ( ((*po) >> i_strength) & remove_high_u64 );
> +#ifdef CAN_COMPILE_MMXEXT
> +        }
> +#endif
> +        /* handle the width remainder */
> +        if( wm8 )
> +        {
> +            uint8_t *po_temp = (uint8_t *)po;
> +            for( int x = 0 ; x < wm8; ++x, ++po_temp )
> +                (*po_temp) = ( ((*po_temp) >> i_strength) & remove_high_u8 );
> +        }
> +    }
> +
> +    /* Process chroma if the field chromas are independent.
> +
> +       The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
> +       The chroma processing is a bit more complicated than luma,
> +       and needs MMX for vectorization.
> +    */
> +    if( p_dst->format.i_chroma == VLC_CODEC_I422  ||
> +        p_dst->format.i_chroma == VLC_CODEC_J422 )
> +    {
> +        for( i_plane = 0 ; i_plane < p_dst->i_planes ; i_plane++ )
> +        {
> +            if( i_plane == Y_PLANE )
> +                continue; /* luma already handled */
> +
> +            int w = p_dst->p[i_plane].i_visible_pitch;
> +#ifdef CAN_COMPILE_MMXEXT
> +            int wm8 = w % 8;   /* remainder */
> +            int w8  = w - wm8; /* part of width that is divisible by 8 */
> +#endif
> +            p_out = p_dst->p[i_plane].p_pixels;
> +            p_out_end = p_out + p_dst->p[i_plane].i_pitch
> +                              * p_dst->p[i_plane].i_visible_lines;
> +
> +            /* skip first line for bottom field */
> +            if( i_field == 1 )
> +                p_out += p_dst->p[i_plane].i_pitch;
> +
> +            for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
> +            {
> +#ifdef CAN_COMPILE_MMXEXT
> +                /* See also easy-to-read C version below. */
> +                if( ui_cpu & CPU_CAPABILITY_MMXEXT )
> +                {
> +                    static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
> +                    movq_m2r( b128, mm5 );
> +                    movq_m2r( i_strength_u64,  mm6 );
> +                    movq_m2r( remove_high_u64, mm7 );
> +
> +                    uint64_t *po = (uint64_t *)p_out;
> +                    for( int x = 0 ; x < w8; x += 8 )
> +                    {
> +                        movq_m2r( (*po), mm0 );
> +
> +                        movq_r2r( mm5, mm2 ); /* 128 */
> +                        movq_r2r( mm0, mm1 ); /* copy of data */
> +                        psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
> +                        psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
> +                        /* mm0 no longer needed */
Once here, I think you can simply do:
 psrlq_r2r(i_strength_u64(mm6), mm1);
 psrlq_r2r(i_strength_u64(mm6), mm2);
 pand_r2r(remove_high_u64(mm7), mm1);
 pand_r2r(remove_high_u64(mm7), mm2);
 psubb(mm2, mm1);
 padd(b128, mm1);
 movq(mm1, *po);
No ?

 Another way would be to use the fact that x/2^n can be optimized
into (x  + (x < 0) * (2^n - 1)) >> n, but because psrab does not
exist, I end up using more instructions.

> +                        pxor_r2r( mm0, mm0 );
> +                        movq_r2r( mm1, mm3 );
> +                        pcmpeqb_r2r( mm0, mm3 );
> +                        /* now mm3 = bytemask: data <= 128 */
> +                        movq_r2r( mm2, mm4 );
> +                        pcmpeqb_r2r( mm0, mm4 );
> +                        /* now mm4 = bytemask: data >= 128 */
> +                        /* mm0 no longer needed */
> +
> +                        movq_r2r( mm4, mm0 );
> +                        pandn_r2r( mm3, mm0 ); /* mm0 = bytemask: mm3 && !mm0,
> +                                                        i.e. data < 128 */
> +                        /* mm0 is important, but mm3 no longer needed. */
> +
> +                        /* Process chroma distances from origin */
> +                        /* mm1 = max(data - 128, 0) >> i_strength */
> +                        psrlq_r2r( mm6, mm1 );
> +                        pand_r2r(  mm7, mm1 );
> +                        /* mm2 = max(128 - data, 0) >> i_strength */
> +                        psrlq_r2r( mm6, mm2 );
> +                        pand_r2r(  mm7, mm2 );
> +
> +                        /* Return to original scale 0..255 */
> +                        paddusb_r2r( mm5, mm1 ); /* mm1 = mm1 + 128 */
> +                        movq_r2r( mm5, mm3 );    /* 128 */
> +                        psubusb_r2r( mm2, mm3 ); /* mm3 = 128 - mm2 */
> +                        /* mm2 no longer needed */
> +
> +                        /* Combine results */
> +                        pand_r2r( mm4, mm1 ); /* filter to keep only pixels
> +                                                 which had data >= 128 */
> +                        pand_r2r( mm0, mm3 ); /* same, for data < 128 */
> +                        paddusb_r2r( mm1, mm3 );
> +
> +                        movq_r2m( mm3, (*po++) );
> +                    }
> +
> +                    /* handle the width remainder */
> +                    if( wm8 )
> +                    {
> +                        uint8_t *po8 = (uint8_t *)po;
> +                        int_fast16_t I; /* must be able to handle -128..255 */
 I am not sure why you need this temporary variable.
C ensures that all operands to an operator are first upconverted to 'int' type
if they are smaller.
> +                        for( int x = 0 ; x < wm8; ++x )
> +                        {
> +                            I = (*po8);
> +                            /* the output is closer to 128 than the input;
> +                               the result always fits in uint8 */
> +                            (*po8++) = (uint8_t)( 128 + ( (I - 128) / div ) );
> +                        }
> +                    }
> +                }
> +                else
> +                {
> +#endif
> +                    /* 4:2:2 chroma handler, C version */
> +                    uint8_t *po = p_out;
> +                    int_fast16_t I; /* must be able to handle -128..255 */
> +                    for( int x = 0 ; x < w; ++x )
> +                    {
> +                        I = (*po);
> +                        /* the output is closer to 128 than the input;
> +                            the result always fits in uint8 */
> +                        (*po++) = (uint8_t)( 128 + ( (I - 128) / div ) );
 I wonder if using directly (1 << i_strength) would not be better for the
compiler. (A sign division by a multiple of 2 is easily optimized).

> +                    }
> +#ifdef CAN_COMPILE_MMXEXT
> +                }
> +#endif
> +            } /* for p_out... */
> +        } /* for i_plane... */
> +    } /* if b_i422 */
> +
> +#ifdef CAN_COMPILE_MMXEXT
> +    if( ui_cpu & CPU_CAPABILITY_MMXEXT )
> +        emms();
> +#endif
> +}

> +static int RenderPhosphor( filter_t *p_filter,
> +                           picture_t *p_dst, picture_t *p_src,
> +                           int i_order, int i_field )
> +
[...]
> +
> +    ComposeFrame( p_filter, p_dst, p_in_top, p_in_bottom, cc );
> +
> +    /* Simulate phosphor light output decay for the old field.
> +
> +       The dimmer can also be switched off in the configuration, but that is
> +       more of a technical curiosity or an educational toy for advanced users
> +       than a useful deinterlacer mode (although it does make telecined
> +       material look slightly better than without any filtering).
> +
> +       In most use cases the dimmer is used.
> +    */
> +    if( p_sys->phosphor.i_dimmer_strength > 0 )
> +        DarkenField( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength );
 If you want to make things faster, doing the copy done in composeframe
and the field darkening in one pass instead of 2, could be done (or at least
tested), or even simply doing the copy on 1 line and then the darkening on it
just after (less cache pressure).
 But this will wait until the current way is commited.

>      /* Set output timestamps, if the algorithm didn't request CUSTOM_PTS for this frame. */
> @@ -1948,6 +2696,7 @@ static int Open( vlc_object_t *p_this )
>      p_sys->i_frame_offset = 0; /* start with default value (first-ever frame cannot have offset) */
>      for( int i = 0; i < HISTORY_SIZE; i++ )
>          p_sys->pp_history[i] = NULL;
> +    p_sys->phosphor.i_chroma_for_420 = PC_ALTLINE;
 Not sure why you do that here (as you also init the whole p_sys->phosphor
just below).

>  #if defined(CAN_COMPILE_C_ALTIVEC)
>      if( vlc_CPU() & CPU_CAPABILITY_ALTIVEC )
> @@ -2002,6 +2751,52 @@ static int Open( vlc_object_t *p_this )
>      SetFilterMethod( p_filter, psz_mode, p_filter->fmt_in.video.i_chroma );
>      free( psz_mode );
>  
> +    if( p_sys->i_mode == DEINTERLACE_PHOSPHOR )
> +    {
> +        int i_c420 = var_GetInteger( p_filter,
> +                                     FILTER_CFG_PREFIX "phosphor-chroma" );
> +        if( !i_c420 ) /* not set */
> +        {
> +            msg_Dbg( p_filter, "Phosphor 4:2:0 input chroma mode not set, "\
> +                               "using default" );
> +            i_c420 = PC_ALTLINE;
> +        }
 I don't think that the value 0 needs a special case.
> +        if( i_c420 != PC_LATEST  &&  i_c420 != PC_ALTLINE  &&
> +            i_c420 != PC_BLEND   && i_c420 != PC_UPCONVERT )
> +        {
> +            msg_Dbg( p_filter, "Phosphor 4:2:0 input chroma mode out of "\
> +                               "range (valid: 1, 2, 3 or 4), using default" );
> +            i_c420 = PC_ALTLINE;
> +        }
 I am not sure if it is really needed as you use change_integer_list() on it.
Anyone know if a variable can be set outside the values defined by
change_integer_list?

> +        msg_Dbg( p_filter, "using Phosphor 4:2:0 input chroma mode %d",
> +                           i_c420 );
> +        /* This maps directly to the phosphor_chroma_t enum. */
> +        p_sys->phosphor.i_chroma_for_420 = i_c420;
> +
> +        int i_dimmer = var_GetInteger( p_filter,
> +                                       FILTER_CFG_PREFIX "phosphor-dimmer" );
> +        if( !i_dimmer )
> +        {
> +            msg_Dbg( p_filter, "Phosphor dimmer strength not set, "\
> +                               "using default" );
> +            i_dimmer = 2; /* low */
> +        }
 Idem than for phosphor-chroma.
> +        if( i_dimmer < 1  ||  i_dimmer > 4 )
> +        {
> +            msg_Dbg( p_filter, "Phosphor dimmer strength out of range "\
> +                               "(valid: 1, 2, 3 or 4), using default" );
> +            i_dimmer = 2; /* low */
> +        }
 Idem than for phosphor-chroma.

> +        msg_Dbg( p_filter, "using Phosphor dimmer strength %d", i_dimmer );
> +        /* The internal value ranges from 0 to 3. */
> +        p_sys->phosphor.i_dimmer_strength = i_dimmer - 1;
> +    }
> +    else
> +    {
> +        p_sys->phosphor.i_chroma_for_420 = PC_ALTLINE;
> +        p_sys->phosphor.i_dimmer_strength = 1;
> +    }
> +
>      /* */
>      video_format_t fmt;
>      GetOutputFormat( p_filter, &fmt, &p_filter->fmt_in.video );

Regards,

-- 
fenrir