[vlc-devel] [PATCH] Introducing Intelligent Approximate playlist search

Abhiram Ravi abhiram.ravi.s at gmail.com
Tue Mar 26 14:27:29 CET 2013


(Sorry, it was a bad mistake.)
Fixed it! Now works for any UTF-8 string.

---
 src/playlist/search.c |  239
+++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 234 insertions(+), 5 deletions(-)

diff --git a/src/playlist/search.c b/src/playlist/search.c
index 0c711ca..f2e5ed4 100644
--- a/src/playlist/search.c
+++ b/src/playlist/search.c
@@ -5,6 +5,7 @@
  * $Id$
  *
  * Authors: Clément Stenac <zorglub at videolan.org>
+ *             Abhiram Ravi <abhiram.ravi.s at gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -29,6 +30,109 @@
 #include <vlc_playlist.h>
 #include <vlc_charset.h>
 #include "playlist_internal.h"
+#include <string.h>
+#include <limits.h>
+#include <wctype.h>
+
+//The delimiter string for splitting meta and query strings
+#define REG_SPLIT N_("()[ ,.-]_")
+//Maximum allowed error in Levenshtein Distance
+#define MAX_ERROR 1
+//TODO: Add a cmdline param/opt to GUI and allow user to set MAX_ERROR
+
+/***************************************************************************
+ * Levenshtein Distance between two strings
+
***************************************************************************/
+/**
+ * Converts a UTF-8 string into lowercase, The function handles multi-byte
+ * codepoints correctly, and puts them in an array of uint32_t's.
+ * @param psz_string : The original string
+ * @param pszw_output : The output array into which the codepoints are
written.
+ * @return : The length of the codepoint array
+ * XXX - pszw_output must be already malloc'ed
+ */
+static int to_lowercase ( const char* psz_string, uint32_t* pszw_output )
+{
+    ssize_t s;
+    size_t len = 0;
+    uint32_t cph;
+    const char *p = psz_string;
+    for ( ; *p; )
+    {
+        s = vlc_towc ( p, &cph );
+        if ( s <= 0 )
+        {
+            s = vlc_towc ( p, &(uint32_t) { 0 });
+            p += s;
+            continue;
+        }
+        *pszw_output = towlower ( cph );
+        pszw_output ++;
+        p += s;
+        len ++;
+    }
+    return len;
+}
+/**
+ * A dynamic programming algorithm to
+ * calculate the levenshtein distance between any
+ * two strings ( used for matching query and playlist items )
+ * @param psz_str_1: The first string
+ * @param psz_str_2: The second string
+ * @return the edit distance between the two strings
+ */
+static int levenshtein_distance ( const char* psz_str_1, const char*
psz_str_2 )
+{
+    // Convert the strings to lowercase
+    uint32_t* psz_wide_1 = xmalloc ( strlen ( psz_str_1 ) * sizeof (
uint32_t ) );
+    uint32_t* psz_wide_2 = xmalloc ( strlen ( psz_str_2 ) * sizeof (
uint32_t ) );
+    size_t l_1 = to_lowercase( psz_str_1, psz_wide_1 );
+    size_t l_2 = to_lowercase( psz_str_2, psz_wide_2 );
+
+    uint32_t char_1, char_2;
+    int i_cost;
+
+    if ( l_1 == 0 )
+        return l_2;
+    if ( l_2 == 0 )
+        return l_1;
+
+    //A flat 2D array
+    int* iarray_distance = xmalloc ( ( l_1 + 1 ) * ( l_2 + 1) * sizeof (
int ) );
+    int i_col = l_2 + 1;
+    for ( size_t i = 0; i <= l_1; i++ )
+        iarray_distance [i * i_col] = i;
+    for ( size_t i = 0; i <= l_2; i++ )
+        iarray_distance [i] = i;
+    for ( size_t i = 1; i <= l_1; i++ )
+    {
+        char_1 = psz_wide_1 [i - 1];
+        for ( size_t j = 1; j <= l_2; j++ )
+        {
+            char_2 = psz_wide_2 [j - 1];
+            if ( char_1 == char_2 )
+                i_cost = 0;
+            else
+                i_cost = 1;
+
+            iarray_distance [i * i_col + j] = INT_MAX;
+            if ( iarray_distance [(i - 1) * i_col + j] + 1 <
iarray_distance [i * i_col + j] )
+                iarray_distance [i * i_col + j] = iarray_distance [(i - 1)
* i_col + j] + 1;
+            if ( iarray_distance [i * i_col + (j - 1)] + 1 <
iarray_distance [i * i_col + j] )
+                iarray_distance [i * i_col + j] = iarray_distance [i *
i_col +(j - 1)] + 1;
+            if ( iarray_distance [(i - 1) * i_col + (j - 1)] + i_cost <
iarray_distance [i * i_col + j] )
+                iarray_distance [i * i_col + j] = iarray_distance [(i - 1)
* i_col + (j - 1)] + i_cost;
+        }
+    }
+    int answer = iarray_distance [l_1 * i_col + l_2];
+
+    // Freeing before returning the answer
+    free ( psz_wide_1 );
+    free ( psz_wide_2 );
+    free ( iarray_distance );
+
+    return answer;
+}

 /***************************************************************************
  * Item search functions
@@ -85,6 +189,83 @@ playlist_item_t* playlist_ItemGetByInput( playlist_t *
p_playlist,
  * Live search handling

***************************************************************************/

+/*
+ * Split a given sentence based on a string of characters to use for the
split
+ * \param ppp_split_destination - the array of words obtained after
splitting
+ * \param psz_sentence - the input string
+ * \param psz_reg_split - the string of characters to use for the split
+ * @return - the number of words in the sentence
+ * */
+static int split_into_words ( char*** ppp_split_destination, const char*
psz_sentence, const char* psz_reg_split )
+{
+    // Making a copy to work with strtok
+    char* psz_sentence_copy = xstrdup ( psz_sentence );
+    const char* psz_buf;
+
+    // Allocating space for the required number of words
+    char** pp_sentence_split = ( char ** ) xmalloc ( strlen (
psz_sentence_copy ) * sizeof ( char* ) );
+
+    char* p_saveptr;
+
+    psz_buf = strtok_r ( psz_sentence_copy, psz_reg_split, &p_saveptr );
+
+    int i_count = 0;
+
+    // Running strtok and copying all the words obtained to a new split
string array
+    while ( psz_buf != NULL )
+    {
+        pp_sentence_split [i_count] = xstrdup ( psz_buf );
+        psz_buf = strtok_r ( NULL, psz_reg_split, &p_saveptr );
+        i_count++;
+    }
+    *ppp_split_destination = pp_sentence_split;
+    return i_count;
+
+}
+
+/*
+ * Match a given string with a set of query words.
+ * \param psz_info - the string that needs to be matched with
+ * \param pp_query_split - the array of query words
+ * \param i_query_split - number of query words
+ * \param bitvector_matches - to keep track of matches
+ * */
+static void execute_match ( const char* psz_info, char** pp_query_split,
int i_query_split, bool* bitvector_matches )
+{
+    char** pp_info_split = NULL;
+    int i_info_split = split_into_words( &pp_info_split, psz_info,
REG_SPLIT );
+
+    for ( int i = 0; i < i_query_split; i++ )
+    {
+        for ( int j = 0; j < i_info_split; j++ )
+        {
+            bitvector_matches[i] |= ( pp_info_split[j] &&
levenshtein_distance( pp_info_split[j], pp_query_split[i] ) <= MAX_ERROR ) ;
+        }
+    }
+
+    for ( int i = 0; i < i_info_split; i++ )
+    {
+        free ( pp_info_split[i] );
+    }
+    free ( pp_info_split );
+}
+
+/*
+ * Checks if a boolean array contains all 1's
+ * \param bitvector_matches - the boolean array
+ * \param len - length of the array
+ * @return - true if all bits are 1.
+ *         - false otherwise
+ * */
+static bool super_match ( bool* bitvector_matches, size_t len )
+{
+    bool b_super_match = 1;
+    for ( size_t i = 0; i < len; i++ )
+    {
+        b_super_match &= bitvector_matches[i];
+    }
+    return b_super_match;
+}
 /**
  * Enable all items in the playlist
  * @param p_root: the current root item
@@ -103,6 +284,9 @@ static void playlist_LiveSearchClean( playlist_item_t
*p_root )

 /**
  * Enable/Disable items in the playlist according to the search argument
+ * Every string is split into words (split based on REG_SPLIT) and an item
is
+ * enabled only if all the words in the query string have found a
+ * match(approximated by MAX_ERROR) in the metadata.
  * @param p_root: the current root item
  * @param psz_string: the string to search
  * @return true if an item match
@@ -111,12 +295,20 @@ static bool playlist_LiveSearchUpdateInternal(
playlist_item_t *p_root,
                                                const char *psz_string,
bool b_recursive )
 {
     int i;
+    // Splitting the query words into strings for separate word-matching
+    char** pp_query_split = NULL;
+    int i_query_split = split_into_words( &pp_query_split, psz_string,
REG_SPLIT );
+
+    // The bit vector of size = number of words in query string - used to
ensure that ALL the
+    // words in the query string match before enabling the item in the
playlist
+    bool* bitvector_matches = xmalloc ( i_query_split * sizeof ( bool ) );
+
     bool b_match = false;
     for( i = 0 ; i < p_root->i_children ; i ++ )
     {
         bool b_enable = false;
         playlist_item_t *p_item = p_root->pp_children[i];
-        // Go recurssively if their is some children
+        // Go recursively if there are some children
         if( b_recursive && p_item->i_children >= 0 &&
             playlist_LiveSearchUpdateInternal( p_item, psz_string, true ) )
         {
@@ -133,14 +325,50 @@ static bool playlist_LiveSearchUpdateInternal(
playlist_item_t *p_root,
                 const char *psz_title = vlc_meta_Get(
p_item->p_input->p_meta, vlc_meta_Title );
                 if( !psz_title )
                     psz_title = p_item->p_input->psz_name;
+
                 const char *psz_album = vlc_meta_Get(
p_item->p_input->p_meta, vlc_meta_Album );
                 const char *psz_artist = vlc_meta_Get(
p_item->p_input->p_meta, vlc_meta_Artist );
-                b_enable = ( psz_title && vlc_strcasestr( psz_title,
psz_string ) ) ||
-                           ( psz_album && vlc_strcasestr( psz_album,
psz_string ) ) ||
-                           ( psz_artist && vlc_strcasestr( psz_artist,
psz_string ) );
+
+                // Initializing the bit vector to 0s
+                memset ( bitvector_matches, 0, i_query_split );
+
+                // First check if the words of the query are direct
substrings, and enable matches for
+                // the corresponding query words
+                for ( int i = 0; i < i_query_split; i++ )
+                {
+                    bitvector_matches[i] |=
+                            ( ( psz_title && vlc_strcasestr( psz_title,
pp_query_split[i] ) ) ||
+                            ( psz_album && vlc_strcasestr( psz_album,
pp_query_split[i] ) ) ||
+                            ( psz_artist && vlc_strcasestr( psz_artist,
pp_query_split[i] ) ) );
+                }
+
+                // Matching the query words and each word of the title,
album and artist.
+                // Uses approximate matching with the help of levenshtein
distance.
+                if ( psz_title )
+                    execute_match ( psz_title, pp_query_split,
i_query_split, bitvector_matches );
+                if ( psz_album )
+                    execute_match ( psz_album, pp_query_split,
i_query_split, bitvector_matches );
+                if ( psz_artist )
+                    execute_match ( psz_artist, pp_query_split,
i_query_split, bitvector_matches );
+
+                // Enabled if all the query words matched.
+                b_enable |= super_match ( bitvector_matches, i_query_split
);
+
             }
             else
-                b_enable = p_item->p_input->psz_name && vlc_strcasestr(
p_item->p_input->psz_name, psz_string );
+            {
+                // If there is no meta, then we just consider the name
+                const char* psz_title = p_item->p_input->psz_name;
+
+                // Initializing the bit vector to 0s
+                memset ( bitvector_matches, 0, i_query_split );
+
+                if ( psz_title )
+                    execute_match( psz_title, pp_query_split,
i_query_split, bitvector_matches );
+
+                // Enabled if all the query words matched.
+                b_enable |= super_match ( bitvector_matches, i_query_split
);
+            }
             vlc_mutex_unlock( &p_item->p_input->lock );
         }

@@ -151,6 +379,7 @@ static bool playlist_LiveSearchUpdateInternal(
playlist_item_t *p_root,

         b_match |= b_enable;
    }
+   free ( bitvector_matches );
    return b_match;
 }

-- 
1.7.9.5
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/vlc-devel/attachments/20130326/98bc8fae/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-Introducing-Intelligent-Approximate-playlist-search.patch
Type: application/octet-stream
Size: 12287 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/vlc-devel/attachments/20130326/98bc8fae/attachment.obj>


More information about the vlc-devel mailing list