[vlc-devel] [PATCH] Introducing Intelligent Approximate playlist search
Abhiram Ravi
abhiram.ravi.s at gmail.com
Tue Mar 26 14:27:29 CET 2013
(Sorry, it was a bad mistake.)
Fixed it! Now works for any UTF-8 string.
---
src/playlist/search.c | 239
+++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 234 insertions(+), 5 deletions(-)
diff --git a/src/playlist/search.c b/src/playlist/search.c
index 0c711ca..f2e5ed4 100644
--- a/src/playlist/search.c
+++ b/src/playlist/search.c
@@ -5,6 +5,7 @@
* $Id$
*
* Authors: Clément Stenac <zorglub at videolan.org>
+ * Abhiram Ravi <abhiram.ravi.s at gmail.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
@@ -29,6 +30,109 @@
#include <vlc_playlist.h>
#include <vlc_charset.h>
#include "playlist_internal.h"
+#include <string.h>
+#include <limits.h>
+#include <wctype.h>
+
+//The delimiter string for splitting meta and query strings
+#define REG_SPLIT N_("()[ ,.-]_")
+//Maximum allowed error in Levenshtein Distance
+#define MAX_ERROR 1
+//TODO: Add a cmdline param/opt to GUI and allow user to set MAX_ERROR
+
+/***************************************************************************
+ * Levenshtein Distance between two strings
+
***************************************************************************/
+/**
+ * Converts a UTF-8 string into lowercase, The function handles multi-byte
+ * codepoints correctly, and puts them in an array of uint32_t's.
+ * @param psz_string : The original string
+ * @param pszw_output : The output array into which the codepoints are
written.
+ * @return : The length of the codepoint array
+ * XXX - pszw_output must be already malloc'ed
+ */
+static int to_lowercase ( const char* psz_string, uint32_t* pszw_output )
+{
+ ssize_t s;
+ size_t len = 0;
+ uint32_t cph;
+ const char *p = psz_string;
+ for ( ; *p; )
+ {
+ s = vlc_towc ( p, &cph );
+ if ( s <= 0 )
+ {
+ s = vlc_towc ( p, &(uint32_t) { 0 });
+ p += s;
+ continue;
+ }
+ *pszw_output = towlower ( cph );
+ pszw_output ++;
+ p += s;
+ len ++;
+ }
+ return len;
+}
+/**
+ * A dynamic programming algorithm to
+ * calculate the levenshtein distance between any
+ * two strings ( used for matching query and playlist items )
+ * @param psz_str_1: The first string
+ * @param psz_str_2: The second string
+ * @return the edit distance between the two strings
+ */
+static int levenshtein_distance ( const char* psz_str_1, const char*
psz_str_2 )
+{
+ // Convert the strings to lowercase
+ uint32_t* psz_wide_1 = xmalloc ( strlen ( psz_str_1 ) * sizeof (
uint32_t ) );
+ uint32_t* psz_wide_2 = xmalloc ( strlen ( psz_str_2 ) * sizeof (
uint32_t ) );
+ size_t l_1 = to_lowercase( psz_str_1, psz_wide_1 );
+ size_t l_2 = to_lowercase( psz_str_2, psz_wide_2 );
+
+ uint32_t char_1, char_2;
+ int i_cost;
+
+ if ( l_1 == 0 )
+ return l_2;
+ if ( l_2 == 0 )
+ return l_1;
+
+ //A flat 2D array
+ int* iarray_distance = xmalloc ( ( l_1 + 1 ) * ( l_2 + 1) * sizeof (
int ) );
+ int i_col = l_2 + 1;
+ for ( size_t i = 0; i <= l_1; i++ )
+ iarray_distance [i * i_col] = i;
+ for ( size_t i = 0; i <= l_2; i++ )
+ iarray_distance [i] = i;
+ for ( size_t i = 1; i <= l_1; i++ )
+ {
+ char_1 = psz_wide_1 [i - 1];
+ for ( size_t j = 1; j <= l_2; j++ )
+ {
+ char_2 = psz_wide_2 [j - 1];
+ if ( char_1 == char_2 )
+ i_cost = 0;
+ else
+ i_cost = 1;
+
+ iarray_distance [i * i_col + j] = INT_MAX;
+ if ( iarray_distance [(i - 1) * i_col + j] + 1 <
iarray_distance [i * i_col + j] )
+ iarray_distance [i * i_col + j] = iarray_distance [(i - 1)
* i_col + j] + 1;
+ if ( iarray_distance [i * i_col + (j - 1)] + 1 <
iarray_distance [i * i_col + j] )
+ iarray_distance [i * i_col + j] = iarray_distance [i *
i_col +(j - 1)] + 1;
+ if ( iarray_distance [(i - 1) * i_col + (j - 1)] + i_cost <
iarray_distance [i * i_col + j] )
+ iarray_distance [i * i_col + j] = iarray_distance [(i - 1)
* i_col + (j - 1)] + i_cost;
+ }
+ }
+ int answer = iarray_distance [l_1 * i_col + l_2];
+
+ // Freeing before returning the answer
+ free ( psz_wide_1 );
+ free ( psz_wide_2 );
+ free ( iarray_distance );
+
+ return answer;
+}
/***************************************************************************
* Item search functions
@@ -85,6 +189,83 @@ playlist_item_t* playlist_ItemGetByInput( playlist_t *
p_playlist,
* Live search handling
***************************************************************************/
+/*
+ * Split a given sentence based on a string of characters to use for the
split
+ * \param ppp_split_destination - the array of words obtained after
splitting
+ * \param psz_sentence - the input string
+ * \param psz_reg_split - the string of characters to use for the split
+ * @return - the number of words in the sentence
+ * */
+static int split_into_words ( char*** ppp_split_destination, const char*
psz_sentence, const char* psz_reg_split )
+{
+ // Making a copy to work with strtok
+ char* psz_sentence_copy = xstrdup ( psz_sentence );
+ const char* psz_buf;
+
+ // Allocating space for the required number of words
+ char** pp_sentence_split = ( char ** ) xmalloc ( strlen (
psz_sentence_copy ) * sizeof ( char* ) );
+
+ char* p_saveptr;
+
+ psz_buf = strtok_r ( psz_sentence_copy, psz_reg_split, &p_saveptr );
+
+ int i_count = 0;
+
+ // Running strtok and copying all the words obtained to a new split
string array
+ while ( psz_buf != NULL )
+ {
+ pp_sentence_split [i_count] = xstrdup ( psz_buf );
+ psz_buf = strtok_r ( NULL, psz_reg_split, &p_saveptr );
+ i_count++;
+ }
+ *ppp_split_destination = pp_sentence_split;
+ return i_count;
+
+}
+
+/*
+ * Match a given string with a set of query words.
+ * \param psz_info - the string that needs to be matched with
+ * \param pp_query_split - the array of query words
+ * \param i_query_split - number of query words
+ * \param bitvector_matches - to keep track of matches
+ * */
+static void execute_match ( const char* psz_info, char** pp_query_split,
int i_query_split, bool* bitvector_matches )
+{
+ char** pp_info_split = NULL;
+ int i_info_split = split_into_words( &pp_info_split, psz_info,
REG_SPLIT );
+
+ for ( int i = 0; i < i_query_split; i++ )
+ {
+ for ( int j = 0; j < i_info_split; j++ )
+ {
+ bitvector_matches[i] |= ( pp_info_split[j] &&
levenshtein_distance( pp_info_split[j], pp_query_split[i] ) <= MAX_ERROR ) ;
+ }
+ }
+
+ for ( int i = 0; i < i_info_split; i++ )
+ {
+ free ( pp_info_split[i] );
+ }
+ free ( pp_info_split );
+}
+
+/*
+ * Checks if a boolean array contains all 1's
+ * \param bitvector_matches - the boolean array
+ * \param len - length of the array
+ * @return - true if all bits are 1.
+ * - false otherwise
+ * */
+static bool super_match ( bool* bitvector_matches, size_t len )
+{
+ bool b_super_match = 1;
+ for ( size_t i = 0; i < len; i++ )
+ {
+ b_super_match &= bitvector_matches[i];
+ }
+ return b_super_match;
+}
/**
* Enable all items in the playlist
* @param p_root: the current root item
@@ -103,6 +284,9 @@ static void playlist_LiveSearchClean( playlist_item_t
*p_root )
/**
* Enable/Disable items in the playlist according to the search argument
+ * Every string is split into words (split based on REG_SPLIT) and an item
is
+ * enabled only if all the words in the query string have found a
+ * match(approximated by MAX_ERROR) in the metadata.
* @param p_root: the current root item
* @param psz_string: the string to search
* @return true if an item match
@@ -111,12 +295,20 @@ static bool playlist_LiveSearchUpdateInternal(
playlist_item_t *p_root,
const char *psz_string,
bool b_recursive )
{
int i;
+ // Splitting the query words into strings for separate word-matching
+ char** pp_query_split = NULL;
+ int i_query_split = split_into_words( &pp_query_split, psz_string,
REG_SPLIT );
+
+ // The bit vector of size = number of words in query string - used to
ensure that ALL the
+ // words in the query string match before enabling the item in the
playlist
+ bool* bitvector_matches = xmalloc ( i_query_split * sizeof ( bool ) );
+
bool b_match = false;
for( i = 0 ; i < p_root->i_children ; i ++ )
{
bool b_enable = false;
playlist_item_t *p_item = p_root->pp_children[i];
- // Go recurssively if their is some children
+ // Go recursively if there are some children
if( b_recursive && p_item->i_children >= 0 &&
playlist_LiveSearchUpdateInternal( p_item, psz_string, true ) )
{
@@ -133,14 +325,50 @@ static bool playlist_LiveSearchUpdateInternal(
playlist_item_t *p_root,
const char *psz_title = vlc_meta_Get(
p_item->p_input->p_meta, vlc_meta_Title );
if( !psz_title )
psz_title = p_item->p_input->psz_name;
+
const char *psz_album = vlc_meta_Get(
p_item->p_input->p_meta, vlc_meta_Album );
const char *psz_artist = vlc_meta_Get(
p_item->p_input->p_meta, vlc_meta_Artist );
- b_enable = ( psz_title && vlc_strcasestr( psz_title,
psz_string ) ) ||
- ( psz_album && vlc_strcasestr( psz_album,
psz_string ) ) ||
- ( psz_artist && vlc_strcasestr( psz_artist,
psz_string ) );
+
+ // Initializing the bit vector to 0s
+ memset ( bitvector_matches, 0, i_query_split );
+
+ // First check if the words of the query are direct
substrings, and enable matches for
+ // the corresponding query words
+ for ( int i = 0; i < i_query_split; i++ )
+ {
+ bitvector_matches[i] |=
+ ( ( psz_title && vlc_strcasestr( psz_title,
pp_query_split[i] ) ) ||
+ ( psz_album && vlc_strcasestr( psz_album,
pp_query_split[i] ) ) ||
+ ( psz_artist && vlc_strcasestr( psz_artist,
pp_query_split[i] ) ) );
+ }
+
+ // Matching the query words and each word of the title,
album and artist.
+ // Uses approximate matching with the help of levenshtein
distance.
+ if ( psz_title )
+ execute_match ( psz_title, pp_query_split,
i_query_split, bitvector_matches );
+ if ( psz_album )
+ execute_match ( psz_album, pp_query_split,
i_query_split, bitvector_matches );
+ if ( psz_artist )
+ execute_match ( psz_artist, pp_query_split,
i_query_split, bitvector_matches );
+
+ // Enabled if all the query words matched.
+ b_enable |= super_match ( bitvector_matches, i_query_split
);
+
}
else
- b_enable = p_item->p_input->psz_name && vlc_strcasestr(
p_item->p_input->psz_name, psz_string );
+ {
+ // If there is no meta, then we just consider the name
+ const char* psz_title = p_item->p_input->psz_name;
+
+ // Initializing the bit vector to 0s
+ memset ( bitvector_matches, 0, i_query_split );
+
+ if ( psz_title )
+ execute_match( psz_title, pp_query_split,
i_query_split, bitvector_matches );
+
+ // Enabled if all the query words matched.
+ b_enable |= super_match ( bitvector_matches, i_query_split
);
+ }
vlc_mutex_unlock( &p_item->p_input->lock );
}
@@ -151,6 +379,7 @@ static bool playlist_LiveSearchUpdateInternal(
playlist_item_t *p_root,
b_match |= b_enable;
}
+ free ( bitvector_matches );
return b_match;
}
--
1.7.9.5
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/vlc-devel/attachments/20130326/98bc8fae/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-Introducing-Intelligent-Approximate-playlist-search.patch
Type: application/octet-stream
Size: 12287 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/vlc-devel/attachments/20130326/98bc8fae/attachment.obj>
More information about the vlc-devel
mailing list