[vlc-devel] commit: Perform charset detection and conversion to UTF-8 also for SDT fields. ( Marian Ďurkovič )

git version control git at videolan.org
Tue Sep 8 12:37:02 CEST 2009


vlc | branch: master | Marian Ďurkovič <md at bts.sk> | Tue Sep  8 12:36:24 2009 +0200| [fdfc6ad12bad9683b7a2af5c9410812fa6d1920a] | committer: Marian Ďurkovič 

Perform charset detection and conversion to UTF-8 also for SDT fields.

> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=fdfc6ad12bad9683b7a2af5c9410812fa6d1920a
---

 modules/demux/ts.c |  270 ++++++++++++++++++++++++++--------------------------
 1 files changed, 136 insertions(+), 134 deletions(-)

diff --git a/modules/demux/ts.c b/modules/demux/ts.c
index 81fc879..5d4f945 100644
--- a/modules/demux/ts.c
+++ b/modules/demux/ts.c
@@ -2571,6 +2571,134 @@ static void ValidateDVBMeta( demux_t *p_demux, int i_pid )
 
 
 #ifdef TS_USE_DVB_SI
+/* FIXME same than dvbsi_to_utf8 from dvb access */
+static char *EITConvertToUTF8( const unsigned char *psz_instring,
+                               size_t i_length )
+{
+    const char *psz_encoding;
+    char *psz_outstring;
+    char psz_encbuf[sizeof( "ISO_8859-123" )];
+    size_t i_in, i_out, offset = 1;
+    vlc_iconv_t iconv_handle;
+
+    if( i_length < 1 ) return NULL;
+    if( psz_instring[0] >= 0x20 )
+    {
+        psz_encoding = "ISO_8859-1";
+        /* According to the specification, this should be ISO6937,
+         * but it seems Latin-1 is used instead. */
+        offset = 0;
+    }
+    else switch( psz_instring[0] )
+    {
+    case 0x01:
+        psz_encoding = "ISO_8859-5";
+        break;
+    case 0x02:
+        psz_encoding = "ISO_8859-6";
+        break;
+    case 0x03:
+        psz_encoding = "ISO_8859-7";
+        break;
+    case 0x04:
+        psz_encoding = "ISO_8859-8";
+        break;
+    case 0x05:
+        psz_encoding = "ISO_8859-9";
+        break;
+    case 0x06:
+        psz_encoding = "ISO_8859-10";
+        break;
+    case 0x07:
+        psz_encoding = "ISO_8859-11";
+        break;
+    case 0x08:
+        psz_encoding = "ISO_8859-12";
+        break;
+    case 0x09:
+        psz_encoding = "ISO_8859-13";
+        break;
+    case 0x0a:
+        psz_encoding = "ISO_8859-14";
+        break;
+    case 0x0b:
+        psz_encoding = "ISO_8859-15";
+        break;
+    case 0x10:
+#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
+        if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
+         || psz_instring[2] == 0 )
+        {
+            psz_encoding = "UTF-8";
+            offset = 0;
+        }
+        else
+        {
+            sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
+            psz_encoding = psz_encbuf;
+            offset = 3;
+        }
+        break;
+    case 0x11:
+#warning Is there a BOM or do we use a fixed endianess?
+        psz_encoding = "UTF-16";
+        break;
+    case 0x12:
+        psz_encoding = "KSC5601-1987";
+        break;
+    case 0x13:
+        psz_encoding = "GB2312"; /* GB-2312-1980 */
+        break;
+    case 0x14:
+        psz_encoding = "BIG-5";
+        break;
+    case 0x15:
+        psz_encoding = "UTF-8";
+        break;
+    default:
+        /* invalid */
+        psz_encoding = "UTF-8";
+        offset = 0;
+    }
+
+    i_in = i_length - offset;
+    i_out = i_in * 6 + 1;
+
+    psz_outstring = malloc( i_out );
+    if( !psz_outstring )
+    {
+        return NULL;
+    }
+
+    iconv_handle = vlc_iconv_open( "UTF-8", psz_encoding );
+    if( iconv_handle == (vlc_iconv_t)(-1) )
+    {
+         /* Invalid character set (e.g. ISO_8859-12) */
+         memcpy( psz_outstring, &psz_instring[offset], i_in );
+         psz_outstring[i_in] = '\0';
+         EnsureUTF8( psz_outstring );
+    }
+    else
+    {
+        const char *psz_in = (const char *)&psz_instring[offset];
+        char *psz_out = psz_outstring;
+
+        while( vlc_iconv( iconv_handle, &psz_in, &i_in,
+                          &psz_out, &i_out ) == (size_t)(-1) )
+        {
+            /* skip naughty byte. This may fail terribly for multibyte stuff,
+             * but what can we do anyway? */
+            psz_in++;
+            i_in--;
+            vlc_iconv( iconv_handle, NULL, NULL, NULL, NULL ); /* reset */
+        }
+        vlc_iconv_close( iconv_handle );
+
+        *psz_out = '\0';
+    }
+    return psz_outstring;
+}
+
 static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
 {
     demux_sys_t          *p_sys = p_demux->p_sys;
@@ -2634,14 +2762,13 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
                     "DVB MHP service"
                 };
                 dvbpsi_service_dr_t *pD = dvbpsi_DecodeServiceDr( p_dr );
-                char str1[257];
-                char str2[257];
+                char *str1 = NULL;
+                char *str2 = NULL;
 
-                memcpy( str1, pD->i_service_provider_name,
-                        pD->i_service_provider_name_length );
-                str1[pD->i_service_provider_name_length] = '\0';
-                memcpy( str2, pD->i_service_name, pD->i_service_name_length );
-                str2[pD->i_service_name_length] = '\0';
+                str1 = EITConvertToUTF8(pD->i_service_provider_name,
+                                        pD->i_service_provider_name_length);
+                str2 = EITConvertToUTF8(pD->i_service_name,
+                                        pD->i_service_name_length);
 
                 msg_Dbg( p_demux, "    - type=%d provider=%s name=%s",
                          pD->i_service_type, str1, str2 );
@@ -2650,6 +2777,8 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
                 vlc_meta_SetPublisher( p_meta, str1 );
                 if( pD->i_service_type >= 0x01 && pD->i_service_type <= 0x10 )
                     psz_type = ppsz_type[pD->i_service_type];
+                free( str1 );
+                free( str2 );
             }
         }
 
@@ -2739,133 +2868,6 @@ static int EITConvertDuration( uint32_t i_duration )
 }
 #undef CVT_FROM_BCD
 
-/* FIXME same than dvbsi_to_utf8 from dvb access */
-static char *EITConvertToUTF8( const unsigned char *psz_instring,
-                               size_t i_length )
-{
-    const char *psz_encoding;
-    char *psz_outstring;
-    char psz_encbuf[sizeof( "ISO_8859-123" )];
-    size_t i_in, i_out, offset = 1;
-    vlc_iconv_t iconv_handle;
-
-    if( i_length < 1 ) return NULL;
-    if( psz_instring[0] >= 0x20 )
-    {
-        psz_encoding = "ISO_8859-1";
-        /* According to the specification, this should be ISO6937,
-         * but it seems Latin-1 is used instead. */
-        offset = 0;
-    }
-    else switch( psz_instring[0] )
-    {
-    case 0x01:
-        psz_encoding = "ISO_8859-5";
-        break;
-    case 0x02:
-        psz_encoding = "ISO_8859-6";
-        break;
-    case 0x03:
-        psz_encoding = "ISO_8859-7";
-        break;
-    case 0x04:
-        psz_encoding = "ISO_8859-8";
-        break;
-    case 0x05:
-        psz_encoding = "ISO_8859-9";
-        break;
-    case 0x06:
-        psz_encoding = "ISO_8859-10";
-        break;
-    case 0x07:
-        psz_encoding = "ISO_8859-11";
-        break;
-    case 0x08:
-        psz_encoding = "ISO_8859-12";
-        break;
-    case 0x09:
-        psz_encoding = "ISO_8859-13";
-        break;
-    case 0x0a:
-        psz_encoding = "ISO_8859-14";
-        break;
-    case 0x0b:
-        psz_encoding = "ISO_8859-15";
-        break;
-    case 0x10:
-#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
-        if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
-         || psz_instring[2] == 0 )
-        {
-            psz_encoding = "UTF-8";
-            offset = 0;
-        }
-        else
-        {
-            sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
-            psz_encoding = psz_encbuf;
-            offset = 3;
-        }
-        break;
-    case 0x11:
-#warning Is there a BOM or do we use a fixed endianess?
-        psz_encoding = "UTF-16";
-        break;
-    case 0x12:
-        psz_encoding = "KSC5601-1987";
-        break;
-    case 0x13:
-        psz_encoding = "GB2312"; /* GB-2312-1980 */
-        break;
-    case 0x14:
-        psz_encoding = "BIG-5";
-        break;
-    case 0x15:
-        psz_encoding = "UTF-8";
-        break;
-    default:
-        /* invalid */
-        psz_encoding = "UTF-8";
-        offset = 0;
-    }
-
-    i_in = i_length - offset;
-    i_out = i_in * 6 + 1;
-
-    psz_outstring = malloc( i_out );
-    if( !psz_outstring )
-    {
-        return NULL;
-    }
-
-    iconv_handle = vlc_iconv_open( "UTF-8", psz_encoding );
-    if( iconv_handle == (vlc_iconv_t)(-1) )
-    {
-         /* Invalid character set (e.g. ISO_8859-12) */
-         memcpy( psz_outstring, &psz_instring[offset], i_in );
-         psz_outstring[i_in] = '\0';
-         EnsureUTF8( psz_outstring );
-    }
-    else
-    {
-        const char *psz_in = (const char *)&psz_instring[offset];
-        char *psz_out = psz_outstring;
-
-        while( vlc_iconv( iconv_handle, &psz_in, &i_in,
-                          &psz_out, &i_out ) == (size_t)(-1) )
-        {
-            /* skip naughty byte. This may fail terribly for multibyte stuff,
-             * but what can we do anyway? */
-            psz_in++;
-            i_in--;
-            vlc_iconv( iconv_handle, NULL, NULL, NULL, NULL ); /* reset */
-        }
-        vlc_iconv_close( iconv_handle );
-
-        *psz_out = '\0';
-    }
-    return psz_outstring;
-}
 
 static void EITCallBack( demux_t *p_demux,
                          dvbpsi_eit_t *p_eit, bool b_current_following )




More information about the vlc-devel mailing list