[vlc-devel] commit: Perform charset detection and conversion to UTF-8 also for SDT fields. ( Marian Ďurkovič )
git version control
git at videolan.org
Fri Sep 11 08:41:14 CEST 2009
vlc | branch: 1.0-bugfix | Marian Ďurkovič <md at bts.sk> | Tue Sep 8 12:36:24 2009 +0200| [86f7516a658046c949a7704ad6643c078f425f59] | committer: Marian Ďurkovič
Perform charset detection and conversion to UTF-8 also for SDT fields.
(cherry picked from commit fdfc6ad12bad9683b7a2af5c9410812fa6d1920a)
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=86f7516a658046c949a7704ad6643c078f425f59
---
modules/demux/ts.c | 270 ++++++++++++++++++++++++++--------------------------
1 files changed, 136 insertions(+), 134 deletions(-)
diff --git a/modules/demux/ts.c b/modules/demux/ts.c
index 97de799..24be427 100644
--- a/modules/demux/ts.c
+++ b/modules/demux/ts.c
@@ -2574,6 +2574,134 @@ static void ValidateDVBMeta( demux_t *p_demux, int i_pid )
#ifdef TS_USE_DVB_SI
+/* FIXME same than dvbsi_to_utf8 from dvb access */
+static char *EITConvertToUTF8( const unsigned char *psz_instring,
+ size_t i_length )
+{
+ const char *psz_encoding;
+ char *psz_outstring;
+ char psz_encbuf[sizeof( "ISO_8859-123" )];
+ size_t i_in, i_out, offset = 1;
+ vlc_iconv_t iconv_handle;
+
+ if( i_length < 1 ) return NULL;
+ if( psz_instring[0] >= 0x20 )
+ {
+ psz_encoding = "ISO_8859-1";
+ /* According to the specification, this should be ISO6937,
+ * but it seems Latin-1 is used instead. */
+ offset = 0;
+ }
+ else switch( psz_instring[0] )
+ {
+ case 0x01:
+ psz_encoding = "ISO_8859-5";
+ break;
+ case 0x02:
+ psz_encoding = "ISO_8859-6";
+ break;
+ case 0x03:
+ psz_encoding = "ISO_8859-7";
+ break;
+ case 0x04:
+ psz_encoding = "ISO_8859-8";
+ break;
+ case 0x05:
+ psz_encoding = "ISO_8859-9";
+ break;
+ case 0x06:
+ psz_encoding = "ISO_8859-10";
+ break;
+ case 0x07:
+ psz_encoding = "ISO_8859-11";
+ break;
+ case 0x08:
+ psz_encoding = "ISO_8859-12";
+ break;
+ case 0x09:
+ psz_encoding = "ISO_8859-13";
+ break;
+ case 0x0a:
+ psz_encoding = "ISO_8859-14";
+ break;
+ case 0x0b:
+ psz_encoding = "ISO_8859-15";
+ break;
+ case 0x10:
+#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
+ if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
+ || psz_instring[2] == 0 )
+ {
+ psz_encoding = "UTF-8";
+ offset = 0;
+ }
+ else
+ {
+ sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
+ psz_encoding = psz_encbuf;
+ offset = 3;
+ }
+ break;
+ case 0x11:
+#warning Is there a BOM or do we use a fixed endianess?
+ psz_encoding = "UTF-16";
+ break;
+ case 0x12:
+ psz_encoding = "KSC5601-1987";
+ break;
+ case 0x13:
+ psz_encoding = "GB2312"; /* GB-2312-1980 */
+ break;
+ case 0x14:
+ psz_encoding = "BIG-5";
+ break;
+ case 0x15:
+ psz_encoding = "UTF-8";
+ break;
+ default:
+ /* invalid */
+ psz_encoding = "UTF-8";
+ offset = 0;
+ }
+
+ i_in = i_length - offset;
+ i_out = i_in * 6 + 1;
+
+ psz_outstring = malloc( i_out );
+ if( !psz_outstring )
+ {
+ return NULL;
+ }
+
+ iconv_handle = vlc_iconv_open( "UTF-8", psz_encoding );
+ if( iconv_handle == (vlc_iconv_t)(-1) )
+ {
+ /* Invalid character set (e.g. ISO_8859-12) */
+ memcpy( psz_outstring, &psz_instring[offset], i_in );
+ psz_outstring[i_in] = '\0';
+ EnsureUTF8( psz_outstring );
+ }
+ else
+ {
+ const char *psz_in = (const char *)&psz_instring[offset];
+ char *psz_out = psz_outstring;
+
+ while( vlc_iconv( iconv_handle, &psz_in, &i_in,
+ &psz_out, &i_out ) == (size_t)(-1) )
+ {
+ /* skip naughty byte. This may fail terribly for multibyte stuff,
+ * but what can we do anyway? */
+ psz_in++;
+ i_in--;
+ vlc_iconv( iconv_handle, NULL, NULL, NULL, NULL ); /* reset */
+ }
+ vlc_iconv_close( iconv_handle );
+
+ *psz_out = '\0';
+ }
+ return psz_outstring;
+}
+
static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
{
demux_sys_t *p_sys = p_demux->p_sys;
@@ -2637,14 +2765,13 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
"DVB MHP service"
};
dvbpsi_service_dr_t *pD = dvbpsi_DecodeServiceDr( p_dr );
- char str1[257];
- char str2[257];
+ char *str1 = NULL;
+ char *str2 = NULL;
- memcpy( str1, pD->i_service_provider_name,
- pD->i_service_provider_name_length );
- str1[pD->i_service_provider_name_length] = '\0';
- memcpy( str2, pD->i_service_name, pD->i_service_name_length );
- str2[pD->i_service_name_length] = '\0';
+ str1 = EITConvertToUTF8(pD->i_service_provider_name,
+ pD->i_service_provider_name_length);
+ str2 = EITConvertToUTF8(pD->i_service_name,
+ pD->i_service_name_length);
msg_Dbg( p_demux, " - type=%d provider=%s name=%s",
pD->i_service_type, str1, str2 );
@@ -2653,6 +2780,8 @@ static void SDTCallBack( demux_t *p_demux, dvbpsi_sdt_t *p_sdt )
vlc_meta_SetPublisher( p_meta, str1 );
if( pD->i_service_type >= 0x01 && pD->i_service_type <= 0x10 )
psz_type = ppsz_type[pD->i_service_type];
+ free( str1 );
+ free( str2 );
}
}
@@ -2742,133 +2871,6 @@ static int EITConvertDuration( uint32_t i_duration )
}
#undef CVT_FROM_BCD
-/* FIXME same than dvbsi_to_utf8 from dvb access */
-static char *EITConvertToUTF8( const unsigned char *psz_instring,
- size_t i_length )
-{
- const char *psz_encoding;
- char *psz_outstring;
- char psz_encbuf[sizeof( "ISO_8859-123" )];
- size_t i_in, i_out, offset = 1;
- vlc_iconv_t iconv_handle;
-
- if( i_length < 1 ) return NULL;
- if( psz_instring[0] >= 0x20 )
- {
- psz_encoding = "ISO_8859-1";
- /* According to the specification, this should be ISO6937,
- * but it seems Latin-1 is used instead. */
- offset = 0;
- }
- else switch( psz_instring[0] )
- {
- case 0x01:
- psz_encoding = "ISO_8859-5";
- break;
- case 0x02:
- psz_encoding = "ISO_8859-6";
- break;
- case 0x03:
- psz_encoding = "ISO_8859-7";
- break;
- case 0x04:
- psz_encoding = "ISO_8859-8";
- break;
- case 0x05:
- psz_encoding = "ISO_8859-9";
- break;
- case 0x06:
- psz_encoding = "ISO_8859-10";
- break;
- case 0x07:
- psz_encoding = "ISO_8859-11";
- break;
- case 0x08:
- psz_encoding = "ISO_8859-12";
- break;
- case 0x09:
- psz_encoding = "ISO_8859-13";
- break;
- case 0x0a:
- psz_encoding = "ISO_8859-14";
- break;
- case 0x0b:
- psz_encoding = "ISO_8859-15";
- break;
- case 0x10:
-#warning Is Latin-10 (psz_instring[2] == 16) really illegal?
- if( i_length < 3 || psz_instring[1] != 0x00 || psz_instring[2] > 15
- || psz_instring[2] == 0 )
- {
- psz_encoding = "UTF-8";
- offset = 0;
- }
- else
- {
- sprintf( psz_encbuf, "ISO_8859-%u", psz_instring[2] );
- psz_encoding = psz_encbuf;
- offset = 3;
- }
- break;
- case 0x11:
-#warning Is there a BOM or do we use a fixed endianess?
- psz_encoding = "UTF-16";
- break;
- case 0x12:
- psz_encoding = "KSC5601-1987";
- break;
- case 0x13:
- psz_encoding = "GB2312"; /* GB-2312-1980 */
- break;
- case 0x14:
- psz_encoding = "BIG-5";
- break;
- case 0x15:
- psz_encoding = "UTF-8";
- break;
- default:
- /* invalid */
- psz_encoding = "UTF-8";
- offset = 0;
- }
-
- i_in = i_length - offset;
- i_out = i_in * 6 + 1;
-
- psz_outstring = malloc( i_out );
- if( !psz_outstring )
- {
- return NULL;
- }
-
- iconv_handle = vlc_iconv_open( "UTF-8", psz_encoding );
- if( iconv_handle == (vlc_iconv_t)(-1) )
- {
- /* Invalid character set (e.g. ISO_8859-12) */
- memcpy( psz_outstring, &psz_instring[offset], i_in );
- psz_outstring[i_in] = '\0';
- EnsureUTF8( psz_outstring );
- }
- else
- {
- const char *psz_in = (const char *)&psz_instring[offset];
- char *psz_out = psz_outstring;
-
- while( vlc_iconv( iconv_handle, &psz_in, &i_in,
- &psz_out, &i_out ) == (size_t)(-1) )
- {
- /* skip naughty byte. This may fail terribly for multibyte stuff,
- * but what can we do anyway? */
- psz_in++;
- i_in--;
- vlc_iconv( iconv_handle, NULL, NULL, NULL, NULL ); /* reset */
- }
- vlc_iconv_close( iconv_handle );
-
- *psz_out = '\0';
- }
- return psz_outstring;
-}
static void EITCallBack( demux_t *p_demux, dvbpsi_eit_t *p_eit )
{
More information about the vlc-devel
mailing list