00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <string.h>
00021 #include <libaudcore/audstrings.h>
00022
00023 #include "audconfig.h"
00024 #include "config.h"
00025 #include "i18n.h"
00026 #include "debug.h"
00027
00028 #ifdef USE_CHARDET
00029 # include <libguess.h>
00030 #endif
00031
00032 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len,
00033 gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error);
00034
00035 static gchar * str_to_utf8_fallback (const gchar * str)
00036 {
00037 gchar * out = g_strconcat (str, _(" (invalid UTF-8)"), NULL);
00038
00039 for (gchar * c = out; * c; c ++)
00040 {
00041 if (* c & 0x80)
00042 * c = '?';
00043 }
00044
00045 return out;
00046 }
00047
00048 static gchar * cd_str_to_utf8 (const gchar * str)
00049 {
00050 gchar *out_str;
00051
00052 if (str == NULL)
00053 return NULL;
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078 #ifdef USE_CHARDET
00079 if (libguess_validate_utf8(str, strlen(str)))
00080 return g_strdup(str);
00081 #else
00082 if (g_utf8_validate(str, strlen(str), NULL))
00083 return g_strdup(str);
00084 #endif
00085
00086
00087 if ((out_str = cd_chardet_to_utf8(str, strlen(str), NULL, NULL, NULL)) != NULL)
00088 return out_str;
00089
00090
00091 return str_to_utf8_fallback(str);
00092 }
00093
00094 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len,
00095 gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error)
00096 {
00097 if (error)
00098 * error = NULL;
00099
00100 #ifdef USE_CHARDET
00101 gchar *det = NULL, *encoding = NULL;
00102 #endif
00103 gchar *ret = NULL;
00104 gsize *bytes_read, *bytes_write;
00105 gsize my_bytes_read, my_bytes_write;
00106
00107 bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
00108 bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
00109
00110 g_return_val_if_fail(str != NULL, NULL);
00111
00112 #ifdef USE_CHARDET
00113 if (libguess_validate_utf8(str, len))
00114 #else
00115 if (g_utf8_validate(str, len, NULL))
00116 #endif
00117 {
00118 if (len < 0)
00119 len = strlen (str);
00120
00121 ret = g_malloc (len + 1);
00122 memcpy (ret, str, len);
00123 ret[len] = 0;
00124
00125 if (arg_bytes_read != NULL)
00126 * arg_bytes_read = len;
00127 if (arg_bytes_write != NULL)
00128 * arg_bytes_write = len;
00129
00130 return ret;
00131 }
00132 #ifdef USE_CHARDET
00133 if (cfg.chardet_detector)
00134 det = cfg.chardet_detector;
00135
00136 if (det)
00137 {
00138 AUDDBG("guess encoding (%s) %s\n", det, str);
00139 encoding = (gchar *) libguess_determine_encoding(str, len, det);
00140 AUDDBG("encoding = %s\n", encoding);
00141 if (encoding == NULL)
00142 goto fallback;
00143
00144 ret = g_convert (str, len, "UTF-8", encoding, bytes_read, bytes_write,
00145 (error && * error) ? NULL : error);
00146 }
00147
00148 fallback:
00149 #endif
00150
00151
00152 if (ret == NULL && cfg.chardet_fallback_s != NULL)
00153 {
00154 gchar **enc;
00155 for (enc = cfg.chardet_fallback_s; *enc != NULL; enc++)
00156 {
00157 ret = g_convert (str, len, "UTF-8", * enc, bytes_read, bytes_write,
00158 (error && * error) ? NULL : error);
00159 if (len == *bytes_read)
00160 break;
00161 else {
00162 g_free(ret);
00163 ret = NULL;
00164 }
00165 }
00166 }
00167
00168
00169 if (ret == NULL)
00170 ret = g_locale_to_utf8 (str, len, bytes_read, bytes_write,
00171 (error && * error) ? NULL : error);
00172
00173
00174 if (ret == NULL)
00175 ret = g_convert (str, len, "UTF-8", "ISO-8859-1", bytes_read,
00176 bytes_write, (error && * error) ? NULL : error);
00177
00178 if (ret != NULL)
00179 {
00180 if (g_utf8_validate(ret, -1, NULL))
00181 return ret;
00182 else
00183 {
00184 g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
00185 g_free(ret);
00186 return NULL;
00187 }
00188 }
00189
00190 return NULL;
00191 }
00192
00193 void chardet_init (void)
00194 {
00195 str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8);
00196 }