00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "encodingdetector.h"
00029
00030 #undef DECODE_DEBUG
00031
00032
00033 #define MAX_BUFFER 16*1024
00034
00035 #include <assert.h>
00036 #include <stdlib.h>
00037
00038 #include "encodingdetector_ja_p.h"
00039
00040 #include <tqregexp.h>
00041 #include <tqtextcodec.h>
00042
00043 #include <kglobal.h>
00044 #include <kcharsets.h>
00045 #include <kdebug.h>
00046 #include <klocale.h>
00047
00048 #include <ctype.h>
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065 typedef struct _PangoScriptForLang {
00066 const char lang[6];
00067 EncodingDetector::AutoDetectScript scripts[1];
00068 } PangoScriptForLang;
00069
00070
00071
00072
00073
00074
00075
00076 #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
00077 #define PANGO_SCRIPT_BENGALI EncodingDetector::None
00078 #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
00079 #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
00080 #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
00081 #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
00082 #define PANGO_SCRIPT_GUJARATI EncodingDetector::None
00083 #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
00084 #define PANGO_SCRIPT_KANNADA EncodingDetector::None
00085 #define PANGO_SCRIPT_KHMER EncodingDetector::None
00086 #define PANGO_SCRIPT_LAO EncodingDetector::None
00087 #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
00088 #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
00089 #define PANGO_SCRIPT_MYANMAR EncodingDetector::None
00090 #define PANGO_SCRIPT_ORIYA EncodingDetector::None
00091 #define PANGO_SCRIPT_SINHALA EncodingDetector::None
00092 #define PANGO_SCRIPT_SYRIAC EncodingDetector::None
00093 #define PANGO_SCRIPT_TAGALOG EncodingDetector::None
00094 #define PANGO_SCRIPT_TAMIL EncodingDetector::None
00095 #define PANGO_SCRIPT_TIBETAN EncodingDetector::None
00096 #define PANGO_SCRIPT_TELUGU EncodingDetector::None
00097
00098
00099 #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
00100 #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
00101 #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
00102 #define PANGO_SCRIPT_GREEK EncodingDetector::Greek
00103 #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
00104 #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
00105 #define PANGO_SCRIPT_THAI EncodingDetector::Thai
00106
00107
00108 static const PangoScriptForLang pango_script_for_lang[] = {
00109 { "aa", { PANGO_SCRIPT_LATIN } },
00110 { "ab", { PANGO_SCRIPT_CYRILLIC } },
00111 { "af", { PANGO_SCRIPT_LATIN } },
00112 { "am", { PANGO_SCRIPT_ETHIOPIC } },
00113 { "ar", { PANGO_SCRIPT_ARABIC } },
00114 { "as", { PANGO_SCRIPT_BENGALI } },
00115 { "ast", { PANGO_SCRIPT_LATIN } },
00116 { "ava", { PANGO_SCRIPT_CYRILLIC } },
00117 { "ay", { PANGO_SCRIPT_LATIN } },
00118 { "az-ir", { PANGO_SCRIPT_ARABIC } },
00119 { "az", { PANGO_SCRIPT_CYRILLIC } },
00120 { "bam", { PANGO_SCRIPT_LATIN } },
00121 { "ba", { PANGO_SCRIPT_CYRILLIC } },
00122 { "be", { PANGO_SCRIPT_CYRILLIC } },
00123 { "bg", { PANGO_SCRIPT_CYRILLIC } },
00124 { "bh", { PANGO_SCRIPT_DEVANAGARI } },
00125 { "bho", { PANGO_SCRIPT_DEVANAGARI } },
00126 { "bi", { PANGO_SCRIPT_LATIN } },
00127 { "bin", { PANGO_SCRIPT_LATIN } },
00128 { "bn", { PANGO_SCRIPT_BENGALI } },
00129 { "bo", { PANGO_SCRIPT_TIBETAN } },
00130 { "br", { PANGO_SCRIPT_LATIN } },
00131 { "bs", { PANGO_SCRIPT_LATIN } },
00132 { "bua", { PANGO_SCRIPT_CYRILLIC } },
00133 { "ca", { PANGO_SCRIPT_LATIN } },
00134 { "ce", { PANGO_SCRIPT_CYRILLIC } },
00135 { "chm", { PANGO_SCRIPT_CYRILLIC } },
00136 { "chr", { PANGO_SCRIPT_CHEROKEE } },
00137 { "ch", { PANGO_SCRIPT_LATIN } },
00138 { "co", { PANGO_SCRIPT_LATIN } },
00139 { "cs", { PANGO_SCRIPT_LATIN } },
00140 { "cu", { PANGO_SCRIPT_CYRILLIC } },
00141 { "cv", { PANGO_SCRIPT_CYRILLIC } },
00142 { "cy", { PANGO_SCRIPT_LATIN } },
00143 { "da", { PANGO_SCRIPT_LATIN } },
00144 { "de", { PANGO_SCRIPT_LATIN } },
00145 { "dz", { PANGO_SCRIPT_TIBETAN } },
00146 { "el", { PANGO_SCRIPT_GREEK } },
00147 { "en", { PANGO_SCRIPT_LATIN } },
00148 { "eo", { PANGO_SCRIPT_LATIN } },
00149 { "es", { PANGO_SCRIPT_LATIN } },
00150
00151 { "et", { EncodingDetector::Baltic } },
00152 { "eu", { PANGO_SCRIPT_LATIN } },
00153 { "fa", { PANGO_SCRIPT_ARABIC } },
00154 { "fi", { PANGO_SCRIPT_LATIN } },
00155 { "fj", { PANGO_SCRIPT_LATIN } },
00156 { "fo", { PANGO_SCRIPT_LATIN } },
00157 { "fr", { PANGO_SCRIPT_LATIN } },
00158 { "ful", { PANGO_SCRIPT_LATIN } },
00159 { "fur", { PANGO_SCRIPT_LATIN } },
00160 { "fy", { PANGO_SCRIPT_LATIN } },
00161 { "ga", { PANGO_SCRIPT_LATIN } },
00162 { "gd", { PANGO_SCRIPT_LATIN } },
00163 { "gez", { PANGO_SCRIPT_ETHIOPIC } },
00164 { "gl", { PANGO_SCRIPT_LATIN } },
00165 { "gn", { PANGO_SCRIPT_LATIN } },
00166 { "gu", { PANGO_SCRIPT_GUJARATI } },
00167 { "gv", { PANGO_SCRIPT_LATIN } },
00168 { "ha", { PANGO_SCRIPT_LATIN } },
00169 { "haw", { PANGO_SCRIPT_LATIN } },
00170 { "he", { PANGO_SCRIPT_HEBREW } },
00171 { "hi", { PANGO_SCRIPT_DEVANAGARI } },
00172 { "ho", { PANGO_SCRIPT_LATIN } },
00173 { "hr", { PANGO_SCRIPT_LATIN } },
00174 { "hu", { PANGO_SCRIPT_LATIN } },
00175 { "hy", { PANGO_SCRIPT_ARMENIAN } },
00176 { "ia", { PANGO_SCRIPT_LATIN } },
00177 { "ibo", { PANGO_SCRIPT_LATIN } },
00178 { "id", { PANGO_SCRIPT_LATIN } },
00179 { "ie", { PANGO_SCRIPT_LATIN } },
00180 { "ik", { PANGO_SCRIPT_CYRILLIC } },
00181 { "io", { PANGO_SCRIPT_LATIN } },
00182 { "is", { PANGO_SCRIPT_LATIN } },
00183 { "it", { PANGO_SCRIPT_LATIN } },
00184 { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL } },
00185
00186 { "ja", { EncodingDetector::Japanese } },
00187 { "kaa", { PANGO_SCRIPT_CYRILLIC } },
00188 { "ka", { PANGO_SCRIPT_GEORGIAN } },
00189 { "ki", { PANGO_SCRIPT_LATIN } },
00190 { "kk", { PANGO_SCRIPT_CYRILLIC } },
00191 { "kl", { PANGO_SCRIPT_LATIN } },
00192 { "km", { PANGO_SCRIPT_KHMER } },
00193 { "kn", { PANGO_SCRIPT_KANNADA } },
00194
00195 { "ko", { EncodingDetector::Korean } },
00196 { "kok", { PANGO_SCRIPT_DEVANAGARI } },
00197 { "ks", { PANGO_SCRIPT_DEVANAGARI } },
00198 { "ku-ir", { PANGO_SCRIPT_ARABIC } },
00199 { "ku", { PANGO_SCRIPT_CYRILLIC } },
00200 { "kum", { PANGO_SCRIPT_CYRILLIC } },
00201 { "kv", { PANGO_SCRIPT_CYRILLIC } },
00202 { "kw", { PANGO_SCRIPT_LATIN } },
00203 { "ky", { PANGO_SCRIPT_CYRILLIC } },
00204 { "la", { PANGO_SCRIPT_LATIN } },
00205 { "lb", { PANGO_SCRIPT_LATIN } },
00206 { "lez", { PANGO_SCRIPT_CYRILLIC } },
00207 { "ln", { PANGO_SCRIPT_LATIN } },
00208 { "lo", { PANGO_SCRIPT_LAO } },
00209
00210 { "lt", { EncodingDetector::Baltic } },
00211
00212 { "lv", { EncodingDetector::Baltic } },
00213 { "mg", { PANGO_SCRIPT_LATIN } },
00214 { "mh", { PANGO_SCRIPT_LATIN } },
00215 { "mi", { PANGO_SCRIPT_LATIN } },
00216 { "mk", { PANGO_SCRIPT_CYRILLIC } },
00217 { "ml", { PANGO_SCRIPT_MALAYALAM } },
00218 { "mn", { PANGO_SCRIPT_MONGOLIAN } },
00219 { "mo", { PANGO_SCRIPT_CYRILLIC } },
00220 { "mr", { PANGO_SCRIPT_DEVANAGARI } },
00221 { "mt", { PANGO_SCRIPT_LATIN } },
00222 { "my", { PANGO_SCRIPT_MYANMAR } },
00223 { "nb", { PANGO_SCRIPT_LATIN } },
00224 { "nds", { PANGO_SCRIPT_LATIN } },
00225 { "ne", { PANGO_SCRIPT_DEVANAGARI } },
00226 { "nl", { PANGO_SCRIPT_LATIN } },
00227 { "nn", { PANGO_SCRIPT_LATIN } },
00228 { "no", { PANGO_SCRIPT_LATIN } },
00229 { "nr", { PANGO_SCRIPT_LATIN } },
00230 { "nso", { PANGO_SCRIPT_LATIN } },
00231 { "ny", { PANGO_SCRIPT_LATIN } },
00232 { "oc", { PANGO_SCRIPT_LATIN } },
00233 { "om", { PANGO_SCRIPT_LATIN } },
00234 { "or", { PANGO_SCRIPT_ORIYA } },
00235 { "os", { PANGO_SCRIPT_CYRILLIC } },
00236 { "pa", { PANGO_SCRIPT_GURMUKHI } },
00237 { "pl", { PANGO_SCRIPT_LATIN } },
00238 { "ps-af", { PANGO_SCRIPT_ARABIC } },
00239 { "ps-pk", { PANGO_SCRIPT_ARABIC } },
00240 { "pt", { PANGO_SCRIPT_LATIN } },
00241 { "rm", { PANGO_SCRIPT_LATIN } },
00242 { "ro", { PANGO_SCRIPT_LATIN } },
00243 { "ru", { PANGO_SCRIPT_CYRILLIC } },
00244 { "sah", { PANGO_SCRIPT_CYRILLIC } },
00245 { "sa", { PANGO_SCRIPT_DEVANAGARI } },
00246 { "sco", { PANGO_SCRIPT_LATIN } },
00247 { "sel", { PANGO_SCRIPT_CYRILLIC } },
00248 { "se", { PANGO_SCRIPT_LATIN } },
00249 { "sh", { PANGO_SCRIPT_CYRILLIC } },
00250 { "si", { PANGO_SCRIPT_SINHALA } },
00251 { "sk", { PANGO_SCRIPT_LATIN } },
00252 { "sl", { PANGO_SCRIPT_LATIN } },
00253 { "sma", { PANGO_SCRIPT_LATIN } },
00254 { "smj", { PANGO_SCRIPT_LATIN } },
00255 { "smn", { PANGO_SCRIPT_LATIN } },
00256 { "sms", { PANGO_SCRIPT_LATIN } },
00257 { "sm", { PANGO_SCRIPT_LATIN } },
00258 { "so", { PANGO_SCRIPT_LATIN } },
00259 { "sq", { PANGO_SCRIPT_LATIN } },
00260 { "sr", { PANGO_SCRIPT_CYRILLIC } },
00261 { "ss", { PANGO_SCRIPT_LATIN } },
00262 { "st", { PANGO_SCRIPT_LATIN } },
00263 { "sv", { PANGO_SCRIPT_LATIN } },
00264 { "sw", { PANGO_SCRIPT_LATIN } },
00265 { "syr", { PANGO_SCRIPT_SYRIAC } },
00266 { "ta", { PANGO_SCRIPT_TAMIL } },
00267 { "te", { PANGO_SCRIPT_TELUGU } },
00268 { "tg", { PANGO_SCRIPT_CYRILLIC } },
00269 { "th", { PANGO_SCRIPT_THAI } },
00270 { "ti-er", { PANGO_SCRIPT_ETHIOPIC } },
00271 { "ti-et", { PANGO_SCRIPT_ETHIOPIC } },
00272 { "tig", { PANGO_SCRIPT_ETHIOPIC } },
00273 { "tk", { PANGO_SCRIPT_CYRILLIC } },
00274 { "tl", { PANGO_SCRIPT_TAGALOG } },
00275 { "tn", { PANGO_SCRIPT_LATIN } },
00276 { "to", { PANGO_SCRIPT_LATIN } },
00277
00278 { "tr", { EncodingDetector::Turkish } },
00279 { "ts", { PANGO_SCRIPT_LATIN } },
00280 { "tt", { PANGO_SCRIPT_CYRILLIC } },
00281 { "tw", { PANGO_SCRIPT_LATIN } },
00282 { "tyv", { PANGO_SCRIPT_CYRILLIC } },
00283 { "ug", { PANGO_SCRIPT_ARABIC } },
00284 { "uk", { PANGO_SCRIPT_CYRILLIC } },
00285 { "ur", { PANGO_SCRIPT_ARABIC } },
00286 { "uz", { PANGO_SCRIPT_CYRILLIC } },
00287 { "ven", { PANGO_SCRIPT_LATIN } },
00288 { "vi", { PANGO_SCRIPT_LATIN } },
00289 { "vot", { PANGO_SCRIPT_LATIN } },
00290 { "vo", { PANGO_SCRIPT_LATIN } },
00291 { "wa", { PANGO_SCRIPT_LATIN } },
00292 { "wen", { PANGO_SCRIPT_LATIN } },
00293 { "wo", { PANGO_SCRIPT_LATIN } },
00294 { "xh", { PANGO_SCRIPT_LATIN } },
00295 { "yap", { PANGO_SCRIPT_LATIN } },
00296 { "yi", { PANGO_SCRIPT_HEBREW } },
00297 { "yo", { PANGO_SCRIPT_LATIN } },
00298
00299 { "zh-cn", { EncodingDetector::ChineseSimplified } },
00300
00301 { "zh-hk", { EncodingDetector::ChineseTraditional } },
00302
00303 { "zh-mo", { EncodingDetector::ChineseTraditional } },
00304
00305 { "zh-sg", { EncodingDetector::ChineseSimplified } },
00306
00307 { "zh-tw", { EncodingDetector::ChineseTraditional } },
00308 { "zu", { PANGO_SCRIPT_LATIN } },
00309 { "\x00", { EncodingDetector::None } }
00310 };
00311
00312 enum MIB
00313 {
00314 MibLatin1 = 4,
00315 Mib8859_8 = 85,
00316 MibUtf8 = 106,
00317 MibUcs2 = 1000,
00318 MibUtf16 = 1015,
00319 MibUtf16BE = 1013,
00320 MibUtf16LE = 1014
00321 };
00322
00323 static bool is16Bit(TQTextCodec* codec)
00324 {
00325 switch (codec->mibEnum())
00326 {
00327 case MibUtf16:
00328 case MibUtf16BE:
00329 case MibUtf16LE:
00330 case MibUcs2:
00331 return true;
00332 default:
00333 return false;
00334 }
00335 }
00336
00337 class EncodingDetectorPrivate
00338 {
00339 public:
00340 TQTextCodec *m_codec;
00341 TQTextDecoder *m_decoder;
00342 TQTextCodec *m_defaultCodec;
00343 TQCString m_storeDecoderName;
00344
00345 EncodingDetector::EncodingChoiceSource m_source;
00346 EncodingDetector::AutoDetectScript m_autoDetectLanguage;
00347
00348 bool m_visualRTL : 1;
00349 bool m_seenBody : 1;
00350 bool m_writtingHappened : 1;
00351 bool m_analyzeCalled : 1;
00352 int m_multiByte;
00353
00354 TQCString m_bufferForDefferedEncDetection;
00355
00356 EncodingDetectorPrivate()
00357 : m_codec(TQTextCodec::codecForMib(MibLatin1))
00358 , m_decoder(m_codec->makeDecoder())
00359 , m_defaultCodec(m_codec)
00360 , m_source(EncodingDetector::DefaultEncoding)
00361 , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
00362 , m_visualRTL(false)
00363 , m_seenBody(false)
00364 , m_writtingHappened(false)
00365 , m_analyzeCalled(false)
00366 , m_multiByte(0)
00367 {
00368 }
00369
00370 EncodingDetectorPrivate(TQTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
00371 : m_codec(codec)
00372 , m_decoder(m_codec->makeDecoder())
00373 , m_defaultCodec(m_codec)
00374 , m_source(source)
00375 , m_autoDetectLanguage(script)
00376 , m_visualRTL(false)
00377 , m_seenBody(false)
00378 , m_writtingHappened(false)
00379 , m_analyzeCalled(false)
00380 , m_multiByte(0)
00381 {
00382 }
00383
00384 ~EncodingDetectorPrivate()
00385 {
00386 delete m_decoder;
00387 }
00388 };
00389
00390
00391 static TQCString automaticDetectionForArabic( const unsigned char* ptr, int size )
00392 {
00393 for ( int i = 0; i < size; ++i ) {
00394 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
00395 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
00396 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
00397 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
00398 return "cp1256";
00399 }
00400 }
00401
00402 return "iso-8859-6";
00403 }
00404
00405 static TQCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
00406 {
00407 for ( int i = 0; i < size; ++i ) {
00408 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
00409 return "cp1257";
00410
00411 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
00412 return "iso-8859-13";
00413 }
00414
00415 return "iso-8859-13";
00416 }
00417
00418 static TQCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
00419 {
00420 TQCString charset;
00421 for ( int i = 0; i < size; ++i ) {
00422 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
00423 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
00424 return "ibm852";
00425
00426 if ( i + 1 > size )
00427 return "cp1250";
00428 else {
00429 charset = "cp1250";
00430 continue;
00431 }
00432 }
00433 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
00434 if ( i + 1 > size )
00435 return "iso-8859-2";
00436 else {
00437 if ( charset.isNull() )
00438 charset = "iso-8859-2";
00439 continue;
00440 }
00441 }
00442 }
00443
00444 if ( charset.isNull() )
00445 charset = "iso-8859-3";
00446
00447 return charset.data();
00448 }
00449
00450 static TQCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
00451 {
00452 #ifdef DECODE_DEBUG
00453 kWarning() << "EncodingDetector: Cyr heuristics";
00454 #endif
00455
00456
00457
00458 int utf8_mark=0;
00459 int koi_score=0;
00460 int cp1251_score=0;
00461
00462 int koi_st=0;
00463 int cp1251_st=0;
00464
00465
00466
00467
00468 int koi_o_capital=0;
00469 int koi_o=0;
00470 int cp1251_o_capital=0;
00471 int cp1251_o=0;
00472
00473 int koi_a_capital=0;
00474 int koi_a=0;
00475 int cp1251_a_capital=0;
00476 int cp1251_a=0;
00477
00478 int koi_s_capital=0;
00479 int koi_s=0;
00480 int cp1251_s_capital=0;
00481 int cp1251_s=0;
00482
00483 int koi_i_capital=0;
00484 int koi_i=0;
00485 int cp1251_i_capital=0;
00486 int cp1251_i=0;
00487
00488 int cp1251_small_range=0;
00489 int koi_small_range=0;
00490 int ibm866_small_range=0;
00491
00492 int i;
00493 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
00494 {
00495 if (ptr[i]>0xdf)
00496 {
00497 ++cp1251_small_range;
00498
00499 if (ptr[i]==0xee)
00500 ++cp1251_o;
00501 else if (ptr[i]==0xe0)
00502 ++cp1251_a;
00503 else if (ptr[i]==0xe8)
00504 ++cp1251_i;
00505 else if (ptr[i]==0xf1)
00506 ++cp1251_s;
00507 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
00508 ++cp1251_st;
00509
00510 else if (ptr[i]==0xef)
00511 ++koi_o_capital;
00512 else if (ptr[i]==0xe1)
00513 ++koi_a_capital;
00514 else if (ptr[i]==0xe9)
00515 ++koi_i_capital;
00516 else if (ptr[i]==0xf3)
00517 ++koi_s_capital;
00518
00519 }
00520 else if (ptr[i]>0xbf)
00521 {
00522 ++koi_small_range;
00523
00524 if (ptr[i]==0xd0||ptr[i]==0xd1)
00525 ++utf8_mark;
00526 else if (ptr[i]==0xcf)
00527 ++koi_o;
00528 else if (ptr[i]==0xc1)
00529 ++koi_a;
00530 else if (ptr[i]==0xc9)
00531 ++koi_i;
00532 else if (ptr[i]==0xd3)
00533 ++koi_s;
00534 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
00535 ++koi_st;
00536
00537 else if (ptr[i]==0xce)
00538 ++cp1251_o_capital;
00539 else if (ptr[i]==0xc0)
00540 ++cp1251_a_capital;
00541 else if (ptr[i]==0xc8)
00542 ++cp1251_i_capital;
00543 else if (ptr[i]==0xd1)
00544 ++cp1251_s_capital;
00545 }
00546 else if (ptr[i]>0x9f && ptr[i]<0xb0)
00547 ++ibm866_small_range;
00548
00549 }
00550
00551
00552 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
00553 {
00554 return "";
00555 }
00556
00557 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
00558 {
00559 #ifdef DECODE_DEBUG
00560 kWarning() << "Cyr Enc Detection: UTF8";
00561 #endif
00562 return "UTF-8";
00563 }
00564
00565 if (ibm866_small_range>cp1251_small_range+koi_small_range)
00566 return "ibm866";
00567
00568
00569
00570
00571 if (cp1251_st==0 && koi_st>1)
00572 koi_score+=10;
00573 else if (koi_st==0 && cp1251_st>1)
00574 cp1251_score+=10;
00575
00576 if (cp1251_st && koi_st)
00577 {
00578 if (cp1251_st/koi_st>2)
00579 cp1251_score+=20;
00580 else if (koi_st/cp1251_st>2)
00581 koi_score+=20;
00582 }
00583
00584 if (cp1251_a>koi_a)
00585 cp1251_score+=10;
00586 else if (cp1251_a || koi_a)
00587 koi_score+=10;
00588
00589 if (cp1251_o>koi_o)
00590 cp1251_score+=10;
00591 else if (cp1251_o || koi_o)
00592 koi_score+=10;
00593
00594 if (cp1251_i>koi_i)
00595 cp1251_score+=10;
00596 else if (cp1251_i || koi_i)
00597 koi_score+=10;
00598
00599 if (cp1251_s>koi_s)
00600 cp1251_score+=10;
00601 else if (cp1251_s || koi_s)
00602 koi_score+=10;
00603
00604 if (cp1251_a_capital>koi_a_capital)
00605 cp1251_score+=9;
00606 else if (cp1251_a_capital || koi_a_capital)
00607 koi_score+=9;
00608
00609 if (cp1251_o_capital>koi_o_capital)
00610 cp1251_score+=9;
00611 else if (cp1251_o_capital || koi_o_capital)
00612 koi_score+=9;
00613
00614 if (cp1251_i_capital>koi_i_capital)
00615 cp1251_score+=9;
00616 else if (cp1251_i_capital || koi_i_capital)
00617 koi_score+=9;
00618
00619 if (cp1251_s_capital>koi_s_capital)
00620 cp1251_score+=9;
00621 else if (cp1251_s_capital || koi_s_capital)
00622 koi_score+=9;
00623 #ifdef DECODE_DEBUG
00624 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
00625 #endif
00626 if (abs(koi_score-cp1251_score)<10)
00627 {
00628
00629 cp1251_score=cp1251_small_range;
00630 koi_score=koi_small_range;
00631 }
00632 if (cp1251_score>koi_score)
00633 return "cp1251";
00634 else
00635 return "koi8-u";
00636
00637
00638
00639
00640
00641
00642
00643
00644 }
00645
00646 static TQCString automaticDetectionForGreek( const unsigned char* ptr, int size )
00647 {
00648 for ( int i = 0; i < size; ++i ) {
00649 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
00650 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
00651 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
00652 return "cp1253";
00653 }
00654 }
00655
00656 return "iso-8859-7";
00657 }
00658
00659 static TQCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
00660 {
00661 for ( int i = 0; i < size; ++i ) {
00662 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
00663 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
00664 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
00665 return "cp1255";
00666 }
00667
00668 if ( ptr[ i ] == 0xDF )
00669 return "iso-8859-8-i";
00670 }
00671
00672 return "iso-8859-8-i";
00673 }
00674
00675 static TQCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
00676 {
00677 JapaneseCode kc;
00678
00679 switch ( kc.guess_jp( (const char*)ptr, size ) ) {
00680 case JapaneseCode::JIS:
00681 return "jis7";
00682 case JapaneseCode::EUC:
00683 return "eucjp";
00684 case JapaneseCode::SJIS:
00685 return "sjis";
00686 case JapaneseCode::UTF8:
00687 return "utf8";
00688 default:
00689 break;
00690 }
00691
00692 return "";
00693 }
00694
00695 static TQCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
00696 {
00697 for ( int i = 0; i < size; ++i ) {
00698 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
00699 return "cp1254";
00700 }
00701 }
00702
00703 return "iso-8859-9";
00704 }
00705
00706 static TQCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
00707 {
00708 uint nonansi_count=0;
00709 for (int i=0; i<size; ++i)
00710 {
00711 if (ptr[i]>0x79)
00712 {
00713 ++nonansi_count;
00714 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
00715 {
00716 return "UTF-8";
00717 }
00718 if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
00719 {
00720 return "cp1252";
00721 }
00722 }
00723
00724 }
00725
00726 if (nonansi_count>0)
00727 return "iso-8859-15";
00728
00729 return "";
00730 }
00731
00732 bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
00733 {
00734 if (d->m_codec->mibEnum()!=MibUtf8)
00735 return false;
00736
00737
00738
00739
00740
00741 static const unsigned char highest1Bits = 0x80;
00742 static const unsigned char highest2Bits = 0xC0;
00743 static const unsigned char highest3Bits = 0xE0;
00744 static const unsigned char highest4Bits = 0xF0;
00745 static const unsigned char highest5Bits = 0xF8;
00746
00747 for (int i=0; i<length; ++i)
00748 {
00749 unsigned char c = data[i];
00750
00751 if (d->m_multiByte>0)
00752 {
00753 if ((c & highest2Bits) == 0x80)
00754 {
00755 --(d->m_multiByte);
00756 continue;
00757 }
00758 #ifdef DECODE_DEBUG
00759 kWarning() << "EncDetector: Broken UTF8";
00760 #endif
00761 return true;
00762 }
00763
00764
00765 if ((c & highest1Bits) == 0x00)
00766 continue;
00767
00768
00769 if ((c & highest3Bits) == 0xC0)
00770 {
00771 d->m_multiByte = 1;
00772 continue;
00773 }
00774
00775
00776 if ((c & highest4Bits) == 0xE0)
00777 {
00778 d->m_multiByte = 2;
00779 continue;
00780 }
00781
00782
00783 if ((c & highest5Bits) == 0xF0)
00784 {
00785 d->m_multiByte = 3;
00786 continue;
00787 }
00788 #ifdef DECODE_DEBUG
00789 kWarning() << "EncDetector:_Broken UTF8";
00790 #endif
00791 return true;
00792 }
00793 return false;
00794 }
00795
00796 EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
00797 {
00798 }
00799
00800 EncodingDetector::EncodingDetector(TQTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
00801 d(new EncodingDetectorPrivate(codec,source,script))
00802 {
00803 }
00804
00805 EncodingDetector::~EncodingDetector()
00806 {
00807 delete d;
00808 }
00809
00810 void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
00811 {
00812 d->m_autoDetectLanguage=lang;
00813 }
00814 EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
00815 {
00816 return d->m_autoDetectLanguage;
00817 }
00818
00819 EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
00820 {
00821 return d->m_source;
00822 }
00823
00824 const char* EncodingDetector::encoding() const
00825 {
00826 d->m_storeDecoderName = d->m_codec->name();
00827 d->m_storeDecoderName = d->m_storeDecoderName.lower().replace( "iso ", "iso-" );
00828 return d->m_storeDecoderName.data();
00829 }
00830
00831 bool EncodingDetector::visuallyOrdered() const
00832 {
00833 return d->m_visualRTL;
00834 }
00835
00836
00837
00838
00839
00840
00841 TQTextDecoder* EncodingDetector::decoder()
00842 {
00843 return d->m_decoder;
00844 }
00845
00846 bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
00847 {
00848 TQTextCodec *codec;
00849 TQCString enc(_encoding);
00850 if(enc.isEmpty())
00851 {
00852 if (type==DefaultEncoding)
00853 codec=d->m_defaultCodec;
00854 else
00855 return false;
00856 }
00857 else
00858 {
00859
00860
00861 enc = enc.lower();
00862
00863 if(enc=="visual")
00864 enc="iso8859-8";
00865 bool b;
00866 codec = KGlobal::charsets()->codecForName(enc, b);
00867 if (!b)
00868 return false;
00869 }
00870
00871 if (d->m_codec->mibEnum()==codec->mibEnum())
00872 return true;
00873
00874 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
00875 {
00876
00877
00878 return false;
00879 }
00880
00881 if (codec->mibEnum() == Mib8859_8)
00882 {
00883
00884 codec = TQTextCodec::codecForName("iso8859-8-i");
00885
00886
00887 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
00888 d->m_visualRTL = true;
00889 }
00890
00891 d->m_codec = codec;
00892 d->m_source = type;
00893 delete d->m_decoder;
00894 d->m_decoder = d->m_codec->makeDecoder();
00895 #ifdef DECODE_DEBUG
00896 kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
00897 #endif
00898 return true;
00899 }
00900
00901 bool EncodingDetector::analyze(const TQByteArray &data)
00902 {
00903 return analyze( data.data(), data.size() );
00904 }
00905
00906 bool EncodingDetector::analyze(const char *data, int len)
00907 {
00908
00909
00910
00911 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
00912 {
00913
00914 const uchar *udata = (const uchar *)data;
00915 uchar c1 = *udata++;
00916 uchar c2 = *udata++;
00917 uchar c3 = *udata++;
00918
00919
00920 const char *autoDetectedEncoding;
00921 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
00922 {
00923 autoDetectedEncoding = "ISO-10646-UCS-2";
00924 }
00925 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
00926 {
00927 autoDetectedEncoding = "UTF-8";
00928 }
00929 else if (c1 == 0x00 || c2 == 0x00)
00930 {
00931 uchar c4 = *udata++;
00932 uchar c5 = *udata++;
00933 uchar c6 = *udata++;
00934 uchar c7 = *udata++;
00935 uchar c8 = *udata++;
00936 uchar c9 = *udata++;
00937 uchar c10 = *udata++;
00938
00939 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
00940 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
00941 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
00942 autoDetectedEncoding = "ISO-10646-UCS-2";
00943 else
00944 autoDetectedEncoding = 0;
00945 }
00946 else
00947 {
00948 autoDetectedEncoding = 0;
00949 }
00950
00951
00952 if (autoDetectedEncoding != 0)
00953 {
00954 d->m_source = BOM;
00955 d->m_codec = TQTextCodec::codecForName(autoDetectedEncoding);
00956 assert(d->m_codec);
00957
00958 delete d->m_decoder;
00959 d->m_decoder = d->m_codec->makeDecoder();
00960 #ifdef DECODE_DEBUG
00961 kWarning() << "Detection by BOM";
00962 #endif
00963 if (is16Bit(d->m_codec) && c2==0x00)
00964 {
00965
00966 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
00967 d->m_decoder->toUnicode(reverseUtf16, 2);
00968 }
00969 return true;
00970 }
00971 }
00972
00973
00974 if (d->m_source==UserChosenEncoding)
00975 {
00976 #ifdef DECODE_DEBUG
00977 kWarning() << "EncodingDetector: UserChosenEncoding exit ";
00978 #endif
00979
00980 if (errorsIfUtf8(data, len))
00981 setEncoding("",DefaultEncoding);
00982 return true;
00983 }
00984 #if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
00985 if (!d->m_seenBody)
00986 {
00987
00988
00989
00990 const char *ptr = data;
00991 const char *pEnd = data+len;
00992
00993 while(ptr != pEnd)
00994 {
00995 if(*ptr!='<')
00996 {
00997 ++ptr;
00998 continue;
00999 }
01000 ++ptr;
01001
01002 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
01003 {
01004 ptr += 3;
01005 skipComment(ptr, pEnd);
01006 continue;
01007 }
01008
01009
01010 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
01011 {
01012 const char *end = ptr;
01013 while (*end != '>' && end < pEnd)
01014 end++;
01015 if (*end == '\0' || end == pEnd)
01016 break;
01017 TQCString str(ptr, end - ptr + 1);
01018 int length;
01019 int pos = findXMLEncoding(str, length);
01020
01021 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
01022 {
01023 return true;
01024 }
01025 }
01026
01027
01028 while (
01029 !((*ptr >= 'a') && (*ptr <= 'z') ||
01030 (*ptr >= 'A') && (*ptr <= 'Z'))
01031 && ptr < pEnd
01032 )
01033 ++ptr;
01034
01035 char tmp[5];
01036 int length=0;
01037 const char* max=ptr+4;
01038 if (pEnd<max)
01039 max=pEnd;
01040 while (
01041 ((*ptr >= 'a') && (*ptr <= 'z') ||
01042 (*ptr >= 'A') && (*ptr <= 'Z') ||
01043 (*ptr >= '0') && (*ptr <= '9'))
01044 && ptr < max
01045 )
01046 {
01047 tmp[length] = tolower( *ptr );
01048 ++ptr;
01049 ++length;
01050 }
01051 tmp[length] = 0;
01052 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
01053 {
01054
01055 const char* end = ptr;
01056 while(*end != '>' && *end != '\0' && end<pEnd)
01057 end++;
01058
01059 TQCString str( ptr, (end-ptr)+1);
01060 str = str.lower();
01061 int pos=0;
01062
01063
01064 if( (pos = str.find("charset")) == -1)
01065 continue;
01066 pos+=6;
01067
01068 if( (pos = str.find('=', pos)) == -1)
01069 continue;
01070
01071
01072 while (pos < (int)str.length() && str[pos] <= ' ')
01073 ++pos;
01074 if ( pos == (int)str.length())
01075 continue;
01076
01077 int endpos = pos;
01078 while( endpos < str.length() &&
01079 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
01080 && str[endpos] != ';' && str[endpos] != '>') )
01081 ++endpos;
01082 #ifdef DECODE_DEBUG
01083 kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
01084 #endif
01085 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
01086 return true;
01087 }
01088 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
01089 {
01090 d->m_seenBody=true;
01091 break;
01092 }
01093 }
01094 }
01095
01096 if (d->m_source==EncodingFromHTTPHeader)
01097 return true;
01098 #endif
01099
01100 if (len < 1)
01101 {
01102 setEncoding("",DefaultEncoding);
01103 return false;
01104 }
01105 #ifdef DECODE_DEBUG
01106 kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
01107 #endif
01108
01109 switch ( d->m_autoDetectLanguage )
01110 {
01111 case EncodingDetector::Arabic:
01112 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01113
01114 case EncodingDetector::Baltic:
01115 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
01116
01117 case EncodingDetector::CentralEuropean:
01118 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
01119 break;
01120 case EncodingDetector::Cyrillic:
01121 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
01122
01123 case EncodingDetector::Greek:
01124 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
01125
01126 case EncodingDetector::Hebrew:
01127 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
01128
01129 case EncodingDetector::Japanese:
01130 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
01131
01132 case EncodingDetector::Turkish:
01133 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
01134
01135 case EncodingDetector::WesternEuropean:
01136 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
01137 return true;
01138 else if (d->m_defaultCodec->mibEnum()==MibLatin1)
01139 {
01140 return setEncoding("iso-8859-15",AutoDetectedEncoding);
01141 }
01142 else
01143 {
01144 return setEncoding("",DefaultEncoding);
01145 }
01146
01147 case EncodingDetector::SemiautomaticDetection:
01148 case EncodingDetector::ChineseSimplified:
01149 case EncodingDetector::ChineseTraditional:
01150 case EncodingDetector::Korean:
01151 case EncodingDetector::Thai:
01152 case EncodingDetector::Unicode:
01153 case EncodingDetector::NorthernSaami:
01154 case EncodingDetector::SouthEasternEurope:
01155 case EncodingDetector::None:
01156
01157
01158 break;
01159 }
01160
01161 setEncoding("",DefaultEncoding);
01162 return true;
01163 }
01164
01165
01166 EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const TQString& lang)
01167 {
01168 if (lang.isEmpty())
01169 return EncodingDetector::None;
01170 else if (lang==i18n("@item Text character set", "Unicode"))
01171 return EncodingDetector::Unicode;
01172 else if (lang==i18n("@item Text character set", "Cyrillic"))
01173 return EncodingDetector::Cyrillic;
01174 else if (lang==i18n("@item Text character set", "Western European"))
01175 return EncodingDetector::WesternEuropean;
01176 else if (lang==i18n("@item Text character set", "Central European"))
01177 return EncodingDetector::CentralEuropean;
01178 else if (lang==i18n("@item Text character set", "Greek"))
01179 return EncodingDetector::Greek;
01180 else if (lang==i18n("@item Text character set", "Hebrew"))
01181 return EncodingDetector::Hebrew;
01182 else if (lang==i18n("@item Text character set", "Turkish"))
01183 return EncodingDetector::Turkish;
01184 else if (lang==i18n("@item Text character set", "Japanese"))
01185 return EncodingDetector::Japanese;
01186 else if (lang==i18n("@item Text character set", "Baltic"))
01187 return EncodingDetector::Baltic;
01188 else if (lang==i18n("@item Text character set", "Arabic"))
01189 return EncodingDetector::Arabic;
01190
01191 return EncodingDetector::None;
01192 }
01193
01194 bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
01195 {
01196 switch (script)
01197 {
01198 case EncodingDetector::Arabic:
01199 return true;
01200 case EncodingDetector::Baltic:
01201 return true;
01202 case EncodingDetector::CentralEuropean:
01203 return true;
01204 case EncodingDetector::Cyrillic:
01205 return true;
01206 case EncodingDetector::Greek:
01207 return true;
01208 case EncodingDetector::Hebrew:
01209 return true;
01210 case EncodingDetector::Japanese:
01211 return true;
01212 case EncodingDetector::Turkish:
01213 return true;
01214 case EncodingDetector::WesternEuropean:
01215 return true;
01216 case EncodingDetector::ChineseTraditional:
01217 return true;
01218 case EncodingDetector::ChineseSimplified:
01219 return true;
01220 case EncodingDetector::Unicode:
01221 return true;
01222 break;
01223 default:
01224 return false;
01225 }
01226 }
01227
01228 TQString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
01229 {
01230 switch (script)
01231 {
01232 case EncodingDetector::Arabic:
01233 return i18n("@item Text character set", "Arabic");
01234 break;
01235 case EncodingDetector::Baltic:
01236 return i18n("@item Text character set", "Baltic");
01237 break;
01238 case EncodingDetector::CentralEuropean:
01239 return i18n("@item Text character set", "Central European");
01240 break;
01241 case EncodingDetector::Cyrillic:
01242 return i18n("@item Text character set", "Cyrillic");
01243 break;
01244 case EncodingDetector::Greek:
01245 return i18n("@item Text character set", "Greek");
01246 break;
01247 case EncodingDetector::Hebrew:
01248 return i18n("@item Text character set", "Hebrew");
01249 break;
01250 case EncodingDetector::Japanese:
01251 return i18n("@item Text character set", "Japanese");
01252 break;
01253 case EncodingDetector::Turkish:
01254 return i18n("@item Text character set", "Turkish");
01255 break;
01256 case EncodingDetector::WesternEuropean:
01257 return i18n("@item Text character set", "Western European");
01258 break;
01259 case EncodingDetector::ChineseTraditional:
01260 return i18n("@item Text character set", "Chinese Traditional");
01261 break;
01262 case EncodingDetector::ChineseSimplified:
01263 return i18n("@item Text character set", "Chinese Simplified");
01264 break;
01265 case EncodingDetector::Korean:
01266 return i18n("@item Text character set", "Korean");
01267 break;
01268 case EncodingDetector::Thai:
01269 return i18n("@item Text character set", "Thai");
01270 break;
01271 case EncodingDetector::Unicode:
01272 return i18n("@item Text character set", "Unicode");
01273 break;
01274
01275 default:
01276 return TQString();
01277
01278 }
01279 }
01280
01281 EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const TQString &lc)
01282 {
01283
01284
01285 const char *langStr = pango_script_for_lang[0].lang;
01286
01287 for ( int i = 0; langStr; i++ ) {
01288 langStr = pango_script_for_lang[i].lang;
01289
01290 if ( lc.startsWith( TQString::fromAscii( langStr ) ) )
01291 return pango_script_for_lang[i].scripts[0];
01292 }
01293 return None;
01294 }
01295
01296 #undef DECODE_DEBUG
01297