encodingdetector.cpp
00001 /* 00002 This file was taken from the KDE 4.x libraries and backported to TQt 3. 00003 00004 Copyright (C) 1999 Lars Knoll (knoll@kde.org) 00005 Copyright (C) 2003 Dirk Mueller (mueller@kde.org) 00006 Copyright (C) 2003 Apple Computer, Inc. 00007 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) 00008 00009 This library is free software; you can redistribute it and/or 00010 modify it under the terms of the GNU Library General Public 00011 License as published by the Free Software Foundation; either 00012 version 2 of the License, or (at your option) any later version. 00013 00014 This library is distributed in the hope that it will be useful, 00015 but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00017 Library General Public License for more details. 00018 00019 You should have received a copy of the GNU Library General Public License 00020 along with this library; see the file COPYING.LIB. If not, write to 00021 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00022 Boston, MA 02110-1301, USA. 00023 */ 00024 //---------------------------------------------------------------------------- 00025 // 00026 // decoder for input stream 00027 00028 #include "encodingdetector.h" 00029 00030 #undef DECODE_DEBUG 00031 //#define DECODE_DEBUG 00032 00033 #define MAX_BUFFER 16*1024 00034 00035 #include <assert.h> 00036 #include <stdlib.h> 00037 00038 #include "encodingdetector_ja_p.h" 00039 00040 #include <tqregexp.h> 00041 #include <tqtextcodec.h> 00042 00043 #include <tdeglobal.h> 00044 #include <kcharsets.h> 00045 #include <kdebug.h> 00046 #include <tdelocale.h> 00047 00048 #include <ctype.h> 00049 00050 // The following table was taken from libpango 1.19.3 and slightly modified. 00051 // Multiple scripts per language were removed and the entries were reordered so 00052 // that simple substring matching will work. For example, bam was put before ba 00053 // so that the first match will be likely the right match. Otherwise "ba" would 00054 // match "bam" but we would have to search on to find "bam" which is what we want. 00055 // The original file is called pango-script-lang-table.h 00056 00057 /* pango-script-lang-table.h: 00058 * 00059 * Generated by gen-script-for-lang-new.c 00060 * Date: 2007-10-26 00061 * Source: fontconfig-2.4.91 00062 * 00063 * Do not edit. // I did. Sue me ;) 00064 */ 00065 typedef struct _PangoScriptForLang { 00066 const char lang[6]; 00067 EncodingDetector::AutoDetectScript scripts[1]; 00068 } PangoScriptForLang; 00069 00070 //Unfortunately EncodingDetector does not know all scripts that Pango knows. 00071 //Also, using EncodingDetector::CentralEuropean for the appropriate countries 00072 //might give better results in some cases. 00073 //One especially important (many speakers/literates) omission is the lack of 00074 //Indian scripts. 00075 00076 #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None 00077 #define PANGO_SCRIPT_BENGALI EncodingDetector::None 00078 #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None 00079 #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None 00080 #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None 00081 #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None 00082 #define PANGO_SCRIPT_GUJARATI EncodingDetector::None 00083 #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None 00084 #define PANGO_SCRIPT_KANNADA EncodingDetector::None 00085 #define PANGO_SCRIPT_KHMER EncodingDetector::None 00086 #define PANGO_SCRIPT_LAO EncodingDetector::None 00087 #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None 00088 #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None 00089 #define PANGO_SCRIPT_MYANMAR EncodingDetector::None 00090 #define PANGO_SCRIPT_ORIYA EncodingDetector::None 00091 #define PANGO_SCRIPT_SINHALA EncodingDetector::None 00092 #define PANGO_SCRIPT_SYRIAC EncodingDetector::None 00093 #define PANGO_SCRIPT_TAGALOG EncodingDetector::None 00094 #define PANGO_SCRIPT_TAMIL EncodingDetector::None 00095 #define PANGO_SCRIPT_TIBETAN EncodingDetector::None 00096 #define PANGO_SCRIPT_TELUGU EncodingDetector::None 00097 00098 //Instead of changing the table even more... 00099 #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic 00100 #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic 00101 #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope 00102 #define PANGO_SCRIPT_GREEK EncodingDetector::Greek 00103 #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew 00104 #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean 00105 #define PANGO_SCRIPT_THAI EncodingDetector::Thai 00106 00107 00108 static const PangoScriptForLang pango_script_for_lang[] = { 00109 { "aa", { PANGO_SCRIPT_LATIN/*62*/ } }, 00110 { "ab", { PANGO_SCRIPT_CYRILLIC/*90*/ } }, 00111 { "af", { PANGO_SCRIPT_LATIN/*69*/ } }, 00112 { "am", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, 00113 { "ar", { PANGO_SCRIPT_ARABIC/*125*/ } }, 00114 { "as", { PANGO_SCRIPT_BENGALI/*89*/ } }, 00115 { "ast", { PANGO_SCRIPT_LATIN/*66*/ } }, 00116 { "ava", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, 00117 { "ay", { PANGO_SCRIPT_LATIN/*60*/ } }, 00118 { "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } }, 00119 { "az", { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } }, 00120 { "bam", { PANGO_SCRIPT_LATIN/*60*/ } }, 00121 { "ba", { PANGO_SCRIPT_CYRILLIC/*82*/ } }, 00122 { "be", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, 00123 { "bg", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, 00124 { "bh", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00125 { "bho", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00126 { "bi", { PANGO_SCRIPT_LATIN/*58*/ } }, 00127 { "bin", { PANGO_SCRIPT_LATIN/*76*/ } }, 00128 { "bn", { PANGO_SCRIPT_BENGALI/*89*/ } }, 00129 { "bo", { PANGO_SCRIPT_TIBETAN/*95*/ } }, 00130 { "br", { PANGO_SCRIPT_LATIN/*64*/ } }, 00131 { "bs", { PANGO_SCRIPT_LATIN/*62*/ } }, 00132 { "bua", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, 00133 { "ca", { PANGO_SCRIPT_LATIN/*74*/ } }, 00134 { "ce", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, 00135 { "chm", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, 00136 { "chr", { PANGO_SCRIPT_CHEROKEE/*85*/ } }, 00137 { "ch", { PANGO_SCRIPT_LATIN/*58*/ } }, 00138 { "co", { PANGO_SCRIPT_LATIN/*84*/ } }, 00139 { "cs", { PANGO_SCRIPT_LATIN/*82*/ } }, 00140 { "cu", { PANGO_SCRIPT_CYRILLIC/*103*/ } }, 00141 { "cv", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } }, 00142 { "cy", { PANGO_SCRIPT_LATIN/*78*/ } }, 00143 { "da", { PANGO_SCRIPT_LATIN/*70*/ } }, 00144 { "de", { PANGO_SCRIPT_LATIN/*59*/ } }, 00145 { "dz", { PANGO_SCRIPT_TIBETAN/*95*/ } }, 00146 { "el", { PANGO_SCRIPT_GREEK/*69*/ } }, 00147 { "en", { PANGO_SCRIPT_LATIN/*72*/ } }, 00148 { "eo", { PANGO_SCRIPT_LATIN/*64*/ } }, 00149 { "es", { PANGO_SCRIPT_LATIN/*66*/ } }, 00150 // { "et", { PANGO_SCRIPT_LATIN/*64*/ } }, 00151 { "et", { EncodingDetector::Baltic } }, 00152 { "eu", { PANGO_SCRIPT_LATIN/*56*/ } }, 00153 { "fa", { PANGO_SCRIPT_ARABIC/*129*/ } }, 00154 { "fi", { PANGO_SCRIPT_LATIN/*62*/ } }, 00155 { "fj", { PANGO_SCRIPT_LATIN/*52*/ } }, 00156 { "fo", { PANGO_SCRIPT_LATIN/*68*/ } }, 00157 { "fr", { PANGO_SCRIPT_LATIN/*84*/ } }, 00158 { "ful", { PANGO_SCRIPT_LATIN/*62*/ } }, 00159 { "fur", { PANGO_SCRIPT_LATIN/*66*/ } }, 00160 { "fy", { PANGO_SCRIPT_LATIN/*75*/ } }, 00161 { "ga", { PANGO_SCRIPT_LATIN/*80*/ } }, 00162 { "gd", { PANGO_SCRIPT_LATIN/*70*/ } }, 00163 { "gez", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, 00164 { "gl", { PANGO_SCRIPT_LATIN/*66*/ } }, 00165 { "gn", { PANGO_SCRIPT_LATIN/*70*/ } }, 00166 { "gu", { PANGO_SCRIPT_GUJARATI/*78*/ } }, 00167 { "gv", { PANGO_SCRIPT_LATIN/*54*/ } }, 00168 { "ha", { PANGO_SCRIPT_LATIN/*60*/ } }, 00169 { "haw", { PANGO_SCRIPT_LATIN/*62*/ } }, 00170 { "he", { PANGO_SCRIPT_HEBREW/*27*/ } }, 00171 { "hi", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00172 { "ho", { PANGO_SCRIPT_LATIN/*52*/ } }, 00173 { "hr", { PANGO_SCRIPT_LATIN/*62*/ } }, 00174 { "hu", { PANGO_SCRIPT_LATIN/*70*/ } }, 00175 { "hy", { PANGO_SCRIPT_ARMENIAN/*77*/ } }, 00176 { "ia", { PANGO_SCRIPT_LATIN/*52*/ } }, 00177 { "ibo", { PANGO_SCRIPT_LATIN/*58*/ } }, 00178 { "id", { PANGO_SCRIPT_LATIN/*54*/ } }, 00179 { "ie", { PANGO_SCRIPT_LATIN/*52*/ } }, 00180 { "ik", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, 00181 { "io", { PANGO_SCRIPT_LATIN/*52*/ } }, 00182 { "is", { PANGO_SCRIPT_LATIN/*70*/ } }, 00183 { "it", { PANGO_SCRIPT_LATIN/*72*/ } }, 00184 { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } }, 00185 // { "ja", { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } }, 00186 { "ja", { EncodingDetector::Japanese } }, 00187 { "kaa", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, 00188 { "ka", { PANGO_SCRIPT_GEORGIAN/*33*/ } }, 00189 { "ki", { PANGO_SCRIPT_LATIN/*56*/ } }, 00190 { "kk", { PANGO_SCRIPT_CYRILLIC/*77*/ } }, 00191 { "kl", { PANGO_SCRIPT_LATIN/*81*/ } }, 00192 { "km", { PANGO_SCRIPT_KHMER/*70*/ } }, 00193 { "kn", { PANGO_SCRIPT_KANNADA/*80*/ } }, 00194 // { "ko", { PANGO_SCRIPT_HANGUL/*2443*/ } }, 00195 { "ko", { EncodingDetector::Korean } }, 00196 { "kok", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00197 { "ks", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00198 { "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } }, 00199 { "ku", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } }, 00200 { "kum", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, 00201 { "kv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, 00202 { "kw", { PANGO_SCRIPT_LATIN/*64*/ } }, 00203 { "ky", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, 00204 { "la", { PANGO_SCRIPT_LATIN/*68*/ } }, 00205 { "lb", { PANGO_SCRIPT_LATIN/*75*/ } }, 00206 { "lez", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, 00207 { "ln", { PANGO_SCRIPT_LATIN/*78*/ } }, 00208 { "lo", { PANGO_SCRIPT_LAO/*65*/ } }, 00209 // { "lt", { PANGO_SCRIPT_LATIN/*70*/ } }, 00210 { "lt", { EncodingDetector::Baltic } }, 00211 // { "lv", { PANGO_SCRIPT_LATIN/*78*/ } }, 00212 { "lv", { EncodingDetector::Baltic } }, 00213 { "mg", { PANGO_SCRIPT_LATIN/*56*/ } }, 00214 { "mh", { PANGO_SCRIPT_LATIN/*62*/ } }, 00215 { "mi", { PANGO_SCRIPT_LATIN/*64*/ } }, 00216 { "mk", { PANGO_SCRIPT_CYRILLIC/*42*/ } }, 00217 { "ml", { PANGO_SCRIPT_MALAYALAM/*78*/ } }, 00218 { "mn", { PANGO_SCRIPT_MONGOLIAN/*130*/ } }, 00219 { "mo", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } }, 00220 { "mr", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00221 { "mt", { PANGO_SCRIPT_LATIN/*72*/ } }, 00222 { "my", { PANGO_SCRIPT_MYANMAR/*48*/ } }, 00223 { "nb", { PANGO_SCRIPT_LATIN/*70*/ } }, 00224 { "nds", { PANGO_SCRIPT_LATIN/*59*/ } }, 00225 { "ne", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00226 { "nl", { PANGO_SCRIPT_LATIN/*82*/ } }, 00227 { "nn", { PANGO_SCRIPT_LATIN/*76*/ } }, 00228 { "no", { PANGO_SCRIPT_LATIN/*70*/ } }, 00229 { "nr", { PANGO_SCRIPT_LATIN/*52*/ } }, 00230 { "nso", { PANGO_SCRIPT_LATIN/*58*/ } }, 00231 { "ny", { PANGO_SCRIPT_LATIN/*54*/ } }, 00232 { "oc", { PANGO_SCRIPT_LATIN/*70*/ } }, 00233 { "om", { PANGO_SCRIPT_LATIN/*52*/ } }, 00234 { "or", { PANGO_SCRIPT_ORIYA/*79*/ } }, 00235 { "os", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, 00236 { "pa", { PANGO_SCRIPT_GURMUKHI/*63*/ } }, 00237 { "pl", { PANGO_SCRIPT_LATIN/*70*/ } }, 00238 { "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } }, 00239 { "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } }, 00240 { "pt", { PANGO_SCRIPT_LATIN/*82*/ } }, 00241 { "rm", { PANGO_SCRIPT_LATIN/*66*/ } }, 00242 { "ro", { PANGO_SCRIPT_LATIN/*62*/ } }, 00243 { "ru", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, 00244 { "sah", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, 00245 { "sa", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, 00246 { "sco", { PANGO_SCRIPT_LATIN/*56*/ } }, 00247 { "sel", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, 00248 { "se", { PANGO_SCRIPT_LATIN/*66*/ } }, 00249 { "sh", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, 00250 { "si", { PANGO_SCRIPT_SINHALA/*77*/ } }, 00251 { "sk", { PANGO_SCRIPT_LATIN/*86*/ } }, 00252 { "sl", { PANGO_SCRIPT_LATIN/*62*/ } }, 00253 { "sma", { PANGO_SCRIPT_LATIN/*60*/ } }, 00254 { "smj", { PANGO_SCRIPT_LATIN/*60*/ } }, 00255 { "smn", { PANGO_SCRIPT_LATIN/*68*/ } }, 00256 { "sms", { PANGO_SCRIPT_LATIN/*80*/ } }, 00257 { "sm", { PANGO_SCRIPT_LATIN/*52*/ } }, 00258 { "so", { PANGO_SCRIPT_LATIN/*52*/ } }, 00259 { "sq", { PANGO_SCRIPT_LATIN/*56*/ } }, 00260 { "sr", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, 00261 { "ss", { PANGO_SCRIPT_LATIN/*52*/ } }, 00262 { "st", { PANGO_SCRIPT_LATIN/*52*/ } }, 00263 { "sv", { PANGO_SCRIPT_LATIN/*68*/ } }, 00264 { "sw", { PANGO_SCRIPT_LATIN/*52*/ } }, 00265 { "syr", { PANGO_SCRIPT_SYRIAC/*45*/ } }, 00266 { "ta", { PANGO_SCRIPT_TAMIL/*48*/ } }, 00267 { "te", { PANGO_SCRIPT_TELUGU/*80*/ } }, 00268 { "tg", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, 00269 { "th", { PANGO_SCRIPT_THAI/*86*/ } }, 00270 { "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, 00271 { "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, 00272 { "tig", { PANGO_SCRIPT_ETHIOPIC/*221*/ } }, 00273 { "tk", { PANGO_SCRIPT_CYRILLIC/*74*/ } }, 00274 { "tl", { PANGO_SCRIPT_TAGALOG/*19*/ } }, 00275 { "tn", { PANGO_SCRIPT_LATIN/*58*/ } }, 00276 { "to", { PANGO_SCRIPT_LATIN/*52*/ } }, 00277 // { "tr", { PANGO_SCRIPT_LATIN/*70*/ } }, 00278 { "tr", { EncodingDetector::Turkish } }, 00279 { "ts", { PANGO_SCRIPT_LATIN/*52*/ } }, 00280 { "tt", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, 00281 { "tw", { PANGO_SCRIPT_LATIN/*70*/ } }, 00282 { "tyv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, 00283 { "ug", { PANGO_SCRIPT_ARABIC/*125*/ } }, 00284 { "uk", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, 00285 { "ur", { PANGO_SCRIPT_ARABIC/*145*/ } }, 00286 { "uz", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, 00287 { "ven", { PANGO_SCRIPT_LATIN/*62*/ } }, 00288 { "vi", { PANGO_SCRIPT_LATIN/*186*/ } }, 00289 { "vot", { PANGO_SCRIPT_LATIN/*62*/ } }, 00290 { "vo", { PANGO_SCRIPT_LATIN/*54*/ } }, 00291 { "wa", { PANGO_SCRIPT_LATIN/*70*/ } }, 00292 { "wen", { PANGO_SCRIPT_LATIN/*76*/ } }, 00293 { "wo", { PANGO_SCRIPT_LATIN/*66*/ } }, 00294 { "xh", { PANGO_SCRIPT_LATIN/*52*/ } }, 00295 { "yap", { PANGO_SCRIPT_LATIN/*58*/ } }, 00296 { "yi", { PANGO_SCRIPT_HEBREW/*27*/ } }, 00297 { "yo", { PANGO_SCRIPT_LATIN/*114*/ } }, 00298 // { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } }, 00299 { "zh-cn", { EncodingDetector::ChineseSimplified } }, 00300 // { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } }, 00301 { "zh-hk", { EncodingDetector::ChineseTraditional } }, 00302 // { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } }, 00303 { "zh-mo", { EncodingDetector::ChineseTraditional } }, 00304 // { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } }, 00305 { "zh-sg", { EncodingDetector::ChineseSimplified } }, 00306 // { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } }, 00307 { "zh-tw", { EncodingDetector::ChineseTraditional } }, 00308 { "zu", { PANGO_SCRIPT_LATIN/*52*/ } }, 00309 { "\x00", { EncodingDetector::None } } //end mark 00310 }; 00311 00312 enum MIB 00313 { 00314 MibLatin1 = 4, 00315 Mib8859_8 = 85, 00316 MibUtf8 = 106, 00317 MibUcs2 = 1000, 00318 MibUtf16 = 1015, 00319 MibUtf16BE = 1013, 00320 MibUtf16LE = 1014 00321 }; 00322 00323 static bool is16Bit(TQTextCodec* codec) 00324 { 00325 switch (codec->mibEnum()) 00326 { 00327 case MibUtf16: 00328 case MibUtf16BE: 00329 case MibUtf16LE: 00330 case MibUcs2: 00331 return true; 00332 default: 00333 return false; 00334 } 00335 } 00336 00337 class EncodingDetectorPrivate 00338 { 00339 public: 00340 TQTextCodec *m_codec; 00341 TQTextDecoder *m_decoder; // utf16 00342 TQTextCodec *m_defaultCodec; 00343 TQCString m_storeDecoderName; 00344 00345 EncodingDetector::EncodingChoiceSource m_source; 00346 EncodingDetector::AutoDetectScript m_autoDetectLanguage; 00347 00348 bool m_visualRTL : 1; 00349 bool m_seenBody : 1; 00350 bool m_writtingHappened : 1; 00351 bool m_analyzeCalled : 1; //for decode() 00352 int m_multiByte; 00353 00354 TQCString m_bufferForDefferedEncDetection; 00355 00356 EncodingDetectorPrivate() 00357 : m_codec(TQTextCodec::codecForMib(MibLatin1)) 00358 , m_decoder(m_codec->makeDecoder()) 00359 , m_defaultCodec(m_codec) 00360 , m_source(EncodingDetector::DefaultEncoding) 00361 , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection) 00362 , m_visualRTL(false) 00363 , m_seenBody(false) 00364 , m_writtingHappened(false) 00365 , m_analyzeCalled(false) 00366 , m_multiByte(0) 00367 { 00368 } 00369 00370 EncodingDetectorPrivate(TQTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script) 00371 : m_codec(codec) 00372 , m_decoder(m_codec->makeDecoder()) 00373 , m_defaultCodec(m_codec) 00374 , m_source(source) 00375 , m_autoDetectLanguage(script) 00376 , m_visualRTL(false) 00377 , m_seenBody(false) 00378 , m_writtingHappened(false) 00379 , m_analyzeCalled(false) 00380 , m_multiByte(0) 00381 { 00382 } 00383 00384 ~EncodingDetectorPrivate() 00385 { 00386 delete m_decoder; 00387 } 00388 }; 00389 00390 00391 static TQCString automaticDetectionForArabic( const unsigned char* ptr, int size ) 00392 { 00393 for ( int i = 0; i < size; ++i ) { 00394 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 00395 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) 00396 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 00397 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { 00398 return "cp1256"; 00399 } 00400 } 00401 00402 return "iso-8859-6"; 00403 } 00404 00405 static TQCString automaticDetectionForBaltic( const unsigned char* ptr, int size ) 00406 { 00407 for ( int i = 0; i < size; ++i ) { 00408 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) 00409 return "cp1257"; 00410 00411 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) 00412 return "iso-8859-13"; 00413 } 00414 00415 return "iso-8859-13"; 00416 } 00417 00418 static TQCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) 00419 { 00420 TQCString charset; 00421 for ( int i = 0; i < size; ++i ) { 00422 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { 00423 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) 00424 return "ibm852"; 00425 00426 if ( i + 1 > size ) 00427 return "cp1250"; 00428 else { // maybe ibm852 ? 00429 charset = "cp1250"; 00430 continue; 00431 } 00432 } 00433 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { 00434 if ( i + 1 > size ) 00435 return "iso-8859-2"; 00436 else { // maybe ibm852 ? 00437 if ( charset.isNull() ) 00438 charset = "iso-8859-2"; 00439 continue; 00440 } 00441 } 00442 } 00443 00444 if ( charset.isNull() ) 00445 charset = "iso-8859-3"; 00446 00447 return charset.data(); 00448 } 00449 00450 static TQCString automaticDetectionForCyrillic( const unsigned char* ptr, int size) 00451 { 00452 #ifdef DECODE_DEBUG 00453 kWarning() << "EncodingDetector: Cyr heuristics"; 00454 #endif 00455 00456 // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf) 00457 // return "utf8"; 00458 int utf8_mark=0; 00459 int koi_score=0; 00460 int cp1251_score=0; 00461 00462 int koi_st=0; 00463 int cp1251_st=0; 00464 00465 // int koi_na=0; 00466 // int cp1251_na=0; 00467 00468 int koi_o_capital=0; 00469 int koi_o=0; 00470 int cp1251_o_capital=0; 00471 int cp1251_o=0; 00472 00473 int koi_a_capital=0; 00474 int koi_a=0; 00475 int cp1251_a_capital=0; 00476 int cp1251_a=0; 00477 00478 int koi_s_capital=0; 00479 int koi_s=0; 00480 int cp1251_s_capital=0; 00481 int cp1251_s=0; 00482 00483 int koi_i_capital=0; 00484 int koi_i=0; 00485 int cp1251_i_capital=0; 00486 int cp1251_i=0; 00487 00488 int cp1251_small_range=0; 00489 int koi_small_range=0; 00490 int ibm866_small_range=0; 00491 00492 int i; 00493 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i) 00494 { 00495 if (ptr[i]>0xdf) 00496 { 00497 ++cp1251_small_range; 00498 00499 if (ptr[i]==0xee)//small o 00500 ++cp1251_o; 00501 else if (ptr[i]==0xe0)//small a 00502 ++cp1251_a; 00503 else if (ptr[i]==0xe8)//small i 00504 ++cp1251_i; 00505 else if (ptr[i]==0xf1)//small s 00506 ++cp1251_s; 00507 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st 00508 ++cp1251_st; 00509 00510 else if (ptr[i]==0xef) 00511 ++koi_o_capital; 00512 else if (ptr[i]==0xe1) 00513 ++koi_a_capital; 00514 else if (ptr[i]==0xe9) 00515 ++koi_i_capital; 00516 else if (ptr[i]==0xf3) 00517 ++koi_s_capital; 00518 00519 } 00520 else if (ptr[i]>0xbf) 00521 { 00522 ++koi_small_range; 00523 00524 if (ptr[i]==0xd0||ptr[i]==0xd1)//small o 00525 ++utf8_mark; 00526 else if (ptr[i]==0xcf)//small o 00527 ++koi_o; 00528 else if (ptr[i]==0xc1)//small a 00529 ++koi_a; 00530 else if (ptr[i]==0xc9)//small i 00531 ++koi_i; 00532 else if (ptr[i]==0xd3)//small s 00533 ++koi_s; 00534 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st 00535 ++koi_st; 00536 00537 else if (ptr[i]==0xce) 00538 ++cp1251_o_capital; 00539 else if (ptr[i]==0xc0) 00540 ++cp1251_a_capital; 00541 else if (ptr[i]==0xc8) 00542 ++cp1251_i_capital; 00543 else if (ptr[i]==0xd1) 00544 ++cp1251_s_capital; 00545 } 00546 else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60% 00547 ++ibm866_small_range; 00548 00549 } 00550 00551 //cannot decide? 00552 if (cp1251_small_range+koi_small_range+ibm866_small_range<8) 00553 { 00554 return ""; 00555 } 00556 00557 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range) 00558 { 00559 #ifdef DECODE_DEBUG 00560 kWarning() << "Cyr Enc Detection: UTF8"; 00561 #endif 00562 return "UTF-8"; 00563 } 00564 00565 if (ibm866_small_range>cp1251_small_range+koi_small_range) 00566 return "ibm866"; 00567 00568 // TQCString koi_string = "koi8-u"; 00569 // TQCString cp1251_string = "cp1251"; 00570 00571 if (cp1251_st==0 && koi_st>1) 00572 koi_score+=10; 00573 else if (koi_st==0 && cp1251_st>1) 00574 cp1251_score+=10; 00575 00576 if (cp1251_st && koi_st) 00577 { 00578 if (cp1251_st/koi_st>2) 00579 cp1251_score+=20; 00580 else if (koi_st/cp1251_st>2) 00581 koi_score+=20; 00582 } 00583 00584 if (cp1251_a>koi_a) 00585 cp1251_score+=10; 00586 else if (cp1251_a || koi_a) 00587 koi_score+=10; 00588 00589 if (cp1251_o>koi_o) 00590 cp1251_score+=10; 00591 else if (cp1251_o || koi_o) 00592 koi_score+=10; 00593 00594 if (cp1251_i>koi_i) 00595 cp1251_score+=10; 00596 else if (cp1251_i || koi_i) 00597 koi_score+=10; 00598 00599 if (cp1251_s>koi_s) 00600 cp1251_score+=10; 00601 else if (cp1251_s || koi_s) 00602 koi_score+=10; 00603 00604 if (cp1251_a_capital>koi_a_capital) 00605 cp1251_score+=9; 00606 else if (cp1251_a_capital || koi_a_capital) 00607 koi_score+=9; 00608 00609 if (cp1251_o_capital>koi_o_capital) 00610 cp1251_score+=9; 00611 else if (cp1251_o_capital || koi_o_capital) 00612 koi_score+=9; 00613 00614 if (cp1251_i_capital>koi_i_capital) 00615 cp1251_score+=9; 00616 else if (cp1251_i_capital || koi_i_capital) 00617 koi_score+=9; 00618 00619 if (cp1251_s_capital>koi_s_capital) 00620 cp1251_score+=9; 00621 else if (cp1251_s_capital || koi_s_capital) 00622 koi_score+=9; 00623 #ifdef DECODE_DEBUG 00624 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score; 00625 #endif 00626 if (abs(koi_score-cp1251_score)<10) 00627 { 00628 //fallback... 00629 cp1251_score=cp1251_small_range; 00630 koi_score=koi_small_range; 00631 } 00632 if (cp1251_score>koi_score) 00633 return "cp1251"; 00634 else 00635 return "koi8-u"; 00636 00637 00638 // if (cp1251_score>koi_score) 00639 // setEncoding("cp1251",AutoDetectedEncoding); 00640 // else 00641 // setEncoding("koi8-u",AutoDetectedEncoding); 00642 // return true; 00643 00644 } 00645 00646 static TQCString automaticDetectionForGreek( const unsigned char* ptr, int size ) 00647 { 00648 for ( int i = 0; i < size; ++i ) { 00649 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B 00650 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 00651 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { 00652 return "cp1253"; 00653 } 00654 } 00655 00656 return "iso-8859-7"; 00657 } 00658 00659 static TQCString automaticDetectionForHebrew( const unsigned char* ptr, int size ) 00660 { 00661 for ( int i = 0; i < size; ++i ) { 00662 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B 00663 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) 00664 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { 00665 return "cp1255"; 00666 } 00667 00668 if ( ptr[ i ] == 0xDF ) 00669 return "iso-8859-8-i"; 00670 } 00671 00672 return "iso-8859-8-i"; 00673 } 00674 00675 static TQCString automaticDetectionForJapanese( const unsigned char* ptr, int size ) 00676 { 00677 JapaneseCode kc; 00678 00679 switch ( kc.guess_jp( (const char*)ptr, size ) ) { 00680 case JapaneseCode::JIS: 00681 return "jis7"; 00682 case JapaneseCode::EUC: 00683 return "eucjp"; 00684 case JapaneseCode::SJIS: 00685 return "sjis"; 00686 case JapaneseCode::UTF8: 00687 return "utf8"; 00688 default: 00689 break; 00690 } 00691 00692 return ""; 00693 } 00694 00695 static TQCString automaticDetectionForTurkish( const unsigned char* ptr, int size ) 00696 { 00697 for ( int i = 0; i < size; ++i ) { 00698 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { 00699 return "cp1254"; 00700 } 00701 } 00702 00703 return "iso-8859-9"; 00704 } 00705 00706 static TQCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) 00707 { 00708 uint nonansi_count=0; 00709 for (int i=0; i<size; ++i) 00710 { 00711 if (ptr[i]>0x79) 00712 { 00713 ++nonansi_count; 00714 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0) 00715 { 00716 return "UTF-8"; 00717 } 00718 if (ptr[i] >= 0x78 && ptr[i] <= 0x9 ) 00719 { 00720 return "cp1252"; 00721 } 00722 } 00723 00724 } 00725 00726 if (nonansi_count>0) 00727 return "iso-8859-15"; 00728 00729 return ""; 00730 } 00731 00732 bool EncodingDetector::errorsIfUtf8 (const char* data, int length) 00733 { 00734 if (d->m_codec->mibEnum()!=MibUtf8) 00735 return false; //means no errors 00736 // #define highest1Bits (unsigned char)0x80 00737 // #define highest2Bits (unsigned char)0xC0 00738 // #define highest3Bits (unsigned char)0xE0 00739 // #define highest4Bits (unsigned char)0xF0 00740 // #define highest5Bits (unsigned char)0xF8 00741 static const unsigned char highest1Bits = 0x80; 00742 static const unsigned char highest2Bits = 0xC0; 00743 static const unsigned char highest3Bits = 0xE0; 00744 static const unsigned char highest4Bits = 0xF0; 00745 static const unsigned char highest5Bits = 0xF8; 00746 00747 for (int i=0; i<length; ++i) 00748 { 00749 unsigned char c = data[i]; 00750 00751 if (d->m_multiByte>0) 00752 { 00753 if ((c & highest2Bits) == 0x80) 00754 { 00755 --(d->m_multiByte); 00756 continue; 00757 } 00758 #ifdef DECODE_DEBUG 00759 kWarning() << "EncDetector: Broken UTF8"; 00760 #endif 00761 return true; 00762 } 00763 00764 // most significant bit zero, single char 00765 if ((c & highest1Bits) == 0x00) 00766 continue; 00767 00768 // 110xxxxx => init 1 following bytes 00769 if ((c & highest3Bits) == 0xC0) 00770 { 00771 d->m_multiByte = 1; 00772 continue; 00773 } 00774 00775 // 1110xxxx => init 2 following bytes 00776 if ((c & highest4Bits) == 0xE0) 00777 { 00778 d->m_multiByte = 2; 00779 continue; 00780 } 00781 00782 // 11110xxx => init 3 following bytes 00783 if ((c & highest5Bits) == 0xF0) 00784 { 00785 d->m_multiByte = 3; 00786 continue; 00787 } 00788 #ifdef DECODE_DEBUG 00789 kWarning() << "EncDetector:_Broken UTF8"; 00790 #endif 00791 return true; 00792 } 00793 return false; 00794 } 00795 00796 EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate) 00797 { 00798 } 00799 00800 EncodingDetector::EncodingDetector(TQTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) : 00801 d(new EncodingDetectorPrivate(codec,source,script)) 00802 { 00803 } 00804 00805 EncodingDetector::~EncodingDetector() 00806 { 00807 delete d; 00808 } 00809 00810 void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang) 00811 { 00812 d->m_autoDetectLanguage=lang; 00813 } 00814 EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const 00815 { 00816 return d->m_autoDetectLanguage; 00817 } 00818 00819 EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const 00820 { 00821 return d->m_source; 00822 } 00823 00824 const char* EncodingDetector::encoding() const 00825 { 00826 d->m_storeDecoderName = d->m_codec->name(); 00827 d->m_storeDecoderName = d->m_storeDecoderName.lower().replace( "iso ", "iso-" ); 00828 return d->m_storeDecoderName.data(); 00829 } 00830 00831 bool EncodingDetector::visuallyOrdered() const 00832 { 00833 return d->m_visualRTL; 00834 } 00835 00836 // const TQTextCodec* EncodingDetector::codec() const 00837 // { 00838 // return d->m_codec; 00839 // } 00840 00841 TQTextDecoder* EncodingDetector::decoder() 00842 { 00843 return d->m_decoder; 00844 } 00845 00846 bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) 00847 { 00848 TQTextCodec *codec; 00849 TQCString enc(_encoding); 00850 if(/*enc.isNull() || */enc.isEmpty()) 00851 { 00852 if (type==DefaultEncoding) 00853 codec=d->m_defaultCodec; 00854 else 00855 return false; 00856 } 00857 else 00858 { 00859 //TQString->TQTextCodec 00860 00861 enc = enc.lower(); 00862 // hebrew visually ordered 00863 if(enc=="visual") 00864 enc="iso8859-8"; 00865 bool b; 00866 codec = TDEGlobal::charsets()->codecForName(enc, b); 00867 if (!b) 00868 return false; 00869 } 00870 00871 if (d->m_codec->mibEnum()==codec->mibEnum()) 00872 return true; 00873 00874 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec)) 00875 { 00876 //Sometimes the codec specified is absurd, i.e. UTF-16 despite 00877 //us decoding a meta tag as ASCII. In that case, ignore it. 00878 return false; 00879 } 00880 00881 if (codec->mibEnum() == Mib8859_8) 00882 { 00883 //We do NOT want to use TQt's TQHebrewCodec, since it tries to reorder itself. 00884 codec = TQTextCodec::codecForName("iso8859-8-i"); 00885 00886 // visually ordered unless one of the following 00887 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical")) 00888 d->m_visualRTL = true; 00889 } 00890 00891 d->m_codec = codec; 00892 d->m_source = type; 00893 delete d->m_decoder; 00894 d->m_decoder = d->m_codec->makeDecoder(); 00895 #ifdef DECODE_DEBUG 00896 kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name(); 00897 #endif 00898 return true; 00899 } 00900 00901 bool EncodingDetector::analyze(const TQByteArray &data) 00902 { 00903 return analyze( data.data(), data.size() ); 00904 } 00905 00906 bool EncodingDetector::analyze(const char *data, int len) 00907 { 00908 // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. 00909 // maximumBOMLength = 10 00910 // Even if the user has chosen utf16 we still need to auto-detect the endianness 00911 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) 00912 { 00913 // Extract the first three bytes. 00914 const uchar *udata = (const uchar *)data; 00915 uchar c1 = *udata++; 00916 uchar c2 = *udata++; 00917 uchar c3 = *udata++; 00918 00919 // Check for the BOM 00920 const char *autoDetectedEncoding; 00921 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) 00922 { 00923 autoDetectedEncoding = "ISO-10646-UCS-2"; 00924 } 00925 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) 00926 { 00927 autoDetectedEncoding = "UTF-8"; 00928 } 00929 else if (c1 == 0x00 || c2 == 0x00) 00930 { 00931 uchar c4 = *udata++; 00932 uchar c5 = *udata++; 00933 uchar c6 = *udata++; 00934 uchar c7 = *udata++; 00935 uchar c8 = *udata++; 00936 uchar c9 = *udata++; 00937 uchar c10 = *udata++; 00938 00939 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); 00940 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); 00941 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0)) 00942 autoDetectedEncoding = "ISO-10646-UCS-2"; 00943 else 00944 autoDetectedEncoding = 0; 00945 } 00946 else 00947 { 00948 autoDetectedEncoding = 0; 00949 } 00950 00951 // If we found a BOM, use the encoding it implies. 00952 if (autoDetectedEncoding != 0) 00953 { 00954 d->m_source = BOM; 00955 d->m_codec = TQTextCodec::codecForName(autoDetectedEncoding); 00956 assert(d->m_codec); 00957 //enc = d->m_codec->name(); 00958 delete d->m_decoder; 00959 d->m_decoder = d->m_codec->makeDecoder(); 00960 #ifdef DECODE_DEBUG 00961 kWarning() << "Detection by BOM"; 00962 #endif 00963 if (is16Bit(d->m_codec) && c2==0x00) 00964 { 00965 // utf16LE, we need to put the decoder in LE mode 00966 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; 00967 d->m_decoder->toUnicode(reverseUtf16, 2); 00968 } 00969 return true; 00970 } 00971 } 00972 00973 //exit from routine in case it was called to only detect byte order for utf-16 00974 if (d->m_source==UserChosenEncoding) 00975 { 00976 #ifdef DECODE_DEBUG 00977 kWarning() << "EncodingDetector: UserChosenEncoding exit "; 00978 #endif 00979 00980 if (errorsIfUtf8(data, len)) 00981 setEncoding("",DefaultEncoding); 00982 return true; 00983 } 00984 #if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz 00985 if (!d->m_seenBody) 00986 { 00987 // we still don't have an encoding, and are in the head 00988 // the following tags are allowed in <head>: 00989 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE 00990 const char *ptr = data; 00991 const char *pEnd = data+len; 00992 00993 while(ptr != pEnd) 00994 { 00995 if(*ptr!='<') 00996 { 00997 ++ptr; 00998 continue; 00999 } 01000 ++ptr; 01001 // Handle comments. 01002 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') 01003 { 01004 ptr += 3; 01005 skipComment(ptr, pEnd); 01006 continue; 01007 } 01008 01009 // Handle XML header, which can have encoding in it. 01010 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l') 01011 { 01012 const char *end = ptr; 01013 while (*end != '>' && end < pEnd) 01014 end++; 01015 if (*end == '\0' || end == pEnd) 01016 break; 01017 TQCString str(ptr, end - ptr + 1); 01018 int length; 01019 int pos = findXMLEncoding(str, length); 01020 // also handles the case when specified encoding aint correct 01021 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader)) 01022 { 01023 return true; 01024 } 01025 } 01026 01027 //look for <meta>, stop if we reach <body> 01028 while ( 01029 !((*ptr >= 'a') && (*ptr <= 'z') || 01030 (*ptr >= 'A') && (*ptr <= 'Z')) 01031 && ptr < pEnd 01032 ) 01033 ++ptr; 01034 01035 char tmp[5]; 01036 int length=0; 01037 const char* max=ptr+4; 01038 if (pEnd<max) 01039 max=pEnd; 01040 while ( 01041 ((*ptr >= 'a') && (*ptr <= 'z') || 01042 (*ptr >= 'A') && (*ptr <= 'Z') || 01043 (*ptr >= '0') && (*ptr <= '9')) 01044 && ptr < max 01045 ) 01046 { 01047 tmp[length] = tolower( *ptr ); 01048 ++ptr; 01049 ++length; 01050 } 01051 tmp[length] = 0; 01052 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a') 01053 { 01054 // found a meta tag... 01055 const char* end = ptr; 01056 while(*end != '>' && *end != '\0' && end<pEnd) 01057 end++; 01058 //if ( *end == '\0' ) break; 01059 TQCString str( ptr, (end-ptr)+1); 01060 str = str.lower(); 01061 int pos=0; 01062 //if( (pos = str.find("http-equiv", pos)) == -1) break; 01063 //if( (pos = str.find("content-type", pos)) == -1) break; 01064 if( (pos = str.find("charset")) == -1) 01065 continue; 01066 pos+=6; 01067 // skip to '=' 01068 if( (pos = str.find('=', pos)) == -1) 01069 continue; 01070 01071 // skip whitespace before encoding itself 01072 while (pos < (int)str.length() && str[pos] <= ' ') 01073 ++pos; 01074 if ( pos == (int)str.length()) 01075 continue; 01076 01077 int endpos = pos; 01078 while( endpos < str.length() && 01079 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' 01080 && str[endpos] != ';' && str[endpos] != '>') ) 01081 ++endpos; 01082 #ifdef DECODE_DEBUG 01083 kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data(); 01084 #endif 01085 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag)) 01086 return true; 01087 } 01088 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y') 01089 { 01090 d->m_seenBody=true; 01091 break; 01092 } 01093 } 01094 } 01095 01096 if (d->m_source==EncodingFromHTTPHeader) 01097 return true; 01098 #endif 01099 //if (len<20) //make a guess even if the file is short -- ahartmetz 01100 if (len < 1) 01101 { 01102 setEncoding("",DefaultEncoding); 01103 return false; 01104 } 01105 #ifdef DECODE_DEBUG 01106 kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")"; 01107 #endif 01108 01109 switch ( d->m_autoDetectLanguage ) 01110 { 01111 case EncodingDetector::Arabic: 01112 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding); 01113 // break; 01114 case EncodingDetector::Baltic: 01115 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding); 01116 // break; 01117 case EncodingDetector::CentralEuropean: 01118 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding); 01119 break; 01120 case EncodingDetector::Cyrillic: 01121 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding); 01122 // break; 01123 case EncodingDetector::Greek: 01124 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding); 01125 // break; 01126 case EncodingDetector::Hebrew: 01127 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding); 01128 // break; 01129 case EncodingDetector::Japanese: 01130 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding); 01131 // break; 01132 case EncodingDetector::Turkish: 01133 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding); 01134 // break; 01135 case EncodingDetector::WesternEuropean: 01136 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding)) 01137 return true; 01138 else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for tdehtml 01139 { 01140 return setEncoding("iso-8859-15",AutoDetectedEncoding); 01141 } 01142 else //use default provided by eg katepart 01143 { 01144 return setEncoding("",DefaultEncoding); 01145 } 01146 // break; 01147 case EncodingDetector::SemiautomaticDetection: 01148 case EncodingDetector::ChineseSimplified: 01149 case EncodingDetector::ChineseTraditional: 01150 case EncodingDetector::Korean: 01151 case EncodingDetector::Thai: 01152 case EncodingDetector::Unicode: 01153 case EncodingDetector::NorthernSaami: 01154 case EncodingDetector::SouthEasternEurope: 01155 case EncodingDetector::None: 01156 // huh. somethings broken in this code ### FIXME 01157 //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. 01158 break; 01159 } 01160 01161 setEncoding("",DefaultEncoding); 01162 return true; 01163 } 01164 01165 01166 EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const TQString& lang) 01167 { 01168 if (lang.isEmpty()) 01169 return EncodingDetector::None; 01170 else if (lang==i18n("@item Text character set", "Unicode")) 01171 return EncodingDetector::Unicode; 01172 else if (lang==i18n("@item Text character set", "Cyrillic")) 01173 return EncodingDetector::Cyrillic; 01174 else if (lang==i18n("@item Text character set", "Western European")) 01175 return EncodingDetector::WesternEuropean; 01176 else if (lang==i18n("@item Text character set", "Central European")) 01177 return EncodingDetector::CentralEuropean; 01178 else if (lang==i18n("@item Text character set", "Greek")) 01179 return EncodingDetector::Greek; 01180 else if (lang==i18n("@item Text character set", "Hebrew")) 01181 return EncodingDetector::Hebrew; 01182 else if (lang==i18n("@item Text character set", "Turkish")) 01183 return EncodingDetector::Turkish; 01184 else if (lang==i18n("@item Text character set", "Japanese")) 01185 return EncodingDetector::Japanese; 01186 else if (lang==i18n("@item Text character set", "Baltic")) 01187 return EncodingDetector::Baltic; 01188 else if (lang==i18n("@item Text character set", "Arabic")) 01189 return EncodingDetector::Arabic; 01190 01191 return EncodingDetector::None; 01192 } 01193 01194 bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script) 01195 { 01196 switch (script) 01197 { 01198 case EncodingDetector::Arabic: 01199 return true; 01200 case EncodingDetector::Baltic: 01201 return true; 01202 case EncodingDetector::CentralEuropean: 01203 return true; 01204 case EncodingDetector::Cyrillic: 01205 return true; 01206 case EncodingDetector::Greek: 01207 return true; 01208 case EncodingDetector::Hebrew: 01209 return true; 01210 case EncodingDetector::Japanese: 01211 return true; 01212 case EncodingDetector::Turkish: 01213 return true; 01214 case EncodingDetector::WesternEuropean: 01215 return true; 01216 case EncodingDetector::ChineseTraditional: 01217 return true; 01218 case EncodingDetector::ChineseSimplified: 01219 return true; 01220 case EncodingDetector::Unicode: 01221 return true; 01222 break; 01223 default: 01224 return false; 01225 } 01226 } 01227 01228 TQString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script) 01229 { 01230 switch (script) 01231 { 01232 case EncodingDetector::Arabic: 01233 return i18n("@item Text character set", "Arabic"); 01234 break; 01235 case EncodingDetector::Baltic: 01236 return i18n("@item Text character set", "Baltic"); 01237 break; 01238 case EncodingDetector::CentralEuropean: 01239 return i18n("@item Text character set", "Central European"); 01240 break; 01241 case EncodingDetector::Cyrillic: 01242 return i18n("@item Text character set", "Cyrillic"); 01243 break; 01244 case EncodingDetector::Greek: 01245 return i18n("@item Text character set", "Greek"); 01246 break; 01247 case EncodingDetector::Hebrew: 01248 return i18n("@item Text character set", "Hebrew"); 01249 break; 01250 case EncodingDetector::Japanese: 01251 return i18n("@item Text character set", "Japanese"); 01252 break; 01253 case EncodingDetector::Turkish: 01254 return i18n("@item Text character set", "Turkish"); 01255 break; 01256 case EncodingDetector::WesternEuropean: 01257 return i18n("@item Text character set", "Western European"); 01258 break; 01259 case EncodingDetector::ChineseTraditional: 01260 return i18n("@item Text character set", "Chinese Traditional"); 01261 break; 01262 case EncodingDetector::ChineseSimplified: 01263 return i18n("@item Text character set", "Chinese Simplified"); 01264 break; 01265 case EncodingDetector::Korean: 01266 return i18n("@item Text character set", "Korean"); 01267 break; 01268 case EncodingDetector::Thai: 01269 return i18n("@item Text character set", "Thai"); 01270 break; 01271 case EncodingDetector::Unicode: 01272 return i18n("@item Text character set", "Unicode"); 01273 break; 01274 //case EncodingDetector::SemiautomaticDetection: 01275 default: 01276 return TQString(); 01277 01278 } 01279 } 01280 01281 EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const TQString &lc) 01282 { 01283 // It might make sense to do something special if the locale ends with 01284 // ".UTF-8" or "@utf8" 01285 const char *langStr = pango_script_for_lang[0].lang; 01286 // There is obvious optimization potential... 01287 for ( int i = 0; langStr; i++ ) { 01288 langStr = pango_script_for_lang[i].lang; 01289 // startsWith() works for empty strings: every string "starts with" an empty string. 01290 if ( lc.startsWith( TQString::fromAscii( langStr ) ) ) 01291 return pango_script_for_lang[i].scripts[0]; 01292 } 01293 return None; 01294 } 01295 01296 #undef DECODE_DEBUG 01297