qutf7codec.cpp
00001 /* 00002 qutf7codec.cpp 00003 00004 A TQTextCodec for UTF-7 (rfc2152). 00005 Copyright (c) 2001 Marc Mutz <mutz@kde.org> 00006 See file COPYING for details 00007 00008 This program is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU General Public License, version 2.0, 00010 as published by the Free Software Foundation. 00011 00012 You should have received a copy of the GNU General Public License 00013 along with this program; if not, write to the Free Software 00014 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 00015 02110-1301, US 00016 00017 As a special exception, permission is granted to use this plugin 00018 with any version of TQt by TrollTech AS, Norway. In this case, the 00019 use of this plugin doesn't cause the resulting executable to be 00020 covered by the GNU General Public License. 00021 This exception does not however invalidate any other reasons why the 00022 executable file might be covered by the GNU General Public License. 00023 */ 00024 00025 00026 #include "qutf7codec.h" 00027 00028 #ifndef TQT_NO_TEXTCODEC 00029 00030 int TQUtf7Codec::mibEnum() const { 00031 return 1012; 00032 } 00033 00034 int TQStrictUtf7Codec::mibEnum() const { 00035 return -1012; 00036 } 00037 00038 const char* TQUtf7Codec::name() const { 00039 return "UTF-7"; 00040 } 00041 00042 const char* TQStrictUtf7Codec::name() const { 00043 return "X-QT-UTF-7-STRICT"; 00044 } 00045 00046 const char* TQUtf7Codec::mimeName() const { 00047 return "UTF-7"; 00048 } 00049 00050 bool TQUtf7Codec::canEncode( TQChar ) const { 00051 return TRUE; 00052 } 00053 00054 bool TQUtf7Codec::canEncode( const TQString & ) const { 00055 return TRUE; 00056 } 00057 00058 static uchar base64Set[] = { 00059 0x00, 0x00, 0x00, 0x00, // '\0' ... 00060 0x00, 0x11, 0xFF, 0xC0, // ' ' ... '?' 00061 0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_' 00062 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL 00063 }; 00064 00065 static uchar base64SetWithLastTwoBitsZero[] = { 00066 0x00, 0x00, 0x00, 0x00, // '\0' ... 00067 0x00, 0x00, 0x88, 0x80, // ' ' ... '?' 00068 0x44, 0x44, 0x44, 0x40, // '@' ... '_' 00069 0x11, 0x11, 0x11, 0x00 // '`' ... DEL 00070 }; 00071 00072 static uchar directSet[] = { 00073 0x00, 0x00, 0x00, 0x00, // '\0' ... 00074 0x01, 0xCF, 0xFF, 0xE1, // ' ' ... '?' 00075 0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_' 00076 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL 00077 }; 00078 00079 static uchar optDirectSet[] = { 00080 0x00, 0x00, 0x00, 0x00, // '\0' ... 00081 0x7E, 0x20, 0x00, 0x1E, // ' ' ... '?' 00082 0x80, 0x00, 0x00, 0x17, // '@' ... '_' 00083 0x80, 0x00, 0x00, 0x1C // '`' ... DEL 00084 }; 00085 00086 static inline bool isOfSet(uchar ch, uchar* set) { 00087 return set[ ch/8 ] & (0x80 >> ( ch%8 )); 00088 } 00089 00090 int TQUtf7Codec::heuristicContentMatch(const char* chars, int len) const 00091 { 00092 int stepNo = 0; 00093 int i; 00094 bool shifted = FALSE; 00095 bool rightAfterEscape = FALSE; 00096 bool onlyNullBitsSinceLastBoundary = TRUE; 00097 for ( i = 0; i < len ; i++ ) { 00098 if ((unsigned char)chars[i] >= 128) // 8bit chars not allowed. 00099 break; 00100 if (shifted) { 00101 if ( isOfSet(chars[i],base64Set) ) { 00102 switch (stepNo) { 00103 case 0: 00104 onlyNullBitsSinceLastBoundary = TRUE; 00105 break; 00106 case 3: 00107 onlyNullBitsSinceLastBoundary 00108 = isOfSet(chars[i],base64SetWithLastTwoBitsZero); 00109 break; 00110 case 6: 00111 onlyNullBitsSinceLastBoundary 00112 = ( chars[i] == 'A' || chars[i] == 'Q' || 00113 chars[i] == 'g' || chars[i] == 'w' ); 00114 break; 00115 default: 00116 onlyNullBitsSinceLastBoundary 00117 = onlyNullBitsSinceLastBoundary && (chars[i] == 'A'); 00118 } 00119 stepNo = (stepNo + 1) % 8; 00120 rightAfterEscape = FALSE; 00121 } else { 00122 if (rightAfterEscape && chars[i] != '-') 00123 break; // a '+' must be followed by '-' or a base64 char 00124 if (!onlyNullBitsSinceLastBoundary) 00125 break; // non-zero bits in the tail of the base64 encoding 00126 shifted = FALSE; 00127 stepNo = 0; 00128 } 00129 } else { 00130 if (chars[i] == '+') { 00131 shifted = TRUE; 00132 rightAfterEscape = TRUE; 00133 } 00134 } 00135 } 00136 return i; 00137 } 00138 00139 class TQUtf7Decoder : public TQTextDecoder { 00140 // the storage for our unicode char until it's finished 00141 ushort uc; 00142 // the state of the base64 decoding 00143 // can be 0 (just finished three unicode chars) 00144 // 1 (have the upper 6 bits of uc already) 00145 // 2 (have the upper 12 bits of uc already) 00146 // 3 (have the upper 2 bits of uc already) 00147 // .......... 00148 // 7 (have the upper 10 bits of uc already) 00149 // => n (have the upper (n * 6) % 16 bits of uc already) 00150 // "stepNo" cycles through all it's values every three 00151 // unicode chars. 00152 char stepNo; 00153 // remembers if we are in shifted-sequence mode 00154 bool shifted; 00155 // remembers if we're just after the initial '+' 00156 // of a shifted-sequence. 00157 bool rightAfterEscape; 00158 public: 00159 TQUtf7Decoder() : uc(0), stepNo(0), shifted(FALSE), rightAfterEscape(FALSE) 00160 { 00161 } 00162 00163 private: 00164 inline void resetParser() 00165 { 00166 uc = 0; 00167 stepNo = 0; 00168 shifted = FALSE; 00169 rightAfterEscape = FALSE; 00170 } 00171 00172 public: 00173 TQString toUnicode(const char* chars, int len) 00174 { 00175 TQString result = ""; 00176 for (int i=0; i<len; i++) { 00177 uchar ch = chars[i]; 00178 00179 // 00180 // check for 8bit char's: 00181 // 00182 if ( ch > 127 ) { 00183 tqWarning("TQUtf7Decoder: 8bit char found in input. " 00184 "Parser has been re-initialized!"); 00185 resetParser(); 00186 result += TQChar::replacement; 00187 continue; 00188 } 00189 00190 if (shifted) { // in shifted mode 00191 00192 // 00193 // first, we check specialities that only occur 00194 // right after the escaping '+': 00195 // 00196 if ( rightAfterEscape && ch == '-' ) { 00197 // a "+-" sequence is a short-circuit encoding 00198 // for just '+': 00199 resetParser(); 00200 result += TQChar('+'); 00201 // we're already done for this "ch", so 00202 continue; 00203 } 00204 00205 // 00206 // Here we're going to extract the bits represented by "ch": 00207 // 00208 ushort bits; 00209 if ( ch >= 'A' && ch <= 'Z' ) { 00210 bits = ch - 'A'; 00211 } else if ( ch >= 'a' && ch <= 'z' ) { 00212 bits = ch - 'a' + 26; 00213 } else if ( ch >= '0' && ch <= '9' ) { 00214 bits = ch - '0' + 52; 00215 } else if ( ch == '+' ) { 00216 bits = 62; 00217 } else if ( ch == '/' ) { 00218 bits = 63; 00219 } else { 00220 bits = 0; // keep compiler happy 00221 00222 // 00223 // ch is not of the base64 alphabet. 00224 // Here we are going to check the sequence's validity: 00225 // 00226 if ( rightAfterEscape ) { 00227 // any non-base64 char following an escaping '+' 00228 // makes for an ill-formed sequence. 00229 // Note that we catch (the valid) "+-" pair 00230 // right at the beginning. 00231 tqWarning("TQUtf7Decoder: ill-formed input: " 00232 "non-base64 char after escaping \"+\"!"); 00233 } 00234 // pending bits from base64 encoding must be all 0: 00235 if (stepNo >= 1 && uc) { 00236 tqWarning("TQUtf7Decoder: ill-formed sequence: " 00237 "non-zero bits in shifted-sequence tail!"); 00238 } 00239 resetParser(); 00240 00241 // a '-' signifies the end of the shifted-sequence, 00242 // so we just swallow it. 00243 if ( ch == '-' ) 00244 continue; 00245 // end of validity checking. Process ch now... 00246 } 00247 00248 if ( /*still*/ shifted ) { 00249 // 00250 // now we're going to stuff the "bits" bit bucket into 00251 // the right position inside "uc", emitting a resulting 00252 // TQChar if possible. 00253 // 00254 switch (stepNo) { 00255 // "bits" are the 6 msb's of uc 00256 case 0: uc = bits << 10; break; 00257 00258 case 1: uc |= bits << 4; break; 00259 00260 // 4 bits of "bits" complete the first ushort 00261 case 2: uc |= bits >> 2; result += TQChar(uc); 00262 // 2 bits of "bits" make the msb's of the next ushort 00263 uc = bits << 14; break; 00264 case 3: uc |= bits << 8; break; 00265 case 4: uc |= bits << 2; break; 00266 00267 // 2 bits of "bits" complete the second ushort 00268 case 5: uc |= bits >> 4; result += TQChar(uc); 00269 // 4 bits of "bits" make the msb's of the next ushort 00270 uc = bits << 12; break; 00271 case 6: uc |= bits << 6; break; 00272 00273 // these 6 bits complete the third ushort 00274 // and also one round of 8 chars -> 3 ushort decoding 00275 case 7: uc |= bits; result += TQChar(uc); 00276 uc = 0; break; 00277 default: ; 00278 } // switch (stepNo) 00279 // increase the step counter 00280 stepNo++; 00281 stepNo %= 8; 00282 rightAfterEscape = FALSE; 00283 // and look at the next char. 00284 continue; 00285 } // fi (still) shifted 00286 } // fi shifted 00287 00288 // 00289 // if control reaches here, we either weren't in a 00290 // shifted sequence or we just left one by seeing 00291 // a non-base64-char. 00292 // Either way, we have to process "ch" outside 00293 // a shifted-sequence now: 00294 // 00295 if ( ch == '+' ) { 00296 // '+' is the escape char for entering a 00297 // shifted sequence: 00298 shifted = TRUE; 00299 stepNo = 0; 00300 // also, we're right at the beginning where 00301 // special rules apply: 00302 rightAfterEscape = TRUE; 00303 } else { 00304 // US-ASCII values are directly used 00305 result += TQChar(ch); 00306 } 00307 } 00308 00309 return result; 00310 00311 } // toUnicode() 00312 00313 }; // class TQUtf7Decoder 00314 00315 TQTextDecoder* TQUtf7Codec::makeDecoder() const 00316 { 00317 return new TQUtf7Decoder; 00318 } 00319 00320 00321 class TQUtf7Encoder : public TQTextEncoder { 00322 uchar dontNeedEncodingSet[16]; 00323 ushort outbits; 00324 uint stepNo : 2; 00325 bool shifted : 1; 00326 bool mayContinueShiftedSequence : 1; 00327 public: 00328 TQUtf7Encoder(bool encOpt, bool encLwsp) 00329 : outbits(0), stepNo(0), 00330 shifted(FALSE), mayContinueShiftedSequence(FALSE) 00331 { 00332 for ( int i = 0; i < 16 ; i++) { 00333 dontNeedEncodingSet[i] = directSet[i]; 00334 if (!encOpt) 00335 dontNeedEncodingSet[i] |= optDirectSet[i]; 00336 } 00337 if(!encLwsp) { 00338 dontNeedEncodingSet[' '/8] |= 0x80 >> (' '%8); 00339 dontNeedEncodingSet['\n'/8] |= 0x80 >> ('\n'%8); 00340 dontNeedEncodingSet['\r'/8] |= 0x80 >> ('\r'%8); 00341 dontNeedEncodingSet['\t'/8] |= 0x80 >> ('\t'%8); 00342 } 00343 } 00344 00345 private: 00346 00347 char toBase64( ushort u ) { 00348 if ( u < 26 ) 00349 return (char)u + 'A'; 00350 else if ( u < 52 ) 00351 return (char)u - 26 + 'a'; 00352 else if ( u < 62 ) 00353 return (char)u - 52 + '0'; 00354 else if ( u == 62 ) 00355 return '+'; 00356 else 00357 return '/'; 00358 } 00359 00360 void addToShiftedSequence(TQCString::Iterator & t, ushort u) { 00361 switch (stepNo) { 00362 // no outbits; use uppermost 6 bits of u 00363 case 0: 00364 *t++ = toBase64( u >> 10 ); 00365 *t++ = toBase64( (u & 0x03FF /* umask top 6 bits */ ) >> 4 ); 00366 // save 4 lowest-order bits in outbits[5..2] 00367 outbits = (u & 0x000F) << 2; 00368 break; 00369 00370 // outbits available; use top two bits of u to complete 00371 // the previous char 00372 case 1: 00373 if (!mayContinueShiftedSequence) { 00374 // if mayContinue, this char has already been written 00375 *t++ = toBase64( outbits | ( u >> 14 ) ); 00376 } 00377 *t++ = toBase64( (u & 0x3F00 /* mask top 2 bits */ ) >> 8 ); 00378 *t++ = toBase64( (u & 0x00FC /* mask msbyte */ ) >> 2 ); 00379 // save 2 lowest-significant bits in outbits[5..4] 00380 outbits = (u & 0x0003) << 4; 00381 break; 00382 00383 // outbits available; use top four bits of u to complete 00384 // the previous char 00385 case 2: 00386 if (!mayContinueShiftedSequence) { 00387 // if mayContinue, this char has already been written 00388 *t++ = toBase64( outbits | ( u >> 12 ) ); 00389 } 00390 *t++ = toBase64( (u & 0x0FFF) >> 6 ); 00391 *t++ = toBase64( u & 0x003F ); 00392 break; 00393 00394 default: ; 00395 } 00396 stepNo = (stepNo + 1) % 3; 00397 } 00398 00399 void endShiftedSequence(TQCString::Iterator & t) { 00400 switch (stepNo) { 00401 case 1: // four outbits still to be written 00402 case 2: // two outbits still to be written 00403 *t++ = toBase64( outbits ); 00404 break; 00405 case 0: // nothing to do 00406 default: ; 00407 } 00408 outbits = 0; 00409 } 00410 00411 // depending on the stepNo, checks whether we can continue 00412 // an already ended shifted-sequence with char "u". 00413 // This is only possible if the topmost bits fit the 00414 // already written ones (which are all 0 between calls) 00415 bool continueOK( ushort u ) { 00416 return stepNo == 0 || 00417 ( stepNo == 1 && (u & 0xF000) == 0 ) || 00418 ( stepNo == 2 && (u & 0xC000) == 0 ); 00419 } 00420 00421 void processDoesntNeedEncoding(TQCString::Iterator & t, ushort ch) { 00422 // doesn't need encoding 00423 if (shifted) { 00424 endShiftedSequence(t); 00425 // add "lead-out" to dis-ambiguate following chars: 00426 if (isOfSet((char)ch,base64Set) || ch == '-' ) { 00427 *t++ = '-'; 00428 } 00429 } else if (mayContinueShiftedSequence) { 00430 // if mayContinue is set, this means the 00431 // shifted-sequence needs a lead-out. 00432 mayContinueShiftedSequence = FALSE; 00433 if (isOfSet(ch,base64Set) || ch == '-' ) { 00434 *t++ = '-'; 00435 } 00436 } 00437 *t++ = (uchar)ch; 00438 shifted = FALSE; 00439 stepNo = 0; 00440 } 00441 00442 public: 00443 TQCString fromUnicode(const TQString & uc, int & len_in_out) 00444 { 00445 // allocate place for worst case: 00446 // len/2 * (5+1) for an alternating sequence of e.g. "A\", 00447 // + 4 for a worst-case of another +ABC encoded char 00448 // + 1 for the trailing \0 00449 // 00450 int maxreslen = 3 * len_in_out + 5; 00451 TQCString result( maxreslen ); 00452 00453 #if 0 00454 // if (len_in_out == 1) { 00455 cout << "\nlen_in_out: " << len_in_out 00456 <<"; shifted: " << (shifted ? "true" : "false") 00457 << ";\n" << "mayContinue: " 00458 << (mayContinueShiftedSequence ? "true" : "false") 00459 << "; stepNo: " << stepNo << ";\n" 00460 << "outbits: " << outbits << endl; 00461 // } 00462 #endif 00463 00464 // source and destination cursor 00465 const TQChar * s = uc.unicode(); 00466 TQCString::Iterator t = result.data(); 00467 00468 if ( uc.isNull() ) { 00469 // return to ascii requested: 00470 if ( mayContinueShiftedSequence ) 00471 *t++ = '-'; 00472 } else { 00473 // normal operation: 00474 for (int i = 0 ; i < len_in_out ; 00475 i++/*, checkOutBuf(result,maxreslen,t,i,len_in_out,5)*/ ) { 00476 ushort ch = s[i].unicode(); 00477 00478 // 00479 // first, we check whether we might get around encoding: 00480 // 00481 if ( ch < 128 ) { 00482 // 00483 // ch is usAscii, so we have a chance that we don't 00484 // need to encode it. 00485 // 00486 if ( isOfSet((uchar)ch,dontNeedEncodingSet) ) { 00487 processDoesntNeedEncoding(t,ch); 00488 continue; 00489 } else if ( ch == '+' ) { 00490 // '+' is the shift escape character 00491 if (shifted || mayContinueShiftedSequence) { 00492 // if we are already in shifted mode, we just 00493 // encode the '+', too. Compare 00494 // 24bits ("-+-") + some from ending the shifted-sequence 00495 // with 21,33 bits 00496 addToShiftedSequence(t,ch); 00497 mayContinueShiftedSequence = FALSE; 00498 shifted = TRUE; 00499 } else { 00500 // shortcut encoding of '+': 00501 *t++ = '+'; 00502 *t++ = '-'; 00503 } 00504 continue; // done 00505 } // else fall through to encoding 00506 } 00507 // 00508 // need encoding 00509 // 00510 if (!shifted && (!mayContinueShiftedSequence || !continueOK(ch) ) ) { 00511 *t++ = '+'; 00512 stepNo = 0; 00513 } 00514 addToShiftedSequence(t,ch); 00515 shifted = TRUE; 00516 mayContinueShiftedSequence = FALSE; 00517 } 00518 00519 if ( shifted ) { 00520 endShiftedSequence(t); 00521 mayContinueShiftedSequence = TRUE; 00522 }; 00523 shifted = FALSE; 00524 } 00525 00526 *t = '\0'; 00527 len_in_out = t - result.data(); 00528 00529 #if 0 00530 cout << "len_in_out: " << len_in_out << "; " 00531 << "mayContinue: " << (mayContinueShiftedSequence ? "true" : "false") 00532 << "; stepNo: " << stepNo << endl; 00533 #endif 00534 00535 Q_ASSERT(len_in_out <= maxreslen-1); 00536 00537 return result; 00538 } // fromUnicode() 00539 00540 }; // class TQUtf7Encoder 00541 00542 TQTextEncoder* TQUtf7Codec::makeEncoder() const { 00543 return new TQUtf7Encoder( false, false ); 00544 } 00545 00546 TQTextEncoder* TQStrictUtf7Codec::makeEncoder() const { 00547 return new TQUtf7Encoder( true, false ); 00548 } 00549 00550 #endif // TQT_NO_TEXTCODEC