kmime_charfreq.cpp
00001 /* 00002 kmime_charfreq.cpp 00003 00004 KMime, the KDE internet mail/usenet news message library. 00005 Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org> 00006 00007 This program is free software; you can redistribute it and/or modify 00008 it under the terms of the GNU General Public License as published by 00009 the Free Software Foundation; version 2 of the License. 00010 You should have received a copy of the GNU General Public License 00011 along with this program; if not, write to the Free Software Foundation, 00012 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US 00013 */ 00014 00015 #include "kmime_charfreq.h" 00016 00017 namespace KMime { 00018 00019 CharFreq::CharFreq( const TQByteArray & buf ) 00020 : NUL(0), 00021 CTL(0), 00022 CR(0), LF(0), 00023 CRLF(0), 00024 printable(0), 00025 eightBit(0), 00026 total(0), 00027 lineMin(0xffffffff), 00028 lineMax(0), 00029 mTrailingWS(false), 00030 mLeadingFrom(false) 00031 { 00032 if ( !buf.isEmpty() ) 00033 count( buf.data(), buf.size() ); 00034 } 00035 00036 CharFreq::CharFreq( const char * buf, size_t len ) 00037 : NUL(0), 00038 CTL(0), 00039 CR(0), LF(0), 00040 CRLF(0), 00041 printable(0), 00042 eightBit(0), 00043 total(0), 00044 lineMin(0xffffffff), 00045 lineMax(0), 00046 mTrailingWS(false), 00047 mLeadingFrom(false) 00048 { 00049 if ( buf && len > 0 ) 00050 count( buf, len ); 00051 } 00052 00053 static inline bool isWS( char ch ) { return ( ch == '\t' || ch == ' ' ); } 00054 00055 void CharFreq::count( const char * it, size_t len ) { 00056 00057 const char * end = it + len; 00058 uint currentLineLength = 0; 00059 // initialize the prevChar with LF so that From_ detection works w/o 00060 // special-casing: 00061 char prevChar = '\n'; 00062 char prevPrevChar = 0; 00063 00064 for ( ; it != end ; ++it ) { 00065 ++currentLineLength; 00066 switch ( *it ) { 00067 case '\0': ++NUL; break; 00068 case '\r': ++CR; break; 00069 case '\n': ++LF; 00070 if ( prevChar == '\r' ) { --currentLineLength; ++CRLF; } 00071 if ( currentLineLength >= lineMax ) lineMax = currentLineLength-1; 00072 if ( currentLineLength <= lineMin ) lineMin = currentLineLength-1; 00073 if ( !mTrailingWS ) 00074 if ( isWS( prevChar ) || ( prevChar == '\r' && isWS( prevPrevChar ) ) ) 00075 mTrailingWS = true; 00076 currentLineLength = 0; 00077 break; 00078 case 'F': // check for lines starting with From_ if not found already: 00079 if ( !mLeadingFrom ) 00080 if ( prevChar == '\n' && end - it >= 5 && !tqstrncmp( "From ", it, 5 ) ) 00081 mLeadingFrom = true; 00082 ++printable; 00083 break; 00084 default: 00085 { 00086 uchar c = *it; 00087 if ( (c == '\t') || ((c >= ' ') && (c <= '~')) ) 00088 ++printable; 00089 else if ( (c == 127) || (c < ' ') ) 00090 ++CTL; 00091 else 00092 ++eightBit; 00093 } 00094 } 00095 prevPrevChar = prevChar; 00096 prevChar = *it; 00097 } 00098 00099 // consider the length of the last line 00100 if ( currentLineLength >= lineMax ) lineMax = currentLineLength; 00101 if ( currentLineLength <= lineMin ) lineMin = currentLineLength; 00102 00103 // check whether the last character is tab or space 00104 if ( isWS( prevChar ) ) 00105 mTrailingWS = true; 00106 00107 total = len; 00108 } 00109 00110 bool CharFreq::isEightBitData() const { 00111 return type() == EightBitData; 00112 } 00113 00114 bool CharFreq::isEightBitText() const { 00115 return type() == EightBitText; 00116 } 00117 00118 bool CharFreq::isSevenBitData() const { 00119 return type() == SevenBitData; 00120 } 00121 00122 bool CharFreq::isSevenBitText() const { 00123 return type() == SevenBitText; 00124 } 00125 00126 bool CharFreq::hasTrailingWhitespace() const { 00127 return mTrailingWS; 00128 } 00129 00130 bool CharFreq::hasLeadingFrom() const { 00131 return mLeadingFrom; 00132 } 00133 00134 CharFreq::Type CharFreq::type() const { 00135 #if 0 00136 tqDebug( "Total: %d; NUL: %d; CTL: %d;\n" 00137 "CR: %d; LF: %d; CRLF: %d;\n" 00138 "lineMin: %d; lineMax: %d;\n" 00139 "printable: %d; eightBit: %d;\n" 00140 "trailing whitespace: %s;\n" 00141 "leading 'From ': %s;\n", 00142 total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax, 00143 printable, eightBit, 00144 mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" ); 00145 #endif 00146 if ( NUL ) // must be binary 00147 return Binary; 00148 00149 // doesn't contain NUL's: 00150 if ( eightBit ) { 00151 if ( lineMax > 988 ) return EightBitData; // not allowed in 8bit 00152 if ( CR != CRLF || controlCodesRatio() > 0.2 ) return EightBitData; 00153 return EightBitText; 00154 } 00155 00156 // doesn't contain NUL's, nor 8bit chars: 00157 if ( lineMax > 988 ) return SevenBitData; 00158 if ( CR != CRLF || controlCodesRatio() > 0.2 ) return SevenBitData; 00159 00160 // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars: 00161 return SevenBitText; 00162 } 00163 00164 float CharFreq::printableRatio() const { 00165 if ( total ) return float(printable) / float(total); 00166 else return 0; 00167 } 00168 00169 float CharFreq::controlCodesRatio() const { 00170 if ( total ) return float(CTL) / float(total); 00171 else return 0; 00172 } 00173 00174 } // namespace KMime 00175 00176