tools_p.cpp
00001 /* 00002 * tools_p.cpp 00003 * 00004 * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org> 00005 * 00006 * This program is distributed in the hope that it will be useful, but WITHOUT 00007 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 00008 * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the 00009 * accompanying file 'COPYING'. 00010 */ 00011 #include "tools_p.h" 00012 00013 #include <krfcdate.h> 00014 #include <tqdom.h> 00015 #include <kcharsets.h> 00016 #include <tqregexp.h> 00017 00018 namespace RSS { 00019 00020 time_t parseISO8601Date(const TQString &s) 00021 { 00022 // do some sanity check: 26-12-2004T00:00+00:00 is parsed to epoch+1 in the KRFCDate, which is wrong. So let's check if the date begins with YYYY -fo 00023 if (s.stripWhiteSpace().left(4).toInt() < 1000) 00024 return 0; // error 00025 00026 // FIXME: imho this is done in KRFCDate::parseDateISO8601() automatically, so we could omit it? -fo 00027 if (s.find('T') != -1) 00028 return KRFCDate::parseDateISO8601(s); 00029 else 00030 return KRFCDate::parseDateISO8601(s + "T12:00:00"); 00031 } 00032 00033 TQString childNodesAsXML(const TQDomNode& parent) 00034 { 00035 TQDomNodeList list = parent.childNodes(); 00036 TQString str; 00037 TQTextStream ts( &str, IO_WriteOnly ); 00038 for (uint i = 0; i < list.count(); ++i) 00039 ts << list.item(i); 00040 return str.stripWhiteSpace(); 00041 } 00042 00043 static TQString plainTextToHtml(const TQString& plainText) 00044 { 00045 TQString str(plainText); 00046 str.replace("&", "&"); 00047 str.replace("\"", """); 00048 str.replace("<", "<"); 00049 //str.replace(">", ">"); 00050 str.replace("\n", "<br/>"); 00051 return str; 00052 } 00053 00054 enum ContentFormat { Text, HTML, XML, Binary }; 00055 00056 static ContentFormat mapTypeToFormat(const TQString& modep, const TQString& typep, const TQString& src) 00057 { 00058 TQString mode = modep.isNull() ? "escaped" : modep; 00059 TQString type = typep; 00060 00061 //"If neither the type attribute nor the src attribute is provided, 00062 //Atom Processors MUST behave as though the type attribute were 00063 //present with a value of "text"" 00064 if (type.isNull() && src.isEmpty()) 00065 type = TQString::fromUtf8("text"); 00066 00067 if (type == TQString::fromUtf8("html") 00068 || type == TQString::fromUtf8("text/html")) 00069 return HTML; 00070 00071 if (type == TQString::fromUtf8("text") 00072 || (type.startsWith(TQString::fromUtf8("text/"), false) 00073 && !type.startsWith(TQString::fromUtf8("text/xml"), false)) 00074 ) 00075 return Text; 00076 00077 TQStringList xmltypes; 00078 xmltypes.append(TQString::fromUtf8("xhtml")); 00079 // XML media types as defined in RFC3023: 00080 xmltypes.append(TQString::fromUtf8("text/xml")); 00081 xmltypes.append(TQString::fromUtf8("application/xml")); 00082 xmltypes.append(TQString::fromUtf8("text/xml-external-parsed-entity")); 00083 xmltypes.append(TQString::fromUtf8("application/xml-external-parsed-entity")); 00084 xmltypes.append(TQString::fromUtf8("application/xml-dtd")); 00085 00086 00087 if (xmltypes.contains(type) 00088 || type.endsWith(TQString::fromUtf8("+xml"), false) 00089 || type.endsWith(TQString::fromUtf8("/xml"), false)) 00090 return XML; 00091 00092 return Binary; 00093 } 00094 00095 static TQString extractAtomContent(const TQDomElement& e) 00096 { 00097 ContentFormat format = mapTypeToFormat(e.attribute("mode"), 00098 e.attribute("type"), 00099 e.attribute("src")); 00100 00101 switch (format) 00102 { 00103 case HTML: 00104 { 00105 const bool hasPre = e.text().contains( "<pre>", false ) || e.text().contains( "<pre ", false ); 00106 return KCharsets::resolveEntities( hasPre ? e.text() : e.text().simplifyWhiteSpace() ); 00107 } 00108 case Text: 00109 return plainTextToHtml(e.text().stripWhiteSpace()); 00110 case XML: 00111 return childNodesAsXML(e).simplifyWhiteSpace(); 00112 case Binary: 00113 default: 00114 return TQString(); 00115 } 00116 00117 return TQString(); 00118 } 00119 00120 TQString extractNode(const TQDomNode &parent, const TQString &elemName, bool isInlined) 00121 { 00122 TQDomNode node = parent.namedItem(elemName); 00123 if (node.isNull()) 00124 return TQString(); 00125 00126 TQDomElement e = node.toElement(); 00127 TQString result = e.text().stripWhiteSpace(); // let's assume plain text 00128 00129 if (elemName == "content") // we have Atom here 00130 { 00131 result = extractAtomContent(e); 00132 } 00133 else // check for HTML; not necessary for Atom:content 00134 { 00135 bool hasPre = result.contains("<pre>", false) || result.contains("<pre ", false); 00136 bool hasHtml = hasPre || result.contains("<"); // FIXME: test if we have html, should be more clever -> regexp 00137 if(!isInlined && !hasHtml) // perform nl2br if not a inline elt and it has no html elts 00138 result = result = result.replace(TQChar('\n'), "<br />"); 00139 if(!hasPre) // strip white spaces if no <pre> 00140 result = result.simplifyWhiteSpace(); 00141 } 00142 00143 return result.isEmpty() ? TQString() : result; 00144 } 00145 00146 TQString extractTitle(const TQDomNode & parent) 00147 { 00148 TQDomNode node = parent.namedItem(TQString::fromLatin1("title")); 00149 if (node.isNull()) 00150 return TQString(); 00151 00152 TQString result = node.toElement().text(); 00153 00154 result = KCharsets::resolveEntities(KCharsets::resolveEntities(result).replace(TQRegExp("<[^>]*>"), "").remove("\\")); 00155 result = result.simplifyWhiteSpace(); 00156 00157 if (result.isEmpty()) 00158 return TQString(); 00159 00160 return result; 00161 } 00162 00163 static void authorFromString(const TQString& strp, TQString& name, TQString& email) 00164 { 00165 TQString str = strp.stripWhiteSpace(); 00166 if (str.isEmpty()) 00167 return; 00168 00169 // look for something looking like a mail address ( "foo@bar.com", 00170 // "<foo@bar.com>") and extract it 00171 00172 TQRegExp remail("<?([^@\\s<]+@[^>\\s]+)>?"); // FIXME: user "proper" regexp, 00173 // search kmail source for it 00174 00175 int pos = remail.search(str); 00176 if (pos != -1) 00177 { 00178 TQString all = remail.cap(0); 00179 email = remail.cap(1); 00180 str.replace(all, ""); // remove mail address 00181 } 00182 00183 // simplify the rest and use it as name 00184 00185 name = str.simplifyWhiteSpace(); 00186 00187 // after removing the email, str might have 00188 // the format "(Foo M. Bar)". We cut off 00189 // parentheses if there are any. However, if 00190 // str is of the format "Foo M. Bar (President)", 00191 // we should not cut anything. 00192 00193 TQRegExp rename("^\\(([^\\)]*)\\)"); 00194 00195 pos = rename.search(name); 00196 00197 if (pos != -1) 00198 { 00199 name = rename.cap(1); 00200 } 00201 00202 name = name.isEmpty() ? TQString() : name; 00203 email = email.isEmpty() ? TQString() : email; 00204 } 00205 00206 TQString parseItemAuthor(const TQDomElement& element, Format format, Version version) 00207 { 00208 TQString name; 00209 TQString email; 00210 00211 TQDomElement dcCreator = element.namedItem("dc:creator").toElement(); 00212 00213 if (!dcCreator.isNull()) 00214 authorFromString(dcCreator.text(), name, email); 00215 else if (format == AtomFeed) 00216 { 00217 TQDomElement atomAuthor = element.namedItem("author").toElement(); 00218 if (atomAuthor.isNull()) 00219 atomAuthor = element.namedItem("atom:author").toElement(); 00220 if (!atomAuthor.isNull()) 00221 { 00222 TQDomElement atomName = atomAuthor.namedItem("name").toElement(); 00223 if (atomName.isNull()) 00224 atomName = atomAuthor.namedItem("atom:name").toElement(); 00225 name = atomName.text().stripWhiteSpace(); 00226 00227 TQDomElement atomEmail = atomAuthor.namedItem("email").toElement(); 00228 if (atomEmail.isNull()) 00229 atomEmail = atomAuthor.namedItem("atom:email").toElement(); 00230 email = atomEmail.text().stripWhiteSpace(); 00231 } 00232 } 00233 else if (format == RSSFeed) 00234 { 00235 authorFromString(element.namedItem("author").toElement().text(), name, email); 00236 } 00237 00238 if (name.isNull()) 00239 name = email; 00240 00241 if (!email.isNull()) 00242 return TQString("<a href=\"mailto:%1\">%2</a>").arg(email).arg(name); 00243 else 00244 return name; 00245 } 00246 00247 } // namespace RSS 00248 00249 // vim:noet:ts=4