akregator/src/librss

tools_p.cpp
1/*
2 * tools_p.cpp
3 *
4 * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org>
5 *
6 * This program is distributed in the hope that it will be useful, but WITHOUT
7 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
8 * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the
9 * accompanying file 'COPYING'.
10 */
11#include "tools_p.h"
12
13#include <krfcdate.h>
14#include <tqdom.h>
15#include <kcharsets.h>
16#include <tqregexp.h>
17
18namespace RSS {
19
20time_t parseISO8601Date(const TQString &s)
21{
22 // do some sanity check: 26-12-2004T00:00+00:00 is parsed to epoch+1 in the KRFCDate, which is wrong. So let's check if the date begins with YYYY -fo
23 if (s.stripWhiteSpace().left(4).toInt() < 1000)
24 return 0; // error
25
26 // FIXME: imho this is done in KRFCDate::parseDateISO8601() automatically, so we could omit it? -fo
27 if (s.find('T') != -1)
28 return KRFCDate::parseDateISO8601(s);
29 else
30 return KRFCDate::parseDateISO8601(s + "T12:00:00");
31}
32
33TQString childNodesAsXML(const TQDomNode& parent)
34{
35 TQDomNodeList list = parent.childNodes();
36 TQString str;
37 TQTextStream ts( &str, IO_WriteOnly );
38 for (uint i = 0; i < list.count(); ++i)
39 ts << list.item(i);
40 return str.stripWhiteSpace();
41}
42
43static TQString plainTextToHtml(const TQString& plainText)
44{
45 TQString str(plainText);
46 str.replace("&", "&amp;");
47 str.replace("\"", "&quot;");
48 str.replace("<", "&lt;");
49 //str.replace(">", "&gt;");
50 str.replace("\n", "<br/>");
51 return str;
52}
53
54enum ContentFormat { Text, HTML, XML, Binary };
55
56static ContentFormat mapTypeToFormat(const TQString& modep, const TQString& typep, const TQString& src)
57{
58 TQString mode = modep.isNull() ? "escaped" : modep;
59 TQString type = typep;
60
61 //"If neither the type attribute nor the src attribute is provided,
62 //Atom Processors MUST behave as though the type attribute were
63 //present with a value of "text""
64 if (type.isNull() && src.isEmpty())
65 type = TQString::fromUtf8("text");
66
67 if (type == TQString::fromUtf8("html")
68 || type == TQString::fromUtf8("text/html"))
69 return HTML;
70
71 if (type == TQString::fromUtf8("text")
72 || (type.startsWith(TQString::fromUtf8("text/"), false)
73 && !type.startsWith(TQString::fromUtf8("text/xml"), false))
74 )
75 return Text;
76
77 TQStringList xmltypes;
78 xmltypes.append(TQString::fromUtf8("xhtml"));
79 // XML media types as defined in RFC3023:
80 xmltypes.append(TQString::fromUtf8("text/xml"));
81 xmltypes.append(TQString::fromUtf8("application/xml"));
82 xmltypes.append(TQString::fromUtf8("text/xml-external-parsed-entity"));
83 xmltypes.append(TQString::fromUtf8("application/xml-external-parsed-entity"));
84 xmltypes.append(TQString::fromUtf8("application/xml-dtd"));
85
86
87 if (xmltypes.contains(type)
88 || type.endsWith(TQString::fromUtf8("+xml"), false)
89 || type.endsWith(TQString::fromUtf8("/xml"), false))
90 return XML;
91
92 return Binary;
93}
94
95static TQString extractAtomContent(const TQDomElement& e)
96{
97 ContentFormat format = mapTypeToFormat(e.attribute("mode"),
98 e.attribute("type"),
99 e.attribute("src"));
100
101 switch (format)
102 {
103 case HTML:
104 {
105 const bool hasPre = e.text().contains( "<pre>", false ) || e.text().contains( "<pre ", false );
106 return KCharsets::resolveEntities( hasPre ? e.text() : e.text().simplifyWhiteSpace() );
107 }
108 case Text:
109 return plainTextToHtml(e.text().stripWhiteSpace());
110 case XML:
111 return childNodesAsXML(e).simplifyWhiteSpace();
112 case Binary:
113 default:
114 return TQString();
115 }
116
117 return TQString();
118}
119
120TQDomElement extractElementNS(const TQDomNode &parent, const TQString &nameSpace, const TQString &localName)
121{
122 TQDomElement element;
123
124 if (parent.isNull())
125 {
126 return element;
127 }
128
129 TQDomNodeList children = parent.childNodes();
130 for (size_t i = 0; i < children.count(); ++i)
131 {
132 TQDomNode node = children.item(i);
133 if (node.isElement() && node.namespaceURI() == nameSpace && node.localName() == localName)
134 {
135 element = node.toElement();
136 break;
137 }
138 }
139
140 return element;
141}
142
143TQString extractElementTextNS(const TQDomNode &parent, const TQString &namespaceURI, const TQString &localName, bool isInlined)
144{
145 TQDomElement element = extractElementNS(parent, namespaceURI, localName);
146
147 if (element.isNull())
148 {
149 return TQString::null;
150 }
151
152 TQString result = element.text().stripWhiteSpace();
153 if (localName == "content")
154 {
155 // Atom content
156 result = extractAtomContent(element);
157 }
158 else
159 {
160 // Check for HTML; not necessary for atom:content
161 // Taken from extractNode below
162 bool hasPre = result.contains("<pre>", false) || result.contains("<pre ", false);
163 bool hasHtml = hasPre || result.contains("<");
164 if (!isInlined && !hasHtml)
165 result = result = result.replace(TQChar('\n'), "<br />");
166 if (!hasPre)
167 result = result.simplifyWhiteSpace();
168 }
169
170 return result.isEmpty() ? TQString::null : result;
171}
172
173TQString extractNode(const TQDomNode &parent, const TQString &elemName, bool isInlined)
174{
175 TQDomNode node = parent.namedItem(elemName);
176 if (node.isNull())
177 return TQString();
178
179 TQDomElement e = node.toElement();
180 TQString result = e.text().stripWhiteSpace(); // let's assume plain text
181
182 if (elemName == "content") // we have Atom here
183 {
184 result = extractAtomContent(e);
185 }
186 else // check for HTML; not necessary for Atom:content
187 {
188 bool hasPre = result.contains("<pre>", false) || result.contains("<pre ", false);
189 bool hasHtml = hasPre || result.contains("<"); // FIXME: test if we have html, should be more clever -> regexp
190 if(!isInlined && !hasHtml) // perform nl2br if not a inline elt and it has no html elts
191 result = result = result.replace(TQChar('\n'), "<br />");
192 if(!hasPre) // strip white spaces if no <pre>
193 result = result.simplifyWhiteSpace();
194 }
195
196 return result.isEmpty() ? TQString() : result;
197}
198
199TQString extractTitle(const TQDomNode & parent)
200{
201 TQDomNode node = parent.namedItem(TQString::fromLatin1("title"));
202 if (node.isNull())
203 return TQString();
204
205 TQString result = node.toElement().text();
206
207 result = KCharsets::resolveEntities(KCharsets::resolveEntities(result).replace(TQRegExp("<[^>]*>"), "").remove("\\"));
208 result = result.simplifyWhiteSpace();
209
210 if (result.isEmpty())
211 return TQString();
212
213 return result;
214}
215
216static void authorFromString(const TQString& strp, TQString& name, TQString& email)
217{
218 TQString str = strp.stripWhiteSpace();
219 if (str.isEmpty())
220 return;
221
222 // look for something looking like a mail address ( "foo@bar.com",
223 // "<foo@bar.com>") and extract it
224
225 TQRegExp remail("<?([^@\\s<]+@[^>\\s]+)>?"); // FIXME: user "proper" regexp,
226 // search kmail source for it
227
228 int pos = remail.search(str);
229 if (pos != -1)
230 {
231 TQString all = remail.cap(0);
232 email = remail.cap(1);
233 str.replace(all, ""); // remove mail address
234 }
235
236 // simplify the rest and use it as name
237
238 name = str.simplifyWhiteSpace();
239
240 // after removing the email, str might have
241 // the format "(Foo M. Bar)". We cut off
242 // parentheses if there are any. However, if
243 // str is of the format "Foo M. Bar (President)",
244 // we should not cut anything.
245
246 TQRegExp rename("^\\(([^\\)]*)\\)");
247
248 pos = rename.search(name);
249
250 if (pos != -1)
251 {
252 name = rename.cap(1);
253 }
254
255 name = name.isEmpty() ? TQString() : name;
256 email = email.isEmpty() ? TQString() : email;
257}
258
259TQString parseItemAuthor(const TQDomElement& element, Format format, Version version)
260{
261 TQString name;
262 TQString email;
263
264 TQDomElement dcCreator = extractElementNS(element, DublinCoreNamespace, "creator");
265
266 if (!dcCreator.isNull())
267 authorFromString(dcCreator.text(), name, email);
268 else if (format == AtomFeed)
269 {
270 TQDomElement atomAuthor = element.namedItem("author").toElement();
271 if (atomAuthor.isNull())
272 atomAuthor = extractElementNS(element, AtomNamespace, "author");
273 if (!atomAuthor.isNull())
274 {
275 TQDomElement atomName = atomAuthor.namedItem("name").toElement();
276 if (atomName.isNull())
277 atomName = extractElementNS(atomAuthor, AtomNamespace, "name");
278 name = atomName.text().stripWhiteSpace();
279
280 TQDomElement atomEmail = atomAuthor.namedItem("email").toElement();
281 if (atomEmail.isNull())
282 atomEmail = extractElementNS(atomAuthor, AtomNamespace, "email");
283 email = atomEmail.text().stripWhiteSpace();
284 }
285 }
286 else if (format == RSSFeed)
287 {
288 authorFromString(element.namedItem("author").toElement().text(), name, email);
289 }
290
291 if (name.isNull())
292 name = email;
293
294 if (!email.isNull())
295 return TQString("<a href=\"mailto:%1\">%2</a>").arg(email).arg(name);
296 else
297 return name;
298}
299
300} // namespace RSS