akregator/src/librss

tools_p.cpp
1/*
2 * tools_p.cpp
3 *
4 * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org>
5 *
6 * This program is distributed in the hope that it will be useful, but WITHOUT
7 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
8 * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the
9 * accompanying file 'COPYING'.
10 */
11#include "tools_p.h"
12
13#include <krfcdate.h>
14#include <tqdom.h>
15#include <kcharsets.h>
16#include <tqregexp.h>
17
18namespace RSS {
19
20time_t parseISO8601Date(const TQString &s)
21{
22 // do some sanity check: 26-12-2004T00:00+00:00 is parsed to epoch+1 in the KRFCDate, which is wrong. So let's check if the date begins with YYYY -fo
23 if (s.stripWhiteSpace().left(4).toInt() < 1000)
24 return 0; // error
25
26 // FIXME: imho this is done in KRFCDate::parseDateISO8601() automatically, so we could omit it? -fo
27 if (s.find('T') != -1)
28 return KRFCDate::parseDateISO8601(s);
29 else
30 return KRFCDate::parseDateISO8601(s + "T12:00:00");
31}
32
33TQString childNodesAsXML(const TQDomNode& parent)
34{
35 TQDomNodeList list = parent.childNodes();
36 TQString str;
37 TQTextStream ts( &str, IO_WriteOnly );
38 for (uint i = 0; i < list.count(); ++i)
39 ts << list.item(i);
40 return str.stripWhiteSpace();
41}
42
43static TQString plainTextToHtml(const TQString& plainText)
44{
45 TQString str(plainText);
46 str.replace("&", "&amp;");
47 str.replace("\"", "&quot;");
48 str.replace("<", "&lt;");
49 //str.replace(">", "&gt;");
50 str.replace("\n", "<br/>");
51 return str;
52}
53
54enum ContentFormat { Text, HTML, XML, Binary };
55
56static ContentFormat mapTypeToFormat(const TQString& modep, const TQString& typep, const TQString& src)
57{
58 TQString mode = modep.isNull() ? "escaped" : modep;
59 TQString type = typep;
60
61 //"If neither the type attribute nor the src attribute is provided,
62 //Atom Processors MUST behave as though the type attribute were
63 //present with a value of "text""
64 if (type.isNull() && src.isEmpty())
65 type = TQString::fromUtf8("text");
66
67 if (type == TQString::fromUtf8("html")
68 || type == TQString::fromUtf8("text/html"))
69 return HTML;
70
71 if (type == TQString::fromUtf8("text")
72 || (type.startsWith(TQString::fromUtf8("text/"), false)
73 && !type.startsWith(TQString::fromUtf8("text/xml"), false))
74 )
75 return Text;
76
77 TQStringList xmltypes;
78 xmltypes.append(TQString::fromUtf8("xhtml"));
79 // XML media types as defined in RFC3023:
80 xmltypes.append(TQString::fromUtf8("text/xml"));
81 xmltypes.append(TQString::fromUtf8("application/xml"));
82 xmltypes.append(TQString::fromUtf8("text/xml-external-parsed-entity"));
83 xmltypes.append(TQString::fromUtf8("application/xml-external-parsed-entity"));
84 xmltypes.append(TQString::fromUtf8("application/xml-dtd"));
85
86
87 if (xmltypes.contains(type)
88 || type.endsWith(TQString::fromUtf8("+xml"), false)
89 || type.endsWith(TQString::fromUtf8("/xml"), false))
90 return XML;
91
92 return Binary;
93}
94
95static TQString extractAtomContent(const TQDomElement& e)
96{
97 ContentFormat format = mapTypeToFormat(e.attribute("mode"),
98 e.attribute("type"),
99 e.attribute("src"));
100
101 switch (format)
102 {
103 case HTML:
104 {
105 const bool hasPre = e.text().contains( "<pre>", false ) || e.text().contains( "<pre ", false );
106 return KCharsets::resolveEntities( hasPre ? e.text() : e.text().simplifyWhiteSpace() );
107 }
108 case Text:
109 return plainTextToHtml(e.text().stripWhiteSpace());
110 case XML:
111 return childNodesAsXML(e).simplifyWhiteSpace();
112 case Binary:
113 default:
114 return TQString();
115 }
116
117 return TQString();
118}
119
120TQString extractNode(const TQDomNode &parent, const TQString &elemName, bool isInlined)
121{
122 TQDomNode node = parent.namedItem(elemName);
123 if (node.isNull())
124 return TQString();
125
126 TQDomElement e = node.toElement();
127 TQString result = e.text().stripWhiteSpace(); // let's assume plain text
128
129 if (elemName == "content") // we have Atom here
130 {
131 result = extractAtomContent(e);
132 }
133 else // check for HTML; not necessary for Atom:content
134 {
135 bool hasPre = result.contains("<pre>", false) || result.contains("<pre ", false);
136 bool hasHtml = hasPre || result.contains("<"); // FIXME: test if we have html, should be more clever -> regexp
137 if(!isInlined && !hasHtml) // perform nl2br if not a inline elt and it has no html elts
138 result = result = result.replace(TQChar('\n'), "<br />");
139 if(!hasPre) // strip white spaces if no <pre>
140 result = result.simplifyWhiteSpace();
141 }
142
143 return result.isEmpty() ? TQString() : result;
144}
145
146TQString extractTitle(const TQDomNode & parent)
147{
148 TQDomNode node = parent.namedItem(TQString::fromLatin1("title"));
149 if (node.isNull())
150 return TQString();
151
152 TQString result = node.toElement().text();
153
154 result = KCharsets::resolveEntities(KCharsets::resolveEntities(result).replace(TQRegExp("<[^>]*>"), "").remove("\\"));
155 result = result.simplifyWhiteSpace();
156
157 if (result.isEmpty())
158 return TQString();
159
160 return result;
161}
162
163static void authorFromString(const TQString& strp, TQString& name, TQString& email)
164{
165 TQString str = strp.stripWhiteSpace();
166 if (str.isEmpty())
167 return;
168
169 // look for something looking like a mail address ( "foo@bar.com",
170 // "<foo@bar.com>") and extract it
171
172 TQRegExp remail("<?([^@\\s<]+@[^>\\s]+)>?"); // FIXME: user "proper" regexp,
173 // search kmail source for it
174
175 int pos = remail.search(str);
176 if (pos != -1)
177 {
178 TQString all = remail.cap(0);
179 email = remail.cap(1);
180 str.replace(all, ""); // remove mail address
181 }
182
183 // simplify the rest and use it as name
184
185 name = str.simplifyWhiteSpace();
186
187 // after removing the email, str might have
188 // the format "(Foo M. Bar)". We cut off
189 // parentheses if there are any. However, if
190 // str is of the format "Foo M. Bar (President)",
191 // we should not cut anything.
192
193 TQRegExp rename("^\\(([^\\)]*)\\)");
194
195 pos = rename.search(name);
196
197 if (pos != -1)
198 {
199 name = rename.cap(1);
200 }
201
202 name = name.isEmpty() ? TQString() : name;
203 email = email.isEmpty() ? TQString() : email;
204}
205
206TQString parseItemAuthor(const TQDomElement& element, Format format, Version version)
207{
208 TQString name;
209 TQString email;
210
211 TQDomElement dcCreator = element.namedItem("dc:creator").toElement();
212
213 if (!dcCreator.isNull())
214 authorFromString(dcCreator.text(), name, email);
215 else if (format == AtomFeed)
216 {
217 TQDomElement atomAuthor = element.namedItem("author").toElement();
218 if (atomAuthor.isNull())
219 atomAuthor = element.namedItem("atom:author").toElement();
220 if (!atomAuthor.isNull())
221 {
222 TQDomElement atomName = atomAuthor.namedItem("name").toElement();
223 if (atomName.isNull())
224 atomName = atomAuthor.namedItem("atom:name").toElement();
225 name = atomName.text().stripWhiteSpace();
226
227 TQDomElement atomEmail = atomAuthor.namedItem("email").toElement();
228 if (atomEmail.isNull())
229 atomEmail = atomAuthor.namedItem("atom:email").toElement();
230 email = atomEmail.text().stripWhiteSpace();
231 }
232 }
233 else if (format == RSSFeed)
234 {
235 authorFromString(element.namedItem("author").toElement().text(), name, email);
236 }
237
238 if (name.isNull())
239 name = email;
240
241 if (!email.isNull())
242 return TQString("<a href=\"mailto:%1\">%2</a>").arg(email).arg(name);
243 else
244 return name;
245}
246
247} // namespace RSS