akregator/src/librss

tools_p.cpp
1 /*
2  * tools_p.cpp
3  *
4  * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org>
5  *
6  * This program is distributed in the hope that it will be useful, but WITHOUT
7  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
8  * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the
9  * accompanying file 'COPYING'.
10  */
11 #include "tools_p.h"
12 
13 #include <krfcdate.h>
14 #include <tqdom.h>
15 #include <kcharsets.h>
16 #include <tqregexp.h>
17 
18 namespace RSS {
19 
20 time_t parseISO8601Date(const TQString &s)
21 {
22  // do some sanity check: 26-12-2004T00:00+00:00 is parsed to epoch+1 in the KRFCDate, which is wrong. So let's check if the date begins with YYYY -fo
23  if (s.stripWhiteSpace().left(4).toInt() < 1000)
24  return 0; // error
25 
26  // FIXME: imho this is done in KRFCDate::parseDateISO8601() automatically, so we could omit it? -fo
27  if (s.find('T') != -1)
28  return KRFCDate::parseDateISO8601(s);
29  else
30  return KRFCDate::parseDateISO8601(s + "T12:00:00");
31 }
32 
33 TQString childNodesAsXML(const TQDomNode& parent)
34 {
35  TQDomNodeList list = parent.childNodes();
36  TQString str;
37  TQTextStream ts( &str, IO_WriteOnly );
38  for (uint i = 0; i < list.count(); ++i)
39  ts << list.item(i);
40  return str.stripWhiteSpace();
41 }
42 
43 static TQString plainTextToHtml(const TQString& plainText)
44 {
45  TQString str(plainText);
46  str.replace("&", "&amp;");
47  str.replace("\"", "&quot;");
48  str.replace("<", "&lt;");
49  //str.replace(">", "&gt;");
50  str.replace("\n", "<br/>");
51  return str;
52 }
53 
54 enum ContentFormat { Text, HTML, XML, Binary };
55 
56 static ContentFormat mapTypeToFormat(const TQString& modep, const TQString& typep, const TQString& src)
57 {
58  TQString mode = modep.isNull() ? "escaped" : modep;
59  TQString type = typep;
60 
61  //"If neither the type attribute nor the src attribute is provided,
62  //Atom Processors MUST behave as though the type attribute were
63  //present with a value of "text""
64  if (type.isNull() && src.isEmpty())
65  type = TQString::fromUtf8("text");
66 
67  if (type == TQString::fromUtf8("html")
68  || type == TQString::fromUtf8("text/html"))
69  return HTML;
70 
71  if (type == TQString::fromUtf8("text")
72  || (type.startsWith(TQString::fromUtf8("text/"), false)
73  && !type.startsWith(TQString::fromUtf8("text/xml"), false))
74  )
75  return Text;
76 
77  TQStringList xmltypes;
78  xmltypes.append(TQString::fromUtf8("xhtml"));
79  // XML media types as defined in RFC3023:
80  xmltypes.append(TQString::fromUtf8("text/xml"));
81  xmltypes.append(TQString::fromUtf8("application/xml"));
82  xmltypes.append(TQString::fromUtf8("text/xml-external-parsed-entity"));
83  xmltypes.append(TQString::fromUtf8("application/xml-external-parsed-entity"));
84  xmltypes.append(TQString::fromUtf8("application/xml-dtd"));
85 
86 
87  if (xmltypes.contains(type)
88  || type.endsWith(TQString::fromUtf8("+xml"), false)
89  || type.endsWith(TQString::fromUtf8("/xml"), false))
90  return XML;
91 
92  return Binary;
93 }
94 
95 static TQString extractAtomContent(const TQDomElement& e)
96 {
97  ContentFormat format = mapTypeToFormat(e.attribute("mode"),
98  e.attribute("type"),
99  e.attribute("src"));
100 
101  switch (format)
102  {
103  case HTML:
104  {
105  const bool hasPre = e.text().contains( "<pre>", false ) || e.text().contains( "<pre ", false );
106  return KCharsets::resolveEntities( hasPre ? e.text() : e.text().simplifyWhiteSpace() );
107  }
108  case Text:
109  return plainTextToHtml(e.text().stripWhiteSpace());
110  case XML:
111  return childNodesAsXML(e).simplifyWhiteSpace();
112  case Binary:
113  default:
114  return TQString();
115  }
116 
117  return TQString();
118 }
119 
120 TQString extractNode(const TQDomNode &parent, const TQString &elemName, bool isInlined)
121 {
122  TQDomNode node = parent.namedItem(elemName);
123  if (node.isNull())
124  return TQString();
125 
126  TQDomElement e = node.toElement();
127  TQString result = e.text().stripWhiteSpace(); // let's assume plain text
128 
129  if (elemName == "content") // we have Atom here
130  {
131  result = extractAtomContent(e);
132  }
133  else // check for HTML; not necessary for Atom:content
134  {
135  bool hasPre = result.contains("<pre>", false) || result.contains("<pre ", false);
136  bool hasHtml = hasPre || result.contains("<"); // FIXME: test if we have html, should be more clever -> regexp
137  if(!isInlined && !hasHtml) // perform nl2br if not a inline elt and it has no html elts
138  result = result = result.replace(TQChar('\n'), "<br />");
139  if(!hasPre) // strip white spaces if no <pre>
140  result = result.simplifyWhiteSpace();
141  }
142 
143  return result.isEmpty() ? TQString() : result;
144 }
145 
146 TQString extractTitle(const TQDomNode & parent)
147 {
148  TQDomNode node = parent.namedItem(TQString::fromLatin1("title"));
149  if (node.isNull())
150  return TQString();
151 
152  TQString result = node.toElement().text();
153 
154  result = KCharsets::resolveEntities(KCharsets::resolveEntities(result).replace(TQRegExp("<[^>]*>"), "").remove("\\"));
155  result = result.simplifyWhiteSpace();
156 
157  if (result.isEmpty())
158  return TQString();
159 
160  return result;
161 }
162 
163 static void authorFromString(const TQString& strp, TQString& name, TQString& email)
164 {
165  TQString str = strp.stripWhiteSpace();
166  if (str.isEmpty())
167  return;
168 
169  // look for something looking like a mail address ( "foo@bar.com",
170  // "<foo@bar.com>") and extract it
171 
172  TQRegExp remail("<?([^@\\s<]+@[^>\\s]+)>?"); // FIXME: user "proper" regexp,
173  // search kmail source for it
174 
175  int pos = remail.search(str);
176  if (pos != -1)
177  {
178  TQString all = remail.cap(0);
179  email = remail.cap(1);
180  str.replace(all, ""); // remove mail address
181  }
182 
183  // simplify the rest and use it as name
184 
185  name = str.simplifyWhiteSpace();
186 
187  // after removing the email, str might have
188  // the format "(Foo M. Bar)". We cut off
189  // parentheses if there are any. However, if
190  // str is of the format "Foo M. Bar (President)",
191  // we should not cut anything.
192 
193  TQRegExp rename("^\\(([^\\)]*)\\)");
194 
195  pos = rename.search(name);
196 
197  if (pos != -1)
198  {
199  name = rename.cap(1);
200  }
201 
202  name = name.isEmpty() ? TQString() : name;
203  email = email.isEmpty() ? TQString() : email;
204 }
205 
206 TQString parseItemAuthor(const TQDomElement& element, Format format, Version version)
207 {
208  TQString name;
209  TQString email;
210 
211  TQDomElement dcCreator = element.namedItem("dc:creator").toElement();
212 
213  if (!dcCreator.isNull())
214  authorFromString(dcCreator.text(), name, email);
215  else if (format == AtomFeed)
216  {
217  TQDomElement atomAuthor = element.namedItem("author").toElement();
218  if (atomAuthor.isNull())
219  atomAuthor = element.namedItem("atom:author").toElement();
220  if (!atomAuthor.isNull())
221  {
222  TQDomElement atomName = atomAuthor.namedItem("name").toElement();
223  if (atomName.isNull())
224  atomName = atomAuthor.namedItem("atom:name").toElement();
225  name = atomName.text().stripWhiteSpace();
226 
227  TQDomElement atomEmail = atomAuthor.namedItem("email").toElement();
228  if (atomEmail.isNull())
229  atomEmail = atomAuthor.namedItem("atom:email").toElement();
230  email = atomEmail.text().stripWhiteSpace();
231  }
232  }
233  else if (format == RSSFeed)
234  {
235  authorFromString(element.namedItem("author").toElement().text(), name, email);
236  }
237 
238  if (name.isNull())
239  name = email;
240 
241  if (!email.isNull())
242  return TQString("<a href=\"mailto:%1\">%2</a>").arg(email).arg(name);
243  else
244  return name;
245 }
246 
247 } // namespace RSS