akregator/src/librss

feeddetector.cpp
1 /*
2  This file is part of Akregator.
3 
4  Copyright (C) 2004 Teemu Rytilahti <tpr@d5k.net>
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program; if not, write to the Free Software
18  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 
20  As a special exception, permission is given to link this program
21  with any edition of TQt, and distribute the resulting executable,
22  without including the source code for TQt in the source distribution.
23 */
24 
25 #include <tqregexp.h>
26 #include <tqstring.h>
27 #include <tqstringlist.h>
28 #include <tqvaluelist.h>
29 #include <kcharsets.h>
30 #include <kurl.h>
31 
32 #include "feeddetector.h"
33 
34 
35 using namespace RSS;
36 
37 FeedDetectorEntryList FeedDetector::extractFromLinkTags(const TQString& s)
38 {
39  //reduce all sequences of spaces, newlines etc. to one space:
40  TQString str = s.simplifyWhiteSpace();
41 
42  // extracts <link> tags
43  TQRegExp reLinkTag("<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>", false);
44 
45  // extracts the URL (href="url")
46  TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
47  // extracts type attribute
48  TQRegExp reType("TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
49  // extracts the title (title="title")
50  TQRegExp reTitle("TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
51 
52  int pos = 0;
53  int matchpos = 0;
54 
55  // get all <link> tags
56  TQStringList linkTags;
57  //int strlength = str.length();
58  while ( matchpos != -1 )
59  {
60  matchpos = reLinkTag.search(str, pos);
61  if (matchpos != -1)
62  {
63  linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) );
64  pos = matchpos + reLinkTag.matchedLength();
65  }
66  }
67 
68  FeedDetectorEntryList list;
69 
70  for ( TQStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it )
71  {
72  TQString type;
73  int pos = reType.search(*it, 0);
74  if (pos != -1)
75  type = TQString(reType.cap(1)).lower();
76 
77  // we accept only type attributes indicating a feed
78  if ( type != "application/rss+xml" && type != "application/rdf+xml"
79  && type != "application/atom+xml" && type != "text/xml" )
80  continue;
81 
82  TQString title;
83  pos = reTitle.search(*it, 0);
84  if (pos != -1)
85  title = reTitle.cap(1);
86 
87  title = KCharsets::resolveEntities(title);
88 
89  TQString url;
90  pos = reHref.search(*it, 0);
91  if (pos != -1)
92  url = reHref.cap(1);
93 
94  url = KCharsets::resolveEntities(url);
95 
96  // if feed has no title, use the url as preliminary title (until feed is parsed)
97  if ( title.isEmpty() )
98  title = url;
99 
100  if ( !url.isEmpty() )
101  list.append(FeedDetectorEntry(url, title) );
102  }
103 
104 
105  return list;
106 }
107 
108 TQStringList FeedDetector::extractBruteForce(const TQString& s)
109 {
110  TQString str = s.simplifyWhiteSpace();
111 
112  TQRegExp reAhrefTag("<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>", false);
113 
114  // extracts the URL (href="url")
115  TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
116 
117  TQRegExp rssrdfxml(".*(RSS|RDF|XML)", false);
118 
119  int pos = 0;
120  int matchpos = 0;
121 
122  // get all <a href> tags and capture url
123  TQStringList list;
124  //int strlength = str.length();
125  while ( matchpos != -1 )
126  {
127  matchpos = reAhrefTag.search(str, pos);
128  if ( matchpos != -1 )
129  {
130  TQString ahref = str.mid(matchpos, reAhrefTag.matchedLength());
131  int hrefpos = reHref.search(ahref, 0);
132  if ( hrefpos != -1 )
133  {
134  TQString url = reHref.cap(1);
135 
136  url = KCharsets::resolveEntities(url);
137 
138  if ( rssrdfxml.exactMatch(url) )
139  list.append(url);
140  }
141 
142  pos = matchpos + reAhrefTag.matchedLength();
143  }
144  }
145 
146  return list;
147 }
148 
149 TQString FeedDetector::fixRelativeURL(const TQString &s, const KURL &baseurl)
150 {
151  TQString s2=s;
152  KURL u;
153  if (KURL::isRelativeURL(s2))
154  {
155  if (s2.startsWith("//"))
156  {
157  s2=s2.prepend(baseurl.protocol()+":");
158  u=s2;
159  }
160  else if (s2.startsWith("/"))
161  {
162  KURL b2(baseurl);
163  b2.setPath(TQString()); // delete path and query, so that only protocol://host remains
164  b2.setQuery(TQString());
165  u = KURL(b2, s2.remove(0,1)); // remove leading "/"
166  }
167  else
168  {
169  u = KURL(baseurl, s2);
170  }
171  }
172  else
173  u=s2;
174 
175  u.cleanPath();
176  //kdDebug() << "AKREGATOR_PLUGIN_FIXURL: " << "url=" << s << " baseurl=" << baseurl.url() << " fixed=" << u.url() <<
177  //endl;
178  return u.url();
179 }
static FeedDetectorEntryList extractFromLinkTags(const TQString &s)
searches an HTML page for feeds listed in <link> tags <link> tags with rel attribute values alterna...
static TQStringList extractBruteForce(const TQString &s)
searches an HTML page for slightly feed-like looking links and catches everything not running away qu...