akregator/src/librss

feeddetector.cpp
1/*
2 This file is part of Akregator.
3
4 Copyright (C) 2004 Teemu Rytilahti <tpr@d5k.net>
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19
20 As a special exception, permission is given to link this program
21 with any edition of TQt, and distribute the resulting executable,
22 without including the source code for TQt in the source distribution.
23*/
24
25#include <tqregexp.h>
26#include <tqstring.h>
27#include <tqstringlist.h>
28#include <tqvaluelist.h>
29#include <kcharsets.h>
30#include <kurl.h>
31
32#include "feeddetector.h"
33
34
35using namespace RSS;
36
37FeedDetectorEntryList FeedDetector::extractFromLinkTags(const TQString& s)
38{
39 //reduce all sequences of spaces, newlines etc. to one space:
40 TQString str = s.simplifyWhiteSpace();
41
42 // extracts <link> tags
43 TQRegExp reLinkTag("<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>", false);
44
45 // extracts the URL (href="url")
46 TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
47 // extracts type attribute
48 TQRegExp reType("TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
49 // extracts the title (title="title")
50 TQRegExp reTitle("TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
51
52 int pos = 0;
53 int matchpos = 0;
54
55 // get all <link> tags
56 TQStringList linkTags;
57 //int strlength = str.length();
58 while ( matchpos != -1 )
59 {
60 matchpos = reLinkTag.search(str, pos);
61 if (matchpos != -1)
62 {
63 linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) );
64 pos = matchpos + reLinkTag.matchedLength();
65 }
66 }
67
68 FeedDetectorEntryList list;
69
70 for ( TQStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it )
71 {
72 TQString type;
73 int pos = reType.search(*it, 0);
74 if (pos != -1)
75 type = TQString(reType.cap(1)).lower();
76
77 // we accept only type attributes indicating a feed
78 if ( type != "application/rss+xml" && type != "application/rdf+xml"
79 && type != "application/atom+xml" && type != "text/xml" )
80 continue;
81
82 TQString title;
83 pos = reTitle.search(*it, 0);
84 if (pos != -1)
85 title = reTitle.cap(1);
86
87 title = KCharsets::resolveEntities(title);
88
89 TQString url;
90 pos = reHref.search(*it, 0);
91 if (pos != -1)
92 url = reHref.cap(1);
93
94 url = KCharsets::resolveEntities(url);
95
96 // if feed has no title, use the url as preliminary title (until feed is parsed)
97 if ( title.isEmpty() )
98 title = url;
99
100 if ( !url.isEmpty() )
101 list.append(FeedDetectorEntry(url, title) );
102 }
103
104
105 return list;
106}
107
108TQStringList FeedDetector::extractBruteForce(const TQString& s)
109{
110 TQString str = s.simplifyWhiteSpace();
111
112 TQRegExp reAhrefTag("<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>", false);
113
114 // extracts the URL (href="url")
115 TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
116
117 TQRegExp rssrdfxml(".*(RSS|RDF|XML)", false);
118
119 int pos = 0;
120 int matchpos = 0;
121
122 // get all <a href> tags and capture url
123 TQStringList list;
124 //int strlength = str.length();
125 while ( matchpos != -1 )
126 {
127 matchpos = reAhrefTag.search(str, pos);
128 if ( matchpos != -1 )
129 {
130 TQString ahref = str.mid(matchpos, reAhrefTag.matchedLength());
131 int hrefpos = reHref.search(ahref, 0);
132 if ( hrefpos != -1 )
133 {
134 TQString url = reHref.cap(1);
135
136 url = KCharsets::resolveEntities(url);
137
138 if ( rssrdfxml.exactMatch(url) )
139 list.append(url);
140 }
141
142 pos = matchpos + reAhrefTag.matchedLength();
143 }
144 }
145
146 return list;
147}
148
149TQString FeedDetector::fixRelativeURL(const TQString &s, const KURL &baseurl)
150{
151 TQString s2=s;
152 KURL u;
153 if (KURL::isRelativeURL(s2))
154 {
155 if (s2.startsWith("//"))
156 {
157 s2=s2.prepend(baseurl.protocol()+":");
158 u=s2;
159 }
160 else if (s2.startsWith("/"))
161 {
162 KURL b2(baseurl);
163 b2.setPath(TQString()); // delete path and query, so that only protocol://host remains
164 b2.setQuery(TQString());
165 u = KURL(b2, s2.remove(0,1)); // remove leading "/"
166 }
167 else
168 {
169 u = KURL(baseurl, s2);
170 }
171 }
172 else
173 u=s2;
174
175 u.cleanPath();
176 //kdDebug() << "AKREGATOR_PLUGIN_FIXURL: " << "url=" << s << " baseurl=" << baseurl.url() << " fixed=" << u.url() <<
177 //endl;
178 return u.url();
179}
static TQStringList extractBruteForce(const TQString &s)
searches an HTML page for slightly feed-like looking links and catches everything not running away qu...