kmail

encodingdetector.cpp
1/*
2 This file was taken from the KDE 4.x libraries and backported to TQt 3.
3
4 Copyright (C) 1999 Lars Knoll (knoll@kde.org)
5 Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003 Apple Computer, Inc.
7 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
8
9 This library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Library General Public
11 License as published by the Free Software Foundation; either
12 version 2 of the License, or (at your option) any later version.
13
14 This library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Library General Public License for more details.
18
19 You should have received a copy of the GNU Library General Public License
20 along with this library; see the file COPYING.LIB. If not, write to
21 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22 Boston, MA 02110-1301, USA.
23*/
24//----------------------------------------------------------------------------
25//
26// decoder for input stream
27
28#include "encodingdetector.h"
29
30#undef DECODE_DEBUG
31//#define DECODE_DEBUG
32
33#define MAX_BUFFER 16*1024
34
35#include <assert.h>
36#include <stdlib.h>
37
38#include "encodingdetector_ja_p.h"
39
40#include <tqregexp.h>
41#include <tqtextcodec.h>
42
43#include <tdeglobal.h>
44#include <kcharsets.h>
45#include <kdebug.h>
46#include <tdelocale.h>
47
48#include <ctype.h>
49
50// The following table was taken from libpango 1.19.3 and slightly modified.
51// Multiple scripts per language were removed and the entries were reordered so
52// that simple substring matching will work. For example, bam was put before ba
53// so that the first match will be likely the right match. Otherwise "ba" would
54// match "bam" but we would have to search on to find "bam" which is what we want.
55// The original file is called pango-script-lang-table.h
56
57/* pango-script-lang-table.h:
58 *
59 * Generated by gen-script-for-lang-new.c
60 * Date: 2007-10-26
61 * Source: fontconfig-2.4.91
62 *
63 * Do not edit. // I did. Sue me ;)
64 */
65typedef struct _PangoScriptForLang {
66 const char lang[6];
67 EncodingDetector::AutoDetectScript scripts[1];
68} PangoScriptForLang;
69
70//Unfortunately EncodingDetector does not know all scripts that Pango knows.
71//Also, using EncodingDetector::CentralEuropean for the appropriate countries
72//might give better results in some cases.
73//One especially important (many speakers/literates) omission is the lack of
74//Indian scripts.
75
76#define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
77#define PANGO_SCRIPT_BENGALI EncodingDetector::None
78#define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
79#define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
80#define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
81#define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
82#define PANGO_SCRIPT_GUJARATI EncodingDetector::None
83#define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
84#define PANGO_SCRIPT_KANNADA EncodingDetector::None
85#define PANGO_SCRIPT_KHMER EncodingDetector::None
86#define PANGO_SCRIPT_LAO EncodingDetector::None
87#define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
88#define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
89#define PANGO_SCRIPT_MYANMAR EncodingDetector::None
90#define PANGO_SCRIPT_ORIYA EncodingDetector::None
91#define PANGO_SCRIPT_SINHALA EncodingDetector::None
92#define PANGO_SCRIPT_SYRIAC EncodingDetector::None
93#define PANGO_SCRIPT_TAGALOG EncodingDetector::None
94#define PANGO_SCRIPT_TAMIL EncodingDetector::None
95#define PANGO_SCRIPT_TIBETAN EncodingDetector::None
96#define PANGO_SCRIPT_TELUGU EncodingDetector::None
97
98//Instead of changing the table even more...
99#define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
100#define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
101#define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
102#define PANGO_SCRIPT_GREEK EncodingDetector::Greek
103#define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
104#define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
105#define PANGO_SCRIPT_THAI EncodingDetector::Thai
106
107
108static const PangoScriptForLang pango_script_for_lang[] = {
109 { "aa", { PANGO_SCRIPT_LATIN/*62*/ } },
110 { "ab", { PANGO_SCRIPT_CYRILLIC/*90*/ } },
111 { "af", { PANGO_SCRIPT_LATIN/*69*/ } },
112 { "am", { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
113 { "ar", { PANGO_SCRIPT_ARABIC/*125*/ } },
114 { "as", { PANGO_SCRIPT_BENGALI/*89*/ } },
115 { "ast", { PANGO_SCRIPT_LATIN/*66*/ } },
116 { "ava", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
117 { "ay", { PANGO_SCRIPT_LATIN/*60*/ } },
118 { "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } },
119 { "az", { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } },
120 { "bam", { PANGO_SCRIPT_LATIN/*60*/ } },
121 { "ba", { PANGO_SCRIPT_CYRILLIC/*82*/ } },
122 { "be", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
123 { "bg", { PANGO_SCRIPT_CYRILLIC/*60*/ } },
124 { "bh", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
125 { "bho", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
126 { "bi", { PANGO_SCRIPT_LATIN/*58*/ } },
127 { "bin", { PANGO_SCRIPT_LATIN/*76*/ } },
128 { "bn", { PANGO_SCRIPT_BENGALI/*89*/ } },
129 { "bo", { PANGO_SCRIPT_TIBETAN/*95*/ } },
130 { "br", { PANGO_SCRIPT_LATIN/*64*/ } },
131 { "bs", { PANGO_SCRIPT_LATIN/*62*/ } },
132 { "bua", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
133 { "ca", { PANGO_SCRIPT_LATIN/*74*/ } },
134 { "ce", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
135 { "chm", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
136 { "chr", { PANGO_SCRIPT_CHEROKEE/*85*/ } },
137 { "ch", { PANGO_SCRIPT_LATIN/*58*/ } },
138 { "co", { PANGO_SCRIPT_LATIN/*84*/ } },
139 { "cs", { PANGO_SCRIPT_LATIN/*82*/ } },
140 { "cu", { PANGO_SCRIPT_CYRILLIC/*103*/ } },
141 { "cv", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } },
142 { "cy", { PANGO_SCRIPT_LATIN/*78*/ } },
143 { "da", { PANGO_SCRIPT_LATIN/*70*/ } },
144 { "de", { PANGO_SCRIPT_LATIN/*59*/ } },
145 { "dz", { PANGO_SCRIPT_TIBETAN/*95*/ } },
146 { "el", { PANGO_SCRIPT_GREEK/*69*/ } },
147 { "en", { PANGO_SCRIPT_LATIN/*72*/ } },
148 { "eo", { PANGO_SCRIPT_LATIN/*64*/ } },
149 { "es", { PANGO_SCRIPT_LATIN/*66*/ } },
150// { "et", { PANGO_SCRIPT_LATIN/*64*/ } },
151 { "et", { EncodingDetector::Baltic } },
152 { "eu", { PANGO_SCRIPT_LATIN/*56*/ } },
153 { "fa", { PANGO_SCRIPT_ARABIC/*129*/ } },
154 { "fi", { PANGO_SCRIPT_LATIN/*62*/ } },
155 { "fj", { PANGO_SCRIPT_LATIN/*52*/ } },
156 { "fo", { PANGO_SCRIPT_LATIN/*68*/ } },
157 { "fr", { PANGO_SCRIPT_LATIN/*84*/ } },
158 { "ful", { PANGO_SCRIPT_LATIN/*62*/ } },
159 { "fur", { PANGO_SCRIPT_LATIN/*66*/ } },
160 { "fy", { PANGO_SCRIPT_LATIN/*75*/ } },
161 { "ga", { PANGO_SCRIPT_LATIN/*80*/ } },
162 { "gd", { PANGO_SCRIPT_LATIN/*70*/ } },
163 { "gez", { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
164 { "gl", { PANGO_SCRIPT_LATIN/*66*/ } },
165 { "gn", { PANGO_SCRIPT_LATIN/*70*/ } },
166 { "gu", { PANGO_SCRIPT_GUJARATI/*78*/ } },
167 { "gv", { PANGO_SCRIPT_LATIN/*54*/ } },
168 { "ha", { PANGO_SCRIPT_LATIN/*60*/ } },
169 { "haw", { PANGO_SCRIPT_LATIN/*62*/ } },
170 { "he", { PANGO_SCRIPT_HEBREW/*27*/ } },
171 { "hi", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
172 { "ho", { PANGO_SCRIPT_LATIN/*52*/ } },
173 { "hr", { PANGO_SCRIPT_LATIN/*62*/ } },
174 { "hu", { PANGO_SCRIPT_LATIN/*70*/ } },
175 { "hy", { PANGO_SCRIPT_ARMENIAN/*77*/ } },
176 { "ia", { PANGO_SCRIPT_LATIN/*52*/ } },
177 { "ibo", { PANGO_SCRIPT_LATIN/*58*/ } },
178 { "id", { PANGO_SCRIPT_LATIN/*54*/ } },
179 { "ie", { PANGO_SCRIPT_LATIN/*52*/ } },
180 { "ik", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
181 { "io", { PANGO_SCRIPT_LATIN/*52*/ } },
182 { "is", { PANGO_SCRIPT_LATIN/*70*/ } },
183 { "it", { PANGO_SCRIPT_LATIN/*72*/ } },
184 { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } },
185// { "ja", { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } },
186 { "ja", { EncodingDetector::Japanese } },
187 { "kaa", { PANGO_SCRIPT_CYRILLIC/*78*/ } },
188 { "ka", { PANGO_SCRIPT_GEORGIAN/*33*/ } },
189 { "ki", { PANGO_SCRIPT_LATIN/*56*/ } },
190 { "kk", { PANGO_SCRIPT_CYRILLIC/*77*/ } },
191 { "kl", { PANGO_SCRIPT_LATIN/*81*/ } },
192 { "km", { PANGO_SCRIPT_KHMER/*70*/ } },
193 { "kn", { PANGO_SCRIPT_KANNADA/*80*/ } },
194// { "ko", { PANGO_SCRIPT_HANGUL/*2443*/ } },
195 { "ko", { EncodingDetector::Korean } },
196 { "kok", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
197 { "ks", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
198 { "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } },
199 { "ku", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } },
200 { "kum", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
201 { "kv", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
202 { "kw", { PANGO_SCRIPT_LATIN/*64*/ } },
203 { "ky", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
204 { "la", { PANGO_SCRIPT_LATIN/*68*/ } },
205 { "lb", { PANGO_SCRIPT_LATIN/*75*/ } },
206 { "lez", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
207 { "ln", { PANGO_SCRIPT_LATIN/*78*/ } },
208 { "lo", { PANGO_SCRIPT_LAO/*65*/ } },
209// { "lt", { PANGO_SCRIPT_LATIN/*70*/ } },
210 { "lt", { EncodingDetector::Baltic } },
211// { "lv", { PANGO_SCRIPT_LATIN/*78*/ } },
212 { "lv", { EncodingDetector::Baltic } },
213 { "mg", { PANGO_SCRIPT_LATIN/*56*/ } },
214 { "mh", { PANGO_SCRIPT_LATIN/*62*/ } },
215 { "mi", { PANGO_SCRIPT_LATIN/*64*/ } },
216 { "mk", { PANGO_SCRIPT_CYRILLIC/*42*/ } },
217 { "ml", { PANGO_SCRIPT_MALAYALAM/*78*/ } },
218 { "mn", { PANGO_SCRIPT_MONGOLIAN/*130*/ } },
219 { "mo", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } },
220 { "mr", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
221 { "mt", { PANGO_SCRIPT_LATIN/*72*/ } },
222 { "my", { PANGO_SCRIPT_MYANMAR/*48*/ } },
223 { "nb", { PANGO_SCRIPT_LATIN/*70*/ } },
224 { "nds", { PANGO_SCRIPT_LATIN/*59*/ } },
225 { "ne", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
226 { "nl", { PANGO_SCRIPT_LATIN/*82*/ } },
227 { "nn", { PANGO_SCRIPT_LATIN/*76*/ } },
228 { "no", { PANGO_SCRIPT_LATIN/*70*/ } },
229 { "nr", { PANGO_SCRIPT_LATIN/*52*/ } },
230 { "nso", { PANGO_SCRIPT_LATIN/*58*/ } },
231 { "ny", { PANGO_SCRIPT_LATIN/*54*/ } },
232 { "oc", { PANGO_SCRIPT_LATIN/*70*/ } },
233 { "om", { PANGO_SCRIPT_LATIN/*52*/ } },
234 { "or", { PANGO_SCRIPT_ORIYA/*79*/ } },
235 { "os", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
236 { "pa", { PANGO_SCRIPT_GURMUKHI/*63*/ } },
237 { "pl", { PANGO_SCRIPT_LATIN/*70*/ } },
238 { "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } },
239 { "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } },
240 { "pt", { PANGO_SCRIPT_LATIN/*82*/ } },
241 { "rm", { PANGO_SCRIPT_LATIN/*66*/ } },
242 { "ro", { PANGO_SCRIPT_LATIN/*62*/ } },
243 { "ru", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
244 { "sah", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
245 { "sa", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
246 { "sco", { PANGO_SCRIPT_LATIN/*56*/ } },
247 { "sel", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
248 { "se", { PANGO_SCRIPT_LATIN/*66*/ } },
249 { "sh", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
250 { "si", { PANGO_SCRIPT_SINHALA/*77*/ } },
251 { "sk", { PANGO_SCRIPT_LATIN/*86*/ } },
252 { "sl", { PANGO_SCRIPT_LATIN/*62*/ } },
253 { "sma", { PANGO_SCRIPT_LATIN/*60*/ } },
254 { "smj", { PANGO_SCRIPT_LATIN/*60*/ } },
255 { "smn", { PANGO_SCRIPT_LATIN/*68*/ } },
256 { "sms", { PANGO_SCRIPT_LATIN/*80*/ } },
257 { "sm", { PANGO_SCRIPT_LATIN/*52*/ } },
258 { "so", { PANGO_SCRIPT_LATIN/*52*/ } },
259 { "sq", { PANGO_SCRIPT_LATIN/*56*/ } },
260 { "sr", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
261 { "ss", { PANGO_SCRIPT_LATIN/*52*/ } },
262 { "st", { PANGO_SCRIPT_LATIN/*52*/ } },
263 { "sv", { PANGO_SCRIPT_LATIN/*68*/ } },
264 { "sw", { PANGO_SCRIPT_LATIN/*52*/ } },
265 { "syr", { PANGO_SCRIPT_SYRIAC/*45*/ } },
266 { "ta", { PANGO_SCRIPT_TAMIL/*48*/ } },
267 { "te", { PANGO_SCRIPT_TELUGU/*80*/ } },
268 { "tg", { PANGO_SCRIPT_CYRILLIC/*78*/ } },
269 { "th", { PANGO_SCRIPT_THAI/*86*/ } },
270 { "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
271 { "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
272 { "tig", { PANGO_SCRIPT_ETHIOPIC/*221*/ } },
273 { "tk", { PANGO_SCRIPT_CYRILLIC/*74*/ } },
274 { "tl", { PANGO_SCRIPT_TAGALOG/*19*/ } },
275 { "tn", { PANGO_SCRIPT_LATIN/*58*/ } },
276 { "to", { PANGO_SCRIPT_LATIN/*52*/ } },
277// { "tr", { PANGO_SCRIPT_LATIN/*70*/ } },
278 { "tr", { EncodingDetector::Turkish } },
279 { "ts", { PANGO_SCRIPT_LATIN/*52*/ } },
280 { "tt", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
281 { "tw", { PANGO_SCRIPT_LATIN/*70*/ } },
282 { "tyv", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
283 { "ug", { PANGO_SCRIPT_ARABIC/*125*/ } },
284 { "uk", { PANGO_SCRIPT_CYRILLIC/*72*/ } },
285 { "ur", { PANGO_SCRIPT_ARABIC/*145*/ } },
286 { "uz", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
287 { "ven", { PANGO_SCRIPT_LATIN/*62*/ } },
288 { "vi", { PANGO_SCRIPT_LATIN/*186*/ } },
289 { "vot", { PANGO_SCRIPT_LATIN/*62*/ } },
290 { "vo", { PANGO_SCRIPT_LATIN/*54*/ } },
291 { "wa", { PANGO_SCRIPT_LATIN/*70*/ } },
292 { "wen", { PANGO_SCRIPT_LATIN/*76*/ } },
293 { "wo", { PANGO_SCRIPT_LATIN/*66*/ } },
294 { "xh", { PANGO_SCRIPT_LATIN/*52*/ } },
295 { "yap", { PANGO_SCRIPT_LATIN/*58*/ } },
296 { "yi", { PANGO_SCRIPT_HEBREW/*27*/ } },
297 { "yo", { PANGO_SCRIPT_LATIN/*114*/ } },
298// { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } },
299 { "zh-cn", { EncodingDetector::ChineseSimplified } },
300// { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } },
301 { "zh-hk", { EncodingDetector::ChineseTraditional } },
302// { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } },
303 { "zh-mo", { EncodingDetector::ChineseTraditional } },
304// { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } },
305 { "zh-sg", { EncodingDetector::ChineseSimplified } },
306// { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } },
307 { "zh-tw", { EncodingDetector::ChineseTraditional } },
308 { "zu", { PANGO_SCRIPT_LATIN/*52*/ } },
309 { "\x00", { EncodingDetector::None } } //end mark
310};
311
312enum MIB
313{
314 MibLatin1 = 4,
315 Mib8859_8 = 85,
316 MibUtf8 = 106,
317 MibUcs2 = 1000,
318 MibUtf16 = 1015,
319 MibUtf16BE = 1013,
320 MibUtf16LE = 1014
321};
322
323static bool is16Bit(TQTextCodec* codec)
324{
325 switch (codec->mibEnum())
326 {
327 case MibUtf16:
328 case MibUtf16BE:
329 case MibUtf16LE:
330 case MibUcs2:
331 return true;
332 default:
333 return false;
334 }
335}
336
337class EncodingDetectorPrivate
338{
339public:
340 TQTextCodec *m_codec;
341 TQTextDecoder *m_decoder; // utf16
342 TQTextCodec *m_defaultCodec;
343 TQCString m_storeDecoderName;
344
345 EncodingDetector::EncodingChoiceSource m_source;
346 EncodingDetector::AutoDetectScript m_autoDetectLanguage;
347
348 bool m_visualRTL : 1;
349 bool m_seenBody : 1;
350 bool m_writtingHappened : 1;
351 bool m_analyzeCalled : 1; //for decode()
352 int m_multiByte;
353
354 TQCString m_bufferForDefferedEncDetection;
355
356 EncodingDetectorPrivate()
357 : m_codec(TQTextCodec::codecForMib(MibLatin1))
358 , m_decoder(m_codec->makeDecoder())
359 , m_defaultCodec(m_codec)
360 , m_source(EncodingDetector::DefaultEncoding)
361 , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
362 , m_visualRTL(false)
363 , m_seenBody(false)
364 , m_writtingHappened(false)
365 , m_analyzeCalled(false)
366 , m_multiByte(0)
367 {
368 }
369
370 EncodingDetectorPrivate(TQTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
371 : m_codec(codec)
372 , m_decoder(m_codec->makeDecoder())
373 , m_defaultCodec(m_codec)
374 , m_source(source)
375 , m_autoDetectLanguage(script)
376 , m_visualRTL(false)
377 , m_seenBody(false)
378 , m_writtingHappened(false)
379 , m_analyzeCalled(false)
380 , m_multiByte(0)
381 {
382 }
383
384 ~EncodingDetectorPrivate()
385 {
386 delete m_decoder;
387 }
388};
389
390
391static TQCString automaticDetectionForArabic( const unsigned char* ptr, int size )
392{
393 for ( int i = 0; i < size; ++i ) {
394 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
395 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
396 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
397 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
398 return "cp1256";
399 }
400 }
401
402 return "iso-8859-6";
403}
404
405static TQCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
406{
407 for ( int i = 0; i < size; ++i ) {
408 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
409 return "cp1257";
410
411 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
412 return "iso-8859-13";
413 }
414
415 return "iso-8859-13";
416}
417
418static TQCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
419{
420 TQCString charset;
421 for ( int i = 0; i < size; ++i ) {
422 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
423 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
424 return "ibm852";
425
426 if ( i + 1 > size )
427 return "cp1250";
428 else { // maybe ibm852 ?
429 charset = "cp1250";
430 continue;
431 }
432 }
433 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
434 if ( i + 1 > size )
435 return "iso-8859-2";
436 else { // maybe ibm852 ?
437 if ( charset.isNull() )
438 charset = "iso-8859-2";
439 continue;
440 }
441 }
442 }
443
444 if ( charset.isNull() )
445 charset = "iso-8859-3";
446
447 return charset.data();
448}
449
450static TQCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
451{
452#ifdef DECODE_DEBUG
453 kWarning() << "EncodingDetector: Cyr heuristics";
454#endif
455
456// if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
457// return "utf8";
458 int utf8_mark=0;
459 int koi_score=0;
460 int cp1251_score=0;
461
462 int koi_st=0;
463 int cp1251_st=0;
464
465// int koi_na=0;
466// int cp1251_na=0;
467
468 int koi_o_capital=0;
469 int koi_o=0;
470 int cp1251_o_capital=0;
471 int cp1251_o=0;
472
473 int koi_a_capital=0;
474 int koi_a=0;
475 int cp1251_a_capital=0;
476 int cp1251_a=0;
477
478 int koi_s_capital=0;
479 int koi_s=0;
480 int cp1251_s_capital=0;
481 int cp1251_s=0;
482
483 int koi_i_capital=0;
484 int koi_i=0;
485 int cp1251_i_capital=0;
486 int cp1251_i=0;
487
488 int cp1251_small_range=0;
489 int koi_small_range=0;
490 int ibm866_small_range=0;
491
492 int i;
493 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
494 {
495 if (ptr[i]>0xdf)
496 {
497 ++cp1251_small_range;
498
499 if (ptr[i]==0xee)//small o
500 ++cp1251_o;
501 else if (ptr[i]==0xe0)//small a
502 ++cp1251_a;
503 else if (ptr[i]==0xe8)//small i
504 ++cp1251_i;
505 else if (ptr[i]==0xf1)//small s
506 ++cp1251_s;
507 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
508 ++cp1251_st;
509
510 else if (ptr[i]==0xef)
511 ++koi_o_capital;
512 else if (ptr[i]==0xe1)
513 ++koi_a_capital;
514 else if (ptr[i]==0xe9)
515 ++koi_i_capital;
516 else if (ptr[i]==0xf3)
517 ++koi_s_capital;
518
519 }
520 else if (ptr[i]>0xbf)
521 {
522 ++koi_small_range;
523
524 if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
525 ++utf8_mark;
526 else if (ptr[i]==0xcf)//small o
527 ++koi_o;
528 else if (ptr[i]==0xc1)//small a
529 ++koi_a;
530 else if (ptr[i]==0xc9)//small i
531 ++koi_i;
532 else if (ptr[i]==0xd3)//small s
533 ++koi_s;
534 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
535 ++koi_st;
536
537 else if (ptr[i]==0xce)
538 ++cp1251_o_capital;
539 else if (ptr[i]==0xc0)
540 ++cp1251_a_capital;
541 else if (ptr[i]==0xc8)
542 ++cp1251_i_capital;
543 else if (ptr[i]==0xd1)
544 ++cp1251_s_capital;
545 }
546 else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
547 ++ibm866_small_range;
548
549 }
550
551 //cannot decide?
552 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
553 {
554 return "";
555 }
556
557 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
558 {
559#ifdef DECODE_DEBUG
560 kWarning() << "Cyr Enc Detection: UTF8";
561#endif
562 return "UTF-8";
563 }
564
565 if (ibm866_small_range>cp1251_small_range+koi_small_range)
566 return "ibm866";
567
568// TQCString koi_string = "koi8-u";
569// TQCString cp1251_string = "cp1251";
570
571 if (cp1251_st==0 && koi_st>1)
572 koi_score+=10;
573 else if (koi_st==0 && cp1251_st>1)
574 cp1251_score+=10;
575
576 if (cp1251_st && koi_st)
577 {
578 if (cp1251_st/koi_st>2)
579 cp1251_score+=20;
580 else if (koi_st/cp1251_st>2)
581 koi_score+=20;
582 }
583
584 if (cp1251_a>koi_a)
585 cp1251_score+=10;
586 else if (cp1251_a || koi_a)
587 koi_score+=10;
588
589 if (cp1251_o>koi_o)
590 cp1251_score+=10;
591 else if (cp1251_o || koi_o)
592 koi_score+=10;
593
594 if (cp1251_i>koi_i)
595 cp1251_score+=10;
596 else if (cp1251_i || koi_i)
597 koi_score+=10;
598
599 if (cp1251_s>koi_s)
600 cp1251_score+=10;
601 else if (cp1251_s || koi_s)
602 koi_score+=10;
603
604 if (cp1251_a_capital>koi_a_capital)
605 cp1251_score+=9;
606 else if (cp1251_a_capital || koi_a_capital)
607 koi_score+=9;
608
609 if (cp1251_o_capital>koi_o_capital)
610 cp1251_score+=9;
611 else if (cp1251_o_capital || koi_o_capital)
612 koi_score+=9;
613
614 if (cp1251_i_capital>koi_i_capital)
615 cp1251_score+=9;
616 else if (cp1251_i_capital || koi_i_capital)
617 koi_score+=9;
618
619 if (cp1251_s_capital>koi_s_capital)
620 cp1251_score+=9;
621 else if (cp1251_s_capital || koi_s_capital)
622 koi_score+=9;
623#ifdef DECODE_DEBUG
624 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
625#endif
626 if (abs(koi_score-cp1251_score)<10)
627 {
628 //fallback...
629 cp1251_score=cp1251_small_range;
630 koi_score=koi_small_range;
631 }
632 if (cp1251_score>koi_score)
633 return "cp1251";
634 else
635 return "koi8-u";
636
637
638// if (cp1251_score>koi_score)
639// setEncoding("cp1251",AutoDetectedEncoding);
640// else
641// setEncoding("koi8-u",AutoDetectedEncoding);
642// return true;
643
644}
645
646static TQCString automaticDetectionForGreek( const unsigned char* ptr, int size )
647{
648 for ( int i = 0; i < size; ++i ) {
649 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
650 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
651 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
652 return "cp1253";
653 }
654 }
655
656 return "iso-8859-7";
657}
658
659static TQCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
660{
661 for ( int i = 0; i < size; ++i ) {
662 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
663 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
664 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
665 return "cp1255";
666 }
667
668 if ( ptr[ i ] == 0xDF )
669 return "iso-8859-8-i";
670 }
671
672 return "iso-8859-8-i";
673}
674
675static TQCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
676{
677 JapaneseCode kc;
678
679 switch ( kc.guess_jp( (const char*)ptr, size ) ) {
680 case JapaneseCode::JIS:
681 return "jis7";
682 case JapaneseCode::EUC:
683 return "eucjp";
684 case JapaneseCode::SJIS:
685 return "sjis";
686 case JapaneseCode::UTF8:
687 return "utf8";
688 default:
689 break;
690 }
691
692 return "";
693}
694
695static TQCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
696{
697 for ( int i = 0; i < size; ++i ) {
698 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
699 return "cp1254";
700 }
701 }
702
703 return "iso-8859-9";
704}
705
706static TQCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
707{
708 uint nonansi_count=0;
709 for (int i=0; i<size; ++i)
710 {
711 if (ptr[i]>0x79)
712 {
713 ++nonansi_count;
714 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
715 {
716 return "UTF-8";
717 }
718 if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
719 {
720 return "cp1252";
721 }
722 }
723
724 }
725
726 if (nonansi_count>0)
727 return "iso-8859-15";
728
729 return "";
730}
731
732bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
733{
734 if (d->m_codec->mibEnum()!=MibUtf8)
735 return false; //means no errors
736// #define highest1Bits (unsigned char)0x80
737// #define highest2Bits (unsigned char)0xC0
738// #define highest3Bits (unsigned char)0xE0
739// #define highest4Bits (unsigned char)0xF0
740// #define highest5Bits (unsigned char)0xF8
741static const unsigned char highest1Bits = 0x80;
742static const unsigned char highest2Bits = 0xC0;
743static const unsigned char highest3Bits = 0xE0;
744static const unsigned char highest4Bits = 0xF0;
745static const unsigned char highest5Bits = 0xF8;
746
747 for (int i=0; i<length; ++i)
748 {
749 unsigned char c = data[i];
750
751 if (d->m_multiByte>0)
752 {
753 if ((c & highest2Bits) == 0x80)
754 {
755 --(d->m_multiByte);
756 continue;
757 }
758#ifdef DECODE_DEBUG
759 kWarning() << "EncDetector: Broken UTF8";
760#endif
761 return true;
762 }
763
764 // most significant bit zero, single char
765 if ((c & highest1Bits) == 0x00)
766 continue;
767
768 // 110xxxxx => init 1 following bytes
769 if ((c & highest3Bits) == 0xC0)
770 {
771 d->m_multiByte = 1;
772 continue;
773 }
774
775 // 1110xxxx => init 2 following bytes
776 if ((c & highest4Bits) == 0xE0)
777 {
778 d->m_multiByte = 2;
779 continue;
780 }
781
782 // 11110xxx => init 3 following bytes
783 if ((c & highest5Bits) == 0xF0)
784 {
785 d->m_multiByte = 3;
786 continue;
787 }
788#ifdef DECODE_DEBUG
789 kWarning() << "EncDetector:_Broken UTF8";
790#endif
791 return true;
792 }
793 return false;
794}
795
796EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
797{
798}
799
800EncodingDetector::EncodingDetector(TQTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
801 d(new EncodingDetectorPrivate(codec,source,script))
802{
803}
804
805EncodingDetector::~EncodingDetector()
806{
807 delete d;
808}
809
810void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
811{
812 d->m_autoDetectLanguage=lang;
813}
814EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
815{
816 return d->m_autoDetectLanguage;
817}
818
819EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
820{
821 return d->m_source;
822}
823
824const char* EncodingDetector::encoding() const
825{
826 d->m_storeDecoderName = d->m_codec->name();
827 d->m_storeDecoderName = d->m_storeDecoderName.lower().replace( "iso ", "iso-" );
828 return d->m_storeDecoderName.data();
829}
830
831bool EncodingDetector::visuallyOrdered() const
832{
833 return d->m_visualRTL;
834}
835
836// const TQTextCodec* EncodingDetector::codec() const
837// {
838// return d->m_codec;
839// }
840
842{
843 return d->m_decoder;
844}
845
846bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
847{
848 TQTextCodec *codec;
849 TQCString enc(_encoding);
850 if(/*enc.isNull() || */enc.isEmpty())
851 {
852 if (type==DefaultEncoding)
853 codec=d->m_defaultCodec;
854 else
855 return false;
856 }
857 else
858 {
859 //TQString->TQTextCodec
860
861 enc = enc.lower();
862 // hebrew visually ordered
863 if(enc=="visual")
864 enc="iso8859-8";
865 bool b;
866 codec = TDEGlobal::charsets()->codecForName(enc, b);
867 if (!b)
868 return false;
869 }
870
871 if (d->m_codec->mibEnum()==codec->mibEnum())
872 return true;
873
874 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
875 {
876 //Sometimes the codec specified is absurd, i.e. UTF-16 despite
877 //us decoding a meta tag as ASCII. In that case, ignore it.
878 return false;
879 }
880
881 if (codec->mibEnum() == Mib8859_8)
882 {
883 //We do NOT want to use TQt's TQHebrewCodec, since it tries to reorder itself.
884 codec = TQTextCodec::codecForName("iso8859-8-i");
885
886 // visually ordered unless one of the following
887 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
888 d->m_visualRTL = true;
889 }
890
891 d->m_codec = codec;
892 d->m_source = type;
893 delete d->m_decoder;
894 d->m_decoder = d->m_codec->makeDecoder();
895#ifdef DECODE_DEBUG
896 kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
897#endif
898 return true;
899}
900
901bool EncodingDetector::analyze(const TQByteArray &data)
902{
903 return analyze( data.data(), data.size() );
904}
905
906bool EncodingDetector::analyze(const char *data, int len)
907{
908 // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
909 // maximumBOMLength = 10
910 // Even if the user has chosen utf16 we still need to auto-detect the endianness
911 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
912 {
913 // Extract the first three bytes.
914 const uchar *udata = (const uchar *)data;
915 uchar c1 = *udata++;
916 uchar c2 = *udata++;
917 uchar c3 = *udata++;
918
919 // Check for the BOM
920 const char *autoDetectedEncoding;
921 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
922 {
923 autoDetectedEncoding = "ISO-10646-UCS-2";
924 }
925 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
926 {
927 autoDetectedEncoding = "UTF-8";
928 }
929 else if (c1 == 0x00 || c2 == 0x00)
930 {
931 uchar c4 = *udata++;
932 uchar c5 = *udata++;
933 uchar c6 = *udata++;
934 uchar c7 = *udata++;
935 uchar c8 = *udata++;
936 uchar c9 = *udata++;
937 uchar c10 = *udata++;
938
939 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
940 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
941 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
942 autoDetectedEncoding = "ISO-10646-UCS-2";
943 else
944 autoDetectedEncoding = 0;
945 }
946 else
947 {
948 autoDetectedEncoding = 0;
949 }
950
951 // If we found a BOM, use the encoding it implies.
952 if (autoDetectedEncoding != 0)
953 {
954 d->m_source = BOM;
955 d->m_codec = TQTextCodec::codecForName(autoDetectedEncoding);
956 assert(d->m_codec);
957 //enc = d->m_codec->name();
958 delete d->m_decoder;
959 d->m_decoder = d->m_codec->makeDecoder();
960#ifdef DECODE_DEBUG
961 kWarning() << "Detection by BOM";
962#endif
963 if (is16Bit(d->m_codec) && c2==0x00)
964 {
965 // utf16LE, we need to put the decoder in LE mode
966 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
967 d->m_decoder->toUnicode(reverseUtf16, 2);
968 }
969 return true;
970 }
971 }
972
973 //exit from routine in case it was called to only detect byte order for utf-16
974 if (d->m_source==UserChosenEncoding)
975 {
976#ifdef DECODE_DEBUG
977 kWarning() << "EncodingDetector: UserChosenEncoding exit ";
978#endif
979
980 if (errorsIfUtf8(data, len))
981 setEncoding("",DefaultEncoding);
982 return true;
983 }
984#if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
985 if (!d->m_seenBody)
986 {
987 // we still don't have an encoding, and are in the head
988 // the following tags are allowed in <head>:
989 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
990 const char *ptr = data;
991 const char *pEnd = data+len;
992
993 while(ptr != pEnd)
994 {
995 if(*ptr!='<')
996 {
997 ++ptr;
998 continue;
999 }
1000 ++ptr;
1001 // Handle comments.
1002 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
1003 {
1004 ptr += 3;
1005 skipComment(ptr, pEnd);
1006 continue;
1007 }
1008
1009 // Handle XML header, which can have encoding in it.
1010 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
1011 {
1012 const char *end = ptr;
1013 while (*end != '>' && end < pEnd)
1014 end++;
1015 if (*end == '\0' || end == pEnd)
1016 break;
1017 TQCString str(ptr, end - ptr + 1);
1018 int length;
1019 int pos = findXMLEncoding(str, length);
1020 // also handles the case when specified encoding aint correct
1021 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
1022 {
1023 return true;
1024 }
1025 }
1026
1027 //look for <meta>, stop if we reach <body>
1028 while (
1029 !((*ptr >= 'a') && (*ptr <= 'z') ||
1030 (*ptr >= 'A') && (*ptr <= 'Z'))
1031 && ptr < pEnd
1032 )
1033 ++ptr;
1034
1035 char tmp[5];
1036 int length=0;
1037 const char* max=ptr+4;
1038 if (pEnd<max)
1039 max=pEnd;
1040 while (
1041 ((*ptr >= 'a') && (*ptr <= 'z') ||
1042 (*ptr >= 'A') && (*ptr <= 'Z') ||
1043 (*ptr >= '0') && (*ptr <= '9'))
1044 && ptr < max
1045 )
1046 {
1047 tmp[length] = tolower( *ptr );
1048 ++ptr;
1049 ++length;
1050 }
1051 tmp[length] = 0;
1052 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
1053 {
1054 // found a meta tag...
1055 const char* end = ptr;
1056 while(*end != '>' && *end != '\0' && end<pEnd)
1057 end++;
1058 //if ( *end == '\0' ) break;
1059 TQCString str( ptr, (end-ptr)+1);
1060 str = str.lower();
1061 int pos=0;
1062 //if( (pos = str.find("http-equiv", pos)) == -1) break;
1063 //if( (pos = str.find("content-type", pos)) == -1) break;
1064 if( (pos = str.find("charset")) == -1)
1065 continue;
1066 pos+=6;
1067 // skip to '='
1068 if( (pos = str.find('=', pos)) == -1)
1069 continue;
1070
1071 // skip whitespace before encoding itself
1072 while (pos < (int)str.length() && str[pos] <= ' ')
1073 ++pos;
1074 if ( pos == (int)str.length())
1075 continue;
1076
1077 int endpos = pos;
1078 while( endpos < str.length() &&
1079 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1080 && str[endpos] != ';' && str[endpos] != '>') )
1081 ++endpos;
1082 #ifdef DECODE_DEBUG
1083 kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1084 #endif
1085 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
1086 return true;
1087 }
1088 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
1089 {
1090 d->m_seenBody=true;
1091 break;
1092 }
1093 }
1094 }
1095
1096 if (d->m_source==EncodingFromHTTPHeader)
1097 return true;
1098#endif
1099 //if (len<20) //make a guess even if the file is short -- ahartmetz
1100 if (len < 1)
1101 {
1102 setEncoding("",DefaultEncoding);
1103 return false;
1104 }
1105#ifdef DECODE_DEBUG
1106 kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
1107#endif
1108
1109 switch ( d->m_autoDetectLanguage )
1110 {
1111 case EncodingDetector::Arabic:
1112 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1113// break;
1114 case EncodingDetector::Baltic:
1115 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1116// break;
1117 case EncodingDetector::CentralEuropean:
1118 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
1119 break;
1120 case EncodingDetector::Cyrillic:
1121 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
1122// break;
1123 case EncodingDetector::Greek:
1124 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
1125// break;
1126 case EncodingDetector::Hebrew:
1127 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
1128// break;
1129 case EncodingDetector::Japanese:
1130 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
1131// break;
1132 case EncodingDetector::Turkish:
1133 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
1134// break;
1135 case EncodingDetector::WesternEuropean:
1136 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
1137 return true;
1138 else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for tdehtml
1139 {
1140 return setEncoding("iso-8859-15",AutoDetectedEncoding);
1141 }
1142 else //use default provided by eg katepart
1143 {
1144 return setEncoding("",DefaultEncoding);
1145 }
1146// break;
1147 case EncodingDetector::SemiautomaticDetection:
1148 case EncodingDetector::ChineseSimplified:
1149 case EncodingDetector::ChineseTraditional:
1150 case EncodingDetector::Korean:
1151 case EncodingDetector::Thai:
1152 case EncodingDetector::Unicode:
1153 case EncodingDetector::NorthernSaami:
1154 case EncodingDetector::SouthEasternEurope:
1155 case EncodingDetector::None:
1156 // huh. somethings broken in this code ### FIXME
1157 //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1158 break;
1159 }
1160
1161 setEncoding("",DefaultEncoding);
1162 return true;
1163}
1164
1165
1166EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const TQString& lang)
1167{
1168 if (lang.isEmpty())
1169 return EncodingDetector::None;
1170 else if (lang==i18n("@item Text character set", "Unicode"))
1171 return EncodingDetector::Unicode;
1172 else if (lang==i18n("@item Text character set", "Cyrillic"))
1173 return EncodingDetector::Cyrillic;
1174 else if (lang==i18n("@item Text character set", "Western European"))
1175 return EncodingDetector::WesternEuropean;
1176 else if (lang==i18n("@item Text character set", "Central European"))
1177 return EncodingDetector::CentralEuropean;
1178 else if (lang==i18n("@item Text character set", "Greek"))
1179 return EncodingDetector::Greek;
1180 else if (lang==i18n("@item Text character set", "Hebrew"))
1181 return EncodingDetector::Hebrew;
1182 else if (lang==i18n("@item Text character set", "Turkish"))
1183 return EncodingDetector::Turkish;
1184 else if (lang==i18n("@item Text character set", "Japanese"))
1185 return EncodingDetector::Japanese;
1186 else if (lang==i18n("@item Text character set", "Baltic"))
1187 return EncodingDetector::Baltic;
1188 else if (lang==i18n("@item Text character set", "Arabic"))
1189 return EncodingDetector::Arabic;
1190
1191 return EncodingDetector::None;
1192}
1193
1194bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
1195{
1196 switch (script)
1197 {
1198 case EncodingDetector::Arabic:
1199 return true;
1200 case EncodingDetector::Baltic:
1201 return true;
1202 case EncodingDetector::CentralEuropean:
1203 return true;
1204 case EncodingDetector::Cyrillic:
1205 return true;
1206 case EncodingDetector::Greek:
1207 return true;
1208 case EncodingDetector::Hebrew:
1209 return true;
1210 case EncodingDetector::Japanese:
1211 return true;
1212 case EncodingDetector::Turkish:
1213 return true;
1214 case EncodingDetector::WesternEuropean:
1215 return true;
1216 case EncodingDetector::ChineseTraditional:
1217 return true;
1218 case EncodingDetector::ChineseSimplified:
1219 return true;
1220 case EncodingDetector::Unicode:
1221 return true;
1222 break;
1223 default:
1224 return false;
1225 }
1226}
1227
1228TQString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
1229{
1230 switch (script)
1231 {
1232 case EncodingDetector::Arabic:
1233 return i18n("@item Text character set", "Arabic");
1234 break;
1235 case EncodingDetector::Baltic:
1236 return i18n("@item Text character set", "Baltic");
1237 break;
1238 case EncodingDetector::CentralEuropean:
1239 return i18n("@item Text character set", "Central European");
1240 break;
1241 case EncodingDetector::Cyrillic:
1242 return i18n("@item Text character set", "Cyrillic");
1243 break;
1244 case EncodingDetector::Greek:
1245 return i18n("@item Text character set", "Greek");
1246 break;
1247 case EncodingDetector::Hebrew:
1248 return i18n("@item Text character set", "Hebrew");
1249 break;
1250 case EncodingDetector::Japanese:
1251 return i18n("@item Text character set", "Japanese");
1252 break;
1253 case EncodingDetector::Turkish:
1254 return i18n("@item Text character set", "Turkish");
1255 break;
1256 case EncodingDetector::WesternEuropean:
1257 return i18n("@item Text character set", "Western European");
1258 break;
1259 case EncodingDetector::ChineseTraditional:
1260 return i18n("@item Text character set", "Chinese Traditional");
1261 break;
1262 case EncodingDetector::ChineseSimplified:
1263 return i18n("@item Text character set", "Chinese Simplified");
1264 break;
1265 case EncodingDetector::Korean:
1266 return i18n("@item Text character set", "Korean");
1267 break;
1268 case EncodingDetector::Thai:
1269 return i18n("@item Text character set", "Thai");
1270 break;
1271 case EncodingDetector::Unicode:
1272 return i18n("@item Text character set", "Unicode");
1273 break;
1274 //case EncodingDetector::SemiautomaticDetection:
1275 default:
1276 return TQString();
1277
1278 }
1279}
1280
1281EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const TQString &lc)
1282{
1283 // It might make sense to do something special if the locale ends with
1284 // ".UTF-8" or "@utf8"
1285 const char *langStr = pango_script_for_lang[0].lang;
1286 // There is obvious optimization potential...
1287 for ( int i = 0; langStr; i++ ) {
1288 langStr = pango_script_for_lang[i].lang;
1289 // startsWith() works for empty strings: every string "starts with" an empty string.
1290 if ( lc.startsWith( TQString::fromAscii( langStr ) ) )
1291 return pango_script_for_lang[i].scripts[0];
1292 }
1293 return None;
1294}
1295
1296#undef DECODE_DEBUG
1297
Provides encoding detection capabilities.
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
static AutoDetectScript scriptForName(const TQString &lang)
Takes lang name after it were i18n()'ed.
EncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
const char * encoding() const
Convenience method.
bool setEncoding(const char *encoding, EncodingChoiceSource type)
TQTextDecoder * decoder()
bool analyze(const char *data, int len)
Analyze text data.