kmail

encodingdetector.cpp
1 /*
2  This file was taken from the KDE 4.x libraries and backported to TQt 3.
3 
4  Copyright (C) 1999 Lars Knoll (knoll@kde.org)
5  Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
6  Copyright (C) 2003 Apple Computer, Inc.
7  Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
8 
9  This library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU Library General Public
11  License as published by the Free Software Foundation; either
12  version 2 of the License, or (at your option) any later version.
13 
14  This library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  Library General Public License for more details.
18 
19  You should have received a copy of the GNU Library General Public License
20  along with this library; see the file COPYING.LIB. If not, write to
21  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22  Boston, MA 02110-1301, USA.
23 */
24 //----------------------------------------------------------------------------
25 //
26 // decoder for input stream
27 
28 #include "encodingdetector.h"
29 
30 #undef DECODE_DEBUG
31 //#define DECODE_DEBUG
32 
33 #define MAX_BUFFER 16*1024
34 
35 #include <assert.h>
36 #include <stdlib.h>
37 
38 #include "encodingdetector_ja_p.h"
39 
40 #include <tqregexp.h>
41 #include <tqtextcodec.h>
42 
43 #include <tdeglobal.h>
44 #include <kcharsets.h>
45 #include <kdebug.h>
46 #include <tdelocale.h>
47 
48 #include <ctype.h>
49 
50 // The following table was taken from libpango 1.19.3 and slightly modified.
51 // Multiple scripts per language were removed and the entries were reordered so
52 // that simple substring matching will work. For example, bam was put before ba
53 // so that the first match will be likely the right match. Otherwise "ba" would
54 // match "bam" but we would have to search on to find "bam" which is what we want.
55 // The original file is called pango-script-lang-table.h
56 
57 /* pango-script-lang-table.h:
58  *
59  * Generated by gen-script-for-lang-new.c
60  * Date: 2007-10-26
61  * Source: fontconfig-2.4.91
62  *
63  * Do not edit. // I did. Sue me ;)
64  */
65 typedef struct _PangoScriptForLang {
66  const char lang[6];
67  EncodingDetector::AutoDetectScript scripts[1];
68 } PangoScriptForLang;
69 
70 //Unfortunately EncodingDetector does not know all scripts that Pango knows.
71 //Also, using EncodingDetector::CentralEuropean for the appropriate countries
72 //might give better results in some cases.
73 //One especially important (many speakers/literates) omission is the lack of
74 //Indian scripts.
75 
76 #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
77 #define PANGO_SCRIPT_BENGALI EncodingDetector::None
78 #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
79 #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
80 #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
81 #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
82 #define PANGO_SCRIPT_GUJARATI EncodingDetector::None
83 #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
84 #define PANGO_SCRIPT_KANNADA EncodingDetector::None
85 #define PANGO_SCRIPT_KHMER EncodingDetector::None
86 #define PANGO_SCRIPT_LAO EncodingDetector::None
87 #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
88 #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
89 #define PANGO_SCRIPT_MYANMAR EncodingDetector::None
90 #define PANGO_SCRIPT_ORIYA EncodingDetector::None
91 #define PANGO_SCRIPT_SINHALA EncodingDetector::None
92 #define PANGO_SCRIPT_SYRIAC EncodingDetector::None
93 #define PANGO_SCRIPT_TAGALOG EncodingDetector::None
94 #define PANGO_SCRIPT_TAMIL EncodingDetector::None
95 #define PANGO_SCRIPT_TIBETAN EncodingDetector::None
96 #define PANGO_SCRIPT_TELUGU EncodingDetector::None
97 
98 //Instead of changing the table even more...
99 #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
100 #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
101 #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
102 #define PANGO_SCRIPT_GREEK EncodingDetector::Greek
103 #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
104 #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
105 #define PANGO_SCRIPT_THAI EncodingDetector::Thai
106 
107 
108 static const PangoScriptForLang pango_script_for_lang[] = {
109  { "aa", { PANGO_SCRIPT_LATIN/*62*/ } },
110  { "ab", { PANGO_SCRIPT_CYRILLIC/*90*/ } },
111  { "af", { PANGO_SCRIPT_LATIN/*69*/ } },
112  { "am", { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
113  { "ar", { PANGO_SCRIPT_ARABIC/*125*/ } },
114  { "as", { PANGO_SCRIPT_BENGALI/*89*/ } },
115  { "ast", { PANGO_SCRIPT_LATIN/*66*/ } },
116  { "ava", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
117  { "ay", { PANGO_SCRIPT_LATIN/*60*/ } },
118  { "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } },
119  { "az", { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } },
120  { "bam", { PANGO_SCRIPT_LATIN/*60*/ } },
121  { "ba", { PANGO_SCRIPT_CYRILLIC/*82*/ } },
122  { "be", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
123  { "bg", { PANGO_SCRIPT_CYRILLIC/*60*/ } },
124  { "bh", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
125  { "bho", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
126  { "bi", { PANGO_SCRIPT_LATIN/*58*/ } },
127  { "bin", { PANGO_SCRIPT_LATIN/*76*/ } },
128  { "bn", { PANGO_SCRIPT_BENGALI/*89*/ } },
129  { "bo", { PANGO_SCRIPT_TIBETAN/*95*/ } },
130  { "br", { PANGO_SCRIPT_LATIN/*64*/ } },
131  { "bs", { PANGO_SCRIPT_LATIN/*62*/ } },
132  { "bua", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
133  { "ca", { PANGO_SCRIPT_LATIN/*74*/ } },
134  { "ce", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
135  { "chm", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
136  { "chr", { PANGO_SCRIPT_CHEROKEE/*85*/ } },
137  { "ch", { PANGO_SCRIPT_LATIN/*58*/ } },
138  { "co", { PANGO_SCRIPT_LATIN/*84*/ } },
139  { "cs", { PANGO_SCRIPT_LATIN/*82*/ } },
140  { "cu", { PANGO_SCRIPT_CYRILLIC/*103*/ } },
141  { "cv", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } },
142  { "cy", { PANGO_SCRIPT_LATIN/*78*/ } },
143  { "da", { PANGO_SCRIPT_LATIN/*70*/ } },
144  { "de", { PANGO_SCRIPT_LATIN/*59*/ } },
145  { "dz", { PANGO_SCRIPT_TIBETAN/*95*/ } },
146  { "el", { PANGO_SCRIPT_GREEK/*69*/ } },
147  { "en", { PANGO_SCRIPT_LATIN/*72*/ } },
148  { "eo", { PANGO_SCRIPT_LATIN/*64*/ } },
149  { "es", { PANGO_SCRIPT_LATIN/*66*/ } },
150 // { "et", { PANGO_SCRIPT_LATIN/*64*/ } },
151  { "et", { EncodingDetector::Baltic } },
152  { "eu", { PANGO_SCRIPT_LATIN/*56*/ } },
153  { "fa", { PANGO_SCRIPT_ARABIC/*129*/ } },
154  { "fi", { PANGO_SCRIPT_LATIN/*62*/ } },
155  { "fj", { PANGO_SCRIPT_LATIN/*52*/ } },
156  { "fo", { PANGO_SCRIPT_LATIN/*68*/ } },
157  { "fr", { PANGO_SCRIPT_LATIN/*84*/ } },
158  { "ful", { PANGO_SCRIPT_LATIN/*62*/ } },
159  { "fur", { PANGO_SCRIPT_LATIN/*66*/ } },
160  { "fy", { PANGO_SCRIPT_LATIN/*75*/ } },
161  { "ga", { PANGO_SCRIPT_LATIN/*80*/ } },
162  { "gd", { PANGO_SCRIPT_LATIN/*70*/ } },
163  { "gez", { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
164  { "gl", { PANGO_SCRIPT_LATIN/*66*/ } },
165  { "gn", { PANGO_SCRIPT_LATIN/*70*/ } },
166  { "gu", { PANGO_SCRIPT_GUJARATI/*78*/ } },
167  { "gv", { PANGO_SCRIPT_LATIN/*54*/ } },
168  { "ha", { PANGO_SCRIPT_LATIN/*60*/ } },
169  { "haw", { PANGO_SCRIPT_LATIN/*62*/ } },
170  { "he", { PANGO_SCRIPT_HEBREW/*27*/ } },
171  { "hi", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
172  { "ho", { PANGO_SCRIPT_LATIN/*52*/ } },
173  { "hr", { PANGO_SCRIPT_LATIN/*62*/ } },
174  { "hu", { PANGO_SCRIPT_LATIN/*70*/ } },
175  { "hy", { PANGO_SCRIPT_ARMENIAN/*77*/ } },
176  { "ia", { PANGO_SCRIPT_LATIN/*52*/ } },
177  { "ibo", { PANGO_SCRIPT_LATIN/*58*/ } },
178  { "id", { PANGO_SCRIPT_LATIN/*54*/ } },
179  { "ie", { PANGO_SCRIPT_LATIN/*52*/ } },
180  { "ik", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
181  { "io", { PANGO_SCRIPT_LATIN/*52*/ } },
182  { "is", { PANGO_SCRIPT_LATIN/*70*/ } },
183  { "it", { PANGO_SCRIPT_LATIN/*72*/ } },
184  { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } },
185 // { "ja", { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } },
186  { "ja", { EncodingDetector::Japanese } },
187  { "kaa", { PANGO_SCRIPT_CYRILLIC/*78*/ } },
188  { "ka", { PANGO_SCRIPT_GEORGIAN/*33*/ } },
189  { "ki", { PANGO_SCRIPT_LATIN/*56*/ } },
190  { "kk", { PANGO_SCRIPT_CYRILLIC/*77*/ } },
191  { "kl", { PANGO_SCRIPT_LATIN/*81*/ } },
192  { "km", { PANGO_SCRIPT_KHMER/*70*/ } },
193  { "kn", { PANGO_SCRIPT_KANNADA/*80*/ } },
194 // { "ko", { PANGO_SCRIPT_HANGUL/*2443*/ } },
195  { "ko", { EncodingDetector::Korean } },
196  { "kok", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
197  { "ks", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
198  { "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } },
199  { "ku", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } },
200  { "kum", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
201  { "kv", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
202  { "kw", { PANGO_SCRIPT_LATIN/*64*/ } },
203  { "ky", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
204  { "la", { PANGO_SCRIPT_LATIN/*68*/ } },
205  { "lb", { PANGO_SCRIPT_LATIN/*75*/ } },
206  { "lez", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
207  { "ln", { PANGO_SCRIPT_LATIN/*78*/ } },
208  { "lo", { PANGO_SCRIPT_LAO/*65*/ } },
209 // { "lt", { PANGO_SCRIPT_LATIN/*70*/ } },
210  { "lt", { EncodingDetector::Baltic } },
211 // { "lv", { PANGO_SCRIPT_LATIN/*78*/ } },
212  { "lv", { EncodingDetector::Baltic } },
213  { "mg", { PANGO_SCRIPT_LATIN/*56*/ } },
214  { "mh", { PANGO_SCRIPT_LATIN/*62*/ } },
215  { "mi", { PANGO_SCRIPT_LATIN/*64*/ } },
216  { "mk", { PANGO_SCRIPT_CYRILLIC/*42*/ } },
217  { "ml", { PANGO_SCRIPT_MALAYALAM/*78*/ } },
218  { "mn", { PANGO_SCRIPT_MONGOLIAN/*130*/ } },
219  { "mo", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } },
220  { "mr", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
221  { "mt", { PANGO_SCRIPT_LATIN/*72*/ } },
222  { "my", { PANGO_SCRIPT_MYANMAR/*48*/ } },
223  { "nb", { PANGO_SCRIPT_LATIN/*70*/ } },
224  { "nds", { PANGO_SCRIPT_LATIN/*59*/ } },
225  { "ne", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
226  { "nl", { PANGO_SCRIPT_LATIN/*82*/ } },
227  { "nn", { PANGO_SCRIPT_LATIN/*76*/ } },
228  { "no", { PANGO_SCRIPT_LATIN/*70*/ } },
229  { "nr", { PANGO_SCRIPT_LATIN/*52*/ } },
230  { "nso", { PANGO_SCRIPT_LATIN/*58*/ } },
231  { "ny", { PANGO_SCRIPT_LATIN/*54*/ } },
232  { "oc", { PANGO_SCRIPT_LATIN/*70*/ } },
233  { "om", { PANGO_SCRIPT_LATIN/*52*/ } },
234  { "or", { PANGO_SCRIPT_ORIYA/*79*/ } },
235  { "os", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
236  { "pa", { PANGO_SCRIPT_GURMUKHI/*63*/ } },
237  { "pl", { PANGO_SCRIPT_LATIN/*70*/ } },
238  { "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } },
239  { "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } },
240  { "pt", { PANGO_SCRIPT_LATIN/*82*/ } },
241  { "rm", { PANGO_SCRIPT_LATIN/*66*/ } },
242  { "ro", { PANGO_SCRIPT_LATIN/*62*/ } },
243  { "ru", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
244  { "sah", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
245  { "sa", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
246  { "sco", { PANGO_SCRIPT_LATIN/*56*/ } },
247  { "sel", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
248  { "se", { PANGO_SCRIPT_LATIN/*66*/ } },
249  { "sh", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
250  { "si", { PANGO_SCRIPT_SINHALA/*77*/ } },
251  { "sk", { PANGO_SCRIPT_LATIN/*86*/ } },
252  { "sl", { PANGO_SCRIPT_LATIN/*62*/ } },
253  { "sma", { PANGO_SCRIPT_LATIN/*60*/ } },
254  { "smj", { PANGO_SCRIPT_LATIN/*60*/ } },
255  { "smn", { PANGO_SCRIPT_LATIN/*68*/ } },
256  { "sms", { PANGO_SCRIPT_LATIN/*80*/ } },
257  { "sm", { PANGO_SCRIPT_LATIN/*52*/ } },
258  { "so", { PANGO_SCRIPT_LATIN/*52*/ } },
259  { "sq", { PANGO_SCRIPT_LATIN/*56*/ } },
260  { "sr", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
261  { "ss", { PANGO_SCRIPT_LATIN/*52*/ } },
262  { "st", { PANGO_SCRIPT_LATIN/*52*/ } },
263  { "sv", { PANGO_SCRIPT_LATIN/*68*/ } },
264  { "sw", { PANGO_SCRIPT_LATIN/*52*/ } },
265  { "syr", { PANGO_SCRIPT_SYRIAC/*45*/ } },
266  { "ta", { PANGO_SCRIPT_TAMIL/*48*/ } },
267  { "te", { PANGO_SCRIPT_TELUGU/*80*/ } },
268  { "tg", { PANGO_SCRIPT_CYRILLIC/*78*/ } },
269  { "th", { PANGO_SCRIPT_THAI/*86*/ } },
270  { "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
271  { "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
272  { "tig", { PANGO_SCRIPT_ETHIOPIC/*221*/ } },
273  { "tk", { PANGO_SCRIPT_CYRILLIC/*74*/ } },
274  { "tl", { PANGO_SCRIPT_TAGALOG/*19*/ } },
275  { "tn", { PANGO_SCRIPT_LATIN/*58*/ } },
276  { "to", { PANGO_SCRIPT_LATIN/*52*/ } },
277 // { "tr", { PANGO_SCRIPT_LATIN/*70*/ } },
278  { "tr", { EncodingDetector::Turkish } },
279  { "ts", { PANGO_SCRIPT_LATIN/*52*/ } },
280  { "tt", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
281  { "tw", { PANGO_SCRIPT_LATIN/*70*/ } },
282  { "tyv", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
283  { "ug", { PANGO_SCRIPT_ARABIC/*125*/ } },
284  { "uk", { PANGO_SCRIPT_CYRILLIC/*72*/ } },
285  { "ur", { PANGO_SCRIPT_ARABIC/*145*/ } },
286  { "uz", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
287  { "ven", { PANGO_SCRIPT_LATIN/*62*/ } },
288  { "vi", { PANGO_SCRIPT_LATIN/*186*/ } },
289  { "vot", { PANGO_SCRIPT_LATIN/*62*/ } },
290  { "vo", { PANGO_SCRIPT_LATIN/*54*/ } },
291  { "wa", { PANGO_SCRIPT_LATIN/*70*/ } },
292  { "wen", { PANGO_SCRIPT_LATIN/*76*/ } },
293  { "wo", { PANGO_SCRIPT_LATIN/*66*/ } },
294  { "xh", { PANGO_SCRIPT_LATIN/*52*/ } },
295  { "yap", { PANGO_SCRIPT_LATIN/*58*/ } },
296  { "yi", { PANGO_SCRIPT_HEBREW/*27*/ } },
297  { "yo", { PANGO_SCRIPT_LATIN/*114*/ } },
298 // { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } },
299  { "zh-cn", { EncodingDetector::ChineseSimplified } },
300 // { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } },
301  { "zh-hk", { EncodingDetector::ChineseTraditional } },
302 // { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } },
303  { "zh-mo", { EncodingDetector::ChineseTraditional } },
304 // { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } },
305  { "zh-sg", { EncodingDetector::ChineseSimplified } },
306 // { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } },
307  { "zh-tw", { EncodingDetector::ChineseTraditional } },
308  { "zu", { PANGO_SCRIPT_LATIN/*52*/ } },
309  { "\x00", { EncodingDetector::None } } //end mark
310 };
311 
312 enum MIB
313 {
314  MibLatin1 = 4,
315  Mib8859_8 = 85,
316  MibUtf8 = 106,
317  MibUcs2 = 1000,
318  MibUtf16 = 1015,
319  MibUtf16BE = 1013,
320  MibUtf16LE = 1014
321 };
322 
323 static bool is16Bit(TQTextCodec* codec)
324 {
325  switch (codec->mibEnum())
326  {
327  case MibUtf16:
328  case MibUtf16BE:
329  case MibUtf16LE:
330  case MibUcs2:
331  return true;
332  default:
333  return false;
334  }
335 }
336 
337 class EncodingDetectorPrivate
338 {
339 public:
340  TQTextCodec *m_codec;
341  TQTextDecoder *m_decoder; // utf16
342  TQTextCodec *m_defaultCodec;
343  TQCString m_storeDecoderName;
344 
345  EncodingDetector::EncodingChoiceSource m_source;
346  EncodingDetector::AutoDetectScript m_autoDetectLanguage;
347 
348  bool m_visualRTL : 1;
349  bool m_seenBody : 1;
350  bool m_writtingHappened : 1;
351  bool m_analyzeCalled : 1; //for decode()
352  int m_multiByte;
353 
354  TQCString m_bufferForDefferedEncDetection;
355 
356  EncodingDetectorPrivate()
357  : m_codec(TQTextCodec::codecForMib(MibLatin1))
358  , m_decoder(m_codec->makeDecoder())
359  , m_defaultCodec(m_codec)
360  , m_source(EncodingDetector::DefaultEncoding)
361  , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
362  , m_visualRTL(false)
363  , m_seenBody(false)
364  , m_writtingHappened(false)
365  , m_analyzeCalled(false)
366  , m_multiByte(0)
367  {
368  }
369 
370  EncodingDetectorPrivate(TQTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
371  : m_codec(codec)
372  , m_decoder(m_codec->makeDecoder())
373  , m_defaultCodec(m_codec)
374  , m_source(source)
375  , m_autoDetectLanguage(script)
376  , m_visualRTL(false)
377  , m_seenBody(false)
378  , m_writtingHappened(false)
379  , m_analyzeCalled(false)
380  , m_multiByte(0)
381  {
382  }
383 
384  ~EncodingDetectorPrivate()
385  {
386  delete m_decoder;
387  }
388 };
389 
390 
391 static TQCString automaticDetectionForArabic( const unsigned char* ptr, int size )
392 {
393  for ( int i = 0; i < size; ++i ) {
394  if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
395  || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
396  || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
397  || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
398  return "cp1256";
399  }
400  }
401 
402  return "iso-8859-6";
403 }
404 
405 static TQCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
406 {
407  for ( int i = 0; i < size; ++i ) {
408  if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
409  return "cp1257";
410 
411  if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
412  return "iso-8859-13";
413  }
414 
415  return "iso-8859-13";
416 }
417 
418 static TQCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
419 {
420  TQCString charset;
421  for ( int i = 0; i < size; ++i ) {
422  if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
423  if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
424  return "ibm852";
425 
426  if ( i + 1 > size )
427  return "cp1250";
428  else { // maybe ibm852 ?
429  charset = "cp1250";
430  continue;
431  }
432  }
433  if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
434  if ( i + 1 > size )
435  return "iso-8859-2";
436  else { // maybe ibm852 ?
437  if ( charset.isNull() )
438  charset = "iso-8859-2";
439  continue;
440  }
441  }
442  }
443 
444  if ( charset.isNull() )
445  charset = "iso-8859-3";
446 
447  return charset.data();
448 }
449 
450 static TQCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
451 {
452 #ifdef DECODE_DEBUG
453  kWarning() << "EncodingDetector: Cyr heuristics";
454 #endif
455 
456 // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
457 // return "utf8";
458  int utf8_mark=0;
459  int koi_score=0;
460  int cp1251_score=0;
461 
462  int koi_st=0;
463  int cp1251_st=0;
464 
465 // int koi_na=0;
466 // int cp1251_na=0;
467 
468  int koi_o_capital=0;
469  int koi_o=0;
470  int cp1251_o_capital=0;
471  int cp1251_o=0;
472 
473  int koi_a_capital=0;
474  int koi_a=0;
475  int cp1251_a_capital=0;
476  int cp1251_a=0;
477 
478  int koi_s_capital=0;
479  int koi_s=0;
480  int cp1251_s_capital=0;
481  int cp1251_s=0;
482 
483  int koi_i_capital=0;
484  int koi_i=0;
485  int cp1251_i_capital=0;
486  int cp1251_i=0;
487 
488  int cp1251_small_range=0;
489  int koi_small_range=0;
490  int ibm866_small_range=0;
491 
492  int i;
493  for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
494  {
495  if (ptr[i]>0xdf)
496  {
497  ++cp1251_small_range;
498 
499  if (ptr[i]==0xee)//small o
500  ++cp1251_o;
501  else if (ptr[i]==0xe0)//small a
502  ++cp1251_a;
503  else if (ptr[i]==0xe8)//small i
504  ++cp1251_i;
505  else if (ptr[i]==0xf1)//small s
506  ++cp1251_s;
507  else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
508  ++cp1251_st;
509 
510  else if (ptr[i]==0xef)
511  ++koi_o_capital;
512  else if (ptr[i]==0xe1)
513  ++koi_a_capital;
514  else if (ptr[i]==0xe9)
515  ++koi_i_capital;
516  else if (ptr[i]==0xf3)
517  ++koi_s_capital;
518 
519  }
520  else if (ptr[i]>0xbf)
521  {
522  ++koi_small_range;
523 
524  if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
525  ++utf8_mark;
526  else if (ptr[i]==0xcf)//small o
527  ++koi_o;
528  else if (ptr[i]==0xc1)//small a
529  ++koi_a;
530  else if (ptr[i]==0xc9)//small i
531  ++koi_i;
532  else if (ptr[i]==0xd3)//small s
533  ++koi_s;
534  else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
535  ++koi_st;
536 
537  else if (ptr[i]==0xce)
538  ++cp1251_o_capital;
539  else if (ptr[i]==0xc0)
540  ++cp1251_a_capital;
541  else if (ptr[i]==0xc8)
542  ++cp1251_i_capital;
543  else if (ptr[i]==0xd1)
544  ++cp1251_s_capital;
545  }
546  else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
547  ++ibm866_small_range;
548 
549  }
550 
551  //cannot decide?
552  if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
553  {
554  return "";
555  }
556 
557  if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
558  {
559 #ifdef DECODE_DEBUG
560  kWarning() << "Cyr Enc Detection: UTF8";
561 #endif
562  return "UTF-8";
563  }
564 
565  if (ibm866_small_range>cp1251_small_range+koi_small_range)
566  return "ibm866";
567 
568 // TQCString koi_string = "koi8-u";
569 // TQCString cp1251_string = "cp1251";
570 
571  if (cp1251_st==0 && koi_st>1)
572  koi_score+=10;
573  else if (koi_st==0 && cp1251_st>1)
574  cp1251_score+=10;
575 
576  if (cp1251_st && koi_st)
577  {
578  if (cp1251_st/koi_st>2)
579  cp1251_score+=20;
580  else if (koi_st/cp1251_st>2)
581  koi_score+=20;
582  }
583 
584  if (cp1251_a>koi_a)
585  cp1251_score+=10;
586  else if (cp1251_a || koi_a)
587  koi_score+=10;
588 
589  if (cp1251_o>koi_o)
590  cp1251_score+=10;
591  else if (cp1251_o || koi_o)
592  koi_score+=10;
593 
594  if (cp1251_i>koi_i)
595  cp1251_score+=10;
596  else if (cp1251_i || koi_i)
597  koi_score+=10;
598 
599  if (cp1251_s>koi_s)
600  cp1251_score+=10;
601  else if (cp1251_s || koi_s)
602  koi_score+=10;
603 
604  if (cp1251_a_capital>koi_a_capital)
605  cp1251_score+=9;
606  else if (cp1251_a_capital || koi_a_capital)
607  koi_score+=9;
608 
609  if (cp1251_o_capital>koi_o_capital)
610  cp1251_score+=9;
611  else if (cp1251_o_capital || koi_o_capital)
612  koi_score+=9;
613 
614  if (cp1251_i_capital>koi_i_capital)
615  cp1251_score+=9;
616  else if (cp1251_i_capital || koi_i_capital)
617  koi_score+=9;
618 
619  if (cp1251_s_capital>koi_s_capital)
620  cp1251_score+=9;
621  else if (cp1251_s_capital || koi_s_capital)
622  koi_score+=9;
623 #ifdef DECODE_DEBUG
624  kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
625 #endif
626  if (abs(koi_score-cp1251_score)<10)
627  {
628  //fallback...
629  cp1251_score=cp1251_small_range;
630  koi_score=koi_small_range;
631  }
632  if (cp1251_score>koi_score)
633  return "cp1251";
634  else
635  return "koi8-u";
636 
637 
638 // if (cp1251_score>koi_score)
639 // setEncoding("cp1251",AutoDetectedEncoding);
640 // else
641 // setEncoding("koi8-u",AutoDetectedEncoding);
642 // return true;
643 
644 }
645 
646 static TQCString automaticDetectionForGreek( const unsigned char* ptr, int size )
647 {
648  for ( int i = 0; i < size; ++i ) {
649  if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
650  || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
651  || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
652  return "cp1253";
653  }
654  }
655 
656  return "iso-8859-7";
657 }
658 
659 static TQCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
660 {
661  for ( int i = 0; i < size; ++i ) {
662  if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
663  || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
664  || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
665  return "cp1255";
666  }
667 
668  if ( ptr[ i ] == 0xDF )
669  return "iso-8859-8-i";
670  }
671 
672  return "iso-8859-8-i";
673 }
674 
675 static TQCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
676 {
677  JapaneseCode kc;
678 
679  switch ( kc.guess_jp( (const char*)ptr, size ) ) {
680  case JapaneseCode::JIS:
681  return "jis7";
682  case JapaneseCode::EUC:
683  return "eucjp";
684  case JapaneseCode::SJIS:
685  return "sjis";
686  case JapaneseCode::UTF8:
687  return "utf8";
688  default:
689  break;
690  }
691 
692  return "";
693 }
694 
695 static TQCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
696 {
697  for ( int i = 0; i < size; ++i ) {
698  if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
699  return "cp1254";
700  }
701  }
702 
703  return "iso-8859-9";
704 }
705 
706 static TQCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
707 {
708  uint nonansi_count=0;
709  for (int i=0; i<size; ++i)
710  {
711  if (ptr[i]>0x79)
712  {
713  ++nonansi_count;
714  if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
715  {
716  return "UTF-8";
717  }
718  if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
719  {
720  return "cp1252";
721  }
722  }
723 
724  }
725 
726  if (nonansi_count>0)
727  return "iso-8859-15";
728 
729  return "";
730 }
731 
732 bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
733 {
734  if (d->m_codec->mibEnum()!=MibUtf8)
735  return false; //means no errors
736 // #define highest1Bits (unsigned char)0x80
737 // #define highest2Bits (unsigned char)0xC0
738 // #define highest3Bits (unsigned char)0xE0
739 // #define highest4Bits (unsigned char)0xF0
740 // #define highest5Bits (unsigned char)0xF8
741 static const unsigned char highest1Bits = 0x80;
742 static const unsigned char highest2Bits = 0xC0;
743 static const unsigned char highest3Bits = 0xE0;
744 static const unsigned char highest4Bits = 0xF0;
745 static const unsigned char highest5Bits = 0xF8;
746 
747  for (int i=0; i<length; ++i)
748  {
749  unsigned char c = data[i];
750 
751  if (d->m_multiByte>0)
752  {
753  if ((c & highest2Bits) == 0x80)
754  {
755  --(d->m_multiByte);
756  continue;
757  }
758 #ifdef DECODE_DEBUG
759  kWarning() << "EncDetector: Broken UTF8";
760 #endif
761  return true;
762  }
763 
764  // most significant bit zero, single char
765  if ((c & highest1Bits) == 0x00)
766  continue;
767 
768  // 110xxxxx => init 1 following bytes
769  if ((c & highest3Bits) == 0xC0)
770  {
771  d->m_multiByte = 1;
772  continue;
773  }
774 
775  // 1110xxxx => init 2 following bytes
776  if ((c & highest4Bits) == 0xE0)
777  {
778  d->m_multiByte = 2;
779  continue;
780  }
781 
782  // 11110xxx => init 3 following bytes
783  if ((c & highest5Bits) == 0xF0)
784  {
785  d->m_multiByte = 3;
786  continue;
787  }
788 #ifdef DECODE_DEBUG
789  kWarning() << "EncDetector:_Broken UTF8";
790 #endif
791  return true;
792  }
793  return false;
794 }
795 
796 EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
797 {
798 }
799 
800 EncodingDetector::EncodingDetector(TQTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
801  d(new EncodingDetectorPrivate(codec,source,script))
802 {
803 }
804 
805 EncodingDetector::~EncodingDetector()
806 {
807  delete d;
808 }
809 
810 void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
811 {
812  d->m_autoDetectLanguage=lang;
813 }
814 EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
815 {
816  return d->m_autoDetectLanguage;
817 }
818 
819 EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
820 {
821  return d->m_source;
822 }
823 
824 const char* EncodingDetector::encoding() const
825 {
826  d->m_storeDecoderName = d->m_codec->name();
827  d->m_storeDecoderName = d->m_storeDecoderName.lower().replace( "iso ", "iso-" );
828  return d->m_storeDecoderName.data();
829 }
830 
831 bool EncodingDetector::visuallyOrdered() const
832 {
833  return d->m_visualRTL;
834 }
835 
836 // const TQTextCodec* EncodingDetector::codec() const
837 // {
838 // return d->m_codec;
839 // }
840 
842 {
843  return d->m_decoder;
844 }
845 
846 bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
847 {
848  TQTextCodec *codec;
849  TQCString enc(_encoding);
850  if(/*enc.isNull() || */enc.isEmpty())
851  {
852  if (type==DefaultEncoding)
853  codec=d->m_defaultCodec;
854  else
855  return false;
856  }
857  else
858  {
859  //TQString->TQTextCodec
860 
861  enc = enc.lower();
862  // hebrew visually ordered
863  if(enc=="visual")
864  enc="iso8859-8";
865  bool b;
866  codec = TDEGlobal::charsets()->codecForName(enc, b);
867  if (!b)
868  return false;
869  }
870 
871  if (d->m_codec->mibEnum()==codec->mibEnum())
872  return true;
873 
874  if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
875  {
876  //Sometimes the codec specified is absurd, i.e. UTF-16 despite
877  //us decoding a meta tag as ASCII. In that case, ignore it.
878  return false;
879  }
880 
881  if (codec->mibEnum() == Mib8859_8)
882  {
883  //We do NOT want to use TQt's TQHebrewCodec, since it tries to reorder itself.
884  codec = TQTextCodec::codecForName("iso8859-8-i");
885 
886  // visually ordered unless one of the following
887  if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
888  d->m_visualRTL = true;
889  }
890 
891  d->m_codec = codec;
892  d->m_source = type;
893  delete d->m_decoder;
894  d->m_decoder = d->m_codec->makeDecoder();
895 #ifdef DECODE_DEBUG
896  kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
897 #endif
898  return true;
899 }
900 
901 bool EncodingDetector::analyze(const TQByteArray &data)
902 {
903  return analyze( data.data(), data.size() );
904 }
905 
906 bool EncodingDetector::analyze(const char *data, int len)
907 {
908  // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
909  // maximumBOMLength = 10
910  // Even if the user has chosen utf16 we still need to auto-detect the endianness
911  if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
912  {
913  // Extract the first three bytes.
914  const uchar *udata = (const uchar *)data;
915  uchar c1 = *udata++;
916  uchar c2 = *udata++;
917  uchar c3 = *udata++;
918 
919  // Check for the BOM
920  const char *autoDetectedEncoding;
921  if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
922  {
923  autoDetectedEncoding = "ISO-10646-UCS-2";
924  }
925  else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
926  {
927  autoDetectedEncoding = "UTF-8";
928  }
929  else if (c1 == 0x00 || c2 == 0x00)
930  {
931  uchar c4 = *udata++;
932  uchar c5 = *udata++;
933  uchar c6 = *udata++;
934  uchar c7 = *udata++;
935  uchar c8 = *udata++;
936  uchar c9 = *udata++;
937  uchar c10 = *udata++;
938 
939  int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
940  int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
941  if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
942  autoDetectedEncoding = "ISO-10646-UCS-2";
943  else
944  autoDetectedEncoding = 0;
945  }
946  else
947  {
948  autoDetectedEncoding = 0;
949  }
950 
951  // If we found a BOM, use the encoding it implies.
952  if (autoDetectedEncoding != 0)
953  {
954  d->m_source = BOM;
955  d->m_codec = TQTextCodec::codecForName(autoDetectedEncoding);
956  assert(d->m_codec);
957  //enc = d->m_codec->name();
958  delete d->m_decoder;
959  d->m_decoder = d->m_codec->makeDecoder();
960 #ifdef DECODE_DEBUG
961  kWarning() << "Detection by BOM";
962 #endif
963  if (is16Bit(d->m_codec) && c2==0x00)
964  {
965  // utf16LE, we need to put the decoder in LE mode
966  char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
967  d->m_decoder->toUnicode(reverseUtf16, 2);
968  }
969  return true;
970  }
971  }
972 
973  //exit from routine in case it was called to only detect byte order for utf-16
974  if (d->m_source==UserChosenEncoding)
975  {
976 #ifdef DECODE_DEBUG
977  kWarning() << "EncodingDetector: UserChosenEncoding exit ";
978 #endif
979 
980  if (errorsIfUtf8(data, len))
981  setEncoding("",DefaultEncoding);
982  return true;
983  }
984 #if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
985  if (!d->m_seenBody)
986  {
987  // we still don't have an encoding, and are in the head
988  // the following tags are allowed in <head>:
989  // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
990  const char *ptr = data;
991  const char *pEnd = data+len;
992 
993  while(ptr != pEnd)
994  {
995  if(*ptr!='<')
996  {
997  ++ptr;
998  continue;
999  }
1000  ++ptr;
1001  // Handle comments.
1002  if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
1003  {
1004  ptr += 3;
1005  skipComment(ptr, pEnd);
1006  continue;
1007  }
1008 
1009  // Handle XML header, which can have encoding in it.
1010  if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
1011  {
1012  const char *end = ptr;
1013  while (*end != '>' && end < pEnd)
1014  end++;
1015  if (*end == '\0' || end == pEnd)
1016  break;
1017  TQCString str(ptr, end - ptr + 1);
1018  int length;
1019  int pos = findXMLEncoding(str, length);
1020  // also handles the case when specified encoding aint correct
1021  if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
1022  {
1023  return true;
1024  }
1025  }
1026 
1027  //look for <meta>, stop if we reach <body>
1028  while (
1029  !((*ptr >= 'a') && (*ptr <= 'z') ||
1030  (*ptr >= 'A') && (*ptr <= 'Z'))
1031  && ptr < pEnd
1032  )
1033  ++ptr;
1034 
1035  char tmp[5];
1036  int length=0;
1037  const char* max=ptr+4;
1038  if (pEnd<max)
1039  max=pEnd;
1040  while (
1041  ((*ptr >= 'a') && (*ptr <= 'z') ||
1042  (*ptr >= 'A') && (*ptr <= 'Z') ||
1043  (*ptr >= '0') && (*ptr <= '9'))
1044  && ptr < max
1045  )
1046  {
1047  tmp[length] = tolower( *ptr );
1048  ++ptr;
1049  ++length;
1050  }
1051  tmp[length] = 0;
1052  if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
1053  {
1054  // found a meta tag...
1055  const char* end = ptr;
1056  while(*end != '>' && *end != '\0' && end<pEnd)
1057  end++;
1058  //if ( *end == '\0' ) break;
1059  TQCString str( ptr, (end-ptr)+1);
1060  str = str.lower();
1061  int pos=0;
1062  //if( (pos = str.find("http-equiv", pos)) == -1) break;
1063  //if( (pos = str.find("content-type", pos)) == -1) break;
1064  if( (pos = str.find("charset")) == -1)
1065  continue;
1066  pos+=6;
1067  // skip to '='
1068  if( (pos = str.find('=', pos)) == -1)
1069  continue;
1070 
1071  // skip whitespace before encoding itself
1072  while (pos < (int)str.length() && str[pos] <= ' ')
1073  ++pos;
1074  if ( pos == (int)str.length())
1075  continue;
1076 
1077  int endpos = pos;
1078  while( endpos < str.length() &&
1079  (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1080  && str[endpos] != ';' && str[endpos] != '>') )
1081  ++endpos;
1082  #ifdef DECODE_DEBUG
1083  kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1084  #endif
1085  if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
1086  return true;
1087  }
1088  else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
1089  {
1090  d->m_seenBody=true;
1091  break;
1092  }
1093  }
1094  }
1095 
1096  if (d->m_source==EncodingFromHTTPHeader)
1097  return true;
1098 #endif
1099  //if (len<20) //make a guess even if the file is short -- ahartmetz
1100  if (len < 1)
1101  {
1102  setEncoding("",DefaultEncoding);
1103  return false;
1104  }
1105 #ifdef DECODE_DEBUG
1106  kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
1107 #endif
1108 
1109  switch ( d->m_autoDetectLanguage )
1110  {
1111  case EncodingDetector::Arabic:
1112  return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1113 // break;
1114  case EncodingDetector::Baltic:
1115  return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1116 // break;
1117  case EncodingDetector::CentralEuropean:
1118  return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
1119  break;
1120  case EncodingDetector::Cyrillic:
1121  return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
1122 // break;
1123  case EncodingDetector::Greek:
1124  return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
1125 // break;
1126  case EncodingDetector::Hebrew:
1127  return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
1128 // break;
1129  case EncodingDetector::Japanese:
1130  return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
1131 // break;
1132  case EncodingDetector::Turkish:
1133  return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
1134 // break;
1135  case EncodingDetector::WesternEuropean:
1136  if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
1137  return true;
1138  else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for tdehtml
1139  {
1140  return setEncoding("iso-8859-15",AutoDetectedEncoding);
1141  }
1142  else //use default provided by eg katepart
1143  {
1144  return setEncoding("",DefaultEncoding);
1145  }
1146 // break;
1147  case EncodingDetector::SemiautomaticDetection:
1148  case EncodingDetector::ChineseSimplified:
1149  case EncodingDetector::ChineseTraditional:
1150  case EncodingDetector::Korean:
1151  case EncodingDetector::Thai:
1152  case EncodingDetector::Unicode:
1153  case EncodingDetector::NorthernSaami:
1154  case EncodingDetector::SouthEasternEurope:
1155  case EncodingDetector::None:
1156  // huh. somethings broken in this code ### FIXME
1157  //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1158  break;
1159  }
1160 
1161  setEncoding("",DefaultEncoding);
1162  return true;
1163 }
1164 
1165 
1166 EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const TQString& lang)
1167 {
1168  if (lang.isEmpty())
1169  return EncodingDetector::None;
1170  else if (lang==i18n("@item Text character set", "Unicode"))
1171  return EncodingDetector::Unicode;
1172  else if (lang==i18n("@item Text character set", "Cyrillic"))
1173  return EncodingDetector::Cyrillic;
1174  else if (lang==i18n("@item Text character set", "Western European"))
1175  return EncodingDetector::WesternEuropean;
1176  else if (lang==i18n("@item Text character set", "Central European"))
1177  return EncodingDetector::CentralEuropean;
1178  else if (lang==i18n("@item Text character set", "Greek"))
1179  return EncodingDetector::Greek;
1180  else if (lang==i18n("@item Text character set", "Hebrew"))
1181  return EncodingDetector::Hebrew;
1182  else if (lang==i18n("@item Text character set", "Turkish"))
1183  return EncodingDetector::Turkish;
1184  else if (lang==i18n("@item Text character set", "Japanese"))
1185  return EncodingDetector::Japanese;
1186  else if (lang==i18n("@item Text character set", "Baltic"))
1187  return EncodingDetector::Baltic;
1188  else if (lang==i18n("@item Text character set", "Arabic"))
1189  return EncodingDetector::Arabic;
1190 
1191  return EncodingDetector::None;
1192 }
1193 
1194 bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
1195 {
1196  switch (script)
1197  {
1198  case EncodingDetector::Arabic:
1199  return true;
1200  case EncodingDetector::Baltic:
1201  return true;
1202  case EncodingDetector::CentralEuropean:
1203  return true;
1204  case EncodingDetector::Cyrillic:
1205  return true;
1206  case EncodingDetector::Greek:
1207  return true;
1208  case EncodingDetector::Hebrew:
1209  return true;
1210  case EncodingDetector::Japanese:
1211  return true;
1212  case EncodingDetector::Turkish:
1213  return true;
1214  case EncodingDetector::WesternEuropean:
1215  return true;
1216  case EncodingDetector::ChineseTraditional:
1217  return true;
1218  case EncodingDetector::ChineseSimplified:
1219  return true;
1220  case EncodingDetector::Unicode:
1221  return true;
1222  break;
1223  default:
1224  return false;
1225  }
1226 }
1227 
1228 TQString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
1229 {
1230  switch (script)
1231  {
1232  case EncodingDetector::Arabic:
1233  return i18n("@item Text character set", "Arabic");
1234  break;
1235  case EncodingDetector::Baltic:
1236  return i18n("@item Text character set", "Baltic");
1237  break;
1238  case EncodingDetector::CentralEuropean:
1239  return i18n("@item Text character set", "Central European");
1240  break;
1241  case EncodingDetector::Cyrillic:
1242  return i18n("@item Text character set", "Cyrillic");
1243  break;
1244  case EncodingDetector::Greek:
1245  return i18n("@item Text character set", "Greek");
1246  break;
1247  case EncodingDetector::Hebrew:
1248  return i18n("@item Text character set", "Hebrew");
1249  break;
1250  case EncodingDetector::Japanese:
1251  return i18n("@item Text character set", "Japanese");
1252  break;
1253  case EncodingDetector::Turkish:
1254  return i18n("@item Text character set", "Turkish");
1255  break;
1256  case EncodingDetector::WesternEuropean:
1257  return i18n("@item Text character set", "Western European");
1258  break;
1259  case EncodingDetector::ChineseTraditional:
1260  return i18n("@item Text character set", "Chinese Traditional");
1261  break;
1262  case EncodingDetector::ChineseSimplified:
1263  return i18n("@item Text character set", "Chinese Simplified");
1264  break;
1265  case EncodingDetector::Korean:
1266  return i18n("@item Text character set", "Korean");
1267  break;
1268  case EncodingDetector::Thai:
1269  return i18n("@item Text character set", "Thai");
1270  break;
1271  case EncodingDetector::Unicode:
1272  return i18n("@item Text character set", "Unicode");
1273  break;
1274  //case EncodingDetector::SemiautomaticDetection:
1275  default:
1276  return TQString();
1277 
1278  }
1279 }
1280 
1281 EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const TQString &lc)
1282 {
1283  // It might make sense to do something special if the locale ends with
1284  // ".UTF-8" or "@utf8"
1285  const char *langStr = pango_script_for_lang[0].lang;
1286  // There is obvious optimization potential...
1287  for ( int i = 0; langStr; i++ ) {
1288  langStr = pango_script_for_lang[i].lang;
1289  // startsWith() works for empty strings: every string "starts with" an empty string.
1290  if ( lc.startsWith( TQString::fromAscii( langStr ) ) )
1291  return pango_script_for_lang[i].scripts[0];
1292  }
1293  return None;
1294 }
1295 
1296 #undef DECODE_DEBUG
1297 
Provides encoding detection capabilities.
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
static AutoDetectScript scriptForName(const TQString &lang)
Takes lang name after it were i18n()'ed.
EncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
const char * encoding() const
Convenience method.
bool setEncoding(const char *encoding, EncodingChoiceSource type)
TQTextDecoder * decoder()
bool analyze(const char *data, int len)
Analyze text data.