28 #include "encodingdetector.h"
33 #define MAX_BUFFER 16*1024
38 #include "encodingdetector_ja_p.h"
41 #include <tqtextcodec.h>
43 #include <tdeglobal.h>
44 #include <kcharsets.h>
46 #include <tdelocale.h>
65 typedef struct _PangoScriptForLang {
67 EncodingDetector::AutoDetectScript scripts[1];
76 #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
77 #define PANGO_SCRIPT_BENGALI EncodingDetector::None
78 #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
79 #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
80 #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
81 #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
82 #define PANGO_SCRIPT_GUJARATI EncodingDetector::None
83 #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
84 #define PANGO_SCRIPT_KANNADA EncodingDetector::None
85 #define PANGO_SCRIPT_KHMER EncodingDetector::None
86 #define PANGO_SCRIPT_LAO EncodingDetector::None
87 #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
88 #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
89 #define PANGO_SCRIPT_MYANMAR EncodingDetector::None
90 #define PANGO_SCRIPT_ORIYA EncodingDetector::None
91 #define PANGO_SCRIPT_SINHALA EncodingDetector::None
92 #define PANGO_SCRIPT_SYRIAC EncodingDetector::None
93 #define PANGO_SCRIPT_TAGALOG EncodingDetector::None
94 #define PANGO_SCRIPT_TAMIL EncodingDetector::None
95 #define PANGO_SCRIPT_TIBETAN EncodingDetector::None
96 #define PANGO_SCRIPT_TELUGU EncodingDetector::None
99 #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
100 #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
101 #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
102 #define PANGO_SCRIPT_GREEK EncodingDetector::Greek
103 #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
104 #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
105 #define PANGO_SCRIPT_THAI EncodingDetector::Thai
108 static const PangoScriptForLang pango_script_for_lang[] = {
109 {
"aa", { PANGO_SCRIPT_LATIN } },
110 {
"ab", { PANGO_SCRIPT_CYRILLIC } },
111 {
"af", { PANGO_SCRIPT_LATIN } },
112 {
"am", { PANGO_SCRIPT_ETHIOPIC } },
113 {
"ar", { PANGO_SCRIPT_ARABIC } },
114 {
"as", { PANGO_SCRIPT_BENGALI } },
115 {
"ast", { PANGO_SCRIPT_LATIN } },
116 {
"ava", { PANGO_SCRIPT_CYRILLIC } },
117 {
"ay", { PANGO_SCRIPT_LATIN } },
118 {
"az-ir", { PANGO_SCRIPT_ARABIC } },
119 {
"az", { PANGO_SCRIPT_CYRILLIC } },
120 {
"bam", { PANGO_SCRIPT_LATIN } },
121 {
"ba", { PANGO_SCRIPT_CYRILLIC } },
122 {
"be", { PANGO_SCRIPT_CYRILLIC } },
123 {
"bg", { PANGO_SCRIPT_CYRILLIC } },
124 {
"bh", { PANGO_SCRIPT_DEVANAGARI } },
125 {
"bho", { PANGO_SCRIPT_DEVANAGARI } },
126 {
"bi", { PANGO_SCRIPT_LATIN } },
127 {
"bin", { PANGO_SCRIPT_LATIN } },
128 {
"bn", { PANGO_SCRIPT_BENGALI } },
129 {
"bo", { PANGO_SCRIPT_TIBETAN } },
130 {
"br", { PANGO_SCRIPT_LATIN } },
131 {
"bs", { PANGO_SCRIPT_LATIN } },
132 {
"bua", { PANGO_SCRIPT_CYRILLIC } },
133 {
"ca", { PANGO_SCRIPT_LATIN } },
134 {
"ce", { PANGO_SCRIPT_CYRILLIC } },
135 {
"chm", { PANGO_SCRIPT_CYRILLIC } },
136 {
"chr", { PANGO_SCRIPT_CHEROKEE } },
137 {
"ch", { PANGO_SCRIPT_LATIN } },
138 {
"co", { PANGO_SCRIPT_LATIN } },
139 {
"cs", { PANGO_SCRIPT_LATIN } },
140 {
"cu", { PANGO_SCRIPT_CYRILLIC } },
141 {
"cv", { PANGO_SCRIPT_CYRILLIC } },
142 {
"cy", { PANGO_SCRIPT_LATIN } },
143 {
"da", { PANGO_SCRIPT_LATIN } },
144 {
"de", { PANGO_SCRIPT_LATIN } },
145 {
"dz", { PANGO_SCRIPT_TIBETAN } },
146 {
"el", { PANGO_SCRIPT_GREEK } },
147 {
"en", { PANGO_SCRIPT_LATIN } },
148 {
"eo", { PANGO_SCRIPT_LATIN } },
149 {
"es", { PANGO_SCRIPT_LATIN } },
151 {
"et", { EncodingDetector::Baltic } },
152 {
"eu", { PANGO_SCRIPT_LATIN } },
153 {
"fa", { PANGO_SCRIPT_ARABIC } },
154 {
"fi", { PANGO_SCRIPT_LATIN } },
155 {
"fj", { PANGO_SCRIPT_LATIN } },
156 {
"fo", { PANGO_SCRIPT_LATIN } },
157 {
"fr", { PANGO_SCRIPT_LATIN } },
158 {
"ful", { PANGO_SCRIPT_LATIN } },
159 {
"fur", { PANGO_SCRIPT_LATIN } },
160 {
"fy", { PANGO_SCRIPT_LATIN } },
161 {
"ga", { PANGO_SCRIPT_LATIN } },
162 {
"gd", { PANGO_SCRIPT_LATIN } },
163 {
"gez", { PANGO_SCRIPT_ETHIOPIC } },
164 {
"gl", { PANGO_SCRIPT_LATIN } },
165 {
"gn", { PANGO_SCRIPT_LATIN } },
166 {
"gu", { PANGO_SCRIPT_GUJARATI } },
167 {
"gv", { PANGO_SCRIPT_LATIN } },
168 {
"ha", { PANGO_SCRIPT_LATIN } },
169 {
"haw", { PANGO_SCRIPT_LATIN } },
170 {
"he", { PANGO_SCRIPT_HEBREW } },
171 {
"hi", { PANGO_SCRIPT_DEVANAGARI } },
172 {
"ho", { PANGO_SCRIPT_LATIN } },
173 {
"hr", { PANGO_SCRIPT_LATIN } },
174 {
"hu", { PANGO_SCRIPT_LATIN } },
175 {
"hy", { PANGO_SCRIPT_ARMENIAN } },
176 {
"ia", { PANGO_SCRIPT_LATIN } },
177 {
"ibo", { PANGO_SCRIPT_LATIN } },
178 {
"id", { PANGO_SCRIPT_LATIN } },
179 {
"ie", { PANGO_SCRIPT_LATIN } },
180 {
"ik", { PANGO_SCRIPT_CYRILLIC } },
181 {
"io", { PANGO_SCRIPT_LATIN } },
182 {
"is", { PANGO_SCRIPT_LATIN } },
183 {
"it", { PANGO_SCRIPT_LATIN } },
184 {
"iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL } },
186 {
"ja", { EncodingDetector::Japanese } },
187 {
"kaa", { PANGO_SCRIPT_CYRILLIC } },
188 {
"ka", { PANGO_SCRIPT_GEORGIAN } },
189 {
"ki", { PANGO_SCRIPT_LATIN } },
190 {
"kk", { PANGO_SCRIPT_CYRILLIC } },
191 {
"kl", { PANGO_SCRIPT_LATIN } },
192 {
"km", { PANGO_SCRIPT_KHMER } },
193 {
"kn", { PANGO_SCRIPT_KANNADA } },
195 {
"ko", { EncodingDetector::Korean } },
196 {
"kok", { PANGO_SCRIPT_DEVANAGARI } },
197 {
"ks", { PANGO_SCRIPT_DEVANAGARI } },
198 {
"ku-ir", { PANGO_SCRIPT_ARABIC } },
199 {
"ku", { PANGO_SCRIPT_CYRILLIC } },
200 {
"kum", { PANGO_SCRIPT_CYRILLIC } },
201 {
"kv", { PANGO_SCRIPT_CYRILLIC } },
202 {
"kw", { PANGO_SCRIPT_LATIN } },
203 {
"ky", { PANGO_SCRIPT_CYRILLIC } },
204 {
"la", { PANGO_SCRIPT_LATIN } },
205 {
"lb", { PANGO_SCRIPT_LATIN } },
206 {
"lez", { PANGO_SCRIPT_CYRILLIC } },
207 {
"ln", { PANGO_SCRIPT_LATIN } },
208 {
"lo", { PANGO_SCRIPT_LAO } },
210 {
"lt", { EncodingDetector::Baltic } },
212 {
"lv", { EncodingDetector::Baltic } },
213 {
"mg", { PANGO_SCRIPT_LATIN } },
214 {
"mh", { PANGO_SCRIPT_LATIN } },
215 {
"mi", { PANGO_SCRIPT_LATIN } },
216 {
"mk", { PANGO_SCRIPT_CYRILLIC } },
217 {
"ml", { PANGO_SCRIPT_MALAYALAM } },
218 {
"mn", { PANGO_SCRIPT_MONGOLIAN } },
219 {
"mo", { PANGO_SCRIPT_CYRILLIC } },
220 {
"mr", { PANGO_SCRIPT_DEVANAGARI } },
221 {
"mt", { PANGO_SCRIPT_LATIN } },
222 {
"my", { PANGO_SCRIPT_MYANMAR } },
223 {
"nb", { PANGO_SCRIPT_LATIN } },
224 {
"nds", { PANGO_SCRIPT_LATIN } },
225 {
"ne", { PANGO_SCRIPT_DEVANAGARI } },
226 {
"nl", { PANGO_SCRIPT_LATIN } },
227 {
"nn", { PANGO_SCRIPT_LATIN } },
228 {
"no", { PANGO_SCRIPT_LATIN } },
229 {
"nr", { PANGO_SCRIPT_LATIN } },
230 {
"nso", { PANGO_SCRIPT_LATIN } },
231 {
"ny", { PANGO_SCRIPT_LATIN } },
232 {
"oc", { PANGO_SCRIPT_LATIN } },
233 {
"om", { PANGO_SCRIPT_LATIN } },
234 {
"or", { PANGO_SCRIPT_ORIYA } },
235 {
"os", { PANGO_SCRIPT_CYRILLIC } },
236 {
"pa", { PANGO_SCRIPT_GURMUKHI } },
237 {
"pl", { PANGO_SCRIPT_LATIN } },
238 {
"ps-af", { PANGO_SCRIPT_ARABIC } },
239 {
"ps-pk", { PANGO_SCRIPT_ARABIC } },
240 {
"pt", { PANGO_SCRIPT_LATIN } },
241 {
"rm", { PANGO_SCRIPT_LATIN } },
242 {
"ro", { PANGO_SCRIPT_LATIN } },
243 {
"ru", { PANGO_SCRIPT_CYRILLIC } },
244 {
"sah", { PANGO_SCRIPT_CYRILLIC } },
245 {
"sa", { PANGO_SCRIPT_DEVANAGARI } },
246 {
"sco", { PANGO_SCRIPT_LATIN } },
247 {
"sel", { PANGO_SCRIPT_CYRILLIC } },
248 {
"se", { PANGO_SCRIPT_LATIN } },
249 {
"sh", { PANGO_SCRIPT_CYRILLIC } },
250 {
"si", { PANGO_SCRIPT_SINHALA } },
251 {
"sk", { PANGO_SCRIPT_LATIN } },
252 {
"sl", { PANGO_SCRIPT_LATIN } },
253 {
"sma", { PANGO_SCRIPT_LATIN } },
254 {
"smj", { PANGO_SCRIPT_LATIN } },
255 {
"smn", { PANGO_SCRIPT_LATIN } },
256 {
"sms", { PANGO_SCRIPT_LATIN } },
257 {
"sm", { PANGO_SCRIPT_LATIN } },
258 {
"so", { PANGO_SCRIPT_LATIN } },
259 {
"sq", { PANGO_SCRIPT_LATIN } },
260 {
"sr", { PANGO_SCRIPT_CYRILLIC } },
261 {
"ss", { PANGO_SCRIPT_LATIN } },
262 {
"st", { PANGO_SCRIPT_LATIN } },
263 {
"sv", { PANGO_SCRIPT_LATIN } },
264 {
"sw", { PANGO_SCRIPT_LATIN } },
265 {
"syr", { PANGO_SCRIPT_SYRIAC } },
266 {
"ta", { PANGO_SCRIPT_TAMIL } },
267 {
"te", { PANGO_SCRIPT_TELUGU } },
268 {
"tg", { PANGO_SCRIPT_CYRILLIC } },
269 {
"th", { PANGO_SCRIPT_THAI } },
270 {
"ti-er", { PANGO_SCRIPT_ETHIOPIC } },
271 {
"ti-et", { PANGO_SCRIPT_ETHIOPIC } },
272 {
"tig", { PANGO_SCRIPT_ETHIOPIC } },
273 {
"tk", { PANGO_SCRIPT_CYRILLIC } },
274 {
"tl", { PANGO_SCRIPT_TAGALOG } },
275 {
"tn", { PANGO_SCRIPT_LATIN } },
276 {
"to", { PANGO_SCRIPT_LATIN } },
278 {
"tr", { EncodingDetector::Turkish } },
279 {
"ts", { PANGO_SCRIPT_LATIN } },
280 {
"tt", { PANGO_SCRIPT_CYRILLIC } },
281 {
"tw", { PANGO_SCRIPT_LATIN } },
282 {
"tyv", { PANGO_SCRIPT_CYRILLIC } },
283 {
"ug", { PANGO_SCRIPT_ARABIC } },
284 {
"uk", { PANGO_SCRIPT_CYRILLIC } },
285 {
"ur", { PANGO_SCRIPT_ARABIC } },
286 {
"uz", { PANGO_SCRIPT_CYRILLIC } },
287 {
"ven", { PANGO_SCRIPT_LATIN } },
288 {
"vi", { PANGO_SCRIPT_LATIN } },
289 {
"vot", { PANGO_SCRIPT_LATIN } },
290 {
"vo", { PANGO_SCRIPT_LATIN } },
291 {
"wa", { PANGO_SCRIPT_LATIN } },
292 {
"wen", { PANGO_SCRIPT_LATIN } },
293 {
"wo", { PANGO_SCRIPT_LATIN } },
294 {
"xh", { PANGO_SCRIPT_LATIN } },
295 {
"yap", { PANGO_SCRIPT_LATIN } },
296 {
"yi", { PANGO_SCRIPT_HEBREW } },
297 {
"yo", { PANGO_SCRIPT_LATIN } },
299 {
"zh-cn", { EncodingDetector::ChineseSimplified } },
301 {
"zh-hk", { EncodingDetector::ChineseTraditional } },
303 {
"zh-mo", { EncodingDetector::ChineseTraditional } },
305 {
"zh-sg", { EncodingDetector::ChineseSimplified } },
307 {
"zh-tw", { EncodingDetector::ChineseTraditional } },
308 {
"zu", { PANGO_SCRIPT_LATIN } },
309 {
"\x00", { EncodingDetector::None } }
323 static bool is16Bit(TQTextCodec* codec)
325 switch (codec->mibEnum())
337 class EncodingDetectorPrivate
340 TQTextCodec *m_codec;
341 TQTextDecoder *m_decoder;
342 TQTextCodec *m_defaultCodec;
343 TQCString m_storeDecoderName;
345 EncodingDetector::EncodingChoiceSource m_source;
346 EncodingDetector::AutoDetectScript m_autoDetectLanguage;
348 bool m_visualRTL : 1;
350 bool m_writtingHappened : 1;
351 bool m_analyzeCalled : 1;
354 TQCString m_bufferForDefferedEncDetection;
356 EncodingDetectorPrivate()
357 : m_codec(TQTextCodec::codecForMib(MibLatin1))
358 , m_decoder(m_codec->makeDecoder())
359 , m_defaultCodec(m_codec)
364 , m_writtingHappened(false)
365 , m_analyzeCalled(false)
370 EncodingDetectorPrivate(TQTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
372 , m_decoder(m_codec->makeDecoder())
373 , m_defaultCodec(m_codec)
375 , m_autoDetectLanguage(script)
378 , m_writtingHappened(false)
379 , m_analyzeCalled(false)
384 ~EncodingDetectorPrivate()
391 static TQCString automaticDetectionForArabic(
const unsigned char* ptr,
int size )
393 for (
int i = 0; i < size; ++i ) {
394 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
395 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
396 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
397 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
405 static TQCString automaticDetectionForBaltic(
const unsigned char* ptr,
int size )
407 for (
int i = 0; i < size; ++i ) {
408 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
411 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
412 return "iso-8859-13";
415 return "iso-8859-13";
418 static TQCString automaticDetectionForCentralEuropean(
const unsigned char* ptr,
int size )
421 for (
int i = 0; i < size; ++i ) {
422 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
423 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
433 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
437 if ( charset.isNull() )
438 charset =
"iso-8859-2";
444 if ( charset.isNull() )
445 charset =
"iso-8859-3";
447 return charset.data();
450 static TQCString automaticDetectionForCyrillic(
const unsigned char* ptr,
int size)
453 kWarning() <<
"EncodingDetector: Cyr heuristics";
470 int cp1251_o_capital=0;
475 int cp1251_a_capital=0;
480 int cp1251_s_capital=0;
485 int cp1251_i_capital=0;
488 int cp1251_small_range=0;
489 int koi_small_range=0;
490 int ibm866_small_range=0;
493 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
497 ++cp1251_small_range;
501 else if (ptr[i]==0xe0)
503 else if (ptr[i]==0xe8)
505 else if (ptr[i]==0xf1)
507 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
510 else if (ptr[i]==0xef)
512 else if (ptr[i]==0xe1)
514 else if (ptr[i]==0xe9)
516 else if (ptr[i]==0xf3)
520 else if (ptr[i]>0xbf)
524 if (ptr[i]==0xd0||ptr[i]==0xd1)
526 else if (ptr[i]==0xcf)
528 else if (ptr[i]==0xc1)
530 else if (ptr[i]==0xc9)
532 else if (ptr[i]==0xd3)
534 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
537 else if (ptr[i]==0xce)
539 else if (ptr[i]==0xc0)
541 else if (ptr[i]==0xc8)
543 else if (ptr[i]==0xd1)
546 else if (ptr[i]>0x9f && ptr[i]<0xb0)
547 ++ibm866_small_range;
552 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
557 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
560 kWarning() <<
"Cyr Enc Detection: UTF8";
565 if (ibm866_small_range>cp1251_small_range+koi_small_range)
571 if (cp1251_st==0 && koi_st>1)
573 else if (koi_st==0 && cp1251_st>1)
576 if (cp1251_st && koi_st)
578 if (cp1251_st/koi_st>2)
580 else if (koi_st/cp1251_st>2)
586 else if (cp1251_a || koi_a)
591 else if (cp1251_o || koi_o)
596 else if (cp1251_i || koi_i)
601 else if (cp1251_s || koi_s)
604 if (cp1251_a_capital>koi_a_capital)
606 else if (cp1251_a_capital || koi_a_capital)
609 if (cp1251_o_capital>koi_o_capital)
611 else if (cp1251_o_capital || koi_o_capital)
614 if (cp1251_i_capital>koi_i_capital)
616 else if (cp1251_i_capital || koi_i_capital)
619 if (cp1251_s_capital>koi_s_capital)
621 else if (cp1251_s_capital || koi_s_capital)
624 kWarning()<<
"koi_score " << koi_score <<
" cp1251_score " << cp1251_score;
626 if (abs(koi_score-cp1251_score)<10)
629 cp1251_score=cp1251_small_range;
630 koi_score=koi_small_range;
632 if (cp1251_score>koi_score)
646 static TQCString automaticDetectionForGreek(
const unsigned char* ptr,
int size )
648 for (
int i = 0; i < size; ++i ) {
649 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
650 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
651 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
659 static TQCString automaticDetectionForHebrew(
const unsigned char* ptr,
int size )
661 for (
int i = 0; i < size; ++i ) {
662 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
663 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
664 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
668 if ( ptr[ i ] == 0xDF )
669 return "iso-8859-8-i";
672 return "iso-8859-8-i";
675 static TQCString automaticDetectionForJapanese(
const unsigned char* ptr,
int size )
679 switch ( kc.guess_jp( (
const char*)ptr, size ) ) {
680 case JapaneseCode::JIS:
682 case JapaneseCode::EUC:
684 case JapaneseCode::SJIS:
686 case JapaneseCode::UTF8:
695 static TQCString automaticDetectionForTurkish(
const unsigned char* ptr,
int size )
697 for (
int i = 0; i < size; ++i ) {
698 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
706 static TQCString automaticDetectionForWesternEuropean(
const unsigned char* ptr,
int size )
708 uint nonansi_count=0;
709 for (
int i=0; i<size; ++i)
714 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
718 if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
727 return "iso-8859-15";
734 if (d->m_codec->mibEnum()!=MibUtf8)
741 static const unsigned char highest1Bits = 0x80;
742 static const unsigned char highest2Bits = 0xC0;
743 static const unsigned char highest3Bits = 0xE0;
744 static const unsigned char highest4Bits = 0xF0;
745 static const unsigned char highest5Bits = 0xF8;
747 for (
int i=0; i<length; ++i)
749 unsigned char c = data[i];
751 if (d->m_multiByte>0)
753 if ((c & highest2Bits) == 0x80)
759 kWarning() <<
"EncDetector: Broken UTF8";
765 if ((c & highest1Bits) == 0x00)
769 if ((c & highest3Bits) == 0xC0)
776 if ((c & highest4Bits) == 0xE0)
783 if ((c & highest5Bits) == 0xF0)
789 kWarning() <<
"EncDetector:_Broken UTF8";
801 d(new EncodingDetectorPrivate(codec,source,script))
805 EncodingDetector::~EncodingDetector()
810 void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
812 d->m_autoDetectLanguage=lang;
814 EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage()
const
816 return d->m_autoDetectLanguage;
819 EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource()
const
826 d->m_storeDecoderName = d->m_codec->name();
827 d->m_storeDecoderName = d->m_storeDecoderName.lower().replace(
"iso ",
"iso-" );
828 return d->m_storeDecoderName.data();
831 bool EncodingDetector::visuallyOrdered()
const
833 return d->m_visualRTL;
849 TQCString enc(_encoding);
852 if (type==DefaultEncoding)
853 codec=d->m_defaultCodec;
866 codec = TDEGlobal::charsets()->codecForName(enc, b);
871 if (d->m_codec->mibEnum()==codec->mibEnum())
874 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
881 if (codec->mibEnum() == Mib8859_8)
884 codec = TQTextCodec::codecForName(
"iso8859-8-i");
887 if(!(enc==
"iso-8859-8-i"||enc==
"iso_8859-8-i"||enc==
"csiso88598i"||enc==
"logical"))
888 d->m_visualRTL =
true;
894 d->m_decoder = d->m_codec->makeDecoder();
896 kDebug(6005) <<
"EncodingDetector::encoding used is" << d->m_codec->name();
903 return analyze( data.data(), data.size() );
911 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
914 const uchar *udata = (
const uchar *)data;
920 const char *autoDetectedEncoding;
921 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
923 autoDetectedEncoding =
"ISO-10646-UCS-2";
925 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
927 autoDetectedEncoding =
"UTF-8";
929 else if (c1 == 0x00 || c2 == 0x00)
937 uchar c10 = *udata++;
939 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
940 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
941 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
942 autoDetectedEncoding =
"ISO-10646-UCS-2";
944 autoDetectedEncoding = 0;
948 autoDetectedEncoding = 0;
952 if (autoDetectedEncoding != 0)
955 d->m_codec = TQTextCodec::codecForName(autoDetectedEncoding);
959 d->m_decoder = d->m_codec->makeDecoder();
961 kWarning() <<
"Detection by BOM";
963 if (is16Bit(d->m_codec) && c2==0x00)
966 char reverseUtf16[3] = {(char)0xFF, (
char)0xFE, 0x00};
967 d->m_decoder->toUnicode(reverseUtf16, 2);
974 if (d->m_source==UserChosenEncoding)
977 kWarning() <<
"EncodingDetector: UserChosenEncoding exit ";
990 const char *ptr = data;
991 const char *pEnd = data+len;
1002 if (ptr[0] ==
'!' && ptr[1] ==
'-' && ptr[2] ==
'-')
1005 skipComment(ptr, pEnd);
1010 if (ptr[0]==
'?' && ptr[1]==
'x' && ptr[2]==
'm' && ptr[3]==
'l')
1012 const char *end = ptr;
1013 while (*end !=
'>' && end < pEnd)
1015 if (*end ==
'\0' || end == pEnd)
1017 TQCString str(ptr, end - ptr + 1);
1019 int pos = findXMLEncoding(str, length);
1021 if (pos!=-1 &&
setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
1029 !((*ptr >=
'a') && (*ptr <=
'z') ||
1030 (*ptr >=
'A') && (*ptr <=
'Z'))
1037 const char* max=ptr+4;
1041 ((*ptr >=
'a') && (*ptr <=
'z') ||
1042 (*ptr >=
'A') && (*ptr <=
'Z') ||
1043 (*ptr >=
'0') && (*ptr <=
'9'))
1047 tmp[length] = tolower( *ptr );
1052 if (tmp[0]==
'm'&&tmp[1]==
'e'&&tmp[2]==
't'&&tmp[3]==
'a')
1055 const char* end = ptr;
1056 while(*end !=
'>' && *end !=
'\0' && end<pEnd)
1059 TQCString str( ptr, (end-ptr)+1);
1064 if( (pos = str.find(
"charset")) == -1)
1068 if( (pos = str.find(
'=', pos)) == -1)
1072 while (pos < (
int)str.length() && str[pos] <=
' ')
1074 if ( pos == (
int)str.length())
1078 while( endpos < str.length() &&
1079 (str[endpos] !=
' ' && str[endpos] !=
'"' && str[endpos] !=
'\''
1080 && str[endpos] !=
';' && str[endpos] !=
'>') )
1083 kDebug( 6005 ) <<
"EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1085 if (
setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
1088 else if (tmp[0]==
'b'&&tmp[1]==
'o'&&tmp[2]==
'd'&&tmp[3]==
'y')
1096 if (d->m_source==EncodingFromHTTPHeader)
1106 kDebug( 6005 ) <<
"EncodingDetector: using heuristics (" << strlen(data) <<
")";
1109 switch ( d->m_autoDetectLanguage )
1111 case EncodingDetector::Arabic:
1112 return setEncoding(automaticDetectionForArabic( (
const unsigned char*) data, len ), AutoDetectedEncoding);
1114 case EncodingDetector::Baltic:
1115 return setEncoding(automaticDetectionForBaltic( (
const unsigned char*) data, len ), AutoDetectedEncoding);
1117 case EncodingDetector::CentralEuropean:
1118 return setEncoding(automaticDetectionForCentralEuropean( (
const unsigned char*) data, len ), AutoDetectedEncoding);
1120 case EncodingDetector::Cyrillic:
1121 return setEncoding(automaticDetectionForCyrillic( (
const unsigned char*) data, len), AutoDetectedEncoding);
1123 case EncodingDetector::Greek:
1124 return setEncoding(automaticDetectionForGreek( (
const unsigned char*) data, len ), AutoDetectedEncoding);
1126 case EncodingDetector::Hebrew:
1127 return setEncoding(automaticDetectionForHebrew( (
const unsigned char*) data, len ), AutoDetectedEncoding);
1129 case EncodingDetector::Japanese:
1130 return setEncoding(automaticDetectionForJapanese( (
const unsigned char*) data, len ), AutoDetectedEncoding);
1132 case EncodingDetector::Turkish:
1133 return setEncoding(automaticDetectionForTurkish( (
const unsigned char*) data, len ), AutoDetectedEncoding);
1135 case EncodingDetector::WesternEuropean:
1136 if (
setEncoding(automaticDetectionForWesternEuropean( (
const unsigned char*) data, len ), AutoDetectedEncoding))
1138 else if (d->m_defaultCodec->mibEnum()==MibLatin1)
1140 return setEncoding(
"iso-8859-15",AutoDetectedEncoding);
1147 case EncodingDetector::SemiautomaticDetection:
1148 case EncodingDetector::ChineseSimplified:
1149 case EncodingDetector::ChineseTraditional:
1150 case EncodingDetector::Korean:
1151 case EncodingDetector::Thai:
1152 case EncodingDetector::Unicode:
1153 case EncodingDetector::NorthernSaami:
1154 case EncodingDetector::SouthEasternEurope:
1155 case EncodingDetector::None:
1169 return EncodingDetector::None;
1170 else if (lang==i18n(
"@item Text character set",
"Unicode"))
1171 return EncodingDetector::Unicode;
1172 else if (lang==i18n(
"@item Text character set",
"Cyrillic"))
1173 return EncodingDetector::Cyrillic;
1174 else if (lang==i18n(
"@item Text character set",
"Western European"))
1175 return EncodingDetector::WesternEuropean;
1176 else if (lang==i18n(
"@item Text character set",
"Central European"))
1177 return EncodingDetector::CentralEuropean;
1178 else if (lang==i18n(
"@item Text character set",
"Greek"))
1179 return EncodingDetector::Greek;
1180 else if (lang==i18n(
"@item Text character set",
"Hebrew"))
1181 return EncodingDetector::Hebrew;
1182 else if (lang==i18n(
"@item Text character set",
"Turkish"))
1183 return EncodingDetector::Turkish;
1184 else if (lang==i18n(
"@item Text character set",
"Japanese"))
1185 return EncodingDetector::Japanese;
1186 else if (lang==i18n(
"@item Text character set",
"Baltic"))
1187 return EncodingDetector::Baltic;
1188 else if (lang==i18n(
"@item Text character set",
"Arabic"))
1189 return EncodingDetector::Arabic;
1191 return EncodingDetector::None;
1194 bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
1198 case EncodingDetector::Arabic:
1200 case EncodingDetector::Baltic:
1202 case EncodingDetector::CentralEuropean:
1204 case EncodingDetector::Cyrillic:
1206 case EncodingDetector::Greek:
1208 case EncodingDetector::Hebrew:
1210 case EncodingDetector::Japanese:
1212 case EncodingDetector::Turkish:
1214 case EncodingDetector::WesternEuropean:
1216 case EncodingDetector::ChineseTraditional:
1218 case EncodingDetector::ChineseSimplified:
1220 case EncodingDetector::Unicode:
1228 TQString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
1232 case EncodingDetector::Arabic:
1233 return i18n(
"@item Text character set",
"Arabic");
1235 case EncodingDetector::Baltic:
1236 return i18n(
"@item Text character set",
"Baltic");
1238 case EncodingDetector::CentralEuropean:
1239 return i18n(
"@item Text character set",
"Central European");
1241 case EncodingDetector::Cyrillic:
1242 return i18n(
"@item Text character set",
"Cyrillic");
1244 case EncodingDetector::Greek:
1245 return i18n(
"@item Text character set",
"Greek");
1247 case EncodingDetector::Hebrew:
1248 return i18n(
"@item Text character set",
"Hebrew");
1250 case EncodingDetector::Japanese:
1251 return i18n(
"@item Text character set",
"Japanese");
1253 case EncodingDetector::Turkish:
1254 return i18n(
"@item Text character set",
"Turkish");
1256 case EncodingDetector::WesternEuropean:
1257 return i18n(
"@item Text character set",
"Western European");
1259 case EncodingDetector::ChineseTraditional:
1260 return i18n(
"@item Text character set",
"Chinese Traditional");
1262 case EncodingDetector::ChineseSimplified:
1263 return i18n(
"@item Text character set",
"Chinese Simplified");
1265 case EncodingDetector::Korean:
1266 return i18n(
"@item Text character set",
"Korean");
1268 case EncodingDetector::Thai:
1269 return i18n(
"@item Text character set",
"Thai");
1271 case EncodingDetector::Unicode:
1272 return i18n(
"@item Text character set",
"Unicode");
1281 EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(
const TQString &lc)
1285 const char *langStr = pango_script_for_lang[0].lang;
1287 for (
int i = 0; langStr; i++ ) {
1288 langStr = pango_script_for_lang[i].lang;
1290 if ( lc.startsWith( TQString::fromAscii( langStr ) ) )
1291 return pango_script_for_lang[i].scripts[0];
Provides encoding detection capabilities.
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
static AutoDetectScript scriptForName(const TQString &lang)
Takes lang name after it were i18n()'ed.
EncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
const char * encoding() const
Convenience method.
bool setEncoding(const char *encoding, EncodingChoiceSource type)
TQTextDecoder * decoder()
bool analyze(const char *data, int len)
Analyze text data.