28#include "encodingdetector.h" 
   33#define MAX_BUFFER 16*1024 
   38#include "encodingdetector_ja_p.h" 
   41#include <tqtextcodec.h> 
   65typedef struct _PangoScriptForLang {
 
   67  EncodingDetector::AutoDetectScript scripts[1];
 
   76#define PANGO_SCRIPT_ARMENIAN EncodingDetector::None 
   77#define PANGO_SCRIPT_BENGALI EncodingDetector::None 
   78#define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None 
   79#define PANGO_SCRIPT_CHEROKEE EncodingDetector::None 
   80#define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None 
   81#define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None 
   82#define PANGO_SCRIPT_GUJARATI EncodingDetector::None 
   83#define PANGO_SCRIPT_GURMUKHI EncodingDetector::None 
   84#define PANGO_SCRIPT_KANNADA EncodingDetector::None 
   85#define PANGO_SCRIPT_KHMER EncodingDetector::None 
   86#define PANGO_SCRIPT_LAO EncodingDetector::None 
   87#define PANGO_SCRIPT_MALAYALAM EncodingDetector::None 
   88#define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None 
   89#define PANGO_SCRIPT_MYANMAR EncodingDetector::None 
   90#define PANGO_SCRIPT_ORIYA EncodingDetector::None 
   91#define PANGO_SCRIPT_SINHALA EncodingDetector::None 
   92#define PANGO_SCRIPT_SYRIAC EncodingDetector::None 
   93#define PANGO_SCRIPT_TAGALOG EncodingDetector::None 
   94#define PANGO_SCRIPT_TAMIL EncodingDetector::None 
   95#define PANGO_SCRIPT_TIBETAN EncodingDetector::None 
   96#define PANGO_SCRIPT_TELUGU EncodingDetector::None 
   99#define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic 
  100#define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic 
  101#define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope 
  102#define PANGO_SCRIPT_GREEK EncodingDetector::Greek 
  103#define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew 
  104#define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean 
  105#define PANGO_SCRIPT_THAI EncodingDetector::Thai 
  108static const PangoScriptForLang pango_script_for_lang[] = {
 
  109  { 
"aa",    { PANGO_SCRIPT_LATIN } },
 
  110  { 
"ab",    { PANGO_SCRIPT_CYRILLIC } },
 
  111  { 
"af",    { PANGO_SCRIPT_LATIN } },
 
  112  { 
"am",    { PANGO_SCRIPT_ETHIOPIC } },
 
  113  { 
"ar",    { PANGO_SCRIPT_ARABIC } },
 
  114  { 
"as",    { PANGO_SCRIPT_BENGALI } },
 
  115  { 
"ast",   { PANGO_SCRIPT_LATIN } },
 
  116  { 
"ava",   { PANGO_SCRIPT_CYRILLIC } },
 
  117  { 
"ay",    { PANGO_SCRIPT_LATIN } },
 
  118  { 
"az-ir", { PANGO_SCRIPT_ARABIC } },
 
  119  { 
"az",    { PANGO_SCRIPT_CYRILLIC } }, 
 
  120  { 
"bam",   { PANGO_SCRIPT_LATIN } },
 
  121  { 
"ba",    { PANGO_SCRIPT_CYRILLIC } },
 
  122  { 
"be",    { PANGO_SCRIPT_CYRILLIC } },
 
  123  { 
"bg",    { PANGO_SCRIPT_CYRILLIC } },
 
  124  { 
"bh",    { PANGO_SCRIPT_DEVANAGARI } },
 
  125  { 
"bho",   { PANGO_SCRIPT_DEVANAGARI } },
 
  126  { 
"bi",    { PANGO_SCRIPT_LATIN } },
 
  127  { 
"bin",   { PANGO_SCRIPT_LATIN } },
 
  128  { 
"bn",    { PANGO_SCRIPT_BENGALI } },
 
  129  { 
"bo",    { PANGO_SCRIPT_TIBETAN } },
 
  130  { 
"br",    { PANGO_SCRIPT_LATIN } },
 
  131  { 
"bs",    { PANGO_SCRIPT_LATIN } },
 
  132  { 
"bua",   { PANGO_SCRIPT_CYRILLIC } },
 
  133  { 
"ca",    { PANGO_SCRIPT_LATIN } },
 
  134  { 
"ce",    { PANGO_SCRIPT_CYRILLIC } },
 
  135  { 
"chm",   { PANGO_SCRIPT_CYRILLIC } },
 
  136  { 
"chr",   { PANGO_SCRIPT_CHEROKEE } },
 
  137  { 
"ch",    { PANGO_SCRIPT_LATIN } },
 
  138  { 
"co",    { PANGO_SCRIPT_LATIN } },
 
  139  { 
"cs",    { PANGO_SCRIPT_LATIN } },
 
  140  { 
"cu",    { PANGO_SCRIPT_CYRILLIC } },
 
  141  { 
"cv",    { PANGO_SCRIPT_CYRILLIC } }, 
 
  142  { 
"cy",    { PANGO_SCRIPT_LATIN } },
 
  143  { 
"da",    { PANGO_SCRIPT_LATIN } },
 
  144  { 
"de",    { PANGO_SCRIPT_LATIN } },
 
  145  { 
"dz",    { PANGO_SCRIPT_TIBETAN } },
 
  146  { 
"el",    { PANGO_SCRIPT_GREEK } },
 
  147  { 
"en",    { PANGO_SCRIPT_LATIN } },
 
  148  { 
"eo",    { PANGO_SCRIPT_LATIN } },
 
  149  { 
"es",    { PANGO_SCRIPT_LATIN } },
 
  151  { 
"et",    { EncodingDetector::Baltic } },
 
  152  { 
"eu",    { PANGO_SCRIPT_LATIN } },
 
  153  { 
"fa",    { PANGO_SCRIPT_ARABIC } },
 
  154  { 
"fi",    { PANGO_SCRIPT_LATIN } },
 
  155  { 
"fj",    { PANGO_SCRIPT_LATIN } },
 
  156  { 
"fo",    { PANGO_SCRIPT_LATIN } },
 
  157  { 
"fr",    { PANGO_SCRIPT_LATIN } },
 
  158  { 
"ful",   { PANGO_SCRIPT_LATIN } },
 
  159  { 
"fur",   { PANGO_SCRIPT_LATIN } },
 
  160  { 
"fy",    { PANGO_SCRIPT_LATIN } },
 
  161  { 
"ga",    { PANGO_SCRIPT_LATIN } },
 
  162  { 
"gd",    { PANGO_SCRIPT_LATIN } },
 
  163  { 
"gez",   { PANGO_SCRIPT_ETHIOPIC } },
 
  164  { 
"gl",    { PANGO_SCRIPT_LATIN } },
 
  165  { 
"gn",    { PANGO_SCRIPT_LATIN } },
 
  166  { 
"gu",    { PANGO_SCRIPT_GUJARATI } },
 
  167  { 
"gv",    { PANGO_SCRIPT_LATIN } },
 
  168  { 
"ha",    { PANGO_SCRIPT_LATIN } },
 
  169  { 
"haw",   { PANGO_SCRIPT_LATIN } },
 
  170  { 
"he",    { PANGO_SCRIPT_HEBREW } },
 
  171  { 
"hi",    { PANGO_SCRIPT_DEVANAGARI } },
 
  172  { 
"ho",    { PANGO_SCRIPT_LATIN } },
 
  173  { 
"hr",    { PANGO_SCRIPT_LATIN } },
 
  174  { 
"hu",    { PANGO_SCRIPT_LATIN } },
 
  175  { 
"hy",    { PANGO_SCRIPT_ARMENIAN } },
 
  176  { 
"ia",    { PANGO_SCRIPT_LATIN } },
 
  177  { 
"ibo",   { PANGO_SCRIPT_LATIN } },
 
  178  { 
"id",    { PANGO_SCRIPT_LATIN } },
 
  179  { 
"ie",    { PANGO_SCRIPT_LATIN } },
 
  180  { 
"ik",    { PANGO_SCRIPT_CYRILLIC } },
 
  181  { 
"io",    { PANGO_SCRIPT_LATIN } },
 
  182  { 
"is",    { PANGO_SCRIPT_LATIN } },
 
  183  { 
"it",    { PANGO_SCRIPT_LATIN } },
 
  184  { 
"iu",    { PANGO_SCRIPT_CANADIAN_ABORIGINAL } },
 
  186  { 
"ja",    { EncodingDetector::Japanese } },
 
  187  { 
"kaa",   { PANGO_SCRIPT_CYRILLIC } },
 
  188  { 
"ka",    { PANGO_SCRIPT_GEORGIAN } },
 
  189  { 
"ki",    { PANGO_SCRIPT_LATIN } },
 
  190  { 
"kk",    { PANGO_SCRIPT_CYRILLIC } },
 
  191  { 
"kl",    { PANGO_SCRIPT_LATIN } },
 
  192  { 
"km",    { PANGO_SCRIPT_KHMER } },
 
  193  { 
"kn",    { PANGO_SCRIPT_KANNADA } },
 
  195  { 
"ko",    { EncodingDetector::Korean } },
 
  196  { 
"kok",   { PANGO_SCRIPT_DEVANAGARI } },
 
  197  { 
"ks",    { PANGO_SCRIPT_DEVANAGARI } },
 
  198  { 
"ku-ir", { PANGO_SCRIPT_ARABIC } },
 
  199  { 
"ku",    { PANGO_SCRIPT_CYRILLIC } }, 
 
  200  { 
"kum",   { PANGO_SCRIPT_CYRILLIC } },
 
  201  { 
"kv",    { PANGO_SCRIPT_CYRILLIC } },
 
  202  { 
"kw",    { PANGO_SCRIPT_LATIN } },
 
  203  { 
"ky",    { PANGO_SCRIPT_CYRILLIC } },
 
  204  { 
"la",    { PANGO_SCRIPT_LATIN } },
 
  205  { 
"lb",    { PANGO_SCRIPT_LATIN } },
 
  206  { 
"lez",   { PANGO_SCRIPT_CYRILLIC } },
 
  207  { 
"ln",    { PANGO_SCRIPT_LATIN } },
 
  208  { 
"lo",    { PANGO_SCRIPT_LAO } },
 
  210  { 
"lt",    { EncodingDetector::Baltic } },
 
  212  { 
"lv",    { EncodingDetector::Baltic } },
 
  213  { 
"mg",    { PANGO_SCRIPT_LATIN } },
 
  214  { 
"mh",    { PANGO_SCRIPT_LATIN } },
 
  215  { 
"mi",    { PANGO_SCRIPT_LATIN } },
 
  216  { 
"mk",    { PANGO_SCRIPT_CYRILLIC } },
 
  217  { 
"ml",    { PANGO_SCRIPT_MALAYALAM } },
 
  218  { 
"mn",    { PANGO_SCRIPT_MONGOLIAN } },
 
  219  { 
"mo",    { PANGO_SCRIPT_CYRILLIC } }, 
 
  220  { 
"mr",    { PANGO_SCRIPT_DEVANAGARI } },
 
  221  { 
"mt",    { PANGO_SCRIPT_LATIN } },
 
  222  { 
"my",    { PANGO_SCRIPT_MYANMAR } },
 
  223  { 
"nb",    { PANGO_SCRIPT_LATIN } },
 
  224  { 
"nds",   { PANGO_SCRIPT_LATIN } },
 
  225  { 
"ne",    { PANGO_SCRIPT_DEVANAGARI } },
 
  226  { 
"nl",    { PANGO_SCRIPT_LATIN } },
 
  227  { 
"nn",    { PANGO_SCRIPT_LATIN } },
 
  228  { 
"no",    { PANGO_SCRIPT_LATIN } },
 
  229  { 
"nr",    { PANGO_SCRIPT_LATIN } },
 
  230  { 
"nso",   { PANGO_SCRIPT_LATIN } },
 
  231  { 
"ny",    { PANGO_SCRIPT_LATIN } },
 
  232  { 
"oc",    { PANGO_SCRIPT_LATIN } },
 
  233  { 
"om",    { PANGO_SCRIPT_LATIN } },
 
  234  { 
"or",    { PANGO_SCRIPT_ORIYA } },
 
  235  { 
"os",    { PANGO_SCRIPT_CYRILLIC } },
 
  236  { 
"pa",    { PANGO_SCRIPT_GURMUKHI } },
 
  237  { 
"pl",    { PANGO_SCRIPT_LATIN } },
 
  238  { 
"ps-af", { PANGO_SCRIPT_ARABIC } },
 
  239  { 
"ps-pk", { PANGO_SCRIPT_ARABIC } },
 
  240  { 
"pt",    { PANGO_SCRIPT_LATIN } },
 
  241  { 
"rm",    { PANGO_SCRIPT_LATIN } },
 
  242  { 
"ro",    { PANGO_SCRIPT_LATIN } },
 
  243  { 
"ru",    { PANGO_SCRIPT_CYRILLIC } },
 
  244  { 
"sah",   { PANGO_SCRIPT_CYRILLIC } },
 
  245  { 
"sa",    { PANGO_SCRIPT_DEVANAGARI } },
 
  246  { 
"sco",   { PANGO_SCRIPT_LATIN } },
 
  247  { 
"sel",   { PANGO_SCRIPT_CYRILLIC } },
 
  248  { 
"se",    { PANGO_SCRIPT_LATIN } },
 
  249  { 
"sh",    { PANGO_SCRIPT_CYRILLIC } },
 
  250  { 
"si",    { PANGO_SCRIPT_SINHALA } },
 
  251  { 
"sk",    { PANGO_SCRIPT_LATIN } },
 
  252  { 
"sl",    { PANGO_SCRIPT_LATIN } },
 
  253  { 
"sma",   { PANGO_SCRIPT_LATIN } },
 
  254  { 
"smj",   { PANGO_SCRIPT_LATIN } },
 
  255  { 
"smn",   { PANGO_SCRIPT_LATIN } },
 
  256  { 
"sms",   { PANGO_SCRIPT_LATIN } },
 
  257  { 
"sm",    { PANGO_SCRIPT_LATIN } },
 
  258  { 
"so",    { PANGO_SCRIPT_LATIN } },
 
  259  { 
"sq",    { PANGO_SCRIPT_LATIN } },
 
  260  { 
"sr",    { PANGO_SCRIPT_CYRILLIC } },
 
  261  { 
"ss",    { PANGO_SCRIPT_LATIN } },
 
  262  { 
"st",    { PANGO_SCRIPT_LATIN } },
 
  263  { 
"sv",    { PANGO_SCRIPT_LATIN } },
 
  264  { 
"sw",    { PANGO_SCRIPT_LATIN } },
 
  265  { 
"syr",   { PANGO_SCRIPT_SYRIAC } },
 
  266  { 
"ta",    { PANGO_SCRIPT_TAMIL } },
 
  267  { 
"te",    { PANGO_SCRIPT_TELUGU } },
 
  268  { 
"tg",    { PANGO_SCRIPT_CYRILLIC } },
 
  269  { 
"th",    { PANGO_SCRIPT_THAI } },
 
  270  { 
"ti-er", { PANGO_SCRIPT_ETHIOPIC } },
 
  271  { 
"ti-et", { PANGO_SCRIPT_ETHIOPIC } },
 
  272  { 
"tig",   { PANGO_SCRIPT_ETHIOPIC } },
 
  273  { 
"tk",    { PANGO_SCRIPT_CYRILLIC } },
 
  274  { 
"tl",    { PANGO_SCRIPT_TAGALOG } },
 
  275  { 
"tn",    { PANGO_SCRIPT_LATIN } },
 
  276  { 
"to",    { PANGO_SCRIPT_LATIN } },
 
  278  { 
"tr",    { EncodingDetector::Turkish } },
 
  279  { 
"ts",    { PANGO_SCRIPT_LATIN } },
 
  280  { 
"tt",    { PANGO_SCRIPT_CYRILLIC } },
 
  281  { 
"tw",    { PANGO_SCRIPT_LATIN } },
 
  282  { 
"tyv",   { PANGO_SCRIPT_CYRILLIC } },
 
  283  { 
"ug",    { PANGO_SCRIPT_ARABIC } },
 
  284  { 
"uk",    { PANGO_SCRIPT_CYRILLIC } },
 
  285  { 
"ur",    { PANGO_SCRIPT_ARABIC } },
 
  286  { 
"uz",    { PANGO_SCRIPT_CYRILLIC } },
 
  287  { 
"ven",   { PANGO_SCRIPT_LATIN } },
 
  288  { 
"vi",    { PANGO_SCRIPT_LATIN } },
 
  289  { 
"vot",   { PANGO_SCRIPT_LATIN } },
 
  290  { 
"vo",    { PANGO_SCRIPT_LATIN } },
 
  291  { 
"wa",    { PANGO_SCRIPT_LATIN } },
 
  292  { 
"wen",   { PANGO_SCRIPT_LATIN } },
 
  293  { 
"wo",    { PANGO_SCRIPT_LATIN } },
 
  294  { 
"xh",    { PANGO_SCRIPT_LATIN } },
 
  295  { 
"yap",   { PANGO_SCRIPT_LATIN } },
 
  296  { 
"yi",    { PANGO_SCRIPT_HEBREW } },
 
  297  { 
"yo",    { PANGO_SCRIPT_LATIN } },
 
  299  { 
"zh-cn", { EncodingDetector::ChineseSimplified } },
 
  301  { 
"zh-hk", { EncodingDetector::ChineseTraditional } },
 
  303  { 
"zh-mo", { EncodingDetector::ChineseTraditional } },
 
  305  { 
"zh-sg", { EncodingDetector::ChineseSimplified } },
 
  307  { 
"zh-tw", { EncodingDetector::ChineseTraditional } },
 
  308  { 
"zu",    { PANGO_SCRIPT_LATIN } },
 
  309  { 
"\x00",    { EncodingDetector::None } }      
 
  323static bool is16Bit(TQTextCodec* codec)
 
  325    switch (codec->mibEnum())
 
  337class EncodingDetectorPrivate
 
  340    TQTextCodec *m_codec;
 
  341    TQTextDecoder *m_decoder; 
 
  342    TQTextCodec *m_defaultCodec;
 
  343    TQCString  m_storeDecoderName;
 
  345    EncodingDetector::EncodingChoiceSource m_source;
 
  346    EncodingDetector::AutoDetectScript m_autoDetectLanguage;
 
  348    bool m_visualRTL : 1;
 
  350    bool m_writtingHappened : 1;
 
  351    bool m_analyzeCalled : 1; 
 
  354    TQCString m_bufferForDefferedEncDetection;
 
  356    EncodingDetectorPrivate()
 
  357            : m_codec(TQTextCodec::codecForMib(MibLatin1))
 
  358            , m_decoder(m_codec->makeDecoder())
 
  359            , m_defaultCodec(m_codec)
 
  364            , m_writtingHappened(false)
 
  365            , m_analyzeCalled(false)
 
  370    EncodingDetectorPrivate(TQTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
 
  372            , m_decoder(m_codec->makeDecoder())
 
  373            , m_defaultCodec(m_codec)
 
  375            , m_autoDetectLanguage(script)
 
  378            , m_writtingHappened(false)
 
  379            , m_analyzeCalled(false)
 
  384    ~EncodingDetectorPrivate()
 
  391static TQCString automaticDetectionForArabic( 
const unsigned char* ptr, 
int size )
 
  393    for ( 
int i = 0; i < size; ++i ) {
 
  394        if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
 
  395             || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
 
  396             || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
 
  397             || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
 
  405static TQCString automaticDetectionForBaltic( 
const unsigned char* ptr, 
int size )
 
  407    for ( 
int i = 0; i < size; ++i ) {
 
  408        if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
 
  411        if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
 
  412            return "iso-8859-13";
 
  415    return "iso-8859-13";
 
  418static TQCString automaticDetectionForCentralEuropean(
const unsigned char* ptr, 
int size )
 
  421    for ( 
int i = 0; i < size; ++i ) {
 
  422        if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
 
  423            if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
 
  433        if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
 
  437                if ( charset.isNull() )
 
  438                    charset = 
"iso-8859-2";
 
  444    if ( charset.isNull() )
 
  445        charset = 
"iso-8859-3";
 
  447    return charset.data();
 
  450static TQCString automaticDetectionForCyrillic( 
const unsigned char* ptr, 
int size)
 
  453        kWarning() << 
"EncodingDetector: Cyr heuristics";
 
  470    int cp1251_o_capital=0;
 
  475    int cp1251_a_capital=0;
 
  480    int cp1251_s_capital=0;
 
  485    int cp1251_i_capital=0;
 
  488    int cp1251_small_range=0;
 
  489    int koi_small_range=0;
 
  490    int ibm866_small_range=0;
 
  493    for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
 
  497            ++cp1251_small_range;
 
  501            else if (ptr[i]==0xe0)
 
  503            else if (ptr[i]==0xe8)
 
  505            else if (ptr[i]==0xf1)
 
  507            else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
 
  510            else if (ptr[i]==0xef)
 
  512            else if (ptr[i]==0xe1)
 
  514            else if (ptr[i]==0xe9)
 
  516            else if (ptr[i]==0xf3)
 
  520        else if (ptr[i]>0xbf)
 
  524            if (ptr[i]==0xd0||ptr[i]==0xd1)
 
  526            else if (ptr[i]==0xcf)
 
  528            else if (ptr[i]==0xc1)
 
  530            else if (ptr[i]==0xc9)
 
  532            else if (ptr[i]==0xd3)
 
  534            else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
 
  537            else if (ptr[i]==0xce)
 
  539            else if (ptr[i]==0xc0)
 
  541            else if (ptr[i]==0xc8)
 
  543            else if (ptr[i]==0xd1)
 
  546        else if (ptr[i]>0x9f && ptr[i]<0xb0) 
 
  547            ++ibm866_small_range;
 
  552    if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
 
  557    if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
 
  560        kWarning() << 
"Cyr Enc Detection: UTF8";
 
  565    if (ibm866_small_range>cp1251_small_range+koi_small_range)
 
  571    if (cp1251_st==0 && koi_st>1)
 
  573    else if (koi_st==0 && cp1251_st>1)
 
  576    if (cp1251_st && koi_st)
 
  578        if (cp1251_st/koi_st>2)
 
  580        else if (koi_st/cp1251_st>2)
 
  586    else if (cp1251_a || koi_a)
 
  591    else if (cp1251_o || koi_o)
 
  596    else if (cp1251_i || koi_i)
 
  601    else if (cp1251_s || koi_s)
 
  604    if (cp1251_a_capital>koi_a_capital)
 
  606    else if (cp1251_a_capital || koi_a_capital)
 
  609    if (cp1251_o_capital>koi_o_capital)
 
  611    else if (cp1251_o_capital || koi_o_capital)
 
  614    if (cp1251_i_capital>koi_i_capital)
 
  616    else if (cp1251_i_capital || koi_i_capital)
 
  619    if (cp1251_s_capital>koi_s_capital)
 
  621    else if (cp1251_s_capital || koi_s_capital)
 
  624    kWarning()<<
"koi_score " << koi_score << 
" cp1251_score " << cp1251_score;
 
  626    if (abs(koi_score-cp1251_score)<10)
 
  629        cp1251_score=cp1251_small_range;
 
  630        koi_score=koi_small_range;
 
  632    if (cp1251_score>koi_score)
 
  646static TQCString automaticDetectionForGreek( 
const unsigned char* ptr, 
int size )
 
  648    for ( 
int i = 0; i < size; ++i ) {
 
  649        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
 
  650             || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
 
  651             || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
 
  659static TQCString automaticDetectionForHebrew( 
const unsigned char* ptr, 
int size )
 
  661    for ( 
int i = 0; i < size; ++i ) {
 
  662        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
 
  663             || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
 
  664             || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
 
  668        if ( ptr[ i ] == 0xDF )
 
  669            return "iso-8859-8-i";
 
  672    return "iso-8859-8-i";
 
  675static TQCString automaticDetectionForJapanese( 
const unsigned char* ptr, 
int size )
 
  679    switch ( kc.guess_jp( (
const char*)ptr, size ) ) {
 
  680    case JapaneseCode::JIS:
 
  682    case JapaneseCode::EUC:
 
  684    case JapaneseCode::SJIS:
 
  686     case JapaneseCode::UTF8:
 
  695static TQCString automaticDetectionForTurkish( 
const unsigned char* ptr, 
int size )
 
  697    for ( 
int i = 0; i < size; ++i ) {
 
  698        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
 
  706static TQCString automaticDetectionForWesternEuropean( 
const unsigned char* ptr, 
int size )
 
  708    uint nonansi_count=0;
 
  709    for (
int i=0; i<size; ++i)
 
  714            if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
 
  718            if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
 
  727        return "iso-8859-15";
 
  734    if (d->m_codec->mibEnum()!=MibUtf8)
 
  741static const unsigned char highest1Bits = 0x80;
 
  742static const unsigned char highest2Bits = 0xC0;
 
  743static const unsigned char highest3Bits = 0xE0;
 
  744static const unsigned char highest4Bits = 0xF0;
 
  745static const unsigned char highest5Bits = 0xF8;
 
  747    for (
int i=0; i<length; ++i)
 
  749        unsigned char c = data[i];
 
  751        if (d->m_multiByte>0)
 
  753            if ((c & highest2Bits) == 0x80)
 
  759            kWarning() << 
"EncDetector: Broken UTF8";
 
  765        if ((c & highest1Bits) == 0x00)
 
  769        if ((c & highest3Bits) == 0xC0)
 
  776        if ((c & highest4Bits) == 0xE0)
 
  783        if ((c & highest5Bits) == 0xF0)
 
  789        kWarning() << 
"EncDetector:_Broken UTF8";
 
  801    d(new EncodingDetectorPrivate(codec,source,script))
 
  805EncodingDetector::~EncodingDetector()
 
  810void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
 
  812    d->m_autoDetectLanguage=lang;
 
  814EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage()
 const 
  816    return d->m_autoDetectLanguage;
 
  819EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource()
 const 
  826    d->m_storeDecoderName = d->m_codec->name();
 
  827    d->m_storeDecoderName = d->m_storeDecoderName.lower().replace( 
"iso ", 
"iso-" );
 
  828    return d->m_storeDecoderName.data();
 
  831bool EncodingDetector::visuallyOrdered()
 const 
  833    return d->m_visualRTL;
 
  849    TQCString enc(_encoding);
 
  852        if (type==DefaultEncoding)
 
  853            codec=d->m_defaultCodec;
 
  866        codec = TDEGlobal::charsets()->codecForName(enc, b);
 
  871    if (d->m_codec->mibEnum()==codec->mibEnum())
 
  874    if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
 
  881    if (codec->mibEnum() == Mib8859_8)
 
  884        codec = TQTextCodec::codecForName(
"iso8859-8-i");
 
  887        if(!(enc==
"iso-8859-8-i"||enc==
"iso_8859-8-i"||enc==
"csiso88598i"||enc==
"logical"))
 
  888            d->m_visualRTL = 
true;
 
  894    d->m_decoder = d->m_codec->makeDecoder();
 
  896    kDebug(6005) << 
"EncodingDetector::encoding used is" << d->m_codec->name();
 
  903    return analyze( data.data(), data.size() );
 
  911    if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
 
  914        const uchar *udata = (
const uchar *)data;
 
  920        const char *autoDetectedEncoding;
 
  921        if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
 
  923            autoDetectedEncoding = 
"ISO-10646-UCS-2";
 
  925        else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
 
  927            autoDetectedEncoding = 
"UTF-8";
 
  929        else if (c1 == 0x00 || c2 == 0x00)
 
  937            uchar c10 = *udata++;
 
  939            int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
 
  940            int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
 
  941            if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
 
  942                autoDetectedEncoding = 
"ISO-10646-UCS-2";
 
  944                autoDetectedEncoding = 0;
 
  948            autoDetectedEncoding = 0;
 
  952        if (autoDetectedEncoding != 0)
 
  955            d->m_codec = TQTextCodec::codecForName(autoDetectedEncoding);
 
  959            d->m_decoder = d->m_codec->makeDecoder();
 
  961            kWarning() << 
"Detection by BOM";
 
  963            if (is16Bit(d->m_codec) && c2==0x00)
 
  966                char reverseUtf16[3] = {(char)0xFF, (
char)0xFE, 0x00};
 
  967                d->m_decoder->toUnicode(reverseUtf16, 2);
 
  974    if (d->m_source==UserChosenEncoding)
 
  977        kWarning() << 
"EncodingDetector: UserChosenEncoding exit ";
 
  990        const char *ptr = data;
 
  991        const char *pEnd = data+len;
 
 1002            if (ptr[0] == 
'!' && ptr[1] == 
'-' && ptr[2] == 
'-')
 
 1005                skipComment(ptr, pEnd);
 
 1010            if (ptr[0]==
'?' && ptr[1]==
'x' && ptr[2]==
'm' && ptr[3]==
'l')
 
 1012                const char *end = ptr;
 
 1013                while (*end != 
'>' && end < pEnd)
 
 1015                if (*end == 
'\0' || end == pEnd)
 
 1017                TQCString str(ptr, end - ptr + 1);
 
 1019                int pos = findXMLEncoding(str, length);
 
 1021                if (pos!=-1 && 
setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
 
 1029                        !((*ptr >= 
'a') && (*ptr <= 
'z') ||
 
 1030                        (*ptr >= 
'A') && (*ptr <= 
'Z'))
 
 1037            const char* max=ptr+4;
 
 1041                        ((*ptr >= 
'a') && (*ptr <= 
'z') ||
 
 1042                        (*ptr >= 
'A') && (*ptr <= 
'Z') ||
 
 1043                        (*ptr >= 
'0') && (*ptr <= 
'9'))
 
 1047                tmp[length] = tolower( *ptr );
 
 1052            if (tmp[0]==
'm'&&tmp[1]==
'e'&&tmp[2]==
't'&&tmp[3]==
'a')
 
 1055                const char* end = ptr;
 
 1056                while(*end != 
'>' && *end != 
'\0' && end<pEnd)
 
 1059                TQCString str( ptr, (end-ptr)+1);
 
 1064                if( (pos = str.find(
"charset")) == -1)
 
 1068                if( (pos = str.find(
'=', pos)) == -1)
 
 1072                while (pos < (
int)str.length() && str[pos] <= 
' ')
 
 1074                if ( pos == (
int)str.length())
 
 1078                while( endpos < str.length() &&
 
 1079                        (str[endpos] != 
' ' && str[endpos] != 
'"' && str[endpos] != 
'\'' 
 1080                                    && str[endpos] != 
';' && str[endpos] != 
'>') )
 
 1083                kDebug( 6005 ) << 
"EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
 
 1085                if (
setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
 
 1088            else if (tmp[0]==
'b'&&tmp[1]==
'o'&&tmp[2]==
'd'&&tmp[3]==
'y')
 
 1096    if (d->m_source==EncodingFromHTTPHeader)
 
 1106    kDebug( 6005 ) << 
"EncodingDetector: using heuristics (" << strlen(data) << 
")";
 
 1109    switch ( d->m_autoDetectLanguage )
 
 1111        case EncodingDetector::Arabic:
 
 1112            return setEncoding(automaticDetectionForArabic( (
const unsigned char*) data, len ), AutoDetectedEncoding);
 
 1114        case EncodingDetector::Baltic:
 
 1115            return setEncoding(automaticDetectionForBaltic( (
const unsigned char*) data, len ), AutoDetectedEncoding);
 
 1117        case EncodingDetector::CentralEuropean:
 
 1118            return setEncoding(automaticDetectionForCentralEuropean( (
const unsigned char*) data, len ), AutoDetectedEncoding);
 
 1120        case EncodingDetector::Cyrillic:
 
 1121            return setEncoding(automaticDetectionForCyrillic( (
const unsigned char*) data, len), AutoDetectedEncoding);
 
 1123        case EncodingDetector::Greek:
 
 1124            return setEncoding(automaticDetectionForGreek( (
const unsigned char*) data, len ), AutoDetectedEncoding);
 
 1126        case EncodingDetector::Hebrew:
 
 1127            return setEncoding(automaticDetectionForHebrew( (
const unsigned char*) data, len ), AutoDetectedEncoding);
 
 1129        case EncodingDetector::Japanese:
 
 1130            return setEncoding(automaticDetectionForJapanese( (
const unsigned char*) data, len ), AutoDetectedEncoding);
 
 1132        case EncodingDetector::Turkish:
 
 1133            return setEncoding(automaticDetectionForTurkish( (
const unsigned char*) data, len ), AutoDetectedEncoding);
 
 1135        case EncodingDetector::WesternEuropean:
 
 1136            if (
setEncoding(automaticDetectionForWesternEuropean( (
const unsigned char*) data, len ), AutoDetectedEncoding))
 
 1138            else if (d->m_defaultCodec->mibEnum()==MibLatin1) 
 
 1140                return setEncoding(
"iso-8859-15",AutoDetectedEncoding);
 
 1147        case EncodingDetector::SemiautomaticDetection:
 
 1148        case EncodingDetector::ChineseSimplified:
 
 1149        case EncodingDetector::ChineseTraditional:
 
 1150        case EncodingDetector::Korean:
 
 1151        case EncodingDetector::Thai:
 
 1152        case EncodingDetector::Unicode:
 
 1153        case EncodingDetector::NorthernSaami:
 
 1154        case EncodingDetector::SouthEasternEurope:
 
 1155        case EncodingDetector::None:
 
 1169        return EncodingDetector::None;
 
 1170    else if (lang==i18n(
"@item Text character set", 
"Unicode"))
 
 1171        return EncodingDetector::Unicode;
 
 1172    else if (lang==i18n(
"@item Text character set", 
"Cyrillic"))
 
 1173        return EncodingDetector::Cyrillic;
 
 1174    else if (lang==i18n(
"@item Text character set", 
"Western European"))
 
 1175        return EncodingDetector::WesternEuropean;
 
 1176    else if (lang==i18n(
"@item Text character set", 
"Central European"))
 
 1177        return EncodingDetector::CentralEuropean;
 
 1178    else if (lang==i18n(
"@item Text character set", 
"Greek"))
 
 1179        return EncodingDetector::Greek;
 
 1180    else if (lang==i18n(
"@item Text character set", 
"Hebrew"))
 
 1181        return EncodingDetector::Hebrew;
 
 1182    else if (lang==i18n(
"@item Text character set", 
"Turkish"))
 
 1183        return EncodingDetector::Turkish;
 
 1184    else if (lang==i18n(
"@item Text character set", 
"Japanese"))
 
 1185        return EncodingDetector::Japanese;
 
 1186    else if (lang==i18n(
"@item Text character set", 
"Baltic"))
 
 1187        return EncodingDetector::Baltic;
 
 1188    else if (lang==i18n(
"@item Text character set", 
"Arabic"))
 
 1189        return EncodingDetector::Arabic;
 
 1191    return EncodingDetector::None;
 
 1194bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
 
 1198        case EncodingDetector::Arabic:
 
 1200        case EncodingDetector::Baltic:
 
 1202        case EncodingDetector::CentralEuropean:
 
 1204        case EncodingDetector::Cyrillic:
 
 1206        case EncodingDetector::Greek:
 
 1208        case EncodingDetector::Hebrew:
 
 1210        case EncodingDetector::Japanese:
 
 1212        case EncodingDetector::Turkish:
 
 1214        case EncodingDetector::WesternEuropean:
 
 1216        case EncodingDetector::ChineseTraditional:
 
 1218        case EncodingDetector::ChineseSimplified:
 
 1220        case EncodingDetector::Unicode:
 
 1228TQString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
 
 1232        case EncodingDetector::Arabic:
 
 1233            return i18n(
"@item Text character set", 
"Arabic");
 
 1235        case EncodingDetector::Baltic:
 
 1236            return i18n(
"@item Text character set", 
"Baltic");
 
 1238        case EncodingDetector::CentralEuropean:
 
 1239            return i18n(
"@item Text character set", 
"Central European");
 
 1241        case EncodingDetector::Cyrillic:
 
 1242            return i18n(
"@item Text character set", 
"Cyrillic");
 
 1244        case EncodingDetector::Greek:
 
 1245            return i18n(
"@item Text character set", 
"Greek");
 
 1247        case EncodingDetector::Hebrew:
 
 1248            return i18n(
"@item Text character set", 
"Hebrew");
 
 1250        case EncodingDetector::Japanese:
 
 1251            return i18n(
"@item Text character set", 
"Japanese");
 
 1253        case EncodingDetector::Turkish:
 
 1254            return i18n(
"@item Text character set", 
"Turkish");
 
 1256        case EncodingDetector::WesternEuropean:
 
 1257            return i18n(
"@item Text character set", 
"Western European");
 
 1259        case EncodingDetector::ChineseTraditional:
 
 1260            return i18n(
"@item Text character set", 
"Chinese Traditional");
 
 1262        case EncodingDetector::ChineseSimplified:
 
 1263            return i18n(
"@item Text character set", 
"Chinese Simplified");
 
 1265        case EncodingDetector::Korean:
 
 1266            return i18n(
"@item Text character set", 
"Korean");
 
 1268        case EncodingDetector::Thai:
 
 1269            return i18n(
"@item Text character set", 
"Thai");
 
 1271        case EncodingDetector::Unicode:
 
 1272            return i18n(
"@item Text character set", 
"Unicode");
 
 1281EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(
const TQString &lc)
 
 1285  const char *langStr = pango_script_for_lang[0].lang;
 
 1287  for ( 
int i = 0; langStr; i++ ) {
 
 1288     langStr = pango_script_for_lang[i].lang;
 
 1290     if ( lc.startsWith( TQString::fromAscii( langStr ) ) )
 
 1291       return pango_script_for_lang[i].scripts[0];
 
Provides encoding detection capabilities.
 
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
 
static AutoDetectScript scriptForName(const TQString &lang)
Takes lang name after it were i18n()'ed.
 
EncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
 
const char * encoding() const
Convenience method.
 
bool setEncoding(const char *encoding, EncodingChoiceSource type)
 
TQTextDecoder * decoder()
 
bool analyze(const char *data, int len)
Analyze text data.