qutf7codec.cpp Source File (libtdepim)

libtdepim

libtdepim
 /*
   qutf7codec.cpp
  
   A TQTextCodec for UTF-7 (rfc2152).
   Copyright (c) 2001 Marc Mutz <mutz@kde.org>
   See file COPYING for details
  
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License, version 2.0,
   as published by the Free Software Foundation.
  
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, US
  
   As a special exception, permission is granted to use this plugin
   with any version of TQt by TrollTech AS, Norway. In this case, the
   use of this plugin doesn't cause the resulting executable to be
   covered by the GNU General Public License.
   This exception does not however invalidate any other reasons why the
   executable file might be covered by the GNU General Public License.
 */
  
  
 #include "qutf7codec.h"
  
 #ifndef TQT_NO_TEXTCODEC
  
 int TQUtf7Codec::mibEnum() const {
   return 1012;
 }
  
 int TQStrictUtf7Codec::mibEnum() const {
   return -1012;
 }
  
 const char* TQUtf7Codec::name() const {
   return "UTF-7";
 }
  
 const char* TQStrictUtf7Codec::name() const {
   return "X-QT-UTF-7-STRICT";
 }
  
 const char* TQUtf7Codec::mimeName() const {
   return "UTF-7";
 }
  
 bool TQUtf7Codec::canEncode( TQChar ) const {
   return TRUE;
 }
  
 bool TQUtf7Codec::canEncode( const TQString & ) const {
   return TRUE;
 }
  
 static uchar base64Set[] = {
   0x00, 0x00, 0x00, 0x00, // '\0' ...
   0x00, 0x11, 0xFF, 0xC0, // ' ' ... '?'
   0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
   0x7F, 0xFF, 0xFF, 0xE0  // '`' ... DEL
 };
  
 static uchar base64SetWithLastTwoBitsZero[] = {
   0x00, 0x00, 0x00, 0x00, // '\0' ...
   0x00, 0x00, 0x88, 0x80, // ' ' ... '?'
   0x44, 0x44, 0x44, 0x40, // '@' ... '_'
   0x11, 0x11, 0x11, 0x00  // '`' ... DEL
 };
  
 static uchar directSet[] = {
   0x00, 0x00, 0x00, 0x00, // '\0' ...
   0x01, 0xCF, 0xFF, 0xE1, // ' ' ... '?'
   0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
   0x7F, 0xFF, 0xFF, 0xE0  // '`' ... DEL
 };
  
 static uchar optDirectSet[] = {
   0x00, 0x00, 0x00, 0x00, // '\0' ...
   0x7E, 0x20, 0x00, 0x1E, // ' ' ... '?'
   0x80, 0x00, 0x00, 0x17, // '@' ... '_'
   0x80, 0x00, 0x00, 0x1C  // '`' ... DEL
 };
  
 static inline bool isOfSet(uchar ch, uchar* set) {
   return set[ ch/8 ] & (0x80 >> ( ch%8 ));
 }
  
 int TQUtf7Codec::heuristicContentMatch(const char* chars, int len) const
 {
   int stepNo = 0;
   int i;
   bool shifted = FALSE;
   bool rightAfterEscape = FALSE;
   bool onlyNullBitsSinceLastBoundary = TRUE;
   for ( i = 0; i < len ; i++ ) {
     if ((unsigned char)chars[i] >= 128) // 8bit chars not allowed.
       break;
     if (shifted) {
       if ( isOfSet(chars[i],base64Set) ) {
     switch (stepNo) {
     case 0:
       onlyNullBitsSinceLastBoundary = TRUE;
       break;
     case 3:
       onlyNullBitsSinceLastBoundary
         = isOfSet(chars[i],base64SetWithLastTwoBitsZero);
       break;
     case 6:
       onlyNullBitsSinceLastBoundary
         = ( chars[i] == 'A' || chars[i] == 'Q' ||
         chars[i] == 'g' || chars[i] == 'w' );
       break;
     default:
        onlyNullBitsSinceLastBoundary
          = onlyNullBitsSinceLastBoundary && (chars[i] == 'A');
     }
     stepNo = (stepNo + 1) % 8;
     rightAfterEscape = FALSE;
       } else {
     if (rightAfterEscape && chars[i] != '-')
       break; // a '+' must be followed by '-' or a base64 char
     if (!onlyNullBitsSinceLastBoundary)
       break; // non-zero bits in the tail of the base64 encoding
     shifted = FALSE;
     stepNo = 0;
       }
     } else {
       if (chars[i] == '+') {
     shifted = TRUE;
     rightAfterEscape = TRUE;
       }
     }
   }
   return i;
 }
  
 class TQUtf7Decoder : public TQTextDecoder {
   // the storage for our unicode char until it's finished
   ushort uc;
   // the state of the base64 decoding
   // can be 0 (just finished three unicode chars)
   //        1 (have the upper  6 bits of uc already)
   //        2 (have the upper 12 bits of uc already)
   //        3 (have the upper  2 bits of uc already)
   // ..........
   //        7 (have the upper 10 bits of uc already)
   //   =>   n (have the upper (n * 6) % 16 bits of uc already)
   // "stepNo" cycles through all it's values every three
   // unicode chars.
   char stepNo;
   // remembers if we are in shifted-sequence mode
   bool shifted;
   // remembers if we're just after the initial '+'
   // of a shifted-sequence.
   bool rightAfterEscape;
 public:
   TQUtf7Decoder() : uc(0), stepNo(0), shifted(FALSE), rightAfterEscape(FALSE)
   {
   }
  
 private:
   inline void resetParser()
   {
     uc = 0;
     stepNo = 0;
     shifted = FALSE;
     rightAfterEscape = FALSE;
   }
  
 public:
   TQString toUnicode(const char* chars, int len)
   {
     TQString result = "";
     for (int i=0; i<len; i++) {
       uchar ch = chars[i];
  
       //
       // check for 8bit char's:
       // 
       if ( ch > 127 ) {
     tqWarning("TQUtf7Decoder: 8bit char found in input. "
          "Parser has been re-initialized!");
     resetParser();
     result += TQChar::replacement;
     continue;
       }
  
       if (shifted) { // in shifted mode
  
     //
     // first, we check specialities that only occur
     // right after the escaping '+':
     //
     if ( rightAfterEscape && ch == '-' ) {
       // a "+-" sequence is a short-circuit encoding
       // for just '+':
       resetParser();
       result += TQChar('+');
       // we're already done for this "ch", so
       continue;
     }
  
     //
     // Here we're going to extract the bits represented by "ch":
     //
     ushort bits;
     if ( ch >= 'A' && ch <= 'Z' ) {
       bits = ch - 'A';
     } else if ( ch >= 'a' && ch <= 'z' ) {
       bits = ch - 'a' + 26;
     } else if ( ch >= '0' && ch <= '9' ) {
       bits = ch - '0' + 52;
     } else if ( ch == '+' ) {
       bits = 62;
     } else if ( ch == '/' ) {
       bits = 63;
     } else {
       bits = 0; // keep compiler happy
  
       //
       // ch is not of the base64 alphabet.
       // Here we are going to check the sequence's validity:
       //
       if ( rightAfterEscape ) {
         // any non-base64 char following an escaping '+'
         // makes for an ill-formed sequence.
         // Note that we catch (the valid) "+-" pair
         // right at the beginning.
         tqWarning("TQUtf7Decoder: ill-formed input: "
              "non-base64 char after escaping \"+\"!");
       }
       // pending bits from base64 encoding must be all 0:
       if (stepNo >= 1 && uc) {
         tqWarning("TQUtf7Decoder: ill-formed sequence: "
              "non-zero bits in shifted-sequence tail!");
       }
       resetParser();
  
       // a '-' signifies the end of the shifted-sequence,
       // so we just swallow it.
       if ( ch == '-' )
         continue;
       // end of validity checking. Process ch now...
     }
  
     if ( /*still*/ shifted ) {
       //
       // now we're going to stuff the "bits" bit bucket into
       // the right position inside "uc", emitting a resulting
       // TQChar if possible.
       //
       switch (stepNo) {
         // "bits" are the 6 msb's of uc
       case 0: uc = bits << 10; break;
  
       case 1: uc |= bits << 4; break;
  
         // 4 bits of "bits" complete the first ushort
       case 2: uc |= bits >> 2; result += TQChar(uc);
         // 2 bits of "bits" make the msb's of the next ushort
               uc = bits << 14; break;
       case 3: uc |= bits << 8; break;
       case 4: uc |= bits << 2; break;
  
         // 2 bits of "bits" complete the second ushort
       case 5: uc |= bits >> 4; result += TQChar(uc);
         // 4 bits of "bits" make the msb's of the next ushort
               uc = bits << 12; break;
       case 6: uc |= bits << 6; break;
  
         // these 6 bits complete the third ushort
         // and also one round of 8 chars -> 3 ushort decoding
       case 7: uc |= bits;      result += TQChar(uc);
               uc = 0;          break;
       default: ;
       } // switch (stepNo)
       // increase the step counter
       stepNo++;
       stepNo %= 8;
       rightAfterEscape = FALSE;
       // and look at the next char.
       continue;
     } // fi (still) shifted
       } // fi shifted
  
       //
       // if control reaches here, we either weren't in a
       // shifted sequence or we just left one by seeing
       // a non-base64-char.
       // Either way, we have to process "ch" outside
       // a shifted-sequence now:
       //
       if ( ch == '+' ) {
     // '+' is the escape char for entering a
     // shifted sequence:
     shifted = TRUE;
     stepNo = 0;
     // also, we're right at the beginning where
     // special rules apply:
     rightAfterEscape = TRUE;
       } else {
     // US-ASCII values are directly used
     result += TQChar(ch);
       }
     }
  
     return result;
  
   } // toUnicode()
  
 }; // class TQUtf7Decoder
  
 TQTextDecoder* TQUtf7Codec::makeDecoder() const
 {
   return new TQUtf7Decoder;
 }
  
  
 class TQUtf7Encoder : public TQTextEncoder {
   uchar dontNeedEncodingSet[16];
   ushort outbits;
   uint stepNo : 2;
   bool shifted : 1;
   bool mayContinueShiftedSequence : 1;
 public:
   TQUtf7Encoder(bool encOpt, bool encLwsp)
     : outbits(0), stepNo(0),
       shifted(FALSE), mayContinueShiftedSequence(FALSE)
   {
     for ( int i = 0; i < 16 ; i++) {
       dontNeedEncodingSet[i] = directSet[i];
       if (!encOpt)
     dontNeedEncodingSet[i] |= optDirectSet[i];
     }
     if(!encLwsp) {
       dontNeedEncodingSet[' '/8] |= 0x80 >> (' '%8);
       dontNeedEncodingSet['\n'/8] |= 0x80 >> ('\n'%8);
       dontNeedEncodingSet['\r'/8] |= 0x80 >> ('\r'%8);
       dontNeedEncodingSet['\t'/8] |= 0x80 >> ('\t'%8);
     }
   }
  
 private:
  
   char toBase64( ushort u ) {
     if ( u < 26 )
       return (char)u + 'A';
     else if ( u < 52 )
       return (char)u - 26 + 'a';
     else if ( u < 62 )
       return (char)u - 52 + '0';
     else if ( u == 62 )
       return '+';
     else
       return '/';
   }
  
   void addToShiftedSequence(TQCString::Iterator & t, ushort u) {
     switch (stepNo) {
       // no outbits; use uppermost 6 bits of u
     case 0:
       *t++ = toBase64( u >> 10 );
       *t++ = toBase64( (u & 0x03FF /* umask top 6 bits */ ) >> 4 );
       // save 4 lowest-order bits in outbits[5..2]
       outbits = (u & 0x000F) << 2;
       break;
  
       // outbits available; use top two bits of u to complete
       // the previous char
     case 1:
       if (!mayContinueShiftedSequence) {
     // if mayContinue, this char has already been written
     *t++ = toBase64( outbits | ( u >> 14 ) );
       }
       *t++ = toBase64( (u & 0x3F00 /* mask top 2 bits */ ) >> 8 );
       *t++ = toBase64( (u & 0x00FC /* mask msbyte */ ) >> 2 );
       // save 2 lowest-significant bits in outbits[5..4]
       outbits = (u & 0x0003) << 4;
       break;
  
       // outbits available; use top four bits of u to complete
       // the previous char
     case 2:
       if (!mayContinueShiftedSequence) {
     // if mayContinue, this char has already been written
     *t++ = toBase64( outbits | ( u >> 12 ) );
       }
       *t++ = toBase64( (u & 0x0FFF) >> 6 );
       *t++ = toBase64( u & 0x003F );
       break;
  
     default: ;
     }
     stepNo = (stepNo + 1) % 3;
   }
  
   void endShiftedSequence(TQCString::Iterator & t) {
     switch (stepNo) {
     case 1: // four outbits still to be written
     case 2: // two outbits still to be written
       *t++ = toBase64( outbits );
       break;
     case 0:      // nothing to do
     default: ;
     }
     outbits = 0;
   }
  
   // depending on the stepNo, checks whether we can continue
   // an already ended shifted-sequence with char "u".
   // This is only possible if the topmost bits fit the
   // already written ones (which are all 0 between calls)
   bool continueOK( ushort u ) {
     return stepNo == 0 ||
       ( stepNo == 1 && (u & 0xF000) == 0 ) ||
       ( stepNo == 2 && (u & 0xC000) == 0 );
   }
  
   void processDoesntNeedEncoding(TQCString::Iterator & t, ushort ch) {
     // doesn't need encoding
     if (shifted) {
       endShiftedSequence(t);
       // add "lead-out" to dis-ambiguate following chars:
       if (isOfSet((char)ch,base64Set) || ch == '-' ) {
     *t++ = '-';
       }
     } else if (mayContinueShiftedSequence) {
       // if mayContinue is set, this means the
       // shifted-sequence needs a lead-out.
       mayContinueShiftedSequence = FALSE;
       if (isOfSet(ch,base64Set) || ch == '-' ) {
     *t++ = '-';
       }
     }
     *t++ = (uchar)ch;
     shifted = FALSE;
     stepNo = 0;
   }
  
 public:
   TQCString fromUnicode(const TQString & uc, int & len_in_out)
   {
     // allocate place for worst case:
     //   len/2 * (5+1) for an alternating sequence of e.g. "A\",
     // + 4             for a worst-case of another +ABC encoded char
     // + 1             for the trailing \0
     // 
     int maxreslen = 3 * len_in_out + 5;
     TQCString result( maxreslen );
  
 #if 0
     //    if (len_in_out == 1) {
     cout << "\nlen_in_out: " << len_in_out
      <<"; shifted: " << (shifted ? "true" : "false")
      << ";\n" << "mayContinue: "
      << (mayContinueShiftedSequence ? "true" : "false")
      << "; stepNo: " << stepNo << ";\n"
      << "outbits: " << outbits << endl;
       //    }
 #endif
  
     // source and destination cursor
     const TQChar * s = uc.unicode();
     TQCString::Iterator t = result.data();
  
     if ( uc.isNull() ) {
       // return to ascii requested:
       if ( mayContinueShiftedSequence )
     *t++ = '-';
     } else {
       // normal operation:
       for (int i = 0 ; i < len_in_out ;
        i++/*, checkOutBuf(result,maxreslen,t,i,len_in_out,5)*/ ) {
     ushort ch = s[i].unicode();
     
     //
     // first, we check whether we might get around encoding:
     //
     if ( ch < 128 ) {
       //
       // ch is usAscii, so we have a chance that we don't
       // need to encode it.
       //
       if ( isOfSet((uchar)ch,dontNeedEncodingSet) ) {
         processDoesntNeedEncoding(t,ch);
         continue;
       } else if ( ch == '+' ) {
         // '+' is the shift escape character
         if (shifted || mayContinueShiftedSequence) {
           // if we are already in shifted mode, we just
           // encode the '+', too. Compare
           // 24bits ("-+-") + some from ending the shifted-sequence
           // with 21,33 bits
           addToShiftedSequence(t,ch);
           mayContinueShiftedSequence = FALSE;
           shifted = TRUE;
         } else {
           // shortcut encoding of '+':
           *t++ = '+';
           *t++ = '-';
         }
         continue; // done
       } // else fall through to encoding
     }
     //
     // need encoding
     //
     if (!shifted && (!mayContinueShiftedSequence || !continueOK(ch) ) ) {
       *t++ = '+';
       stepNo = 0;
     }
     addToShiftedSequence(t,ch);
     shifted = TRUE;
     mayContinueShiftedSequence = FALSE;
       }
  
       if ( shifted ) {
     endShiftedSequence(t);
     mayContinueShiftedSequence = TRUE;
       };
       shifted = FALSE;
     }
  
     *t = '\0';
     len_in_out = t - result.data();
  
 #if 0
     cout << "len_in_out: " << len_in_out << "; "
      << "mayContinue: " << (mayContinueShiftedSequence ? "true" : "false")
      << "; stepNo: " << stepNo << endl;
 #endif
  
     Q_ASSERT(len_in_out <= maxreslen-1);
  
     return result;
   } // fromUnicode()
  
 }; // class TQUtf7Encoder
  
 TQTextEncoder* TQUtf7Codec::makeEncoder() const {
   return new TQUtf7Encoder( false, false );
 }
  
 TQTextEncoder* TQStrictUtf7Codec::makeEncoder() const {
   return new TQUtf7Encoder( true, false );
 }
  
 #endif // TQT_NO_TEXTCODEC