libtdepim

qutf7codec.cpp
1 /*
2  qutf7codec.cpp
3 
4  A TQTextCodec for UTF-7 (rfc2152).
5  Copyright (c) 2001 Marc Mutz <mutz@kde.org>
6  See file COPYING for details
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License, version 2.0,
10  as published by the Free Software Foundation.
11 
12  You should have received a copy of the GNU General Public License
13  along with this program; if not, write to the Free Software
14  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
15  02110-1301, US
16 
17  As a special exception, permission is granted to use this plugin
18  with any version of TQt by TrollTech AS, Norway. In this case, the
19  use of this plugin doesn't cause the resulting executable to be
20  covered by the GNU General Public License.
21  This exception does not however invalidate any other reasons why the
22  executable file might be covered by the GNU General Public License.
23 */
24 
25 
26 #include "qutf7codec.h"
27 
28 #ifndef TQT_NO_TEXTCODEC
29 
30 int TQUtf7Codec::mibEnum() const {
31  return 1012;
32 }
33 
34 int TQStrictUtf7Codec::mibEnum() const {
35  return -1012;
36 }
37 
38 const char* TQUtf7Codec::name() const {
39  return "UTF-7";
40 }
41 
42 const char* TQStrictUtf7Codec::name() const {
43  return "X-QT-UTF-7-STRICT";
44 }
45 
46 const char* TQUtf7Codec::mimeName() const {
47  return "UTF-7";
48 }
49 
50 bool TQUtf7Codec::canEncode( TQChar ) const {
51  return TRUE;
52 }
53 
54 bool TQUtf7Codec::canEncode( const TQString & ) const {
55  return TRUE;
56 }
57 
58 static uchar base64Set[] = {
59  0x00, 0x00, 0x00, 0x00, // '\0' ...
60  0x00, 0x11, 0xFF, 0xC0, // ' ' ... '?'
61  0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
62  0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL
63 };
64 
65 static uchar base64SetWithLastTwoBitsZero[] = {
66  0x00, 0x00, 0x00, 0x00, // '\0' ...
67  0x00, 0x00, 0x88, 0x80, // ' ' ... '?'
68  0x44, 0x44, 0x44, 0x40, // '@' ... '_'
69  0x11, 0x11, 0x11, 0x00 // '`' ... DEL
70 };
71 
72 static uchar directSet[] = {
73  0x00, 0x00, 0x00, 0x00, // '\0' ...
74  0x01, 0xCF, 0xFF, 0xE1, // ' ' ... '?'
75  0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
76  0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL
77 };
78 
79 static uchar optDirectSet[] = {
80  0x00, 0x00, 0x00, 0x00, // '\0' ...
81  0x7E, 0x20, 0x00, 0x1E, // ' ' ... '?'
82  0x80, 0x00, 0x00, 0x17, // '@' ... '_'
83  0x80, 0x00, 0x00, 0x1C // '`' ... DEL
84 };
85 
86 static inline bool isOfSet(uchar ch, uchar* set) {
87  return set[ ch/8 ] & (0x80 >> ( ch%8 ));
88 }
89 
90 int TQUtf7Codec::heuristicContentMatch(const char* chars, int len) const
91 {
92  int stepNo = 0;
93  int i;
94  bool shifted = FALSE;
95  bool rightAfterEscape = FALSE;
96  bool onlyNullBitsSinceLastBoundary = TRUE;
97  for ( i = 0; i < len ; i++ ) {
98  if ((unsigned char)chars[i] >= 128) // 8bit chars not allowed.
99  break;
100  if (shifted) {
101  if ( isOfSet(chars[i],base64Set) ) {
102  switch (stepNo) {
103  case 0:
104  onlyNullBitsSinceLastBoundary = TRUE;
105  break;
106  case 3:
107  onlyNullBitsSinceLastBoundary
108  = isOfSet(chars[i],base64SetWithLastTwoBitsZero);
109  break;
110  case 6:
111  onlyNullBitsSinceLastBoundary
112  = ( chars[i] == 'A' || chars[i] == 'Q' ||
113  chars[i] == 'g' || chars[i] == 'w' );
114  break;
115  default:
116  onlyNullBitsSinceLastBoundary
117  = onlyNullBitsSinceLastBoundary && (chars[i] == 'A');
118  }
119  stepNo = (stepNo + 1) % 8;
120  rightAfterEscape = FALSE;
121  } else {
122  if (rightAfterEscape && chars[i] != '-')
123  break; // a '+' must be followed by '-' or a base64 char
124  if (!onlyNullBitsSinceLastBoundary)
125  break; // non-zero bits in the tail of the base64 encoding
126  shifted = FALSE;
127  stepNo = 0;
128  }
129  } else {
130  if (chars[i] == '+') {
131  shifted = TRUE;
132  rightAfterEscape = TRUE;
133  }
134  }
135  }
136  return i;
137 }
138 
139 class TQUtf7Decoder : public TQTextDecoder {
140  // the storage for our unicode char until it's finished
141  ushort uc;
142  // the state of the base64 decoding
143  // can be 0 (just finished three unicode chars)
144  // 1 (have the upper 6 bits of uc already)
145  // 2 (have the upper 12 bits of uc already)
146  // 3 (have the upper 2 bits of uc already)
147  // ..........
148  // 7 (have the upper 10 bits of uc already)
149  // => n (have the upper (n * 6) % 16 bits of uc already)
150  // "stepNo" cycles through all it's values every three
151  // unicode chars.
152  char stepNo;
153  // remembers if we are in shifted-sequence mode
154  bool shifted;
155  // remembers if we're just after the initial '+'
156  // of a shifted-sequence.
157  bool rightAfterEscape;
158 public:
159  TQUtf7Decoder() : uc(0), stepNo(0), shifted(FALSE), rightAfterEscape(FALSE)
160  {
161  }
162 
163 private:
164  inline void resetParser()
165  {
166  uc = 0;
167  stepNo = 0;
168  shifted = FALSE;
169  rightAfterEscape = FALSE;
170  }
171 
172 public:
173  TQString toUnicode(const char* chars, int len)
174  {
175  TQString result = "";
176  for (int i=0; i<len; i++) {
177  uchar ch = chars[i];
178 
179  //
180  // check for 8bit char's:
181  //
182  if ( ch > 127 ) {
183  tqWarning("TQUtf7Decoder: 8bit char found in input. "
184  "Parser has been re-initialized!");
185  resetParser();
186  result += TQChar::replacement;
187  continue;
188  }
189 
190  if (shifted) { // in shifted mode
191 
192  //
193  // first, we check specialities that only occur
194  // right after the escaping '+':
195  //
196  if ( rightAfterEscape && ch == '-' ) {
197  // a "+-" sequence is a short-circuit encoding
198  // for just '+':
199  resetParser();
200  result += TQChar('+');
201  // we're already done for this "ch", so
202  continue;
203  }
204 
205  //
206  // Here we're going to extract the bits represented by "ch":
207  //
208  ushort bits;
209  if ( ch >= 'A' && ch <= 'Z' ) {
210  bits = ch - 'A';
211  } else if ( ch >= 'a' && ch <= 'z' ) {
212  bits = ch - 'a' + 26;
213  } else if ( ch >= '0' && ch <= '9' ) {
214  bits = ch - '0' + 52;
215  } else if ( ch == '+' ) {
216  bits = 62;
217  } else if ( ch == '/' ) {
218  bits = 63;
219  } else {
220  bits = 0; // keep compiler happy
221 
222  //
223  // ch is not of the base64 alphabet.
224  // Here we are going to check the sequence's validity:
225  //
226  if ( rightAfterEscape ) {
227  // any non-base64 char following an escaping '+'
228  // makes for an ill-formed sequence.
229  // Note that we catch (the valid) "+-" pair
230  // right at the beginning.
231  tqWarning("TQUtf7Decoder: ill-formed input: "
232  "non-base64 char after escaping \"+\"!");
233  }
234  // pending bits from base64 encoding must be all 0:
235  if (stepNo >= 1 && uc) {
236  tqWarning("TQUtf7Decoder: ill-formed sequence: "
237  "non-zero bits in shifted-sequence tail!");
238  }
239  resetParser();
240 
241  // a '-' signifies the end of the shifted-sequence,
242  // so we just swallow it.
243  if ( ch == '-' )
244  continue;
245  // end of validity checking. Process ch now...
246  }
247 
248  if ( /*still*/ shifted ) {
249  //
250  // now we're going to stuff the "bits" bit bucket into
251  // the right position inside "uc", emitting a resulting
252  // TQChar if possible.
253  //
254  switch (stepNo) {
255  // "bits" are the 6 msb's of uc
256  case 0: uc = bits << 10; break;
257 
258  case 1: uc |= bits << 4; break;
259 
260  // 4 bits of "bits" complete the first ushort
261  case 2: uc |= bits >> 2; result += TQChar(uc);
262  // 2 bits of "bits" make the msb's of the next ushort
263  uc = bits << 14; break;
264  case 3: uc |= bits << 8; break;
265  case 4: uc |= bits << 2; break;
266 
267  // 2 bits of "bits" complete the second ushort
268  case 5: uc |= bits >> 4; result += TQChar(uc);
269  // 4 bits of "bits" make the msb's of the next ushort
270  uc = bits << 12; break;
271  case 6: uc |= bits << 6; break;
272 
273  // these 6 bits complete the third ushort
274  // and also one round of 8 chars -> 3 ushort decoding
275  case 7: uc |= bits; result += TQChar(uc);
276  uc = 0; break;
277  default: ;
278  } // switch (stepNo)
279  // increase the step counter
280  stepNo++;
281  stepNo %= 8;
282  rightAfterEscape = FALSE;
283  // and look at the next char.
284  continue;
285  } // fi (still) shifted
286  } // fi shifted
287 
288  //
289  // if control reaches here, we either weren't in a
290  // shifted sequence or we just left one by seeing
291  // a non-base64-char.
292  // Either way, we have to process "ch" outside
293  // a shifted-sequence now:
294  //
295  if ( ch == '+' ) {
296  // '+' is the escape char for entering a
297  // shifted sequence:
298  shifted = TRUE;
299  stepNo = 0;
300  // also, we're right at the beginning where
301  // special rules apply:
302  rightAfterEscape = TRUE;
303  } else {
304  // US-ASCII values are directly used
305  result += TQChar(ch);
306  }
307  }
308 
309  return result;
310 
311  } // toUnicode()
312 
313 }; // class TQUtf7Decoder
314 
315 TQTextDecoder* TQUtf7Codec::makeDecoder() const
316 {
317  return new TQUtf7Decoder;
318 }
319 
320 
321 class TQUtf7Encoder : public TQTextEncoder {
322  uchar dontNeedEncodingSet[16];
323  ushort outbits;
324  uint stepNo : 2;
325  bool shifted : 1;
326  bool mayContinueShiftedSequence : 1;
327 public:
328  TQUtf7Encoder(bool encOpt, bool encLwsp)
329  : outbits(0), stepNo(0),
330  shifted(FALSE), mayContinueShiftedSequence(FALSE)
331  {
332  for ( int i = 0; i < 16 ; i++) {
333  dontNeedEncodingSet[i] = directSet[i];
334  if (!encOpt)
335  dontNeedEncodingSet[i] |= optDirectSet[i];
336  }
337  if(!encLwsp) {
338  dontNeedEncodingSet[' '/8] |= 0x80 >> (' '%8);
339  dontNeedEncodingSet['\n'/8] |= 0x80 >> ('\n'%8);
340  dontNeedEncodingSet['\r'/8] |= 0x80 >> ('\r'%8);
341  dontNeedEncodingSet['\t'/8] |= 0x80 >> ('\t'%8);
342  }
343  }
344 
345 private:
346 
347  char toBase64( ushort u ) {
348  if ( u < 26 )
349  return (char)u + 'A';
350  else if ( u < 52 )
351  return (char)u - 26 + 'a';
352  else if ( u < 62 )
353  return (char)u - 52 + '0';
354  else if ( u == 62 )
355  return '+';
356  else
357  return '/';
358  }
359 
360  void addToShiftedSequence(TQCString::Iterator & t, ushort u) {
361  switch (stepNo) {
362  // no outbits; use uppermost 6 bits of u
363  case 0:
364  *t++ = toBase64( u >> 10 );
365  *t++ = toBase64( (u & 0x03FF /* umask top 6 bits */ ) >> 4 );
366  // save 4 lowest-order bits in outbits[5..2]
367  outbits = (u & 0x000F) << 2;
368  break;
369 
370  // outbits available; use top two bits of u to complete
371  // the previous char
372  case 1:
373  if (!mayContinueShiftedSequence) {
374  // if mayContinue, this char has already been written
375  *t++ = toBase64( outbits | ( u >> 14 ) );
376  }
377  *t++ = toBase64( (u & 0x3F00 /* mask top 2 bits */ ) >> 8 );
378  *t++ = toBase64( (u & 0x00FC /* mask msbyte */ ) >> 2 );
379  // save 2 lowest-significant bits in outbits[5..4]
380  outbits = (u & 0x0003) << 4;
381  break;
382 
383  // outbits available; use top four bits of u to complete
384  // the previous char
385  case 2:
386  if (!mayContinueShiftedSequence) {
387  // if mayContinue, this char has already been written
388  *t++ = toBase64( outbits | ( u >> 12 ) );
389  }
390  *t++ = toBase64( (u & 0x0FFF) >> 6 );
391  *t++ = toBase64( u & 0x003F );
392  break;
393 
394  default: ;
395  }
396  stepNo = (stepNo + 1) % 3;
397  }
398 
399  void endShiftedSequence(TQCString::Iterator & t) {
400  switch (stepNo) {
401  case 1: // four outbits still to be written
402  case 2: // two outbits still to be written
403  *t++ = toBase64( outbits );
404  break;
405  case 0: // nothing to do
406  default: ;
407  }
408  outbits = 0;
409  }
410 
411  // depending on the stepNo, checks whether we can continue
412  // an already ended shifted-sequence with char "u".
413  // This is only possible if the topmost bits fit the
414  // already written ones (which are all 0 between calls)
415  bool continueOK( ushort u ) {
416  return stepNo == 0 ||
417  ( stepNo == 1 && (u & 0xF000) == 0 ) ||
418  ( stepNo == 2 && (u & 0xC000) == 0 );
419  }
420 
421  void processDoesntNeedEncoding(TQCString::Iterator & t, ushort ch) {
422  // doesn't need encoding
423  if (shifted) {
424  endShiftedSequence(t);
425  // add "lead-out" to dis-ambiguate following chars:
426  if (isOfSet((char)ch,base64Set) || ch == '-' ) {
427  *t++ = '-';
428  }
429  } else if (mayContinueShiftedSequence) {
430  // if mayContinue is set, this means the
431  // shifted-sequence needs a lead-out.
432  mayContinueShiftedSequence = FALSE;
433  if (isOfSet(ch,base64Set) || ch == '-' ) {
434  *t++ = '-';
435  }
436  }
437  *t++ = (uchar)ch;
438  shifted = FALSE;
439  stepNo = 0;
440  }
441 
442 public:
443  TQCString fromUnicode(const TQString & uc, int & len_in_out)
444  {
445  // allocate place for worst case:
446  // len/2 * (5+1) for an alternating sequence of e.g. "A\",
447  // + 4 for a worst-case of another +ABC encoded char
448  // + 1 for the trailing \0
449  //
450  int maxreslen = 3 * len_in_out + 5;
451  TQCString result( maxreslen );
452 
453 #if 0
454  // if (len_in_out == 1) {
455  cout << "\nlen_in_out: " << len_in_out
456  <<"; shifted: " << (shifted ? "true" : "false")
457  << ";\n" << "mayContinue: "
458  << (mayContinueShiftedSequence ? "true" : "false")
459  << "; stepNo: " << stepNo << ";\n"
460  << "outbits: " << outbits << endl;
461  // }
462 #endif
463 
464  // source and destination cursor
465  const TQChar * s = uc.unicode();
466  TQCString::Iterator t = result.data();
467 
468  if ( uc.isNull() ) {
469  // return to ascii requested:
470  if ( mayContinueShiftedSequence )
471  *t++ = '-';
472  } else {
473  // normal operation:
474  for (int i = 0 ; i < len_in_out ;
475  i++/*, checkOutBuf(result,maxreslen,t,i,len_in_out,5)*/ ) {
476  ushort ch = s[i].unicode();
477 
478  //
479  // first, we check whether we might get around encoding:
480  //
481  if ( ch < 128 ) {
482  //
483  // ch is usAscii, so we have a chance that we don't
484  // need to encode it.
485  //
486  if ( isOfSet((uchar)ch,dontNeedEncodingSet) ) {
487  processDoesntNeedEncoding(t,ch);
488  continue;
489  } else if ( ch == '+' ) {
490  // '+' is the shift escape character
491  if (shifted || mayContinueShiftedSequence) {
492  // if we are already in shifted mode, we just
493  // encode the '+', too. Compare
494  // 24bits ("-+-") + some from ending the shifted-sequence
495  // with 21,33 bits
496  addToShiftedSequence(t,ch);
497  mayContinueShiftedSequence = FALSE;
498  shifted = TRUE;
499  } else {
500  // shortcut encoding of '+':
501  *t++ = '+';
502  *t++ = '-';
503  }
504  continue; // done
505  } // else fall through to encoding
506  }
507  //
508  // need encoding
509  //
510  if (!shifted && (!mayContinueShiftedSequence || !continueOK(ch) ) ) {
511  *t++ = '+';
512  stepNo = 0;
513  }
514  addToShiftedSequence(t,ch);
515  shifted = TRUE;
516  mayContinueShiftedSequence = FALSE;
517  }
518 
519  if ( shifted ) {
520  endShiftedSequence(t);
521  mayContinueShiftedSequence = TRUE;
522  };
523  shifted = FALSE;
524  }
525 
526  *t = '\0';
527  len_in_out = t - result.data();
528 
529 #if 0
530  cout << "len_in_out: " << len_in_out << "; "
531  << "mayContinue: " << (mayContinueShiftedSequence ? "true" : "false")
532  << "; stepNo: " << stepNo << endl;
533 #endif
534 
535  Q_ASSERT(len_in_out <= maxreslen-1);
536 
537  return result;
538  } // fromUnicode()
539 
540 }; // class TQUtf7Encoder
541 
542 TQTextEncoder* TQUtf7Codec::makeEncoder() const {
543  return new TQUtf7Encoder( false, false );
544 }
545 
546 TQTextEncoder* TQStrictUtf7Codec::makeEncoder() const {
547  return new TQUtf7Encoder( true, false );
548 }
549 
550 #endif // TQT_NO_TEXTCODEC