libtdepim

qutf7codec.cpp
1/*
2 qutf7codec.cpp
3
4 A TQTextCodec for UTF-7 (rfc2152).
5 Copyright (c) 2001 Marc Mutz <mutz@kde.org>
6 See file COPYING for details
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License, version 2.0,
10 as published by the Free Software Foundation.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
15 02110-1301, US
16
17 As a special exception, permission is granted to use this plugin
18 with any version of TQt by TrollTech AS, Norway. In this case, the
19 use of this plugin doesn't cause the resulting executable to be
20 covered by the GNU General Public License.
21 This exception does not however invalidate any other reasons why the
22 executable file might be covered by the GNU General Public License.
23*/
24
25
26#include "qutf7codec.h"
27
28#ifndef TQT_NO_TEXTCODEC
29
30int TQUtf7Codec::mibEnum() const {
31 return 1012;
32}
33
34int TQStrictUtf7Codec::mibEnum() const {
35 return -1012;
36}
37
38const char* TQUtf7Codec::name() const {
39 return "UTF-7";
40}
41
42const char* TQStrictUtf7Codec::name() const {
43 return "X-QT-UTF-7-STRICT";
44}
45
46const char* TQUtf7Codec::mimeName() const {
47 return "UTF-7";
48}
49
50bool TQUtf7Codec::canEncode( TQChar ) const {
51 return TRUE;
52}
53
54bool TQUtf7Codec::canEncode( const TQString & ) const {
55 return TRUE;
56}
57
58static uchar base64Set[] = {
59 0x00, 0x00, 0x00, 0x00, // '\0' ...
60 0x00, 0x11, 0xFF, 0xC0, // ' ' ... '?'
61 0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
62 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL
63};
64
65static uchar base64SetWithLastTwoBitsZero[] = {
66 0x00, 0x00, 0x00, 0x00, // '\0' ...
67 0x00, 0x00, 0x88, 0x80, // ' ' ... '?'
68 0x44, 0x44, 0x44, 0x40, // '@' ... '_'
69 0x11, 0x11, 0x11, 0x00 // '`' ... DEL
70};
71
72static uchar directSet[] = {
73 0x00, 0x00, 0x00, 0x00, // '\0' ...
74 0x01, 0xCF, 0xFF, 0xE1, // ' ' ... '?'
75 0x7F, 0xFF, 0xFF, 0xE0, // '@' ... '_'
76 0x7F, 0xFF, 0xFF, 0xE0 // '`' ... DEL
77};
78
79static uchar optDirectSet[] = {
80 0x00, 0x00, 0x00, 0x00, // '\0' ...
81 0x7E, 0x20, 0x00, 0x1E, // ' ' ... '?'
82 0x80, 0x00, 0x00, 0x17, // '@' ... '_'
83 0x80, 0x00, 0x00, 0x1C // '`' ... DEL
84};
85
86static inline bool isOfSet(uchar ch, uchar* set) {
87 return set[ ch/8 ] & (0x80 >> ( ch%8 ));
88}
89
90int TQUtf7Codec::heuristicContentMatch(const char* chars, int len) const
91{
92 int stepNo = 0;
93 int i;
94 bool shifted = FALSE;
95 bool rightAfterEscape = FALSE;
96 bool onlyNullBitsSinceLastBoundary = TRUE;
97 for ( i = 0; i < len ; i++ ) {
98 if ((unsigned char)chars[i] >= 128) // 8bit chars not allowed.
99 break;
100 if (shifted) {
101 if ( isOfSet(chars[i],base64Set) ) {
102 switch (stepNo) {
103 case 0:
104 onlyNullBitsSinceLastBoundary = TRUE;
105 break;
106 case 3:
107 onlyNullBitsSinceLastBoundary
108 = isOfSet(chars[i],base64SetWithLastTwoBitsZero);
109 break;
110 case 6:
111 onlyNullBitsSinceLastBoundary
112 = ( chars[i] == 'A' || chars[i] == 'Q' ||
113 chars[i] == 'g' || chars[i] == 'w' );
114 break;
115 default:
116 onlyNullBitsSinceLastBoundary
117 = onlyNullBitsSinceLastBoundary && (chars[i] == 'A');
118 }
119 stepNo = (stepNo + 1) % 8;
120 rightAfterEscape = FALSE;
121 } else {
122 if (rightAfterEscape && chars[i] != '-')
123 break; // a '+' must be followed by '-' or a base64 char
124 if (!onlyNullBitsSinceLastBoundary)
125 break; // non-zero bits in the tail of the base64 encoding
126 shifted = FALSE;
127 stepNo = 0;
128 }
129 } else {
130 if (chars[i] == '+') {
131 shifted = TRUE;
132 rightAfterEscape = TRUE;
133 }
134 }
135 }
136 return i;
137}
138
139class TQUtf7Decoder : public TQTextDecoder {
140 // the storage for our unicode char until it's finished
141 ushort uc;
142 // the state of the base64 decoding
143 // can be 0 (just finished three unicode chars)
144 // 1 (have the upper 6 bits of uc already)
145 // 2 (have the upper 12 bits of uc already)
146 // 3 (have the upper 2 bits of uc already)
147 // ..........
148 // 7 (have the upper 10 bits of uc already)
149 // => n (have the upper (n * 6) % 16 bits of uc already)
150 // "stepNo" cycles through all it's values every three
151 // unicode chars.
152 char stepNo;
153 // remembers if we are in shifted-sequence mode
154 bool shifted;
155 // remembers if we're just after the initial '+'
156 // of a shifted-sequence.
157 bool rightAfterEscape;
158public:
159 TQUtf7Decoder() : uc(0), stepNo(0), shifted(FALSE), rightAfterEscape(FALSE)
160 {
161 }
162
163private:
164 inline void resetParser()
165 {
166 uc = 0;
167 stepNo = 0;
168 shifted = FALSE;
169 rightAfterEscape = FALSE;
170 }
171
172public:
173 TQString toUnicode(const char* chars, int len)
174 {
175 TQString result = "";
176 for (int i=0; i<len; i++) {
177 uchar ch = chars[i];
178
179 //
180 // check for 8bit char's:
181 //
182 if ( ch > 127 ) {
183 tqWarning("TQUtf7Decoder: 8bit char found in input. "
184 "Parser has been re-initialized!");
185 resetParser();
186 result += TQChar::replacement;
187 continue;
188 }
189
190 if (shifted) { // in shifted mode
191
192 //
193 // first, we check specialities that only occur
194 // right after the escaping '+':
195 //
196 if ( rightAfterEscape && ch == '-' ) {
197 // a "+-" sequence is a short-circuit encoding
198 // for just '+':
199 resetParser();
200 result += TQChar('+');
201 // we're already done for this "ch", so
202 continue;
203 }
204
205 //
206 // Here we're going to extract the bits represented by "ch":
207 //
208 ushort bits;
209 if ( ch >= 'A' && ch <= 'Z' ) {
210 bits = ch - 'A';
211 } else if ( ch >= 'a' && ch <= 'z' ) {
212 bits = ch - 'a' + 26;
213 } else if ( ch >= '0' && ch <= '9' ) {
214 bits = ch - '0' + 52;
215 } else if ( ch == '+' ) {
216 bits = 62;
217 } else if ( ch == '/' ) {
218 bits = 63;
219 } else {
220 bits = 0; // keep compiler happy
221
222 //
223 // ch is not of the base64 alphabet.
224 // Here we are going to check the sequence's validity:
225 //
226 if ( rightAfterEscape ) {
227 // any non-base64 char following an escaping '+'
228 // makes for an ill-formed sequence.
229 // Note that we catch (the valid) "+-" pair
230 // right at the beginning.
231 tqWarning("TQUtf7Decoder: ill-formed input: "
232 "non-base64 char after escaping \"+\"!");
233 }
234 // pending bits from base64 encoding must be all 0:
235 if (stepNo >= 1 && uc) {
236 tqWarning("TQUtf7Decoder: ill-formed sequence: "
237 "non-zero bits in shifted-sequence tail!");
238 }
239 resetParser();
240
241 // a '-' signifies the end of the shifted-sequence,
242 // so we just swallow it.
243 if ( ch == '-' )
244 continue;
245 // end of validity checking. Process ch now...
246 }
247
248 if ( /*still*/ shifted ) {
249 //
250 // now we're going to stuff the "bits" bit bucket into
251 // the right position inside "uc", emitting a resulting
252 // TQChar if possible.
253 //
254 switch (stepNo) {
255 // "bits" are the 6 msb's of uc
256 case 0: uc = bits << 10; break;
257
258 case 1: uc |= bits << 4; break;
259
260 // 4 bits of "bits" complete the first ushort
261 case 2: uc |= bits >> 2; result += TQChar(uc);
262 // 2 bits of "bits" make the msb's of the next ushort
263 uc = bits << 14; break;
264 case 3: uc |= bits << 8; break;
265 case 4: uc |= bits << 2; break;
266
267 // 2 bits of "bits" complete the second ushort
268 case 5: uc |= bits >> 4; result += TQChar(uc);
269 // 4 bits of "bits" make the msb's of the next ushort
270 uc = bits << 12; break;
271 case 6: uc |= bits << 6; break;
272
273 // these 6 bits complete the third ushort
274 // and also one round of 8 chars -> 3 ushort decoding
275 case 7: uc |= bits; result += TQChar(uc);
276 uc = 0; break;
277 default: ;
278 } // switch (stepNo)
279 // increase the step counter
280 stepNo++;
281 stepNo %= 8;
282 rightAfterEscape = FALSE;
283 // and look at the next char.
284 continue;
285 } // fi (still) shifted
286 } // fi shifted
287
288 //
289 // if control reaches here, we either weren't in a
290 // shifted sequence or we just left one by seeing
291 // a non-base64-char.
292 // Either way, we have to process "ch" outside
293 // a shifted-sequence now:
294 //
295 if ( ch == '+' ) {
296 // '+' is the escape char for entering a
297 // shifted sequence:
298 shifted = TRUE;
299 stepNo = 0;
300 // also, we're right at the beginning where
301 // special rules apply:
302 rightAfterEscape = TRUE;
303 } else {
304 // US-ASCII values are directly used
305 result += TQChar(ch);
306 }
307 }
308
309 return result;
310
311 } // toUnicode()
312
313}; // class TQUtf7Decoder
314
315TQTextDecoder* TQUtf7Codec::makeDecoder() const
316{
317 return new TQUtf7Decoder;
318}
319
320
321class TQUtf7Encoder : public TQTextEncoder {
322 uchar dontNeedEncodingSet[16];
323 ushort outbits;
324 uint stepNo : 2;
325 bool shifted : 1;
326 bool mayContinueShiftedSequence : 1;
327public:
328 TQUtf7Encoder(bool encOpt, bool encLwsp)
329 : outbits(0), stepNo(0),
330 shifted(FALSE), mayContinueShiftedSequence(FALSE)
331 {
332 for ( int i = 0; i < 16 ; i++) {
333 dontNeedEncodingSet[i] = directSet[i];
334 if (!encOpt)
335 dontNeedEncodingSet[i] |= optDirectSet[i];
336 }
337 if(!encLwsp) {
338 dontNeedEncodingSet[' '/8] |= 0x80 >> (' '%8);
339 dontNeedEncodingSet['\n'/8] |= 0x80 >> ('\n'%8);
340 dontNeedEncodingSet['\r'/8] |= 0x80 >> ('\r'%8);
341 dontNeedEncodingSet['\t'/8] |= 0x80 >> ('\t'%8);
342 }
343 }
344
345private:
346
347 char toBase64( ushort u ) {
348 if ( u < 26 )
349 return (char)u + 'A';
350 else if ( u < 52 )
351 return (char)u - 26 + 'a';
352 else if ( u < 62 )
353 return (char)u - 52 + '0';
354 else if ( u == 62 )
355 return '+';
356 else
357 return '/';
358 }
359
360 void addToShiftedSequence(TQCString::Iterator & t, ushort u) {
361 switch (stepNo) {
362 // no outbits; use uppermost 6 bits of u
363 case 0:
364 *t++ = toBase64( u >> 10 );
365 *t++ = toBase64( (u & 0x03FF /* umask top 6 bits */ ) >> 4 );
366 // save 4 lowest-order bits in outbits[5..2]
367 outbits = (u & 0x000F) << 2;
368 break;
369
370 // outbits available; use top two bits of u to complete
371 // the previous char
372 case 1:
373 if (!mayContinueShiftedSequence) {
374 // if mayContinue, this char has already been written
375 *t++ = toBase64( outbits | ( u >> 14 ) );
376 }
377 *t++ = toBase64( (u & 0x3F00 /* mask top 2 bits */ ) >> 8 );
378 *t++ = toBase64( (u & 0x00FC /* mask msbyte */ ) >> 2 );
379 // save 2 lowest-significant bits in outbits[5..4]
380 outbits = (u & 0x0003) << 4;
381 break;
382
383 // outbits available; use top four bits of u to complete
384 // the previous char
385 case 2:
386 if (!mayContinueShiftedSequence) {
387 // if mayContinue, this char has already been written
388 *t++ = toBase64( outbits | ( u >> 12 ) );
389 }
390 *t++ = toBase64( (u & 0x0FFF) >> 6 );
391 *t++ = toBase64( u & 0x003F );
392 break;
393
394 default: ;
395 }
396 stepNo = (stepNo + 1) % 3;
397 }
398
399 void endShiftedSequence(TQCString::Iterator & t) {
400 switch (stepNo) {
401 case 1: // four outbits still to be written
402 case 2: // two outbits still to be written
403 *t++ = toBase64( outbits );
404 break;
405 case 0: // nothing to do
406 default: ;
407 }
408 outbits = 0;
409 }
410
411 // depending on the stepNo, checks whether we can continue
412 // an already ended shifted-sequence with char "u".
413 // This is only possible if the topmost bits fit the
414 // already written ones (which are all 0 between calls)
415 bool continueOK( ushort u ) {
416 return stepNo == 0 ||
417 ( stepNo == 1 && (u & 0xF000) == 0 ) ||
418 ( stepNo == 2 && (u & 0xC000) == 0 );
419 }
420
421 void processDoesntNeedEncoding(TQCString::Iterator & t, ushort ch) {
422 // doesn't need encoding
423 if (shifted) {
424 endShiftedSequence(t);
425 // add "lead-out" to dis-ambiguate following chars:
426 if (isOfSet((char)ch,base64Set) || ch == '-' ) {
427 *t++ = '-';
428 }
429 } else if (mayContinueShiftedSequence) {
430 // if mayContinue is set, this means the
431 // shifted-sequence needs a lead-out.
432 mayContinueShiftedSequence = FALSE;
433 if (isOfSet(ch,base64Set) || ch == '-' ) {
434 *t++ = '-';
435 }
436 }
437 *t++ = (uchar)ch;
438 shifted = FALSE;
439 stepNo = 0;
440 }
441
442public:
443 TQCString fromUnicode(const TQString & uc, int & len_in_out)
444 {
445 // allocate place for worst case:
446 // len/2 * (5+1) for an alternating sequence of e.g. "A\",
447 // + 4 for a worst-case of another +ABC encoded char
448 // + 1 for the trailing \0
449 //
450 int maxreslen = 3 * len_in_out + 5;
451 TQCString result( maxreslen );
452
453#if 0
454 // if (len_in_out == 1) {
455 cout << "\nlen_in_out: " << len_in_out
456 <<"; shifted: " << (shifted ? "true" : "false")
457 << ";\n" << "mayContinue: "
458 << (mayContinueShiftedSequence ? "true" : "false")
459 << "; stepNo: " << stepNo << ";\n"
460 << "outbits: " << outbits << endl;
461 // }
462#endif
463
464 // source and destination cursor
465 const TQChar * s = uc.unicode();
466 TQCString::Iterator t = result.data();
467
468 if ( uc.isNull() ) {
469 // return to ascii requested:
470 if ( mayContinueShiftedSequence )
471 *t++ = '-';
472 } else {
473 // normal operation:
474 for (int i = 0 ; i < len_in_out ;
475 i++/*, checkOutBuf(result,maxreslen,t,i,len_in_out,5)*/ ) {
476 ushort ch = s[i].unicode();
477
478 //
479 // first, we check whether we might get around encoding:
480 //
481 if ( ch < 128 ) {
482 //
483 // ch is usAscii, so we have a chance that we don't
484 // need to encode it.
485 //
486 if ( isOfSet((uchar)ch,dontNeedEncodingSet) ) {
487 processDoesntNeedEncoding(t,ch);
488 continue;
489 } else if ( ch == '+' ) {
490 // '+' is the shift escape character
491 if (shifted || mayContinueShiftedSequence) {
492 // if we are already in shifted mode, we just
493 // encode the '+', too. Compare
494 // 24bits ("-+-") + some from ending the shifted-sequence
495 // with 21,33 bits
496 addToShiftedSequence(t,ch);
497 mayContinueShiftedSequence = FALSE;
498 shifted = TRUE;
499 } else {
500 // shortcut encoding of '+':
501 *t++ = '+';
502 *t++ = '-';
503 }
504 continue; // done
505 } // else fall through to encoding
506 }
507 //
508 // need encoding
509 //
510 if (!shifted && (!mayContinueShiftedSequence || !continueOK(ch) ) ) {
511 *t++ = '+';
512 stepNo = 0;
513 }
514 addToShiftedSequence(t,ch);
515 shifted = TRUE;
516 mayContinueShiftedSequence = FALSE;
517 }
518
519 if ( shifted ) {
520 endShiftedSequence(t);
521 mayContinueShiftedSequence = TRUE;
522 };
523 shifted = FALSE;
524 }
525
526 *t = '\0';
527 len_in_out = t - result.data();
528
529#if 0
530 cout << "len_in_out: " << len_in_out << "; "
531 << "mayContinue: " << (mayContinueShiftedSequence ? "true" : "false")
532 << "; stepNo: " << stepNo << endl;
533#endif
534
535 Q_ASSERT(len_in_out <= maxreslen-1);
536
537 return result;
538 } // fromUnicode()
539
540}; // class TQUtf7Encoder
541
542TQTextEncoder* TQUtf7Codec::makeEncoder() const {
543 return new TQUtf7Encoder( false, false );
544}
545
546TQTextEncoder* TQStrictUtf7Codec::makeEncoder() const {
547 return new TQUtf7Encoder( true, false );
548}
549
550#endif // TQT_NO_TEXTCODEC