tdeioslave/imap4

rfcdecoder.cpp
1/**********************************************************************
2 *
3 * rfcdecoder.cpp - handler for various rfc/mime encodings
4 * Copyright (C) 2000 s.carstens@gmx.de
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 *
20 * Send comments and bug fixes to s.carstens@gmx.de
21 *
22 *********************************************************************/
23#include "rfcdecoder.h"
24
25#include <ctype.h>
26#include <sys/types.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30
31#include <tqtextcodec.h>
32#include <tqbuffer.h>
33#include <tqregexp.h>
34#include <kmdcodec.h>
35
36// This part taken from rfc 2192 IMAP URL Scheme. C. Newman. September 1997.
37// adapted to QT-Toolkit by Sven Carstens <s.carstens@gmx.de> 2000
38
39static unsigned char base64chars[] =
40 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
41#define UNDEFINED 64
42#define MAXLINE 76
43
44/* UTF16 definitions */
45#define UTF16MASK 0x03FFUL
46#define UTF16SHIFT 10
47#define UTF16BASE 0x10000UL
48#define UTF16HIGHSTART 0xD800UL
49#define UTF16HIGHEND 0xDBFFUL
50#define UTF16LOSTART 0xDC00UL
51#define UTF16LOEND 0xDFFFUL
52
53/* Convert an IMAP mailbox to a Unicode path
54 */
55TQString rfcDecoder::fromIMAP (const TQString & inSrc)
56{
57 unsigned char c, i, bitcount;
58 unsigned long ucs4, utf16, bitbuf;
59 unsigned char base64[256], utf8[6];
60 unsigned long srcPtr = 0;
61 TQCString dst;
62 TQCString src = inSrc.ascii ();
63 uint srcLen = inSrc.length();
64
65 /* initialize modified base64 decoding table */
66 memset (base64, UNDEFINED, sizeof (base64));
67 for (i = 0; i < sizeof (base64chars); ++i)
68 {
69 base64[(int)base64chars[i]] = i;
70 }
71
72 /* loop until end of string */
73 while (srcPtr < srcLen)
74 {
75 c = src[srcPtr++];
76 /* deal with literal characters and &- */
77 if (c != '&' || src[srcPtr] == '-')
78 {
79 /* encode literally */
80 dst += c;
81 /* skip over the '-' if this is an &- sequence */
82 if (c == '&')
83 srcPtr++;
84 }
85 else
86 {
87 /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
88 bitbuf = 0;
89 bitcount = 0;
90 ucs4 = 0;
91 while ((c = base64[(unsigned char) src[srcPtr]]) != UNDEFINED)
92 {
93 ++srcPtr;
94 bitbuf = (bitbuf << 6) | c;
95 bitcount += 6;
96 /* enough bits for a UTF-16 character? */
97 if (bitcount >= 16)
98 {
99 bitcount -= 16;
100 utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff;
101 /* convert UTF16 to UCS4 */
102 if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND)
103 {
104 ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
105 continue;
106 }
107 else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND)
108 {
109 ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
110 }
111 else
112 {
113 ucs4 = utf16;
114 }
115 /* convert UTF-16 range of UCS4 to UTF-8 */
116 if (ucs4 <= 0x7fUL)
117 {
118 utf8[0] = ucs4;
119 i = 1;
120 }
121 else if (ucs4 <= 0x7ffUL)
122 {
123 utf8[0] = 0xc0 | (ucs4 >> 6);
124 utf8[1] = 0x80 | (ucs4 & 0x3f);
125 i = 2;
126 }
127 else if (ucs4 <= 0xffffUL)
128 {
129 utf8[0] = 0xe0 | (ucs4 >> 12);
130 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
131 utf8[2] = 0x80 | (ucs4 & 0x3f);
132 i = 3;
133 }
134 else
135 {
136 utf8[0] = 0xf0 | (ucs4 >> 18);
137 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
138 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
139 utf8[3] = 0x80 | (ucs4 & 0x3f);
140 i = 4;
141 }
142 /* copy it */
143 for (c = 0; c < i; ++c)
144 {
145 dst += utf8[c];
146 }
147 }
148 }
149 /* skip over trailing '-' in modified UTF-7 encoding */
150 if (src[srcPtr] == '-')
151 ++srcPtr;
152 }
153 }
154 return TQString::fromUtf8 (dst.data ());
155}
156
157/* replace " with \" and \ with \\ " and \ characters */
158TQString rfcDecoder::quoteIMAP(const TQString &src)
159{
160 uint len = src.length();
161 TQString result;
162 result.reserve(2 * len);
163 for (unsigned int i = 0; i < len; i++)
164 {
165 if (src[i] == '"' || src[i] == '\\')
166 result += '\\';
167 result += src[i];
168 }
169 //result.squeeze(); - unnecessary and slow
170 return result;
171}
172
173/* Convert Unicode path to modified UTF-7 IMAP mailbox
174 */
175TQString rfcDecoder::toIMAP (const TQString & inSrc)
176{
177 unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag;
178 unsigned long ucs4, bitbuf;
179 TQCString src = inSrc.utf8 ();
180 TQString dst;
181
182 ulong srcPtr = 0;
183 utf7mode = 0;
184 utf8total = 0;
185 bitstogo = 0;
186 utf8pos = 0;
187 bitbuf = 0;
188 ucs4 = 0;
189 while (srcPtr < src.length ())
190 {
191 c = (unsigned char) src[srcPtr++];
192 /* normal character? */
193 if (c >= ' ' && c <= '~')
194 {
195 /* switch out of UTF-7 mode */
196 if (utf7mode)
197 {
198 if (bitstogo)
199 {
200 dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
201 bitstogo = 0;
202 }
203 dst += '-';
204 utf7mode = 0;
205 }
206 dst += c;
207 /* encode '&' as '&-' */
208 if (c == '&')
209 {
210 dst += '-';
211 }
212 continue;
213 }
214 /* switch to UTF-7 mode */
215 if (!utf7mode)
216 {
217 dst += '&';
218 utf7mode = 1;
219 }
220 /* Encode US-ASCII characters as themselves */
221 if (c < 0x80)
222 {
223 ucs4 = c;
224 utf8total = 1;
225 }
226 else if (utf8total)
227 {
228 /* save UTF8 bits into UCS4 */
229 ucs4 = (ucs4 << 6) | (c & 0x3FUL);
230 if (++utf8pos < utf8total)
231 {
232 continue;
233 }
234 }
235 else
236 {
237 utf8pos = 1;
238 if (c < 0xE0)
239 {
240 utf8total = 2;
241 ucs4 = c & 0x1F;
242 }
243 else if (c < 0xF0)
244 {
245 utf8total = 3;
246 ucs4 = c & 0x0F;
247 }
248 else
249 {
250 /* NOTE: can't convert UTF8 sequences longer than 4 */
251 utf8total = 4;
252 ucs4 = c & 0x03;
253 }
254 continue;
255 }
256 /* loop to split ucs4 into two utf16 chars if necessary */
257 utf8total = 0;
258 do
259 {
260 if (ucs4 >= UTF16BASE)
261 {
262 ucs4 -= UTF16BASE;
263 bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART);
264 ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
265 utf16flag = 1;
266 }
267 else
268 {
269 bitbuf = (bitbuf << 16) | ucs4;
270 utf16flag = 0;
271 }
272 bitstogo += 16;
273 /* spew out base64 */
274 while (bitstogo >= 6)
275 {
276 bitstogo -= 6;
277 dst += base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F];
278 }
279 }
280 while (utf16flag);
281 }
282 /* if in UTF-7 mode, finish in ASCII */
283 if (utf7mode)
284 {
285 if (bitstogo)
286 {
287 dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
288 }
289 dst += '-';
290 }
291 return quoteIMAP(dst);
292}
293
294//-----------------------------------------------------------------------------
295TQString rfcDecoder::decodeQuoting(const TQString &aStr)
296{
297 TQString result;
298 unsigned int strLength(aStr.length());
299 for (unsigned int i = 0; i < strLength ; i++)
300 {
301 if (aStr[i] == "\\") i++;
302 result += aStr[i];
303 }
304 return result;
305}
306
307//-----------------------------------------------------------------------------
308TQTextCodec *
309rfcDecoder::codecForName (const TQString & _str)
310{
311 if (_str.isEmpty ())
312 return NULL;
313 return TQTextCodec::codecForName (_str.lower ().
314 replace ("windows", "cp").latin1 ());
315}
316
317//-----------------------------------------------------------------------------
318const TQString
319rfcDecoder::decodeRFC2047String (const TQString & _str)
320{
321 TQString throw_away;
322
323 return decodeRFC2047String (_str, throw_away);
324}
325
326//-----------------------------------------------------------------------------
327const TQString
328rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset)
329{
330 TQString throw_away;
331
332 return decodeRFC2047String (_str, charset, throw_away);
333}
334
335//-----------------------------------------------------------------------------
336const TQString
337rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset,
338 TQString & language)
339{
340 //do we have a rfc string
341 if (_str.find("=?") < 0)
342 return _str;
343
344 TQCString aStr = _str.ascii (); // TQString.length() means Unicode chars
345 TQCString result;
346 char *pos, *beg, *end, *mid = NULL;
347 TQCString str;
348 char encoding = 0, ch;
349 bool valid;
350 const int maxLen = 200;
351 int i;
352
353// result.truncate(aStr.length());
354 for (pos = aStr.data (); *pos; pos++)
355 {
356 if (pos[0] != '=' || pos[1] != '?')
357 {
358 result += *pos;
359 continue;
360 }
361 beg = pos + 2;
362 end = beg;
363 valid = TRUE;
364 // parse charset name
365 for (i = 2, pos += 2;
366 i < maxLen && (*pos != '?' && (ispunct (*pos) || isalnum (*pos)));
367 i++)
368 pos++;
369 if (*pos != '?' || i < 4 || i >= maxLen)
370 valid = FALSE;
371 else
372 {
373 charset = TQCString (beg, i - 1); // -2 + 1 for the zero
374 int pt = charset.findRev('*');
375 if (pt != -1)
376 {
377 // save language for later usage
378 language = charset.right (charset.length () - pt - 1);
379
380 // tie off language as defined in rfc2047
381 charset.truncate(pt);
382 }
383 // get encoding and check delimiting question marks
384 encoding = toupper (pos[1]);
385 if (pos[2] != '?'
386 || (encoding != 'Q' && encoding != 'B' && encoding != 'q'
387 && encoding != 'b'))
388 valid = FALSE;
389 pos += 3;
390 i += 3;
391// kdDebug(7116) << "rfcDecoder::decodeRFC2047String - charset " << charset << " - language " << language << " - '" << pos << "'" << endl;
392 }
393 if (valid)
394 {
395 mid = pos;
396 // search for end of encoded part
397 while (i < maxLen && *pos && !(*pos == '?' && *(pos + 1) == '='))
398 {
399 i++;
400 pos++;
401 }
402 end = pos + 2; //end now points to the first char after the encoded string
403 if (i >= maxLen || !*pos)
404 valid = FALSE;
405 }
406 if (valid)
407 {
408 ch = *pos;
409 *pos = '\0';
410 str = TQCString (mid).left ((int) (mid - pos - 1));
411 if (encoding == 'Q')
412 {
413 // decode quoted printable text
414 for (i = str.length () - 1; i >= 0; i--)
415 if (str[i] == '_')
416 str[i] = ' ';
417// kdDebug(7116) << "rfcDecoder::decodeRFC2047String - before QP '" << str << "'" << endl;
418
419 str = KCodecs::quotedPrintableDecode(str);
420// kdDebug(7116) << "rfcDecoder::decodeRFC2047String - after QP '" << str << "'" << endl;
421 }
422 else
423 {
424 // decode base64 text
425 str = KCodecs::base64Decode(str);
426 }
427 *pos = ch;
428 int len = str.length();
429 for (i = 0; i < len; i++)
430 result += (char) (TQChar) str[i];
431
432 pos = end - 1;
433 }
434 else
435 {
436// kdDebug(7116) << "rfcDecoder::decodeRFC2047String - invalid" << endl;
437 //result += "=?";
438 //pos = beg -1; // because pos gets increased shortly afterwards
439 pos = beg - 2;
440 result += *pos++;
441 result += *pos;
442 }
443 }
444 if (!charset.isEmpty ())
445 {
446 TQTextCodec *aCodec = codecForName (charset.ascii ());
447 if (aCodec)
448 {
449// kdDebug(7116) << "Codec is " << aCodec->name() << endl;
450 return aCodec->toUnicode (result);
451 }
452 }
453 return result;
454}
455
456
457//-----------------------------------------------------------------------------
458const char especials[17] = "()<>@,;:\"/[]?.= ";
459
460const TQString
461rfcDecoder::encodeRFC2047String (const TQString & _str)
462{
463 if (_str.isEmpty ())
464 return _str;
465 const signed char *latin = reinterpret_cast<const signed char *>(_str.latin1()), *l, *start, *stop;
466 char hexcode;
467 int numQuotes, i;
468 int rptr = 0;
469 // My stats show this number results in 12 resize() out of 73,000
470 int resultLen = 3 * _str.length() / 2;
471 TQCString result(resultLen);
472
473 while (*latin)
474 {
475 l = latin;
476 start = latin;
477 while (*l)
478 {
479 if (*l == 32)
480 start = l + 1;
481 if (*l < 0)
482 break;
483 l++;
484 }
485 if (*l)
486 {
487 numQuotes = 1;
488 while (*l)
489 {
490 /* The encoded word must be limited to 75 character */
491 for (i = 0; i < 16; i++)
492 if (*l == especials[i])
493 numQuotes++;
494 if (*l < 0)
495 numQuotes++;
496 /* Stop after 58 = 75 - 17 characters or at "<user@host..." */
497 if (l - start + 2 * numQuotes >= 58 || *l == 60)
498 break;
499 l++;
500 }
501 if (*l)
502 {
503 stop = l - 1;
504 while (stop >= start && *stop != 32)
505 stop--;
506 if (stop <= start)
507 stop = l;
508 }
509 else
510 stop = l;
511 if (resultLen - rptr - 1 <= start - latin + 1 + 16 /* =?iso-88... */) {
512 resultLen += (start - latin + 1) * 2 + 20; // more space
513 result.resize(resultLen);
514 }
515 while (latin < start)
516 {
517 result[rptr++] = *latin;
518 latin++;
519 }
520 strcpy(&result[rptr], "=?iso-8859-1?q?"); rptr += 15;
521 if (resultLen - rptr - 1 <= 3*(stop - latin + 1)) {
522 resultLen += (stop - latin + 1) * 4 + 20; // more space
523 result.resize(resultLen);
524 }
525 while (latin < stop) // can add up to 3 chars/iteration
526 {
527 numQuotes = 0;
528 for (i = 0; i < 16; i++)
529 if (*latin == especials[i])
530 numQuotes = 1;
531 if (*latin < 0)
532 numQuotes = 1;
533 if (numQuotes)
534 {
535 result[rptr++] = '=';
536 hexcode = ((*latin & 0xF0) >> 4) + 48;
537 if (hexcode >= 58)
538 hexcode += 7;
539 result[rptr++] = hexcode;
540 hexcode = (*latin & 0x0F) + 48;
541 if (hexcode >= 58)
542 hexcode += 7;
543 result[rptr++] = hexcode;
544 }
545 else
546 {
547 result[rptr++] = *latin;
548 }
549 latin++;
550 }
551 result[rptr++] = '?';
552 result[rptr++] = '=';
553 }
554 else
555 {
556 while (*latin)
557 {
558 if (rptr == resultLen - 1) {
559 resultLen += 30;
560 result.resize(resultLen);
561 }
562 result[rptr++] = *latin;
563 latin++;
564 }
565 }
566 }
567 result[rptr] = 0;
568 //free (latinStart);
569 return result;
570}
571
572
573//-----------------------------------------------------------------------------
574const TQString
575rfcDecoder::encodeRFC2231String (const TQString & _str)
576{
577 if (_str.isEmpty ())
578 return _str;
579 signed char *latin = (signed char *) calloc (1, _str.length () + 1);
580 char *latin_us = (char *) latin;
581 strcpy (latin_us, _str.latin1 ());
582 signed char *l = latin;
583 char hexcode;
584 int i;
585 bool quote;
586 while (*l)
587 {
588 if (*l < 0)
589 break;
590 l++;
591 }
592 if (!*l) {
593 free(latin);
594 return _str;
595 }
596 TQCString result;
597 l = latin;
598 while (*l)
599 {
600 quote = *l < 0;
601 for (i = 0; i < 16; i++)
602 if (*l == especials[i])
603 quote = true;
604 if (quote)
605 {
606 result += "%";
607 hexcode = ((*l & 0xF0) >> 4) + 48;
608 if (hexcode >= 58)
609 hexcode += 7;
610 result += hexcode;
611 hexcode = (*l & 0x0F) + 48;
612 if (hexcode >= 58)
613 hexcode += 7;
614 result += hexcode;
615 }
616 else
617 {
618 result += *l;
619 }
620 l++;
621 }
622 free (latin);
623 return result;
624}
625
626
627//-----------------------------------------------------------------------------
628const TQString
629rfcDecoder::decodeRFC2231String (const TQString & _str)
630{
631 int p = _str.find ('\'');
632
633 //see if it is an rfc string
634 if (p < 0)
635 return _str;
636
637 int l = _str.findRev ('\'');
638
639 //second is language
640 if (p >= l)
641 return _str;
642
643 //first is charset or empty
644 TQString charset = _str.left (p);
645 TQString st = _str.mid (l + 1);
646 TQString language = _str.mid (p + 1, l - p - 1);
647
648 //kdDebug(7116) << "Charset: " << charset << " Language: " << language << endl;
649
650 char ch, ch2;
651 p = 0;
652 while (p < (int) st.length ())
653 {
654 if (st.at (p) == 37)
655 {
656 ch = st.at (p + 1).latin1 () - 48;
657 if (ch > 16)
658 ch -= 7;
659 ch2 = st.at (p + 2).latin1 () - 48;
660 if (ch2 > 16)
661 ch2 -= 7;
662 st.at (p) = ch * 16 + ch2;
663 st.remove (p + 1, 2);
664 }
665 p++;
666 }
667 return st;
668}
static const TQString decodeRFC2047String(const TQString &_str, TQString &charset, TQString &language)
decode a RFC2047 String
Definition: rfcdecoder.cpp:337
static const TQString encodeRFC2231String(const TQString &_str)
encode a RFC2231 String
Definition: rfcdecoder.cpp:575
static TQString decodeQuoting(const TQString &aStr)
remove \ from a string
Definition: rfcdecoder.cpp:295
static TQString toIMAP(const TQString &inSrc)
Convert Unicode path to modified UTF-7 IMAP mailbox.
Definition: rfcdecoder.cpp:175
static TQTextCodec * codecForName(const TQString &)
fetch a codec by name
Definition: rfcdecoder.cpp:309
static TQString quoteIMAP(const TQString &src)
replace " with \" and \ with \\ " and \ characters
Definition: rfcdecoder.cpp:158
static const TQString decodeRFC2231String(const TQString &_str)
decode a RFC2231 String
Definition: rfcdecoder.cpp:629
static TQString fromIMAP(const TQString &src)
Convert an IMAP mailbox to a Unicode path.
Definition: rfcdecoder.cpp:55
static const TQString encodeRFC2047String(const TQString &_str, TQString &charset, TQString &language)
encode a RFC2047 String