tdeioslave/imap4

rfcdecoder.cpp
1 /**********************************************************************
2  *
3  * rfcdecoder.cpp - handler for various rfc/mime encodings
4  * Copyright (C) 2000 s.carstens@gmx.de
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19  *
20  * Send comments and bug fixes to s.carstens@gmx.de
21  *
22  *********************************************************************/
23 #include "rfcdecoder.h"
24 
25 #include <ctype.h>
26 #include <sys/types.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 
31 #include <tqtextcodec.h>
32 #include <tqbuffer.h>
33 #include <tqregexp.h>
34 #include <kmdcodec.h>
35 
36 // This part taken from rfc 2192 IMAP URL Scheme. C. Newman. September 1997.
37 // adapted to QT-Toolkit by Sven Carstens <s.carstens@gmx.de> 2000
38 
39 static unsigned char base64chars[] =
40  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
41 #define UNDEFINED 64
42 #define MAXLINE 76
43 
44 /* UTF16 definitions */
45 #define UTF16MASK 0x03FFUL
46 #define UTF16SHIFT 10
47 #define UTF16BASE 0x10000UL
48 #define UTF16HIGHSTART 0xD800UL
49 #define UTF16HIGHEND 0xDBFFUL
50 #define UTF16LOSTART 0xDC00UL
51 #define UTF16LOEND 0xDFFFUL
52 
53 /* Convert an IMAP mailbox to a Unicode path
54  */
55 TQString rfcDecoder::fromIMAP (const TQString & inSrc)
56 {
57  unsigned char c, i, bitcount;
58  unsigned long ucs4, utf16, bitbuf;
59  unsigned char base64[256], utf8[6];
60  unsigned long srcPtr = 0;
61  TQCString dst;
62  TQCString src = inSrc.ascii ();
63  uint srcLen = inSrc.length();
64 
65  /* initialize modified base64 decoding table */
66  memset (base64, UNDEFINED, sizeof (base64));
67  for (i = 0; i < sizeof (base64chars); ++i)
68  {
69  base64[(int)base64chars[i]] = i;
70  }
71 
72  /* loop until end of string */
73  while (srcPtr < srcLen)
74  {
75  c = src[srcPtr++];
76  /* deal with literal characters and &- */
77  if (c != '&' || src[srcPtr] == '-')
78  {
79  /* encode literally */
80  dst += c;
81  /* skip over the '-' if this is an &- sequence */
82  if (c == '&')
83  srcPtr++;
84  }
85  else
86  {
87  /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
88  bitbuf = 0;
89  bitcount = 0;
90  ucs4 = 0;
91  while ((c = base64[(unsigned char) src[srcPtr]]) != UNDEFINED)
92  {
93  ++srcPtr;
94  bitbuf = (bitbuf << 6) | c;
95  bitcount += 6;
96  /* enough bits for a UTF-16 character? */
97  if (bitcount >= 16)
98  {
99  bitcount -= 16;
100  utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff;
101  /* convert UTF16 to UCS4 */
102  if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND)
103  {
104  ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
105  continue;
106  }
107  else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND)
108  {
109  ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
110  }
111  else
112  {
113  ucs4 = utf16;
114  }
115  /* convert UTF-16 range of UCS4 to UTF-8 */
116  if (ucs4 <= 0x7fUL)
117  {
118  utf8[0] = ucs4;
119  i = 1;
120  }
121  else if (ucs4 <= 0x7ffUL)
122  {
123  utf8[0] = 0xc0 | (ucs4 >> 6);
124  utf8[1] = 0x80 | (ucs4 & 0x3f);
125  i = 2;
126  }
127  else if (ucs4 <= 0xffffUL)
128  {
129  utf8[0] = 0xe0 | (ucs4 >> 12);
130  utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
131  utf8[2] = 0x80 | (ucs4 & 0x3f);
132  i = 3;
133  }
134  else
135  {
136  utf8[0] = 0xf0 | (ucs4 >> 18);
137  utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
138  utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
139  utf8[3] = 0x80 | (ucs4 & 0x3f);
140  i = 4;
141  }
142  /* copy it */
143  for (c = 0; c < i; ++c)
144  {
145  dst += utf8[c];
146  }
147  }
148  }
149  /* skip over trailing '-' in modified UTF-7 encoding */
150  if (src[srcPtr] == '-')
151  ++srcPtr;
152  }
153  }
154  return TQString::fromUtf8 (dst.data ());
155 }
156 
157 /* replace " with \" and \ with \\ " and \ characters */
158 TQString rfcDecoder::quoteIMAP(const TQString &src)
159 {
160  uint len = src.length();
161  TQString result;
162  result.reserve(2 * len);
163  for (unsigned int i = 0; i < len; i++)
164  {
165  if (src[i] == '"' || src[i] == '\\')
166  result += '\\';
167  result += src[i];
168  }
169  //result.squeeze(); - unnecessary and slow
170  return result;
171 }
172 
173 /* Convert Unicode path to modified UTF-7 IMAP mailbox
174  */
175 TQString rfcDecoder::toIMAP (const TQString & inSrc)
176 {
177  unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag;
178  unsigned long ucs4, bitbuf;
179  TQCString src = inSrc.utf8 ();
180  TQString dst;
181 
182  ulong srcPtr = 0;
183  utf7mode = 0;
184  utf8total = 0;
185  bitstogo = 0;
186  utf8pos = 0;
187  bitbuf = 0;
188  ucs4 = 0;
189  while (srcPtr < src.length ())
190  {
191  c = (unsigned char) src[srcPtr++];
192  /* normal character? */
193  if (c >= ' ' && c <= '~')
194  {
195  /* switch out of UTF-7 mode */
196  if (utf7mode)
197  {
198  if (bitstogo)
199  {
200  dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
201  bitstogo = 0;
202  }
203  dst += '-';
204  utf7mode = 0;
205  }
206  dst += c;
207  /* encode '&' as '&-' */
208  if (c == '&')
209  {
210  dst += '-';
211  }
212  continue;
213  }
214  /* switch to UTF-7 mode */
215  if (!utf7mode)
216  {
217  dst += '&';
218  utf7mode = 1;
219  }
220  /* Encode US-ASCII characters as themselves */
221  if (c < 0x80)
222  {
223  ucs4 = c;
224  utf8total = 1;
225  }
226  else if (utf8total)
227  {
228  /* save UTF8 bits into UCS4 */
229  ucs4 = (ucs4 << 6) | (c & 0x3FUL);
230  if (++utf8pos < utf8total)
231  {
232  continue;
233  }
234  }
235  else
236  {
237  utf8pos = 1;
238  if (c < 0xE0)
239  {
240  utf8total = 2;
241  ucs4 = c & 0x1F;
242  }
243  else if (c < 0xF0)
244  {
245  utf8total = 3;
246  ucs4 = c & 0x0F;
247  }
248  else
249  {
250  /* NOTE: can't convert UTF8 sequences longer than 4 */
251  utf8total = 4;
252  ucs4 = c & 0x03;
253  }
254  continue;
255  }
256  /* loop to split ucs4 into two utf16 chars if necessary */
257  utf8total = 0;
258  do
259  {
260  if (ucs4 >= UTF16BASE)
261  {
262  ucs4 -= UTF16BASE;
263  bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART);
264  ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
265  utf16flag = 1;
266  }
267  else
268  {
269  bitbuf = (bitbuf << 16) | ucs4;
270  utf16flag = 0;
271  }
272  bitstogo += 16;
273  /* spew out base64 */
274  while (bitstogo >= 6)
275  {
276  bitstogo -= 6;
277  dst += base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F];
278  }
279  }
280  while (utf16flag);
281  }
282  /* if in UTF-7 mode, finish in ASCII */
283  if (utf7mode)
284  {
285  if (bitstogo)
286  {
287  dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
288  }
289  dst += '-';
290  }
291  return quoteIMAP(dst);
292 }
293 
294 //-----------------------------------------------------------------------------
295 TQString rfcDecoder::decodeQuoting(const TQString &aStr)
296 {
297  TQString result;
298  unsigned int strLength(aStr.length());
299  for (unsigned int i = 0; i < strLength ; i++)
300  {
301  if (aStr[i] == "\\") i++;
302  result += aStr[i];
303  }
304  return result;
305 }
306 
307 //-----------------------------------------------------------------------------
308 TQTextCodec *
309 rfcDecoder::codecForName (const TQString & _str)
310 {
311  if (_str.isEmpty ())
312  return NULL;
313  return TQTextCodec::codecForName (_str.lower ().
314  replace ("windows", "cp").latin1 ());
315 }
316 
317 //-----------------------------------------------------------------------------
318 const TQString
319 rfcDecoder::decodeRFC2047String (const TQString & _str)
320 {
321  TQString throw_away;
322 
323  return decodeRFC2047String (_str, throw_away);
324 }
325 
326 //-----------------------------------------------------------------------------
327 const TQString
328 rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset)
329 {
330  TQString throw_away;
331 
332  return decodeRFC2047String (_str, charset, throw_away);
333 }
334 
335 //-----------------------------------------------------------------------------
336 const TQString
337 rfcDecoder::decodeRFC2047String (const TQString & _str, TQString & charset,
338  TQString & language)
339 {
340  //do we have a rfc string
341  if (_str.find("=?") < 0)
342  return _str;
343 
344  TQCString aStr = _str.ascii (); // TQString.length() means Unicode chars
345  TQCString result;
346  char *pos, *beg, *end, *mid = NULL;
347  TQCString str;
348  char encoding = 0, ch;
349  bool valid;
350  const int maxLen = 200;
351  int i;
352 
353 // result.truncate(aStr.length());
354  for (pos = aStr.data (); *pos; pos++)
355  {
356  if (pos[0] != '=' || pos[1] != '?')
357  {
358  result += *pos;
359  continue;
360  }
361  beg = pos + 2;
362  end = beg;
363  valid = TRUE;
364  // parse charset name
365  for (i = 2, pos += 2;
366  i < maxLen && (*pos != '?' && (ispunct (*pos) || isalnum (*pos)));
367  i++)
368  pos++;
369  if (*pos != '?' || i < 4 || i >= maxLen)
370  valid = FALSE;
371  else
372  {
373  charset = TQCString (beg, i - 1); // -2 + 1 for the zero
374  int pt = charset.findRev('*');
375  if (pt != -1)
376  {
377  // save language for later usage
378  language = charset.right (charset.length () - pt - 1);
379 
380  // tie off language as defined in rfc2047
381  charset.truncate(pt);
382  }
383  // get encoding and check delimiting question marks
384  encoding = toupper (pos[1]);
385  if (pos[2] != '?'
386  || (encoding != 'Q' && encoding != 'B' && encoding != 'q'
387  && encoding != 'b'))
388  valid = FALSE;
389  pos += 3;
390  i += 3;
391 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - charset " << charset << " - language " << language << " - '" << pos << "'" << endl;
392  }
393  if (valid)
394  {
395  mid = pos;
396  // search for end of encoded part
397  while (i < maxLen && *pos && !(*pos == '?' && *(pos + 1) == '='))
398  {
399  i++;
400  pos++;
401  }
402  end = pos + 2; //end now points to the first char after the encoded string
403  if (i >= maxLen || !*pos)
404  valid = FALSE;
405  }
406  if (valid)
407  {
408  ch = *pos;
409  *pos = '\0';
410  str = TQCString (mid).left ((int) (mid - pos - 1));
411  if (encoding == 'Q')
412  {
413  // decode quoted printable text
414  for (i = str.length () - 1; i >= 0; i--)
415  if (str[i] == '_')
416  str[i] = ' ';
417 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - before QP '" << str << "'" << endl;
418 
419  str = KCodecs::quotedPrintableDecode(str);
420 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - after QP '" << str << "'" << endl;
421  }
422  else
423  {
424  // decode base64 text
425  str = KCodecs::base64Decode(str);
426  }
427  *pos = ch;
428  int len = str.length();
429  for (i = 0; i < len; i++)
430  result += (char) (TQChar) str[i];
431 
432  pos = end - 1;
433  }
434  else
435  {
436 // kdDebug(7116) << "rfcDecoder::decodeRFC2047String - invalid" << endl;
437  //result += "=?";
438  //pos = beg -1; // because pos gets increased shortly afterwards
439  pos = beg - 2;
440  result += *pos++;
441  result += *pos;
442  }
443  }
444  if (!charset.isEmpty ())
445  {
446  TQTextCodec *aCodec = codecForName (charset.ascii ());
447  if (aCodec)
448  {
449 // kdDebug(7116) << "Codec is " << aCodec->name() << endl;
450  return aCodec->toUnicode (result);
451  }
452  }
453  return result;
454 }
455 
456 
457 //-----------------------------------------------------------------------------
458 const char especials[17] = "()<>@,;:\"/[]?.= ";
459 
460 const TQString
461 rfcDecoder::encodeRFC2047String (const TQString & _str)
462 {
463  if (_str.isEmpty ())
464  return _str;
465  const signed char *latin = reinterpret_cast<const signed char *>(_str.latin1()), *l, *start, *stop;
466  char hexcode;
467  int numQuotes, i;
468  int rptr = 0;
469  // My stats show this number results in 12 resize() out of 73,000
470  int resultLen = 3 * _str.length() / 2;
471  TQCString result(resultLen);
472 
473  while (*latin)
474  {
475  l = latin;
476  start = latin;
477  while (*l)
478  {
479  if (*l == 32)
480  start = l + 1;
481  if (*l < 0)
482  break;
483  l++;
484  }
485  if (*l)
486  {
487  numQuotes = 1;
488  while (*l)
489  {
490  /* The encoded word must be limited to 75 character */
491  for (i = 0; i < 16; i++)
492  if (*l == especials[i])
493  numQuotes++;
494  if (*l < 0)
495  numQuotes++;
496  /* Stop after 58 = 75 - 17 characters or at "<user@host..." */
497  if (l - start + 2 * numQuotes >= 58 || *l == 60)
498  break;
499  l++;
500  }
501  if (*l)
502  {
503  stop = l - 1;
504  while (stop >= start && *stop != 32)
505  stop--;
506  if (stop <= start)
507  stop = l;
508  }
509  else
510  stop = l;
511  if (resultLen - rptr - 1 <= start - latin + 1 + 16 /* =?iso-88... */) {
512  resultLen += (start - latin + 1) * 2 + 20; // more space
513  result.resize(resultLen);
514  }
515  while (latin < start)
516  {
517  result[rptr++] = *latin;
518  latin++;
519  }
520  strcpy(&result[rptr], "=?iso-8859-1?q?"); rptr += 15;
521  if (resultLen - rptr - 1 <= 3*(stop - latin + 1)) {
522  resultLen += (stop - latin + 1) * 4 + 20; // more space
523  result.resize(resultLen);
524  }
525  while (latin < stop) // can add up to 3 chars/iteration
526  {
527  numQuotes = 0;
528  for (i = 0; i < 16; i++)
529  if (*latin == especials[i])
530  numQuotes = 1;
531  if (*latin < 0)
532  numQuotes = 1;
533  if (numQuotes)
534  {
535  result[rptr++] = '=';
536  hexcode = ((*latin & 0xF0) >> 4) + 48;
537  if (hexcode >= 58)
538  hexcode += 7;
539  result[rptr++] = hexcode;
540  hexcode = (*latin & 0x0F) + 48;
541  if (hexcode >= 58)
542  hexcode += 7;
543  result[rptr++] = hexcode;
544  }
545  else
546  {
547  result[rptr++] = *latin;
548  }
549  latin++;
550  }
551  result[rptr++] = '?';
552  result[rptr++] = '=';
553  }
554  else
555  {
556  while (*latin)
557  {
558  if (rptr == resultLen - 1) {
559  resultLen += 30;
560  result.resize(resultLen);
561  }
562  result[rptr++] = *latin;
563  latin++;
564  }
565  }
566  }
567  result[rptr] = 0;
568  //free (latinStart);
569  return result;
570 }
571 
572 
573 //-----------------------------------------------------------------------------
574 const TQString
575 rfcDecoder::encodeRFC2231String (const TQString & _str)
576 {
577  if (_str.isEmpty ())
578  return _str;
579  signed char *latin = (signed char *) calloc (1, _str.length () + 1);
580  char *latin_us = (char *) latin;
581  strcpy (latin_us, _str.latin1 ());
582  signed char *l = latin;
583  char hexcode;
584  int i;
585  bool quote;
586  while (*l)
587  {
588  if (*l < 0)
589  break;
590  l++;
591  }
592  if (!*l) {
593  free(latin);
594  return _str;
595  }
596  TQCString result;
597  l = latin;
598  while (*l)
599  {
600  quote = *l < 0;
601  for (i = 0; i < 16; i++)
602  if (*l == especials[i])
603  quote = true;
604  if (quote)
605  {
606  result += "%";
607  hexcode = ((*l & 0xF0) >> 4) + 48;
608  if (hexcode >= 58)
609  hexcode += 7;
610  result += hexcode;
611  hexcode = (*l & 0x0F) + 48;
612  if (hexcode >= 58)
613  hexcode += 7;
614  result += hexcode;
615  }
616  else
617  {
618  result += *l;
619  }
620  l++;
621  }
622  free (latin);
623  return result;
624 }
625 
626 
627 //-----------------------------------------------------------------------------
628 const TQString
629 rfcDecoder::decodeRFC2231String (const TQString & _str)
630 {
631  int p = _str.find ('\'');
632 
633  //see if it is an rfc string
634  if (p < 0)
635  return _str;
636 
637  int l = _str.findRev ('\'');
638 
639  //second is language
640  if (p >= l)
641  return _str;
642 
643  //first is charset or empty
644  TQString charset = _str.left (p);
645  TQString st = _str.mid (l + 1);
646  TQString language = _str.mid (p + 1, l - p - 1);
647 
648  //kdDebug(7116) << "Charset: " << charset << " Language: " << language << endl;
649 
650  char ch, ch2;
651  p = 0;
652  while (p < (int) st.length ())
653  {
654  if (st.at (p) == 37)
655  {
656  ch = st.at (p + 1).latin1 () - 48;
657  if (ch > 16)
658  ch -= 7;
659  ch2 = st.at (p + 2).latin1 () - 48;
660  if (ch2 > 16)
661  ch2 -= 7;
662  st.at (p) = ch * 16 + ch2;
663  st.remove (p + 1, 2);
664  }
665  p++;
666  }
667  return st;
668 }
static const TQString decodeRFC2047String(const TQString &_str, TQString &charset, TQString &language)
decode a RFC2047 String
Definition: rfcdecoder.cpp:337
static const TQString encodeRFC2231String(const TQString &_str)
encode a RFC2231 String
Definition: rfcdecoder.cpp:575
static TQString decodeQuoting(const TQString &aStr)
remove \ from a string
Definition: rfcdecoder.cpp:295
static TQString toIMAP(const TQString &inSrc)
Convert Unicode path to modified UTF-7 IMAP mailbox.
Definition: rfcdecoder.cpp:175
static TQTextCodec * codecForName(const TQString &)
fetch a codec by name
Definition: rfcdecoder.cpp:309
static TQString quoteIMAP(const TQString &src)
replace " with \" and \ with \ " and \ characters
Definition: rfcdecoder.cpp:158
static const TQString decodeRFC2231String(const TQString &_str)
decode a RFC2231 String
Definition: rfcdecoder.cpp:629
static TQString fromIMAP(const TQString &src)
Convert an IMAP mailbox to a Unicode path.
Definition: rfcdecoder.cpp:55
static const TQString encodeRFC2047String(const TQString &_str, TQString &charset, TQString &language)
encode a RFC2047 String