libkmime

kmime_codec_qp.cpp
1 /*
2  kmime_codec_qp.cpp
3 
4  This file is part of KMime, the KDE internet mail/usenet news message library.
5  Copyright (c) 2002 Marc Mutz <mutz@kde.org>
6 
7  KMime is free software; you can redistribute it and/or modify it
8  under the terms of the GNU General Public License, version 2, as
9  published by the Free Software Foundation.
10 
11  KMime is distributed in the hope that it will be useful, but
12  WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this library; if not, write to the Free Software
18  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 
20  In addition, as a special exception, the copyright holders give
21  permission to link the code of this library with any edition of
22  the TQt library by Trolltech AS, Norway (or with modified versions
23  of TQt that use the same license as TQt), and distribute linked
24  combinations including the two. You must obey the GNU General
25  Public License in all respects for all of the code used other than
26  TQt. If you modify this file, you may extend this exception to
27  your version of the file, but you are not obligated to do so. If
28  you do not wish to do so, delete this exception statement from
29  your version.
30 */
31 
32 #include "kmime_codec_qp.h"
33 
34 #include "kmime_util.h"
35 
36 #include <kdebug.h>
37 
38 #include <cassert>
39 
40 using namespace KMime;
41 
42 namespace KMime {
43 
44 // some helpful functions:
45 
46 static inline char binToHex( uchar value ) {
47  if ( value > 9 )
48  return value + 'A' - 10;
49  else
50  return value + '0';
51 }
52 
53 static inline uchar highNibble( uchar ch ) {
54  return ch >> 4;
55 }
56 
57 static inline uchar lowNibble( uchar ch ) {
58  return ch & 0xF;
59 }
60 
61 static inline bool keep( uchar ch ) {
62  // no CTLs, except HT and not '?'
63  return !( ((ch < ' ') && (ch != '\t')) || (ch == '?') );
64 }
65 
66 //
67 // QuotedPrintableCodec
68 //
69 
70 class QuotedPrintableEncoder : public Encoder {
71  char mInputBuffer[16];
72  uchar mCurrentLineLength; // 0..76
73  uchar mAccu;
74  uint mInputBufferReadCursor : 4; // 0..15
75  uint mInputBufferWriteCursor : 4; // 0..15
76  enum {
77  Never, AtBOL, Definitely
78  } mAccuNeedsEncoding : 2;
79  bool mSawLineEnd : 1;
80  bool mSawCR : 1;
81  bool mFinishing : 1;
82  bool mFinished : 1;
83 protected:
84  friend class QuotedPrintableCodec;
85  QuotedPrintableEncoder( bool withCRLF=false )
86  : Encoder( withCRLF ), mCurrentLineLength(0), mAccu(0),
87  mInputBufferReadCursor(0), mInputBufferWriteCursor(0),
88  mAccuNeedsEncoding(Never),
89  mSawLineEnd(false), mSawCR(false), mFinishing(false),
90  mFinished(false) {}
91 
92  bool needsEncoding( uchar ch ) {
93  return ( (ch > '~') || ((ch < ' ') && (ch != '\t')) || (ch == '=') );
94  }
95  bool needsEncodingAtEOL( uchar ch ) {
96  return ( ch == ' ' || ch == '\t' );
97  }
98  bool needsEncodingAtBOL( uchar ch ) {
99  return ( ch == 'F' || ch == '.' || ch == '-' );
100  }
101  bool fillInputBuffer( const char* & scursor, const char * const send );
102  bool processNextChar();
103  void createOutputBuffer( char* & dcursor, const char * const dend );
104 public:
105  virtual ~QuotedPrintableEncoder() {}
106 
107  bool encode( const char* & scursor, const char * const send,
108  char* & dcursor, const char * const dend );
109 
110  bool finish( char* & dcursor, const char * const dend );
111 };
112 
113 
114 class QuotedPrintableDecoder : public Decoder {
115  const char mEscapeChar;
116  char mBadChar;
118  uchar mAccu;
128  const bool mTQEncoding;
129  bool mInsideHexChar;
130  bool mFlushing;
131  bool mExpectLF;
132  bool mHaveAccu;
133 protected:
134  friend class QuotedPrintableCodec;
135  friend class Rfc2047TQEncodingCodec;
136  friend class Rfc2231EncodingCodec;
137  QuotedPrintableDecoder( bool withCRLF=false,
138  bool aTQEncoding=false, char aEscapeChar='=' )
139  : Decoder( withCRLF ),
140  mEscapeChar(aEscapeChar),
141  mBadChar(0),
142  mAccu(0),
143  mTQEncoding(aTQEncoding),
144  mInsideHexChar(false),
145  mFlushing(false),
146  mExpectLF(false),
147  mHaveAccu(false) {}
148 public:
149  virtual ~QuotedPrintableDecoder() {}
150 
151  bool decode( const char* & scursor, const char * const send,
152  char* & dcursor, const char * const dend );
153  // ### really no finishing needed???
154  bool finish( char* &, const char * const ) { return true; }
155 };
156 
157 
158 class Rfc2047TQEncodingEncoder : public Encoder {
159  uchar mAccu;
160  uchar mStepNo;
161  const char mEscapeChar;
162  bool mInsideFinishing : 1;
163 protected:
164  friend class Rfc2047TQEncodingCodec;
165  friend class Rfc2231EncodingCodec;
166  Rfc2047TQEncodingEncoder( bool withCRLF=false, char aEscapeChar='=' )
167  : Encoder( withCRLF ),
168  mAccu(0), mStepNo(0), mEscapeChar( aEscapeChar ),
169  mInsideFinishing( false )
170  {
171  // else an optimization in ::encode might break.
172  assert( aEscapeChar == '=' || aEscapeChar == '%' );
173  }
174 
175  // this code assumes that isEText( mEscapeChar ) == false!
176  bool needsEncoding( uchar ch ) {
177  if ( ch > 'z' ) return true; // {|}~ DEL and 8bit chars need
178  if ( !isEText( ch ) ) return true; // all but a-zA-Z0-9!/*+- need, too
179  if ( mEscapeChar == '%' && ( ch == '*' || ch == '/' ) )
180  return true; // not allowed in rfc2231 encoding
181  return false;
182  }
183 
184 public:
185  virtual ~Rfc2047TQEncodingEncoder() {}
186 
187  bool encode( const char* & scursor, const char * const send,
188  char* & dcursor, const char * const dend );
189  bool finish( char* & dcursor, const char * const dend );
190 };
191 
192 // this doesn't access any member variables, so it can be defined static
193 // but then we can't call it from virtual functions
194 static int QuotedPrintableDecoder_maxDecodedSizeFor( int insize, bool withCRLF ) {
195  // all chars unencoded:
196  int result = insize;
197  // but maybe all of them are \n and we need to make them \r\n :-o
198  if ( withCRLF )
199  result += insize;
200 
201  // there might be an accu plus escape
202  result += 2;
203 
204  return result;
205 }
206 
207 Encoder * QuotedPrintableCodec::makeEncoder( bool withCRLF ) const {
208  return new QuotedPrintableEncoder( withCRLF );
209 }
210 
211 Decoder * QuotedPrintableCodec::makeDecoder( bool withCRLF ) const {
212  return new QuotedPrintableDecoder( withCRLF );
213 }
214 
215 int QuotedPrintableCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
216  return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
217 }
218 
219 Encoder * Rfc2047TQEncodingCodec::makeEncoder( bool withCRLF ) const {
220  return new Rfc2047TQEncodingEncoder( withCRLF );
221 }
222 
223 Decoder * Rfc2047TQEncodingCodec::makeDecoder( bool withCRLF ) const {
224  return new QuotedPrintableDecoder( withCRLF, true );
225 }
226 
227 int Rfc2047TQEncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
228  return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
229 }
230 
231 Encoder * Rfc2231EncodingCodec::makeEncoder( bool withCRLF ) const {
232  return new Rfc2047TQEncodingEncoder( withCRLF, '%' );
233 }
234 
235 Decoder * Rfc2231EncodingCodec::makeDecoder( bool withCRLF ) const {
236  return new QuotedPrintableDecoder( withCRLF, true, '%' );
237 }
238 
239 int Rfc2231EncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
240  return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
241 }
242 
243  /********************************************************/
244  /********************************************************/
245  /********************************************************/
246 
247 bool QuotedPrintableDecoder::decode( const char* & scursor, const char * const send,
248  char* & dcursor, const char * const dend ) {
249  if ( mWithCRLF )
250  kdWarning() << "CRLF output for decoders isn't yet supported!" << endl;
251 
252  while ( scursor != send && dcursor != dend ) {
253  if ( mFlushing ) {
254  // we have to flush chars in the aftermath of an decoding
255  // error. The way to request a flush is to
256  // - store the offending character in mBadChar and
257  // - set mFlushing to true.
258  // The supported cases are (H: hexchar, X: bad char):
259  // =X, =HX, CR
260  // mBadChar is only written out if it is not by itself illegal in
261  // quoted-printable (e.g. CTLs, 8Bits).
262  // A fast way to suppress mBadChar output is to set it to NUL.
263  if ( mInsideHexChar ) {
264  // output '='
265  *dcursor++ = mEscapeChar;
266  mInsideHexChar = false;
267  } else if ( mHaveAccu ) {
268  // output the high nibble of the accumulator:
269  *dcursor++ = binToHex( highNibble( mAccu ) );
270  mHaveAccu = false;
271  mAccu = 0;
272  } else {
273  // output mBadChar
274  assert( mAccu == 0 );
275  if ( mBadChar ) {
276  if ( ((mBadChar >= '>') && (mBadChar <= '~')) ||
277  ((mBadChar >= '!') && (mBadChar <= '<')) )
278  *dcursor++ = mBadChar;
279  mBadChar = 0;
280  }
281  mFlushing = false;
282  }
283  continue;
284  }
285  assert( mBadChar == 0 );
286 
287  uchar ch = *scursor++;
288  uchar value = 255;
289 
290  if ( mExpectLF && ch != '\n' ) {
291  kdWarning() << "QuotedPrintableDecoder: "
292  "illegally formed soft linebreak or lonely CR!" << endl;
293  mInsideHexChar = false;
294  mExpectLF = false;
295  assert( mAccu == 0 );
296  }
297 
298  if ( mInsideHexChar ) {
299  // next char(s) represent nibble instead of itself:
300  if ( ch <= '9' ) {
301  if ( ch >= '0' ) {
302  value = ch - '0';
303  } else {
304  switch ( ch ) {
305  case '\r':
306  mExpectLF = true;
307  break;
308  case '\n':
309  // soft line break, but only if mAccu is NUL.
310  if ( !mHaveAccu ) {
311  mExpectLF = false;
312  mInsideHexChar = false;
313  break;
314  }
315  // else fall through
316  default:
317  kdWarning() << "QuotedPrintableDecoder: "
318  "illegally formed hex char! Outputting verbatim." << endl;
319  mBadChar = ch;
320  mFlushing = true;
321  }
322  continue;
323  }
324  } else { // ch > '9'
325  if ( ch <= 'F' ) {
326  if ( ch >= 'A' ) {
327  value = 10 + ch - 'A';
328  } else { // [:-@]
329  mBadChar = ch;
330  mFlushing = true;
331  continue;
332  }
333  } else { // ch > 'F'
334  if ( ch <= 'f' && ch >= 'a' ) {
335  value = 10 + ch - 'a';
336  } else {
337  mBadChar = ch;
338  mFlushing = true;
339  continue;
340  }
341  }
342  }
343 
344  assert( value < 16 );
345  assert( mBadChar == 0 );
346  assert( !mExpectLF );
347 
348  if ( mHaveAccu ) {
349  *dcursor++ = char( mAccu | value );
350  mAccu = 0;
351  mHaveAccu = false;
352  mInsideHexChar = false;
353  } else {
354  mHaveAccu = true;
355  mAccu = value << 4;
356  }
357  } else { // not mInsideHexChar
358  if ( ((ch <= '~') && (ch >= ' ')) || (ch == '\t') ) {
359  if ( ch == mEscapeChar ) {
360  mInsideHexChar = true;
361  } else if ( mTQEncoding && ch == '_' ) {
362  *dcursor++ = char(0x20);
363  } else {
364  *dcursor++ = char(ch);
365  }
366  } else if ( ch == '\n' ) {
367  *dcursor++ = '\n';
368  mExpectLF = false;
369  } else if ( ch == '\r' ) {
370  mExpectLF = true;
371  } else {
372  kdWarning() << "QuotedPrintableDecoder: " << ch <<
373  " illegal character in input stream! Ignoring." << endl;
374  }
375  }
376  }
377 
378  return (scursor == send);
379 }
380 
381 bool QuotedPrintableEncoder::fillInputBuffer( const char* & scursor,
382  const char * const send ) {
383  // Don't read more if there's still a tail of a line in the buffer:
384  if ( mSawLineEnd )
385  return true;
386 
387  // Read until the buffer is full or we have found CRLF or LF (which
388  // don't end up in the input buffer):
389  for ( ; ( mInputBufferWriteCursor + 1 ) % 16 != mInputBufferReadCursor
390  && scursor != send ; mInputBufferWriteCursor++ ) {
391  char ch = *scursor++;
392  if ( ch == '\r' ) {
393  mSawCR = true;
394  } else if ( ch == '\n' ) {
395  // remove the CR from the input buffer (if any) and return that
396  // we found a line ending:
397  if ( mSawCR ) {
398  mSawCR = false;
399  assert( mInputBufferWriteCursor != mInputBufferReadCursor );
400  mInputBufferWriteCursor--;
401  }
402  mSawLineEnd = true;
403  return true; // saw CRLF or LF
404  } else {
405  mSawCR = false;
406  }
407  mInputBuffer[ mInputBufferWriteCursor ] = ch;
408  }
409  mSawLineEnd = false;
410  return false; // didn't see a line ending...
411 }
412 
413 bool QuotedPrintableEncoder::processNextChar() {
414 
415  // If we process a buffer which doesn't end in a line break, we
416  // can't process all of it, since the next chars that will be read
417  // could be a line break. So we empty the buffer only until a fixed
418  // number of chars is left (except when mFinishing, which means that
419  // the data doesn't end in newline):
420  const int minBufferFillWithoutLineEnd = 4;
421 
422  assert( mOutputBufferCursor == 0 );
423 
424  int bufferFill = int(mInputBufferWriteCursor) - int(mInputBufferReadCursor) ;
425  if ( bufferFill < 0 )
426  bufferFill += 16;
427 
428  assert( bufferFill >=0 && bufferFill <= 15 );
429 
430  if ( !mFinishing && !mSawLineEnd &&
431  bufferFill < minBufferFillWithoutLineEnd )
432  return false;
433 
434  // buffer is empty, return false:
435  if ( mInputBufferReadCursor == mInputBufferWriteCursor )
436  return false;
437 
438  // Real processing goes here:
439  mAccu = mInputBuffer[ mInputBufferReadCursor++ ];
440  if ( needsEncoding( mAccu ) ) // always needs encoding or
441  mAccuNeedsEncoding = Definitely;
442  else if ( ( mSawLineEnd || mFinishing ) // needs encoding at end of line
443  && bufferFill == 1 // or end of buffer
444  && needsEncodingAtEOL( mAccu ) )
445  mAccuNeedsEncoding = Definitely;
446  else if ( needsEncodingAtBOL( mAccu ) )
447  mAccuNeedsEncoding = AtBOL;
448  else
449  // never needs encoding
450  mAccuNeedsEncoding = Never;
451 
452  return true;
453 }
454 
455 // Outputs processed (verbatim or hex-encoded) chars and inserts soft
456 // line breaks as necessary. Depends on processNextChar's directions
457 // on whether or not to encode the current char, and whether or not
458 // the current char is the last one in it's input line:
459 void QuotedPrintableEncoder::createOutputBuffer( char* & dcursor,
460  const char * const dend )
461 {
462  const int maxLineLength = 76; // rfc 2045
463 
464  assert( mOutputBufferCursor == 0 );
465 
466  bool lastOneOnThisLine = mSawLineEnd
467  && mInputBufferReadCursor == mInputBufferWriteCursor;
468 
469  int neededSpace = 1;
470  if ( mAccuNeedsEncoding == Definitely)
471  neededSpace = 3;
472 
473  // reserve space for the soft hyphen (=)
474  if ( !lastOneOnThisLine )
475  neededSpace++;
476 
477  if ( mCurrentLineLength > maxLineLength - neededSpace ) {
478  // current line too short, insert soft line break:
479  write( '=', dcursor, dend );
480  writeCRLF( dcursor, dend );
481  mCurrentLineLength = 0;
482  }
483 
484  if ( (Never == mAccuNeedsEncoding) ||
485  ((AtBOL == mAccuNeedsEncoding) && (mCurrentLineLength != 0)) ) {
486  write( mAccu, dcursor, dend );
487  mCurrentLineLength++;
488  } else {
489  write( '=', dcursor, dend );
490  write( binToHex( highNibble( mAccu ) ), dcursor, dend );
491  write( binToHex( lowNibble( mAccu ) ), dcursor, dend );
492  mCurrentLineLength += 3;
493  }
494 }
495 
496 
497 bool QuotedPrintableEncoder::encode( const char* & scursor, const char * const send,
498  char* & dcursor, const char * const dend )
499 {
500  // support probing by the caller:
501  if ( mFinishing ) return true;
502 
503  while ( scursor != send && dcursor != dend ) {
504  if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) )
505  return (scursor == send);
506 
507  assert( mOutputBufferCursor == 0 );
508 
509  // fill input buffer until eol has been reached or until the
510  // buffer is full, whatever comes first:
511  fillInputBuffer( scursor, send );
512 
513  if ( processNextChar() )
514  // there was one...
515  createOutputBuffer( dcursor, dend );
516  else if ( mSawLineEnd &&
517  mInputBufferWriteCursor == mInputBufferReadCursor ) {
518  // load a hard line break into output buffer:
519  writeCRLF( dcursor, dend );
520  // signal fillInputBuffer() we are ready for the next line:
521  mSawLineEnd = false;
522  mCurrentLineLength = 0;
523  } else
524  // we are supposedly finished with this input block:
525  break;
526  }
527 
528  // make sure we write as much as possible and don't stop _writing_
529  // just because we have no more _input_:
530  if ( mOutputBufferCursor ) flushOutputBuffer( dcursor, dend );
531 
532  return (scursor == send);
533 
534 } // encode
535 
536 bool QuotedPrintableEncoder::finish( char* & dcursor,
537  const char * const dend ) {
538  mFinishing = true;
539 
540  if ( mFinished )
541  return flushOutputBuffer( dcursor, dend );
542 
543  while ( dcursor != dend ) {
544  if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) )
545  return false;
546 
547  assert( mOutputBufferCursor == 0 );
548 
549  if ( processNextChar() )
550  // there was one...
551  createOutputBuffer( dcursor, dend );
552  else if ( mSawLineEnd &&
553  mInputBufferWriteCursor == mInputBufferReadCursor ) {
554  // load a hard line break into output buffer:
555  writeCRLF( dcursor, dend );
556  mSawLineEnd = false;
557  mCurrentLineLength = 0;
558  } else {
559  mFinished = true;
560  return flushOutputBuffer( dcursor, dend );
561  }
562  }
563 
564  return mFinished && !mOutputBufferCursor;
565 
566 } // finish
567 
568 
569 bool Rfc2047TQEncodingEncoder::encode( const char* & scursor, const char * const send,
570  char* & dcursor, const char * const dend )
571 {
572  if ( mInsideFinishing ) return true;
573 
574  while ( scursor != send && dcursor != dend ) {
575  uchar value;
576  switch ( mStepNo ) {
577  case 0:
578  // read the next char and decide if and how do encode:
579  mAccu = *scursor++;
580  if ( !needsEncoding( mAccu ) ) {
581  *dcursor++ = char(mAccu);
582  } else if ( mEscapeChar == '=' && mAccu == 0x20 ) {
583  // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
584  // (not for rfc2231 encoding)
585  *dcursor++ = '_';
586  } else {
587  // needs =XY encoding - write escape char:
588  *dcursor++ = mEscapeChar;
589  mStepNo = 1;
590  }
591  continue;
592  case 1:
593  // extract hi-nibble:
594  value = highNibble(mAccu);
595  mStepNo = 2;
596  break;
597  case 2:
598  // extract lo-nibble:
599  value = lowNibble(mAccu);
600  mStepNo = 0;
601  break;
602  default: assert( 0 );
603  }
604 
605  // and write:
606  *dcursor++ = binToHex( value );
607  }
608 
609  return (scursor == send);
610 } // encode
611 
612 #include <tqstring.h>
613 
614 bool Rfc2047TQEncodingEncoder::finish( char* & dcursor, const char * const dend ) {
615  mInsideFinishing = true;
616 
617  // write the last bits of mAccu, if any:
618  while ( mStepNo != 0 && dcursor != dend ) {
619  uchar value;
620  switch ( mStepNo ) {
621  case 1:
622  // extract hi-nibble:
623  value = highNibble(mAccu);
624  mStepNo = 2;
625  break;
626  case 2:
627  // extract lo-nibble:
628  value = lowNibble(mAccu);
629  mStepNo = 0;
630  break;
631  default: assert( 0 );
632  }
633 
634  // and write:
635  *dcursor++ = binToHex( value );
636  }
637 
638  return mStepNo == 0;
639 }
640 
641 
642 
643 
644 } // namespace KMime
Stateful decoder class, modelled after TQTextDecoder.
Definition: kmime_codecs.h:268
Stateful encoder class, modelled after TQTextEncoder.
Definition: kmime_codecs.h:300