libkmime

kmime_codec_qp.cpp
1/*
2 kmime_codec_qp.cpp
3
4 This file is part of KMime, the KDE internet mail/usenet news message library.
5 Copyright (c) 2002 Marc Mutz <mutz@kde.org>
6
7 KMime is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License, version 2, as
9 published by the Free Software Foundation.
10
11 KMime is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this library; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
20 In addition, as a special exception, the copyright holders give
21 permission to link the code of this library with any edition of
22 the TQt library by Trolltech AS, Norway (or with modified versions
23 of TQt that use the same license as TQt), and distribute linked
24 combinations including the two. You must obey the GNU General
25 Public License in all respects for all of the code used other than
26 TQt. If you modify this file, you may extend this exception to
27 your version of the file, but you are not obligated to do so. If
28 you do not wish to do so, delete this exception statement from
29 your version.
30*/
31
32#include "kmime_codec_qp.h"
33
34#include "kmime_util.h"
35
36#include <kdebug.h>
37
38#include <cassert>
39
40using namespace KMime;
41
42namespace KMime {
43
44// some helpful functions:
45
46static inline char binToHex( uchar value ) {
47 if ( value > 9 )
48 return value + 'A' - 10;
49 else
50 return value + '0';
51}
52
53static inline uchar highNibble( uchar ch ) {
54 return ch >> 4;
55}
56
57static inline uchar lowNibble( uchar ch ) {
58 return ch & 0xF;
59}
60
61static inline bool keep( uchar ch ) {
62 // no CTLs, except HT and not '?'
63 return !( ((ch < ' ') && (ch != '\t')) || (ch == '?') );
64}
65
66//
67// QuotedPrintableCodec
68//
69
70class QuotedPrintableEncoder : public Encoder {
71 char mInputBuffer[16];
72 uchar mCurrentLineLength; // 0..76
73 uchar mAccu;
74 uint mInputBufferReadCursor : 4; // 0..15
75 uint mInputBufferWriteCursor : 4; // 0..15
76 enum {
77 Never, AtBOL, Definitely
78 } mAccuNeedsEncoding : 2;
79 bool mSawLineEnd : 1;
80 bool mSawCR : 1;
81 bool mFinishing : 1;
82 bool mFinished : 1;
83protected:
84 friend class QuotedPrintableCodec;
85 QuotedPrintableEncoder( bool withCRLF=false )
86 : Encoder( withCRLF ), mCurrentLineLength(0), mAccu(0),
87 mInputBufferReadCursor(0), mInputBufferWriteCursor(0),
88 mAccuNeedsEncoding(Never),
89 mSawLineEnd(false), mSawCR(false), mFinishing(false),
90 mFinished(false) {}
91
92 bool needsEncoding( uchar ch ) {
93 return ( (ch > '~') || ((ch < ' ') && (ch != '\t')) || (ch == '=') );
94 }
95 bool needsEncodingAtEOL( uchar ch ) {
96 return ( ch == ' ' || ch == '\t' );
97 }
98 bool needsEncodingAtBOL( uchar ch ) {
99 return ( ch == 'F' || ch == '.' || ch == '-' );
100 }
101 bool fillInputBuffer( const char* & scursor, const char * const send );
102 bool processNextChar();
103 void createOutputBuffer( char* & dcursor, const char * const dend );
104public:
105 virtual ~QuotedPrintableEncoder() {}
106
107 bool encode( const char* & scursor, const char * const send,
108 char* & dcursor, const char * const dend );
109
110 bool finish( char* & dcursor, const char * const dend );
111};
112
113
114class QuotedPrintableDecoder : public Decoder {
115 const char mEscapeChar;
116 char mBadChar;
118 uchar mAccu;
128 const bool mTQEncoding;
129 bool mInsideHexChar;
130 bool mFlushing;
131 bool mExpectLF;
132 bool mHaveAccu;
133protected:
134 friend class QuotedPrintableCodec;
135 friend class Rfc2047TQEncodingCodec;
136 friend class Rfc2231EncodingCodec;
137 QuotedPrintableDecoder( bool withCRLF=false,
138 bool aTQEncoding=false, char aEscapeChar='=' )
139 : Decoder( withCRLF ),
140 mEscapeChar(aEscapeChar),
141 mBadChar(0),
142 mAccu(0),
143 mTQEncoding(aTQEncoding),
144 mInsideHexChar(false),
145 mFlushing(false),
146 mExpectLF(false),
147 mHaveAccu(false) {}
148public:
149 virtual ~QuotedPrintableDecoder() {}
150
151 bool decode( const char* & scursor, const char * const send,
152 char* & dcursor, const char * const dend );
153 // ### really no finishing needed???
154 bool finish( char* &, const char * const ) { return true; }
155};
156
157
158class Rfc2047TQEncodingEncoder : public Encoder {
159 uchar mAccu;
160 uchar mStepNo;
161 const char mEscapeChar;
162 bool mInsideFinishing : 1;
163protected:
164 friend class Rfc2047TQEncodingCodec;
165 friend class Rfc2231EncodingCodec;
166 Rfc2047TQEncodingEncoder( bool withCRLF=false, char aEscapeChar='=' )
167 : Encoder( withCRLF ),
168 mAccu(0), mStepNo(0), mEscapeChar( aEscapeChar ),
169 mInsideFinishing( false )
170 {
171 // else an optimization in ::encode might break.
172 assert( aEscapeChar == '=' || aEscapeChar == '%' );
173 }
174
175 // this code assumes that isEText( mEscapeChar ) == false!
176 bool needsEncoding( uchar ch ) {
177 if ( ch > 'z' ) return true; // {|}~ DEL and 8bit chars need
178 if ( !isEText( ch ) ) return true; // all but a-zA-Z0-9!/*+- need, too
179 if ( mEscapeChar == '%' && ( ch == '*' || ch == '/' ) )
180 return true; // not allowed in rfc2231 encoding
181 return false;
182 }
183
184public:
185 virtual ~Rfc2047TQEncodingEncoder() {}
186
187 bool encode( const char* & scursor, const char * const send,
188 char* & dcursor, const char * const dend );
189 bool finish( char* & dcursor, const char * const dend );
190};
191
192// this doesn't access any member variables, so it can be defined static
193// but then we can't call it from virtual functions
194static int QuotedPrintableDecoder_maxDecodedSizeFor( int insize, bool withCRLF ) {
195 // all chars unencoded:
196 int result = insize;
197 // but maybe all of them are \n and we need to make them \r\n :-o
198 if ( withCRLF )
199 result += insize;
200
201 // there might be an accu plus escape
202 result += 2;
203
204 return result;
205}
206
207Encoder * QuotedPrintableCodec::makeEncoder( bool withCRLF ) const {
208 return new QuotedPrintableEncoder( withCRLF );
209}
210
211Decoder * QuotedPrintableCodec::makeDecoder( bool withCRLF ) const {
212 return new QuotedPrintableDecoder( withCRLF );
213}
214
215int QuotedPrintableCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
216 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
217}
218
219Encoder * Rfc2047TQEncodingCodec::makeEncoder( bool withCRLF ) const {
220 return new Rfc2047TQEncodingEncoder( withCRLF );
221}
222
223Decoder * Rfc2047TQEncodingCodec::makeDecoder( bool withCRLF ) const {
224 return new QuotedPrintableDecoder( withCRLF, true );
225}
226
227int Rfc2047TQEncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
228 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
229}
230
231Encoder * Rfc2231EncodingCodec::makeEncoder( bool withCRLF ) const {
232 return new Rfc2047TQEncodingEncoder( withCRLF, '%' );
233}
234
235Decoder * Rfc2231EncodingCodec::makeDecoder( bool withCRLF ) const {
236 return new QuotedPrintableDecoder( withCRLF, true, '%' );
237}
238
239int Rfc2231EncodingCodec::maxDecodedSizeFor( int insize, bool withCRLF ) const {
240 return QuotedPrintableDecoder_maxDecodedSizeFor(insize, withCRLF);
241}
242
243 /********************************************************/
244 /********************************************************/
245 /********************************************************/
246
247bool QuotedPrintableDecoder::decode( const char* & scursor, const char * const send,
248 char* & dcursor, const char * const dend ) {
249 if ( mWithCRLF )
250 kdWarning() << "CRLF output for decoders isn't yet supported!" << endl;
251
252 while ( scursor != send && dcursor != dend ) {
253 if ( mFlushing ) {
254 // we have to flush chars in the aftermath of an decoding
255 // error. The way to request a flush is to
256 // - store the offending character in mBadChar and
257 // - set mFlushing to true.
258 // The supported cases are (H: hexchar, X: bad char):
259 // =X, =HX, CR
260 // mBadChar is only written out if it is not by itself illegal in
261 // quoted-printable (e.g. CTLs, 8Bits).
262 // A fast way to suppress mBadChar output is to set it to NUL.
263 if ( mInsideHexChar ) {
264 // output '='
265 *dcursor++ = mEscapeChar;
266 mInsideHexChar = false;
267 } else if ( mHaveAccu ) {
268 // output the high nibble of the accumulator:
269 *dcursor++ = binToHex( highNibble( mAccu ) );
270 mHaveAccu = false;
271 mAccu = 0;
272 } else {
273 // output mBadChar
274 assert( mAccu == 0 );
275 if ( mBadChar ) {
276 if ( ((mBadChar >= '>') && (mBadChar <= '~')) ||
277 ((mBadChar >= '!') && (mBadChar <= '<')) )
278 *dcursor++ = mBadChar;
279 mBadChar = 0;
280 }
281 mFlushing = false;
282 }
283 continue;
284 }
285 assert( mBadChar == 0 );
286
287 uchar ch = *scursor++;
288 uchar value = 255;
289
290 if ( mExpectLF && ch != '\n' ) {
291 kdWarning() << "QuotedPrintableDecoder: "
292 "illegally formed soft linebreak or lonely CR!" << endl;
293 mInsideHexChar = false;
294 mExpectLF = false;
295 assert( mAccu == 0 );
296 }
297
298 if ( mInsideHexChar ) {
299 // next char(s) represent nibble instead of itself:
300 if ( ch <= '9' ) {
301 if ( ch >= '0' ) {
302 value = ch - '0';
303 } else {
304 switch ( ch ) {
305 case '\r':
306 mExpectLF = true;
307 break;
308 case '\n':
309 // soft line break, but only if mAccu is NUL.
310 if ( !mHaveAccu ) {
311 mExpectLF = false;
312 mInsideHexChar = false;
313 break;
314 }
315 // else fall through
316 default:
317 kdWarning() << "QuotedPrintableDecoder: "
318 "illegally formed hex char! Outputting verbatim." << endl;
319 mBadChar = ch;
320 mFlushing = true;
321 }
322 continue;
323 }
324 } else { // ch > '9'
325 if ( ch <= 'F' ) {
326 if ( ch >= 'A' ) {
327 value = 10 + ch - 'A';
328 } else { // [:-@]
329 mBadChar = ch;
330 mFlushing = true;
331 continue;
332 }
333 } else { // ch > 'F'
334 if ( ch <= 'f' && ch >= 'a' ) {
335 value = 10 + ch - 'a';
336 } else {
337 mBadChar = ch;
338 mFlushing = true;
339 continue;
340 }
341 }
342 }
343
344 assert( value < 16 );
345 assert( mBadChar == 0 );
346 assert( !mExpectLF );
347
348 if ( mHaveAccu ) {
349 *dcursor++ = char( mAccu | value );
350 mAccu = 0;
351 mHaveAccu = false;
352 mInsideHexChar = false;
353 } else {
354 mHaveAccu = true;
355 mAccu = value << 4;
356 }
357 } else { // not mInsideHexChar
358 if ( ((ch <= '~') && (ch >= ' ')) || (ch == '\t') ) {
359 if ( ch == mEscapeChar ) {
360 mInsideHexChar = true;
361 } else if ( mTQEncoding && ch == '_' ) {
362 *dcursor++ = char(0x20);
363 } else {
364 *dcursor++ = char(ch);
365 }
366 } else if ( ch == '\n' ) {
367 *dcursor++ = '\n';
368 mExpectLF = false;
369 } else if ( ch == '\r' ) {
370 mExpectLF = true;
371 } else {
372 kdWarning() << "QuotedPrintableDecoder: " << ch <<
373 " illegal character in input stream! Ignoring." << endl;
374 }
375 }
376 }
377
378 return (scursor == send);
379}
380
381bool QuotedPrintableEncoder::fillInputBuffer( const char* & scursor,
382 const char * const send ) {
383 // Don't read more if there's still a tail of a line in the buffer:
384 if ( mSawLineEnd )
385 return true;
386
387 // Read until the buffer is full or we have found CRLF or LF (which
388 // don't end up in the input buffer):
389 for ( ; ( mInputBufferWriteCursor + 1 ) % 16 != mInputBufferReadCursor
390 && scursor != send ; mInputBufferWriteCursor++ ) {
391 char ch = *scursor++;
392 if ( ch == '\r' ) {
393 mSawCR = true;
394 } else if ( ch == '\n' ) {
395 // remove the CR from the input buffer (if any) and return that
396 // we found a line ending:
397 if ( mSawCR ) {
398 mSawCR = false;
399 assert( mInputBufferWriteCursor != mInputBufferReadCursor );
400 mInputBufferWriteCursor--;
401 }
402 mSawLineEnd = true;
403 return true; // saw CRLF or LF
404 } else {
405 mSawCR = false;
406 }
407 mInputBuffer[ mInputBufferWriteCursor ] = ch;
408 }
409 mSawLineEnd = false;
410 return false; // didn't see a line ending...
411}
412
413bool QuotedPrintableEncoder::processNextChar() {
414
415 // If we process a buffer which doesn't end in a line break, we
416 // can't process all of it, since the next chars that will be read
417 // could be a line break. So we empty the buffer only until a fixed
418 // number of chars is left (except when mFinishing, which means that
419 // the data doesn't end in newline):
420 const int minBufferFillWithoutLineEnd = 4;
421
422 assert( mOutputBufferCursor == 0 );
423
424 int bufferFill = int(mInputBufferWriteCursor) - int(mInputBufferReadCursor) ;
425 if ( bufferFill < 0 )
426 bufferFill += 16;
427
428 assert( bufferFill >=0 && bufferFill <= 15 );
429
430 if ( !mFinishing && !mSawLineEnd &&
431 bufferFill < minBufferFillWithoutLineEnd )
432 return false;
433
434 // buffer is empty, return false:
435 if ( mInputBufferReadCursor == mInputBufferWriteCursor )
436 return false;
437
438 // Real processing goes here:
439 mAccu = mInputBuffer[ mInputBufferReadCursor++ ];
440 if ( needsEncoding( mAccu ) ) // always needs encoding or
441 mAccuNeedsEncoding = Definitely;
442 else if ( ( mSawLineEnd || mFinishing ) // needs encoding at end of line
443 && bufferFill == 1 // or end of buffer
444 && needsEncodingAtEOL( mAccu ) )
445 mAccuNeedsEncoding = Definitely;
446 else if ( needsEncodingAtBOL( mAccu ) )
447 mAccuNeedsEncoding = AtBOL;
448 else
449 // never needs encoding
450 mAccuNeedsEncoding = Never;
451
452 return true;
453}
454
455// Outputs processed (verbatim or hex-encoded) chars and inserts soft
456// line breaks as necessary. Depends on processNextChar's directions
457// on whether or not to encode the current char, and whether or not
458// the current char is the last one in it's input line:
459void QuotedPrintableEncoder::createOutputBuffer( char* & dcursor,
460 const char * const dend )
461{
462 const int maxLineLength = 76; // rfc 2045
463
464 assert( mOutputBufferCursor == 0 );
465
466 bool lastOneOnThisLine = mSawLineEnd
467 && mInputBufferReadCursor == mInputBufferWriteCursor;
468
469 int neededSpace = 1;
470 if ( mAccuNeedsEncoding == Definitely)
471 neededSpace = 3;
472
473 // reserve space for the soft hyphen (=)
474 if ( !lastOneOnThisLine )
475 neededSpace++;
476
477 if ( mCurrentLineLength > maxLineLength - neededSpace ) {
478 // current line too short, insert soft line break:
479 write( '=', dcursor, dend );
480 writeCRLF( dcursor, dend );
481 mCurrentLineLength = 0;
482 }
483
484 if ( (Never == mAccuNeedsEncoding) ||
485 ((AtBOL == mAccuNeedsEncoding) && (mCurrentLineLength != 0)) ) {
486 write( mAccu, dcursor, dend );
487 mCurrentLineLength++;
488 } else {
489 write( '=', dcursor, dend );
490 write( binToHex( highNibble( mAccu ) ), dcursor, dend );
491 write( binToHex( lowNibble( mAccu ) ), dcursor, dend );
492 mCurrentLineLength += 3;
493 }
494}
495
496
497bool QuotedPrintableEncoder::encode( const char* & scursor, const char * const send,
498 char* & dcursor, const char * const dend )
499{
500 // support probing by the caller:
501 if ( mFinishing ) return true;
502
503 while ( scursor != send && dcursor != dend ) {
504 if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) )
505 return (scursor == send);
506
507 assert( mOutputBufferCursor == 0 );
508
509 // fill input buffer until eol has been reached or until the
510 // buffer is full, whatever comes first:
511 fillInputBuffer( scursor, send );
512
513 if ( processNextChar() )
514 // there was one...
515 createOutputBuffer( dcursor, dend );
516 else if ( mSawLineEnd &&
517 mInputBufferWriteCursor == mInputBufferReadCursor ) {
518 // load a hard line break into output buffer:
519 writeCRLF( dcursor, dend );
520 // signal fillInputBuffer() we are ready for the next line:
521 mSawLineEnd = false;
522 mCurrentLineLength = 0;
523 } else
524 // we are supposedly finished with this input block:
525 break;
526 }
527
528 // make sure we write as much as possible and don't stop _writing_
529 // just because we have no more _input_:
530 if ( mOutputBufferCursor ) flushOutputBuffer( dcursor, dend );
531
532 return (scursor == send);
533
534} // encode
535
536bool QuotedPrintableEncoder::finish( char* & dcursor,
537 const char * const dend ) {
538 mFinishing = true;
539
540 if ( mFinished )
541 return flushOutputBuffer( dcursor, dend );
542
543 while ( dcursor != dend ) {
544 if ( mOutputBufferCursor && !flushOutputBuffer( dcursor, dend ) )
545 return false;
546
547 assert( mOutputBufferCursor == 0 );
548
549 if ( processNextChar() )
550 // there was one...
551 createOutputBuffer( dcursor, dend );
552 else if ( mSawLineEnd &&
553 mInputBufferWriteCursor == mInputBufferReadCursor ) {
554 // load a hard line break into output buffer:
555 writeCRLF( dcursor, dend );
556 mSawLineEnd = false;
557 mCurrentLineLength = 0;
558 } else {
559 mFinished = true;
560 return flushOutputBuffer( dcursor, dend );
561 }
562 }
563
564 return mFinished && !mOutputBufferCursor;
565
566} // finish
567
568
569bool Rfc2047TQEncodingEncoder::encode( const char* & scursor, const char * const send,
570 char* & dcursor, const char * const dend )
571{
572 if ( mInsideFinishing ) return true;
573
574 while ( scursor != send && dcursor != dend ) {
575 uchar value;
576 switch ( mStepNo ) {
577 case 0:
578 // read the next char and decide if and how do encode:
579 mAccu = *scursor++;
580 if ( !needsEncoding( mAccu ) ) {
581 *dcursor++ = char(mAccu);
582 } else if ( mEscapeChar == '=' && mAccu == 0x20 ) {
583 // shortcut encoding for 0x20 (latin-1/us-ascii SPACE)
584 // (not for rfc2231 encoding)
585 *dcursor++ = '_';
586 } else {
587 // needs =XY encoding - write escape char:
588 *dcursor++ = mEscapeChar;
589 mStepNo = 1;
590 }
591 continue;
592 case 1:
593 // extract hi-nibble:
594 value = highNibble(mAccu);
595 mStepNo = 2;
596 break;
597 case 2:
598 // extract lo-nibble:
599 value = lowNibble(mAccu);
600 mStepNo = 0;
601 break;
602 default: assert( 0 );
603 }
604
605 // and write:
606 *dcursor++ = binToHex( value );
607 }
608
609 return (scursor == send);
610} // encode
611
612#include <tqstring.h>
613
614bool Rfc2047TQEncodingEncoder::finish( char* & dcursor, const char * const dend ) {
615 mInsideFinishing = true;
616
617 // write the last bits of mAccu, if any:
618 while ( mStepNo != 0 && dcursor != dend ) {
619 uchar value;
620 switch ( mStepNo ) {
621 case 1:
622 // extract hi-nibble:
623 value = highNibble(mAccu);
624 mStepNo = 2;
625 break;
626 case 2:
627 // extract lo-nibble:
628 value = lowNibble(mAccu);
629 mStepNo = 0;
630 break;
631 default: assert( 0 );
632 }
633
634 // and write:
635 *dcursor++ = binToHex( value );
636 }
637
638 return mStepNo == 0;
639}
640
641
642
643
644} // namespace KMime
Stateful decoder class, modelled after TQTextDecoder.
Definition: kmime_codecs.h:268
Stateful encoder class, modelled after TQTextEncoder.
Definition: kmime_codecs.h:300