libkmime

kmime_charfreq.cpp
1/*
2 kmime_charfreq.cpp
3
4 KMime, the KDE internet mail/usenet news message library.
5 Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; version 2 of the License.
10 You should have received a copy of the GNU General Public License
11 along with this program; if not, write to the Free Software Foundation,
12 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US
13*/
14
15#include "kmime_charfreq.h"
16
17namespace KMime {
18
19CharFreq::CharFreq( const TQByteArray & buf )
20 : NUL(0),
21 CTL(0),
22 CR(0), LF(0),
23 CRLF(0),
24 printable(0),
25 eightBit(0),
26 total(0),
27 lineMin(0xffffffff),
28 lineMax(0),
29 mTrailingWS(false),
30 mLeadingFrom(false)
31{
32 if ( !buf.isEmpty() )
33 count( buf.data(), buf.size() );
34}
35
36CharFreq::CharFreq( const char * buf, size_t len )
37 : NUL(0),
38 CTL(0),
39 CR(0), LF(0),
40 CRLF(0),
41 printable(0),
42 eightBit(0),
43 total(0),
44 lineMin(0xffffffff),
45 lineMax(0),
46 mTrailingWS(false),
47 mLeadingFrom(false)
48{
49 if ( buf && len > 0 )
50 count( buf, len );
51}
52
53static inline bool isWS( char ch ) { return ( ch == '\t' || ch == ' ' ); }
54
55void CharFreq::count( const char * it, size_t len ) {
56
57 const char * end = it + len;
58 uint currentLineLength = 0;
59 // initialize the prevChar with LF so that From_ detection works w/o
60 // special-casing:
61 char prevChar = '\n';
62 char prevPrevChar = 0;
63
64 for ( ; it != end ; ++it ) {
65 ++currentLineLength;
66 switch ( *it ) {
67 case '\0': ++NUL; break;
68 case '\r': ++CR; break;
69 case '\n': ++LF;
70 if ( prevChar == '\r' ) { --currentLineLength; ++CRLF; }
71 if ( currentLineLength >= lineMax ) lineMax = currentLineLength-1;
72 if ( currentLineLength <= lineMin ) lineMin = currentLineLength-1;
73 if ( !mTrailingWS )
74 if ( isWS( prevChar ) || ( prevChar == '\r' && isWS( prevPrevChar ) ) )
75 mTrailingWS = true;
76 currentLineLength = 0;
77 break;
78 case 'F': // check for lines starting with From_ if not found already:
79 if ( !mLeadingFrom )
80 if ( prevChar == '\n' && end - it >= 5 && !tqstrncmp( "From ", it, 5 ) )
81 mLeadingFrom = true;
82 ++printable;
83 break;
84 default:
85 {
86 uchar c = *it;
87 if ( (c == '\t') || ((c >= ' ') && (c <= '~')) )
88 ++printable;
89 else if ( (c == 127) || (c < ' ') )
90 ++CTL;
91 else
92 ++eightBit;
93 }
94 }
95 prevPrevChar = prevChar;
96 prevChar = *it;
97 }
98
99 // consider the length of the last line
100 if ( currentLineLength >= lineMax ) lineMax = currentLineLength;
101 if ( currentLineLength <= lineMin ) lineMin = currentLineLength;
102
103 // check whether the last character is tab or space
104 if ( isWS( prevChar ) )
105 mTrailingWS = true;
106
107 total = len;
108}
109
110bool CharFreq::isEightBitData() const {
111 return type() == EightBitData;
112}
113
114bool CharFreq::isEightBitText() const {
115 return type() == EightBitText;
116}
117
118bool CharFreq::isSevenBitData() const {
119 return type() == SevenBitData;
120}
121
122bool CharFreq::isSevenBitText() const {
123 return type() == SevenBitText;
124}
125
126bool CharFreq::hasTrailingWhitespace() const {
127 return mTrailingWS;
128}
129
130bool CharFreq::hasLeadingFrom() const {
131 return mLeadingFrom;
132}
133
134CharFreq::Type CharFreq::type() const {
135#if 0
136 tqDebug( "Total: %d; NUL: %d; CTL: %d;\n"
137 "CR: %d; LF: %d; CRLF: %d;\n"
138 "lineMin: %d; lineMax: %d;\n"
139 "printable: %d; eightBit: %d;\n"
140 "trailing whitespace: %s;\n"
141 "leading 'From ': %s;\n",
142 total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
143 printable, eightBit,
144 mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" );
145#endif
146 if ( NUL ) // must be binary
147 return Binary;
148
149 // doesn't contain NUL's:
150 if ( eightBit ) {
151 if ( lineMax > 988 ) return EightBitData; // not allowed in 8bit
152 if ( CR != CRLF || controlCodesRatio() > 0.2 ) return EightBitData;
153 return EightBitText;
154 }
155
156 // doesn't contain NUL's, nor 8bit chars:
157 if ( lineMax > 988 ) return SevenBitData;
158 if ( CR != CRLF || controlCodesRatio() > 0.2 ) return SevenBitData;
159
160 // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars:
161 return SevenBitText;
162}
163
164float CharFreq::printableRatio() const {
165 if ( total ) return float(printable) / float(total);
166 else return 0;
167}
168
169float CharFreq::controlCodesRatio() const {
170 if ( total ) return float(CTL) / float(total);
171 else return 0;
172}
173
174} // namespace KMime
175
176