• Skip to content
  • Skip to link menu
Trinity API Reference
  • Trinity API Reference
  • kjs
 

kjs

  • kjs
regexp.cpp
1/*
2 * This file is part of the KDE libraries
3 * Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
4 * Copyright (C) 2003,2004 Apple Computer, Inc.
5 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23#include "regexp.h"
24
25#include "lexer.h"
26#include <assert.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30
31using namespace KJS;
32
33#ifdef HAVE_PCRE2POSIX
34RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
35#endif
36
37RegExp::RegExp(const UString &p, int f)
38 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
39{
40#ifdef HAVE_PCRE2POSIX
41 // Determine whether libpcre has unicode support if need be..
42 if (utf8Support == Unknown) {
43 uint32_t supported;
44 pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported);
45 utf8Support = (supported & 0x0001) ? Supported : Unsupported;
46 }
47#endif
48
49 nrSubPatterns = 0; // determined in match() with POSIX regex.
50
51 // JS regexps can contain Unicode escape sequences (\uxxxx) which
52 // are rather uncommon elsewhere. As our regexp libs don't understand
53 // them we do the unescaping ourselves internally.
54 // Also make sure to expand out any nulls as pcre_compile
55 // expects null termination..
56 UString intern;
57 const char* const nil = "\\x00";
58 if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
59 bool escape = false;
60 for (int i = 0; i < p.size(); ++i) {
61 UChar c = p[i];
62 if (escape) {
63 escape = false;
64 // we only care about \u
65 if (c == 'u') {
66 // standard unicode escape sequence looks like \uxxxx but
67 // other browsers also accept less then 4 hex digits
68 unsigned short u = 0;
69 int j = 0;
70 for (j = 0; j < 4; ++j) {
71 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
72 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
73 ++i;
74 } else {
75 // sequence incomplete. restore index.
76 // TODO: cleaner way to propagate warning
77 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
78 i -= j;
79 break;
80 }
81 }
82 if (j < 4) {
83 // sequence was incomplete. treat \u as u which IE always
84 // and FF sometimes does.
85 intern.append(UString('u'));
86 } else {
87 c = UChar(u);
88 switch (u) {
89 case 0:
90 // Make sure to encode 0, to avoid terminating the string
91 intern += UString(nil);
92 break;
93 case '^':
94 case '$':
95 case '\\':
96 case '.':
97 case '*':
98 case '+':
99 case '?':
100 case '(': case ')':
101 case '{': case '}':
102 case '[': case ']':
103 case '|':
104 // escape pattern characters have to remain escaped
105 intern.append(UString('\\'));
106 // intentional fallthrough
107 default:
108 intern += UString(&c, 1);
109 break;
110 }
111 }
112 continue;
113 }
114 intern += UString('\\');
115 intern += UString(&c, 1);
116 } else {
117 if (c == '\\')
118 escape = true;
119 else if (c == '\0')
120 intern += UString(nil);
121 else
122 intern += UString(&c, 1);
123 }
124 }
125 } else {
126 intern = p;
127 }
128
129#ifdef HAVE_PCRE2POSIX
130 uint32_t pcre2flags = 0;
131 int errorCode;
132 PCRE2_SIZE errorOffset;
133
134 if (flgs & IgnoreCase)
135 pcre2flags |= PCRE2_CASELESS;
136
137 if (flgs & Multiline)
138 pcre2flags |= PCRE2_MULTILINE;
139
140 if (utf8Support == Supported)
141 pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
142
143 // Fill our buffer with an encoded version, whether utf-8, or,
144 // if PCRE is incapable, truncated.
145 prepareMatch(intern);
146
147 pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
148 &errorCode, &errorOffset, NULL);
149 doneMatch(); // Cleanup buffers
150 if (!pcregex) {
151#ifndef NDEBUG
152 PCRE2_UCHAR errorMsg[256];
153 pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg));
154 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg);
155#endif
156 match_data = nullptr;
157 valid = false;
158 return;
159 }
160
161 // Get number of subpatterns that will be returned
162 int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
163 if (rc != 0)
164 {
165 nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
166 }
167
168 match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
169#else
170
171 int regflags = 0;
172#ifdef REG_EXTENDED
173 regflags |= REG_EXTENDED;
174#endif
175#ifdef REG_ICASE
176 if ( f & IgnoreCase )
177 regflags |= REG_ICASE;
178#endif
179
180 //NOTE: Multiline is not feasible with POSIX regex.
181 //if ( f & Multiline )
182 // ;
183 // Note: the Global flag is already handled by RegExpProtoFunc::execute
184
185 int errorCode = regcomp(&preg, intern.ascii(), regflags);
186 if (errorCode != 0) {
187#ifndef NDEBUG
188 char errorMessage[80];
189 regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
190 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
191#endif
192 valid = false;
193 }
194#endif
195}
196
197RegExp::~RegExp()
198{
199 doneMatch(); // Be 100% sure buffers are freed
200#ifdef HAVE_PCRE2POSIX
201 if (match_data)
202 {
203 pcre2_match_data_free(match_data);
204 }
205 if (pcregex)
206 {
207 pcre2_code_free(pcregex);
208 }
209#else
210 /* TODO: is this really okay after an error ? */
211 regfree(&preg);
212#endif
213}
214
215void RegExp::prepareUtf8(const UString& s)
216{
217 // Allocate a buffer big enough to hold all the characters plus \0
218 const int length = s.size();
219 buffer = new buftype_t[length * 3 + 1];
220
221 // Also create buffer for positions. We need one extra character in there,
222 // even past the \0 since the non-empty handling may jump one past the end
223 originalPos = new int[length * 3 + 2];
224
225 // Convert to runs of 8-bit characters, and generate indeces
226 // Note that we do NOT combine surrogate pairs here, as
227 // regexps operate on them as separate characters
228 buftype_t *p = buffer;
229 int *posOut = originalPos;
230 const UChar *d = s.data();
231 for (int i = 0; i != length; ++i) {
232 unsigned short c = d[i].unicode();
233
234 int sequenceLen;
235 if (c < 0x80) {
236 *p++ = (buftype_t)c;
237 sequenceLen = 1;
238 } else if (c < 0x800) {
239 *p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
240 *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
241 sequenceLen = 2;
242 } else {
243 *p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
244 *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
245 *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
246 sequenceLen = 3;
247 }
248
249 while (sequenceLen > 0) {
250 *posOut = i;
251 ++posOut;
252 --sequenceLen;
253 }
254 }
255
256 bufferSize = p - buffer;
257
258 *p++ = '\0';
259
260 // Record positions for \0, and the fictional character after that.
261 *posOut = length;
262 *(posOut+1) = length+1;
263}
264
265void RegExp::prepareASCII (const UString& s)
266{
267 originalPos = 0;
268
269 // Best-effort attempt to get something done
270 // when we don't have utf 8 available -- use
271 // truncated version, and pray for the best
272 CString truncated = s.cstring();
273 buffer = new buftype_t[truncated.size() + 1];
274 memcpy(buffer, truncated.c_str(), truncated.size());
275 buffer[truncated.size()] = '\0'; // For _compile use
276 bufferSize = truncated.size();
277}
278
279void RegExp::prepareMatch(const UString &s)
280{
281 delete[] originalPos; // Just to be sure..
282 delete[] buffer;
283#ifdef HAVE_PCRE2POSIX
284 if (utf8Support == Supported)
285 prepareUtf8(s);
286 else
287#endif
288 prepareASCII(s);
289
290#ifndef NDEBUG
291 originalS = s;
292#endif
293}
294
295void RegExp::doneMatch()
296{
297 delete[] originalPos; originalPos = 0;
298 delete[] buffer; buffer = 0;
299}
300
301UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
302{
303#ifndef NDEBUG
304 assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
305#endif
306 assert(valid);
307
308 if (i < 0)
309 i = 0;
310 if (ovector)
311 *ovector = 0L;
312 int dummyPos;
313 if (!pos)
314 pos = &dummyPos;
315 *pos = -1;
316 if (i > s.size() || s.isNull())
317 return UString::null;
318
319#ifdef HAVE_PCRE2POSIX
320 if (!pcregex || !match_data)
321 return UString::null;
322 if (!ovector)
323 return UString::null;
324
325 int startPos;
326 int nextPos;
327 if (utf8Support == Supported)
328 {
329 startPos = i;
330 while (originalPos[startPos] < i)
331 ++startPos;
332
333 nextPos = startPos;
334 if (i < s.size()) {
335 while (originalPos[nextPos] < (i + 1))
336 ++nextPos;
337 }
338 }
339 else
340 {
341 startPos = i;
342 nextPos = i + (i < s.size() ? 1 : 0);
343 }
344
345 uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
346 if (m_notEmpty)
347 {
348 baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
349 }
350 int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
351 if (numMatches <= 0)
352 {
353 // Failed to match.
354 if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
355 {
356 // We set m_notEmpty ourselves, to look for a non-empty match
357 // So we don't stop here, we want to try again at i+1.
358#ifdef KJS_VERBOSE
359 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
360#endif
361 m_notEmpty = 0;
362 baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
363 numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
364 if (numMatches <= 0)
365 return UString::null;
366 }
367 else
368 return UString::null;
369 }
370
371 PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
372 if (!pcre2_ovector)
373 return UString::null;
374
375 uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
376 *ovector = new int[pcre2_ovecCount * 2];
377 if (originalPos)
378 {
379 for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
380 {
381 (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
382 }
383 }
384 else
385 {
386 for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
387 {
388 (*ovector)[c] = pcre2_ovector[c];
389 }
390 }
391#else
392 const uint maxMatch = 10;
393 regmatch_t rmatch[maxMatch];
394
395 char *str = strdup(s.ascii()); // TODO: why ???
396 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
397 free(str);
398 return UString::null;
399 }
400 free(str);
401
402 if (!ovector) {
403 *pos = rmatch[0].rm_so + i;
404 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
405 }
406
407 // map rmatch array to ovector used in PCRE case
408 nrSubPatterns = 0;
409 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
410 nrSubPatterns++;
411 // if the nonEmpty flag is set, return a failed match if any of the
412 // subMatches happens to be an empty string.
413 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
414 return UString::null;
415 }
416 // Allow an ovector slot to return the (failed) match result.
417 if (nrSubPatterns == 0) nrSubPatterns = 1;
418
419 int ovecsize = (nrSubPatterns)*3; // see above
420 *ovector = new int[ovecsize];
421 for (uint j = 0; j < nrSubPatterns; j++) {
422 (*ovector)[2*j] = rmatch[j].rm_so + i;
423 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
424 }
425#endif
426
427 *pos = (*ovector)[0];
428 if ( *pos == (*ovector)[1] && (flgs & Global) )
429 {
430 // empty match, next try will be with m_notEmpty=true
431 m_notEmpty=true;
432 }
433 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
434}
KJS::CString
8 bit char based string class
Definition: ustring.h:165
KJS::UString
Unicode string class.
Definition: ustring.h:189
KJS::UString::find
int find(const UString &f, int pos=0) const
Definition: ustring.cpp:798
KJS::UString::ascii
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
Definition: ustring.cpp:485
KJS::UString::isNull
bool isNull() const
Definition: ustring.h:343
KJS::UString::size
int size() const
Definition: ustring.h:359
KJS::UString::substr
UString substr(int pos=0, int len=-1) const
Definition: ustring.cpp:868
KJS::UString::cstring
CString cstring() const
Definition: ustring.cpp:480
KJS::UString::data
const UChar * data() const
Definition: ustring.h:339
KJS::UString::append
UString & append(const UString &)
Append another string.
Definition: ustring.cpp:457
KJS::UChar
Unicode character.
Definition: ustring.h:51
KJS::UChar::unicode
unsigned short unicode() const
Definition: ustring.h:81

kjs

Skip menu "kjs"
  • Main Page
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members
  • Related Pages

kjs

Skip menu "kjs"
  • arts
  • dcop
  • dnssd
  • interfaces
  •   kspeech
  •     interface
  •     library
  •   tdetexteditor
  • kate
  • kded
  • kdoctools
  • kimgio
  • kjs
  • libtdemid
  • libtdescreensaver
  • tdeabc
  • tdecmshell
  • tdecore
  • tdefx
  • tdehtml
  • tdeinit
  • tdeio
  •   bookmarks
  •   httpfilter
  •   kpasswdserver
  •   kssl
  •   tdefile
  •   tdeio
  •   tdeioexec
  • tdeioslave
  •   http
  • tdemdi
  •   tdemdi
  • tdenewstuff
  • tdeparts
  • tdeprint
  • tderandr
  • tderesources
  • tdespell2
  • tdesu
  • tdeui
  • tdeunittest
  • tdeutils
  • tdewallet
Generated for kjs by doxygen 1.9.4
This website is maintained by Timothy Pearson.