• Skip to content
  • Skip to link menu
Trinity API Reference
  • Trinity API Reference
  • kjs
 

kjs

  • kjs
regexp.cpp
1/*
2 * This file is part of the KDE libraries
3 * Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
4 * Copyright (C) 2003,2004 Apple Computer, Inc.
5 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23#include "regexp.h"
24
25#include "lexer.h"
26#include <assert.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30
31using namespace KJS;
32
33RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
34
35RegExp::RegExp(const UString &p, int f)
36 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
37{
38 // Determine whether libpcre has unicode support if need be..
39 if (utf8Support == Unknown) {
40 uint32_t supported;
41 pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported);
42 utf8Support = (supported & 0x0001) ? Supported : Unsupported;
43 }
44
45 nrSubPatterns = 0; // determined in match() with POSIX regex.
46
47 // JS regexps can contain Unicode escape sequences (\uxxxx) which
48 // are rather uncommon elsewhere. As our regexp libs don't understand
49 // them we do the unescaping ourselves internally.
50 // Also make sure to expand out any nulls as pcre_compile
51 // expects null termination..
52 UString intern;
53 const char* const nil = "\\x00";
54 if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
55 bool escape = false;
56 for (int i = 0; i < p.size(); ++i) {
57 UChar c = p[i];
58 if (escape) {
59 escape = false;
60 // we only care about \u
61 if (c == 'u') {
62 // standard unicode escape sequence looks like \uxxxx but
63 // other browsers also accept less then 4 hex digits
64 unsigned short u = 0;
65 int j = 0;
66 for (j = 0; j < 4; ++j) {
67 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
68 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
69 ++i;
70 } else {
71 // sequence incomplete. restore index.
72 // TODO: cleaner way to propagate warning
73 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
74 i -= j;
75 break;
76 }
77 }
78 if (j < 4) {
79 // sequence was incomplete. treat \u as u which IE always
80 // and FF sometimes does.
81 intern.append(UString('u'));
82 } else {
83 c = UChar(u);
84 switch (u) {
85 case 0:
86 // Make sure to encode 0, to avoid terminating the string
87 intern += UString(nil);
88 break;
89 case '^':
90 case '$':
91 case '\\':
92 case '.':
93 case '*':
94 case '+':
95 case '?':
96 case '(': case ')':
97 case '{': case '}':
98 case '[': case ']':
99 case '|':
100 // escape pattern characters have to remain escaped
101 intern.append(UString('\\'));
102 // intentional fallthrough
103 default:
104 intern += UString(&c, 1);
105 break;
106 }
107 }
108 continue;
109 }
110 intern += UString('\\');
111 intern += UString(&c, 1);
112 } else {
113 if (c == '\\')
114 escape = true;
115 else if (c == '\0')
116 intern += UString(nil);
117 else
118 intern += UString(&c, 1);
119 }
120 }
121 } else {
122 intern = p;
123 }
124
125#ifdef HAVE_PCRE2POSIX
126 uint32_t pcre2flags = 0;
127 int errorCode;
128 PCRE2_SIZE errorOffset;
129
130 if (flgs & IgnoreCase)
131 pcre2flags |= PCRE2_CASELESS;
132
133 if (flgs & Multiline)
134 pcre2flags |= PCRE2_MULTILINE;
135
136 if (utf8Support == Supported)
137 pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
138
139 // Fill our buffer with an encoded version, whether utf-8, or,
140 // if PCRE is incapable, truncated.
141 prepareMatch(intern);
142
143 pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
144 &errorCode, &errorOffset, NULL);
145 doneMatch(); // Cleanup buffers
146 if (!pcregex) {
147#ifndef NDEBUG
148 PCRE2_UCHAR errorMsg[256];
149 pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg));
150 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg);
151#endif
152 match_data = nullptr;
153 valid = false;
154 return;
155 }
156
157 // Get number of subpatterns that will be returned
158 int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
159 if (rc != 0)
160 {
161 nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
162 }
163
164 match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
165#else
166
167 int regflags = 0;
168#ifdef REG_EXTENDED
169 regflags |= REG_EXTENDED;
170#endif
171#ifdef REG_ICASE
172 if ( f & IgnoreCase )
173 regflags |= REG_ICASE;
174#endif
175
176 //NOTE: Multiline is not feasible with POSIX regex.
177 //if ( f & Multiline )
178 // ;
179 // Note: the Global flag is already handled by RegExpProtoFunc::execute
180
181 int errorCode = regcomp(&preg, intern.ascii(), regflags);
182 if (errorCode != 0) {
183#ifndef NDEBUG
184 char errorMessage[80];
185 regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
186 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
187#endif
188 valid = false;
189 }
190#endif
191}
192
193RegExp::~RegExp()
194{
195 doneMatch(); // Be 100% sure buffers are freed
196#ifdef HAVE_PCRE2POSIX
197 if (match_data)
198 {
199 pcre2_match_data_free(match_data);
200 }
201 if (pcregex)
202 {
203 pcre2_code_free(pcregex);
204 }
205#else
206 /* TODO: is this really okay after an error ? */
207 regfree(&preg);
208#endif
209}
210
211void RegExp::prepareUtf8(const UString& s)
212{
213 // Allocate a buffer big enough to hold all the characters plus \0
214 const int length = s.size();
215 buffer = new buftype_t[length * 3 + 1];
216
217 // Also create buffer for positions. We need one extra character in there,
218 // even past the \0 since the non-empty handling may jump one past the end
219 originalPos = new int[length * 3 + 2];
220
221 // Convert to runs of 8-bit characters, and generate indeces
222 // Note that we do NOT combine surrogate pairs here, as
223 // regexps operate on them as separate characters
224 buftype_t *p = buffer;
225 int *posOut = originalPos;
226 const UChar *d = s.data();
227 for (int i = 0; i != length; ++i) {
228 unsigned short c = d[i].unicode();
229
230 int sequenceLen;
231 if (c < 0x80) {
232 *p++ = (buftype_t)c;
233 sequenceLen = 1;
234 } else if (c < 0x800) {
235 *p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
236 *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
237 sequenceLen = 2;
238 } else {
239 *p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
240 *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
241 *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
242 sequenceLen = 3;
243 }
244
245 while (sequenceLen > 0) {
246 *posOut = i;
247 ++posOut;
248 --sequenceLen;
249 }
250 }
251
252 bufferSize = p - buffer;
253
254 *p++ = '\0';
255
256 // Record positions for \0, and the fictional character after that.
257 *posOut = length;
258 *(posOut+1) = length+1;
259}
260
261void RegExp::prepareASCII (const UString& s)
262{
263 originalPos = 0;
264
265 // Best-effort attempt to get something done
266 // when we don't have utf 8 available -- use
267 // truncated version, and pray for the best
268 CString truncated = s.cstring();
269 buffer = new buftype_t[truncated.size() + 1];
270 memcpy(buffer, truncated.c_str(), truncated.size());
271 buffer[truncated.size()] = '\0'; // For _compile use
272 bufferSize = truncated.size();
273}
274
275void RegExp::prepareMatch(const UString &s)
276{
277 delete[] originalPos; // Just to be sure..
278 delete[] buffer;
279 if (utf8Support == Supported)
280 prepareUtf8(s);
281 else
282 prepareASCII(s);
283
284#ifndef NDEBUG
285 originalS = s;
286#endif
287}
288
289void RegExp::doneMatch()
290{
291 delete[] originalPos; originalPos = 0;
292 delete[] buffer; buffer = 0;
293}
294
295UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
296{
297#ifndef NDEBUG
298 assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
299#endif
300 assert(valid);
301
302 if (i < 0)
303 i = 0;
304 if (ovector)
305 *ovector = 0L;
306 int dummyPos;
307 if (!pos)
308 pos = &dummyPos;
309 *pos = -1;
310 if (i > s.size() || s.isNull())
311 return UString::null;
312
313#ifdef HAVE_PCRE2POSIX
314 if (!pcregex || !match_data)
315 return UString::null;
316 if (!ovector)
317 return UString::null;
318
319 int startPos;
320 int nextPos;
321 if (utf8Support == Supported)
322 {
323 startPos = i;
324 while (originalPos[startPos] < i)
325 ++startPos;
326
327 nextPos = startPos;
328 if (i < s.size()) {
329 while (originalPos[nextPos] < (i + 1))
330 ++nextPos;
331 }
332 }
333 else
334 {
335 startPos = i;
336 nextPos = i + (i < s.size() ? 1 : 0);
337 }
338
339 uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
340 if (m_notEmpty)
341 {
342 baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
343 }
344 int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
345 if (numMatches <= 0)
346 {
347 // Failed to match.
348 if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
349 {
350 // We set m_notEmpty ourselves, to look for a non-empty match
351 // So we don't stop here, we want to try again at i+1.
352#ifdef KJS_VERBOSE
353 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
354#endif
355 m_notEmpty = 0;
356 baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
357 numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
358 if (numMatches <= 0)
359 return UString::null;
360 }
361 else
362 return UString::null;
363 }
364
365 PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
366 if (!pcre2_ovector)
367 return UString::null;
368
369 uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
370 *ovector = new int[pcre2_ovecCount * 2];
371 if (originalPos)
372 {
373 for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
374 {
375 (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
376 }
377 }
378 else
379 {
380 for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
381 {
382 (*ovector)[c] = pcre2_ovector[c];
383 }
384 }
385#else
386 const uint maxMatch = 10;
387 regmatch_t rmatch[maxMatch];
388
389 char *str = strdup(s.ascii()); // TODO: why ???
390 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
391 free(str);
392 return UString::null;
393 }
394 free(str);
395
396 if (!ovector) {
397 *pos = rmatch[0].rm_so + i;
398 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
399 }
400
401 // map rmatch array to ovector used in PCRE case
402 nrSubPatterns = 0;
403 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
404 nrSubPatterns++;
405 // if the nonEmpty flag is set, return a failed match if any of the
406 // subMatches happens to be an empty string.
407 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
408 return UString::null;
409 }
410 // Allow an ovector slot to return the (failed) match result.
411 if (nrSubPatterns == 0) nrSubPatterns = 1;
412
413 int ovecsize = (nrSubPatterns)*3; // see above
414 *ovector = new int[ovecsize];
415 for (uint j = 0; j < nrSubPatterns; j++) {
416 (*ovector)[2*j] = rmatch[j].rm_so + i;
417 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
418 }
419#endif
420
421 *pos = (*ovector)[0];
422 if ( *pos == (*ovector)[1] && (flgs & Global) )
423 {
424 // empty match, next try will be with m_notEmpty=true
425 m_notEmpty=true;
426 }
427 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
428}
KJS::CString
8 bit char based string class
Definition: ustring.h:165
KJS::UString
Unicode string class.
Definition: ustring.h:189
KJS::UString::find
int find(const UString &f, int pos=0) const
Definition: ustring.cpp:798
KJS::UString::ascii
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
Definition: ustring.cpp:485
KJS::UString::isNull
bool isNull() const
Definition: ustring.h:343
KJS::UString::size
int size() const
Definition: ustring.h:359
KJS::UString::substr
UString substr(int pos=0, int len=-1) const
Definition: ustring.cpp:868
KJS::UString::cstring
CString cstring() const
Definition: ustring.cpp:480
KJS::UString::data
const UChar * data() const
Definition: ustring.h:339
KJS::UString::append
UString & append(const UString &)
Append another string.
Definition: ustring.cpp:457
KJS::UChar
Unicode character.
Definition: ustring.h:51
KJS::UChar::unicode
unsigned short unicode() const
Definition: ustring.h:81

kjs

Skip menu "kjs"
  • Main Page
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members
  • Related Pages

kjs

Skip menu "kjs"
  • arts
  • dcop
  • dnssd
  • interfaces
  •   kspeech
  •     interface
  •     library
  •   tdetexteditor
  • kate
  • kded
  • kdoctools
  • kimgio
  • kjs
  • libtdemid
  • libtdescreensaver
  • tdeabc
  • tdecmshell
  • tdecore
  • tdefx
  • tdehtml
  • tdeinit
  • tdeio
  •   bookmarks
  •   httpfilter
  •   kpasswdserver
  •   kssl
  •   tdefile
  •   tdeio
  •   tdeioexec
  • tdeioslave
  •   http
  • tdemdi
  •   tdemdi
  • tdenewstuff
  • tdeparts
  • tdeprint
  • tderandr
  • tderesources
  • tdespell2
  • tdesu
  • tdeui
  • tdeunittest
  • tdeutils
  • tdewallet
Generated for kjs by doxygen 1.9.4
This website is maintained by Timothy Pearson.