33 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
35 RegExp::RegExp(
const UString &p,
int f)
36 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
39 if (utf8Support == Unknown) {
41 pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (
void*)&supported);
42 utf8Support = (supported & 0x0001) ? Supported : Unsupported;
53 const char*
const nil =
"\\x00";
56 for (
int i = 0; i < p.
size(); ++i) {
66 for (j = 0; j < 4; ++j) {
67 if (i + 1 < p.
size() && Lexer::isHexDigit(p[i + 1].unicode())) {
68 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
73 fprintf(stderr,
"KJS: saw %d digit \\u sequence.\n", j);
125 #ifdef HAVE_PCRE2POSIX
126 uint32_t pcre2flags = 0;
128 PCRE2_SIZE errorOffset;
130 if (flgs & IgnoreCase)
131 pcre2flags |= PCRE2_CASELESS;
133 if (flgs & Multiline)
134 pcre2flags |= PCRE2_MULTILINE;
136 if (utf8Support == Supported)
137 pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
141 prepareMatch(intern);
143 pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
144 &errorCode, &errorOffset, NULL);
148 PCRE2_UCHAR errorMsg[256];
149 pcre2_get_error_message(errorCode, errorMsg,
sizeof(errorMsg));
150 fprintf(stderr,
"KJS: pcre_compile() failed with '%s'\n", errorMsg);
152 match_data =
nullptr;
158 int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
164 match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
169 regflags |= REG_EXTENDED;
172 if ( f & IgnoreCase )
173 regflags |= REG_ICASE;
181 int errorCode = regcomp(&preg, intern.
ascii(), regflags);
182 if (errorCode != 0) {
184 char errorMessage[80];
185 regerror(errorCode, &preg, errorMessage,
sizeof errorMessage);
186 fprintf(stderr,
"KJS: regcomp failed with '%s'\n", errorMessage);
196 #ifdef HAVE_PCRE2POSIX
199 pcre2_match_data_free(match_data);
203 pcre2_code_free(pcregex);
211 void RegExp::prepareUtf8(
const UString& s)
214 const int length = s.
size();
215 buffer =
new buftype_t[length * 3 + 1];
219 originalPos =
new int[length * 3 + 2];
224 buftype_t *p = buffer;
225 int *posOut = originalPos;
227 for (
int i = 0; i != length; ++i) {
228 unsigned short c = d[i].
unicode();
234 }
else if (c < 0x800) {
235 *p++ = (buftype_t)((c >> 6) | 0xC0);
236 *p++ = (buftype_t)((c | 0x80) & 0xBF);
239 *p++ = (buftype_t)((c >> 12) | 0xE0);
240 *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF);
241 *p++ = (buftype_t)((c | 0x80) & 0xBF);
245 while (sequenceLen > 0) {
252 bufferSize = p - buffer;
258 *(posOut+1) = length+1;
261 void RegExp::prepareASCII (
const UString& s)
269 buffer =
new buftype_t[truncated.size() + 1];
270 memcpy(buffer, truncated.c_str(), truncated.size());
271 buffer[truncated.size()] =
'\0';
272 bufferSize = truncated.size();
275 void RegExp::prepareMatch(
const UString &s)
277 delete[] originalPos;
279 if (utf8Support == Supported)
289 void RegExp::doneMatch()
291 delete[] originalPos; originalPos = 0;
292 delete[] buffer; buffer = 0;
295 UString RegExp::match(
const UString &s,
int i,
int *pos,
int **ovector)
298 assert(s.
data() == originalS.data());
311 return UString::null;
313 #ifdef HAVE_PCRE2POSIX
314 if (!pcregex || !match_data)
315 return UString::null;
317 return UString::null;
321 if (utf8Support == Supported)
324 while (originalPos[startPos] < i)
329 while (originalPos[nextPos] < (i + 1))
336 nextPos = i + (i < s.
size() ? 1 : 0);
339 uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
342 baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
344 int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
348 if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
353 fprintf(stderr,
"No match after m_notEmpty. +1 and keep going.\n");
356 baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
357 numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
359 return UString::null;
362 return UString::null;
365 PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
367 return UString::null;
369 uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
370 *ovector =
new int[pcre2_ovecCount * 2];
373 for (
size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
375 (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
380 for (
size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
382 (*ovector)[c] = pcre2_ovector[c];
386 const uint maxMatch = 10;
387 regmatch_t rmatch[maxMatch];
389 char *str = strdup(s.
ascii());
390 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
392 return UString::null;
397 *pos = rmatch[0].rm_so + i;
398 return s.
substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
403 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
407 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
408 return UString::null;
411 if (nrSubPatterns == 0) nrSubPatterns = 1;
413 int ovecsize = (nrSubPatterns)*3;
414 *ovector =
new int[ovecsize];
415 for (uint j = 0; j < nrSubPatterns; j++) {
416 (*ovector)[2*j] = rmatch[j].rm_so + i;
417 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
421 *pos = (*ovector)[0];
422 if ( *pos == (*ovector)[1] && (flgs & Global) )
427 return s.
substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
8 bit char based string class
int find(const UString &f, int pos=0) const
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
const UChar * data() const
UString substr(int pos=0, int len=-1) const
UString & append(const UString &)
Append another string.
unsigned short unicode() const