34RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
37RegExp::RegExp(
const UString &p,
int f)
38 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
42 if (utf8Support == Unknown) {
44 pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (
void*)&supported);
45 utf8Support = (supported & 0x0001) ? Supported : Unsupported;
57 const char*
const nil =
"\\x00";
60 for (
int i = 0; i < p.
size(); ++i) {
70 for (j = 0; j < 4; ++j) {
71 if (i + 1 < p.
size() && Lexer::isHexDigit(p[i + 1].unicode())) {
72 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
77 fprintf(stderr,
"KJS: saw %d digit \\u sequence.\n", j);
129#ifdef HAVE_PCRE2POSIX
130 uint32_t pcre2flags = 0;
132 PCRE2_SIZE errorOffset;
134 if (flgs & IgnoreCase)
135 pcre2flags |= PCRE2_CASELESS;
137 if (flgs & Multiline)
138 pcre2flags |= PCRE2_MULTILINE;
140 if (utf8Support == Supported)
141 pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
145 prepareMatch(intern);
147 pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
148 &errorCode, &errorOffset, NULL);
152 PCRE2_UCHAR errorMsg[256];
153 pcre2_get_error_message(errorCode, errorMsg,
sizeof(errorMsg));
154 fprintf(stderr,
"KJS: pcre_compile() failed with '%s'\n", errorMsg);
156 match_data =
nullptr;
162 int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
168 match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
173 regflags |= REG_EXTENDED;
176 if ( f & IgnoreCase )
177 regflags |= REG_ICASE;
185 int errorCode = regcomp(&preg, intern.
ascii(), regflags);
186 if (errorCode != 0) {
188 char errorMessage[80];
189 regerror(errorCode, &preg, errorMessage,
sizeof errorMessage);
190 fprintf(stderr,
"KJS: regcomp failed with '%s'\n", errorMessage);
200#ifdef HAVE_PCRE2POSIX
203 pcre2_match_data_free(match_data);
207 pcre2_code_free(pcregex);
215void RegExp::prepareUtf8(
const UString& s)
218 const int length = s.
size();
219 buffer =
new buftype_t[length * 3 + 1];
223 originalPos =
new int[length * 3 + 2];
228 buftype_t *p = buffer;
229 int *posOut = originalPos;
231 for (
int i = 0; i != length; ++i) {
232 unsigned short c = d[i].
unicode();
238 }
else if (c < 0x800) {
239 *p++ = (buftype_t)((c >> 6) | 0xC0);
240 *p++ = (buftype_t)((c | 0x80) & 0xBF);
243 *p++ = (buftype_t)((c >> 12) | 0xE0);
244 *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF);
245 *p++ = (buftype_t)((c | 0x80) & 0xBF);
249 while (sequenceLen > 0) {
256 bufferSize = p - buffer;
262 *(posOut+1) = length+1;
265void RegExp::prepareASCII (
const UString& s)
273 buffer =
new buftype_t[truncated.size() + 1];
274 memcpy(buffer, truncated.c_str(), truncated.size());
275 buffer[truncated.size()] =
'\0';
276 bufferSize = truncated.size();
279void RegExp::prepareMatch(
const UString &s)
281 delete[] originalPos;
283#ifdef HAVE_PCRE2POSIX
284 if (utf8Support == Supported)
295void RegExp::doneMatch()
297 delete[] originalPos; originalPos = 0;
298 delete[] buffer; buffer = 0;
301UString RegExp::match(
const UString &s,
int i,
int *pos,
int **ovector)
304 assert(s.
data() == originalS.data());
317 return UString::null;
319#ifdef HAVE_PCRE2POSIX
320 if (!pcregex || !match_data)
321 return UString::null;
323 return UString::null;
327 if (utf8Support == Supported)
330 while (originalPos[startPos] < i)
335 while (originalPos[nextPos] < (i + 1))
342 nextPos = i + (i < s.
size() ? 1 : 0);
345 uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
348 baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
350 int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
354 if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
359 fprintf(stderr,
"No match after m_notEmpty. +1 and keep going.\n");
362 baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
363 numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
365 return UString::null;
368 return UString::null;
371 PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
373 return UString::null;
375 uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
376 *ovector =
new int[pcre2_ovecCount * 2];
379 for (
size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
381 (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
386 for (
size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
388 (*ovector)[c] = pcre2_ovector[c];
392 const uint maxMatch = 10;
393 regmatch_t rmatch[maxMatch];
395 char *str = strdup(s.
ascii());
396 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
398 return UString::null;
403 *pos = rmatch[0].rm_so + i;
404 return s.
substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
409 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
413 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
414 return UString::null;
417 if (nrSubPatterns == 0) nrSubPatterns = 1;
419 int ovecsize = (nrSubPatterns)*3;
420 *ovector =
new int[ovecsize];
421 for (uint j = 0; j < nrSubPatterns; j++) {
422 (*ovector)[2*j] = rmatch[j].rm_so + i;
423 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
427 *pos = (*ovector)[0];
428 if ( *pos == (*ovector)[1] && (flgs & Global) )
433 return s.
substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
8 bit char based string class
int find(const UString &f, int pos=0) const
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
UString substr(int pos=0, int len=-1) const
const UChar * data() const
UString & append(const UString &)
Append another string.
unsigned short unicode() const