33 #ifdef PCRE_CONFIG_UTF8
34 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
37 RegExp::RegExp(
const UString &p,
int f)
38 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
41 #ifdef PCRE_CONFIG_UTF8
42 if (utf8Support == Unknown) {
44 pcre_config(PCRE_CONFIG_UTF8, (
void*)&supported);
45 utf8Support = supported ? Supported : Unsupported;
57 const char*
const nil =
"\\x00";
60 for (
int i = 0; i < p.
size(); ++i) {
70 for (j = 0; j < 4; ++j) {
71 if (i + 1 < p.
size() && Lexer::isHexDigit(p[i + 1].unicode())) {
72 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
77 fprintf(stderr,
"KJS: saw %d digit \\u sequence.\n", j);
129 #ifdef HAVE_PCREPOSIX
131 const char *perrormsg;
134 if (flgs & IgnoreCase)
135 pcreflags |= PCRE_CASELESS;
137 if (flgs & Multiline)
138 pcreflags |= PCRE_MULTILINE;
140 #ifdef PCRE_CONFIG_UTF8
141 if (utf8Support == Supported)
142 pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
147 prepareMatch(intern);
149 pcregex = pcre_compile(buffer, pcreflags,
150 &perrormsg, &errorOffset, NULL);
154 fprintf(stderr,
"KJS: pcre_compile() failed with '%s'\n", perrormsg);
160 #ifdef PCRE_INFO_CAPTURECOUNT
162 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
171 regflags |= REG_EXTENDED;
174 if ( f & IgnoreCase )
175 regflags |= REG_ICASE;
183 int errorCode = regcomp(&preg, intern.
ascii(), regflags);
184 if (errorCode != 0) {
186 char errorMessage[80];
187 regerror(errorCode, &preg, errorMessage,
sizeof errorMessage);
188 fprintf(stderr,
"KJS: regcomp failed with '%s'\n", errorMessage);
198 #ifdef HAVE_PCREPOSIX
207 void RegExp::prepareUtf8(
const UString& s)
210 const int length = s.
size();
211 buffer =
new char[length * 3 + 1];
215 originalPos =
new int[length * 3 + 2];
221 int *posOut = originalPos;
223 for (
int i = 0; i != length; ++i) {
224 unsigned short c = d[i].
unicode();
230 }
else if (c < 0x800) {
231 *p++ = (char)((c >> 6) | 0xC0);
232 *p++ = (char)((c | 0x80) & 0xBF);
235 *p++ = (char)((c >> 12) | 0xE0);
236 *p++ = (char)(((c >> 6) | 0x80) & 0xBF);
237 *p++ = (char)((c | 0x80) & 0xBF);
241 while (sequenceLen > 0) {
248 bufferSize = p - buffer;
254 *(posOut+1) = length+1;
257 void RegExp::prepareASCII (
const UString& s)
265 buffer =
new char[truncated.size() + 1];
266 memcpy(buffer, truncated.c_str(), truncated.size());
267 buffer[truncated.size()] =
'\0';
268 bufferSize = truncated.size();
271 void RegExp::prepareMatch(
const UString &s)
273 delete[] originalPos;
275 #ifdef PCRE_CONFIG_UTF8
276 if (utf8Support == Supported)
287 void RegExp::doneMatch()
289 delete[] originalPos; originalPos = 0;
290 delete[] buffer; buffer = 0;
293 UString RegExp::match(
const UString &s,
int i,
int *pos,
int **ovector)
296 assert(s.
data() == originalS.data());
309 return UString::null;
311 #ifdef HAVE_PCREPOSIX
312 int ovecsize = (nrSubPatterns+1)*3;
313 if (ovector) *ovector =
new int[ovecsize];
315 return UString::null;
320 #ifdef PCRE_CONFIG_UTF8
321 if (utf8Support == Supported) {
323 while (originalPos[startPos] < i)
328 while (originalPos[nextPos] < (i + 1))
335 nextPos = i + (i < s.
size() ? 1 : 0);
339 #ifdef PCRE_CONFIG_UTF8
340 utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
343 int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
344 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags,
345 ovector ? *ovector : 0L, ovecsize);
349 if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
355 fprintf(stderr,
"No match after m_notEmpty. +1 and keep going.\n");
358 numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
359 ovector ? *ovector : 0L, ovecsize);
361 return UString::null;
364 return UString::null;
369 if (ovector && originalPos) {
370 for (
unsigned c = 0; c < 2 * TQMIN((
unsigned)numMatches, nrSubPatterns+1); ++c) {
371 if ((*ovector)[c] != -1)
372 (*ovector)[c] = originalPos[(*ovector)[c]];
377 return UString::null;
379 const uint maxMatch = 10;
380 regmatch_t rmatch[maxMatch];
382 char *str = strdup(s.
ascii());
383 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
385 return UString::null;
390 *pos = rmatch[0].rm_so + i;
391 return s.
substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
396 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
400 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
401 return UString::null;
404 if (nrSubPatterns == 0) nrSubPatterns = 1;
406 int ovecsize = (nrSubPatterns)*3;
407 *ovector =
new int[ovecsize];
408 for (uint j = 0; j < nrSubPatterns; j++) {
409 (*ovector)[2*j] = rmatch[j].rm_so + i;
410 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
414 *pos = (*ovector)[0];
415 if ( *pos == (*ovector)[1] && (flgs & Global) )
420 return s.
substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
424 bool RegExp::test(
const UString &s,
int)
426 #ifdef HAVE_PCREPOSIX
431 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
432 0, ovector, 300) == PCRE_ERROR_NOMATCH)
439 char *str = strdup(s.
ascii());
440 int r = regexec(&preg, str, 0, 0, 0);
8 bit char based string class
int find(const UString &f, int pos=0) const
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
const UChar * data() const
UString substr(int pos=0, int len=-1) const
UString & append(const UString &)
Append another string.
unsigned short unicode() const