d0ddf150d9
After implementing unit testing to nrex I caught and fixed some errors so it should behave more like Python's RegEx In addition, I've added version numbering so it should be able to tell if the library needs updating. Here are a list of changes: - Fixed zero count quantifiers failing. - Fixed infinite recursion if quantifying zero length token. - Fixed `$` (as a string pattern on its own) not matching. - Fixed look behind rewinding beyond the start of the string. - Added support for alternative back reference format `\g{1}` similar to Python. This allows digits to be used immediately after back references. - Number of capture groups are still limited to 9 by default but can now be manually set, with option for no limit at all. (Python has no limit) - Curly bracket quantifiers `{0}` no longer interpreted as a literal string if previous token is not quantifiable. (Python behaviour)
1435 lines
37 KiB
C++
1435 lines
37 KiB
C++
// NREX: Node RegEx
|
|
// Version 0.1
|
|
//
|
|
// Copyright (c) 2015, Zher Huei Lee
|
|
// All rights reserved.
|
|
//
|
|
// This software is provided 'as-is', without any express or implied
|
|
// warranty. In no event will the authors be held liable for any damages
|
|
// arising from the use of this software.
|
|
//
|
|
// Permission is granted to anyone to use this software for any purpose,
|
|
// including commercial applications, and to alter it and redistribute it
|
|
// freely, subject to the following restrictions:
|
|
//
|
|
// 1. The origin of this software must not be misrepresented; you must not
|
|
// claim that you wrote the original software. If you use this software
|
|
// in a product, an acknowledgment in the product documentation would
|
|
// be appreciated but is not required.
|
|
//
|
|
// 2. Altered source versions must be plainly marked as such, and must not
|
|
// be misrepresented as being the original software.
|
|
//
|
|
// 3. This notice may not be removed or altered from any source
|
|
// distribution.
|
|
//
|
|
|
|
#include "nrex.hpp"
|
|
|
|
#ifdef NREX_UNICODE
|
|
#include <wctype.h>
|
|
#include <wchar.h>
|
|
#define NREX_ISALPHANUM iswalnum
|
|
#define NREX_ISSPACE iswspace
|
|
#define NREX_STRLEN wcslen
|
|
#else
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
#define NREX_ISALPHANUM isalnum
|
|
#define NREX_ISSPACE isspace
|
|
#define NREX_STRLEN strlen
|
|
#endif
|
|
|
|
#ifdef NREX_THROW_ERROR
|
|
#define NREX_COMPILE_ERROR(M) throw nrex_compile_error(M)
|
|
#else
|
|
#define NREX_COMPILE_ERROR(M) reset(); return false
|
|
#endif
|
|
|
|
#ifndef NREX_NEW
|
|
#define NREX_NEW(X) new X
|
|
#define NREX_NEW_ARRAY(X, N) new X[N]
|
|
#define NREX_DELETE(X) delete X
|
|
#define NREX_DELETE_ARRAY(X) delete[] X
|
|
#endif
|
|
|
|
template<typename T>
|
|
class nrex_array
|
|
{
|
|
private:
|
|
T* _data;
|
|
unsigned int _reserved;
|
|
unsigned int _size;
|
|
public:
|
|
nrex_array()
|
|
: _data(NREX_NEW_ARRAY(T, 2))
|
|
, _reserved(2)
|
|
, _size(0)
|
|
{
|
|
}
|
|
|
|
~nrex_array()
|
|
{
|
|
NREX_DELETE_ARRAY(_data);
|
|
}
|
|
|
|
unsigned int size() const
|
|
{
|
|
return _size;
|
|
}
|
|
|
|
void reserve(unsigned int size)
|
|
{
|
|
T* old = _data;
|
|
_data = NREX_NEW_ARRAY(T, size);
|
|
_reserved = size;
|
|
for (unsigned int i = 0; i < _size; ++i)
|
|
{
|
|
_data[i] = old[i];
|
|
}
|
|
NREX_DELETE_ARRAY(old);
|
|
}
|
|
|
|
void push(T item)
|
|
{
|
|
if (_size == _reserved)
|
|
{
|
|
reserve(_reserved * 2);
|
|
}
|
|
_data[_size] = item;
|
|
_size++;
|
|
}
|
|
|
|
T& top()
|
|
{
|
|
return _data[_size - 1];
|
|
}
|
|
|
|
const T& operator[] (unsigned int i) const
|
|
{
|
|
return _data[i];
|
|
}
|
|
|
|
void pop()
|
|
{
|
|
if (_size > 0)
|
|
{
|
|
--_size;
|
|
}
|
|
}
|
|
};
|
|
|
|
static int nrex_parse_hex(nrex_char c)
|
|
{
|
|
if ('0' <= c && c <= '9')
|
|
{
|
|
return int(c - '0');
|
|
}
|
|
else if ('a' <= c && c <= 'f')
|
|
{
|
|
return int(c - 'a') + 10;
|
|
}
|
|
else if ('A' <= c && c <= 'F')
|
|
{
|
|
return int(c - 'A') + 10;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static nrex_char nrex_unescape(const nrex_char*& c)
|
|
{
|
|
switch (c[1])
|
|
{
|
|
case '0': ++c; return '\0';
|
|
case 'a': ++c; return '\a';
|
|
case 'e': ++c; return '\e';
|
|
case 'f': ++c; return '\f';
|
|
case 'n': ++c; return '\n';
|
|
case 'r': ++c; return '\r';
|
|
case 't': ++c; return '\t';
|
|
case 'v': ++c; return '\v';
|
|
case 'b': ++c; return '\b';
|
|
case 'x':
|
|
{
|
|
int point = 0;
|
|
for (int i = 2; i <= 3; ++i)
|
|
{
|
|
int res = nrex_parse_hex(c[i]);
|
|
if (res == -1)
|
|
{
|
|
return '\0';
|
|
}
|
|
point = (point << 4) + res;
|
|
}
|
|
c = &c[3];
|
|
return nrex_char(point);
|
|
}
|
|
case 'u':
|
|
{
|
|
int point = 0;
|
|
for (int i = 2; i <= 5; ++i)
|
|
{
|
|
int res = nrex_parse_hex(c[i]);
|
|
if (res == -1)
|
|
{
|
|
return '\0';
|
|
}
|
|
point = (point << 4) + res;
|
|
}
|
|
c = &c[5];
|
|
return nrex_char(point);
|
|
}
|
|
}
|
|
return (++c)[0];
|
|
}
|
|
|
|
struct nrex_search
|
|
{
|
|
const nrex_char* str;
|
|
nrex_result* captures;
|
|
int end;
|
|
bool complete;
|
|
|
|
nrex_char at(int pos)
|
|
{
|
|
return str[pos];
|
|
}
|
|
|
|
nrex_search(const nrex_char* str, nrex_result* captures)
|
|
: str(str)
|
|
, captures(captures)
|
|
, end(0)
|
|
{
|
|
}
|
|
};
|
|
|
|
struct nrex_node
|
|
{
|
|
nrex_node* next;
|
|
nrex_node* previous;
|
|
nrex_node* parent;
|
|
bool quantifiable;
|
|
int length;
|
|
|
|
nrex_node(bool quantify = false)
|
|
: next(NULL)
|
|
, previous(NULL)
|
|
, parent(NULL)
|
|
, quantifiable(quantify)
|
|
, length(-1)
|
|
{
|
|
}
|
|
|
|
virtual ~nrex_node()
|
|
{
|
|
if (next)
|
|
{
|
|
NREX_DELETE(next);
|
|
}
|
|
}
|
|
|
|
virtual int test(nrex_search* s, int pos) const
|
|
{
|
|
return next ? next->test(s, pos) : -1;
|
|
}
|
|
|
|
virtual int test_parent(nrex_search* s, int pos) const
|
|
{
|
|
if (next)
|
|
{
|
|
pos = next->test(s, pos);
|
|
}
|
|
if (parent && pos >= 0)
|
|
{
|
|
pos = parent->test_parent(s, pos);
|
|
}
|
|
if (pos >= 0)
|
|
{
|
|
s->complete = true;
|
|
}
|
|
return pos;
|
|
}
|
|
|
|
void increment_length(int amount, bool subtract = false)
|
|
{
|
|
if (amount >= 0 && length >= 0)
|
|
{
|
|
if (!subtract)
|
|
{
|
|
length += amount;
|
|
}
|
|
else
|
|
{
|
|
length -= amount;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
length = -1;
|
|
}
|
|
if (parent)
|
|
{
|
|
parent->increment_length(amount, subtract);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct nrex_node_group : public nrex_node
|
|
{
|
|
static const int NonCapture = -1;
|
|
static const int Bracket = -2;
|
|
static const int LookAhead = -3;
|
|
static const int LookBehind = -4;
|
|
|
|
int mode;
|
|
bool negate;
|
|
nrex_array<nrex_node*> childset;
|
|
nrex_node* back;
|
|
|
|
nrex_node_group(int mode)
|
|
: nrex_node(true)
|
|
, mode(mode)
|
|
, negate(false)
|
|
, back(NULL)
|
|
{
|
|
if (mode != Bracket)
|
|
{
|
|
length = 0;
|
|
}
|
|
else
|
|
{
|
|
length = 1;
|
|
}
|
|
if (mode == LookAhead || mode == LookBehind)
|
|
{
|
|
quantifiable = false;
|
|
}
|
|
}
|
|
|
|
virtual ~nrex_node_group()
|
|
{
|
|
for (unsigned int i = 0; i < childset.size(); ++i)
|
|
{
|
|
NREX_DELETE(childset[i]);
|
|
}
|
|
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
if (mode >= 0)
|
|
{
|
|
s->captures[mode].start = pos;
|
|
}
|
|
for (unsigned int i = 0; i < childset.size(); ++i)
|
|
{
|
|
s->complete = false;
|
|
int offset = 0;
|
|
if (mode == LookBehind)
|
|
{
|
|
if (pos < length)
|
|
{
|
|
return -1;
|
|
}
|
|
offset = length;
|
|
}
|
|
int res = childset[i]->test(s, pos - offset);
|
|
if (s->complete)
|
|
{
|
|
return res;
|
|
}
|
|
if (negate)
|
|
{
|
|
if (res < 0)
|
|
{
|
|
res = pos + 1;
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
if (i + 1 < childset.size())
|
|
{
|
|
continue;
|
|
}
|
|
}
|
|
if (res >= 0)
|
|
{
|
|
if (mode >= 0)
|
|
{
|
|
s->captures[mode].length = res - pos;
|
|
}
|
|
else if (mode == LookAhead || mode == LookBehind)
|
|
{
|
|
res = pos;
|
|
}
|
|
return next ? next->test(s, res) : res;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
virtual int test_parent(nrex_search* s, int pos) const
|
|
{
|
|
if (mode >= 0)
|
|
{
|
|
s->captures[mode].length = pos - s->captures[mode].start;
|
|
}
|
|
return nrex_node::test_parent(s, pos);
|
|
}
|
|
|
|
void add_childset()
|
|
{
|
|
if (childset.size() > 0 && mode != Bracket)
|
|
{
|
|
length = -1;
|
|
}
|
|
back = NULL;
|
|
}
|
|
|
|
void add_child(nrex_node* node)
|
|
{
|
|
node->parent = this;
|
|
node->previous = back;
|
|
if (back && mode != Bracket)
|
|
{
|
|
back->next = node;
|
|
}
|
|
else
|
|
{
|
|
childset.push(node);
|
|
}
|
|
if (mode != Bracket)
|
|
{
|
|
increment_length(node->length);
|
|
}
|
|
back = node;
|
|
}
|
|
|
|
nrex_node* swap_back(nrex_node* node)
|
|
{
|
|
if (!back)
|
|
{
|
|
add_child(node);
|
|
return NULL;
|
|
}
|
|
nrex_node* old = back;
|
|
if (!old->previous)
|
|
{
|
|
childset.pop();
|
|
}
|
|
if (mode != Bracket)
|
|
{
|
|
increment_length(old->length, true);
|
|
}
|
|
back = old->previous;
|
|
add_child(node);
|
|
return old;
|
|
}
|
|
|
|
void pop_back()
|
|
{
|
|
if (back)
|
|
{
|
|
nrex_node* old = back;
|
|
if (!old->previous)
|
|
{
|
|
childset.pop();
|
|
}
|
|
if (mode != Bracket)
|
|
{
|
|
increment_length(old->length, true);
|
|
}
|
|
back = old->previous;
|
|
NREX_DELETE(old);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct nrex_node_char : public nrex_node
|
|
{
|
|
nrex_char ch;
|
|
|
|
nrex_node_char(nrex_char c)
|
|
: nrex_node(true)
|
|
, ch(c)
|
|
{
|
|
length = 1;
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
if (s->end <= pos || 0 > pos || s->at(pos) != ch)
|
|
{
|
|
return -1;
|
|
}
|
|
return next ? next->test(s, pos + 1) : pos + 1;
|
|
}
|
|
};
|
|
|
|
struct nrex_node_range : public nrex_node
|
|
{
|
|
nrex_char start;
|
|
nrex_char end;
|
|
|
|
nrex_node_range(nrex_char s, nrex_char e)
|
|
: nrex_node(true)
|
|
, start(s)
|
|
, end(e)
|
|
{
|
|
length = 1;
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
if (s->end <= pos || 0 > pos)
|
|
{
|
|
return -1;
|
|
}
|
|
nrex_char c = s->at(pos);
|
|
if (c < start || end < c)
|
|
{
|
|
return -1;
|
|
}
|
|
return next ? next->test(s, pos + 1) : pos + 1;
|
|
}
|
|
};
|
|
|
|
enum nrex_class_type
|
|
{
|
|
nrex_class_none,
|
|
nrex_class_alnum,
|
|
nrex_class_alpha,
|
|
nrex_class_blank,
|
|
nrex_class_cntrl,
|
|
nrex_class_digit,
|
|
nrex_class_graph,
|
|
nrex_class_lower,
|
|
nrex_class_print,
|
|
nrex_class_punct,
|
|
nrex_class_space,
|
|
nrex_class_upper,
|
|
nrex_class_xdigit,
|
|
nrex_class_word
|
|
};
|
|
|
|
static bool nrex_compare_class(const nrex_char** pos, const char* text)
|
|
{
|
|
unsigned int i = 0;
|
|
for (i = 0; text[i] != '\0'; ++i)
|
|
{
|
|
if ((*pos)[i] != text[i])
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
if ((*pos)[i++] != ':' || (*pos)[i] != ']')
|
|
{
|
|
return false;
|
|
}
|
|
*pos = &(*pos)[i];
|
|
return true;
|
|
}
|
|
|
|
#define NREX_COMPARE_CLASS(POS, NAME) if (nrex_compare_class(POS, #NAME)) return nrex_class_ ## NAME
|
|
|
|
static nrex_class_type nrex_parse_class(const nrex_char** pos)
|
|
{
|
|
NREX_COMPARE_CLASS(pos, alnum);
|
|
NREX_COMPARE_CLASS(pos, alpha);
|
|
NREX_COMPARE_CLASS(pos, blank);
|
|
NREX_COMPARE_CLASS(pos, cntrl);
|
|
NREX_COMPARE_CLASS(pos, digit);
|
|
NREX_COMPARE_CLASS(pos, graph);
|
|
NREX_COMPARE_CLASS(pos, lower);
|
|
NREX_COMPARE_CLASS(pos, print);
|
|
NREX_COMPARE_CLASS(pos, punct);
|
|
NREX_COMPARE_CLASS(pos, space);
|
|
NREX_COMPARE_CLASS(pos, upper);
|
|
NREX_COMPARE_CLASS(pos, xdigit);
|
|
NREX_COMPARE_CLASS(pos, word);
|
|
return nrex_class_none;
|
|
}
|
|
|
|
struct nrex_node_class : public nrex_node
|
|
{
|
|
nrex_class_type type;
|
|
|
|
nrex_node_class(nrex_class_type t)
|
|
: nrex_node(true)
|
|
, type(t)
|
|
{
|
|
length = 1;
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
if (s->end <= pos || 0 > pos)
|
|
{
|
|
return -1;
|
|
}
|
|
if (!test_class(s->at(pos)))
|
|
{
|
|
return -1;
|
|
}
|
|
return next ? next->test(s, pos + 1) : pos + 1;
|
|
}
|
|
|
|
bool test_class(nrex_char c) const
|
|
{
|
|
if ((0 <= c && c <= 0x1F) || c == 0x7F)
|
|
{
|
|
if (type == nrex_class_cntrl)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
else if (c < 0x7F)
|
|
{
|
|
if (type == nrex_class_print)
|
|
{
|
|
return true;
|
|
}
|
|
else if (type == nrex_class_graph && c != ' ')
|
|
{
|
|
return true;
|
|
}
|
|
else if ('0' <= c && c <= '9')
|
|
{
|
|
switch (type)
|
|
{
|
|
case nrex_class_alnum:
|
|
case nrex_class_digit:
|
|
case nrex_class_xdigit:
|
|
case nrex_class_word:
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else if ('A' <= c && c <= 'Z')
|
|
{
|
|
switch (type)
|
|
{
|
|
case nrex_class_alnum:
|
|
case nrex_class_alpha:
|
|
case nrex_class_upper:
|
|
case nrex_class_word:
|
|
return true;
|
|
case nrex_class_xdigit:
|
|
if (c <= 'F')
|
|
{
|
|
return true;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else if ('a' <= c && c <= 'z')
|
|
{
|
|
switch (type)
|
|
{
|
|
case nrex_class_alnum:
|
|
case nrex_class_alpha:
|
|
case nrex_class_lower:
|
|
case nrex_class_word:
|
|
return true;
|
|
case nrex_class_xdigit:
|
|
if (c <= 'f')
|
|
{
|
|
return true;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
switch (c)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
if (type == nrex_class_blank)
|
|
{
|
|
return true;
|
|
}
|
|
case '\r':
|
|
case '\n':
|
|
case '\f':
|
|
if (type == nrex_class_space)
|
|
{
|
|
return true;
|
|
}
|
|
break;
|
|
case '_':
|
|
if (type == nrex_class_word)
|
|
{
|
|
return true;
|
|
}
|
|
case ']':
|
|
case '[':
|
|
case '!':
|
|
case '"':
|
|
case '#':
|
|
case '$':
|
|
case '%':
|
|
case '&':
|
|
case '\'':
|
|
case '(':
|
|
case ')':
|
|
case '*':
|
|
case '+':
|
|
case ',':
|
|
case '.':
|
|
case '/':
|
|
case ':':
|
|
case ';':
|
|
case '<':
|
|
case '=':
|
|
case '>':
|
|
case '?':
|
|
case '@':
|
|
case '\\':
|
|
case '^':
|
|
case '`':
|
|
case '{':
|
|
case '|':
|
|
case '}':
|
|
case '~':
|
|
case '-':
|
|
if (type == nrex_class_punct)
|
|
{
|
|
return true;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return false;
|
|
}
|
|
};
|
|
|
|
static bool nrex_is_shorthand(nrex_char repr)
|
|
{
|
|
switch (repr)
|
|
{
|
|
case 'W':
|
|
case 'w':
|
|
case 'D':
|
|
case 'd':
|
|
case 'S':
|
|
case 's':
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
struct nrex_node_shorthand : public nrex_node
|
|
{
|
|
nrex_char repr;
|
|
|
|
nrex_node_shorthand(nrex_char c)
|
|
: nrex_node(true)
|
|
, repr(c)
|
|
{
|
|
length = 1;
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
if (s->end <= pos || 0 > pos)
|
|
{
|
|
return -1;
|
|
}
|
|
bool found = false;
|
|
bool invert = false;
|
|
nrex_char c = s->at(pos);
|
|
switch (repr)
|
|
{
|
|
case '.':
|
|
found = true;
|
|
break;
|
|
case 'W':
|
|
invert = true;
|
|
case 'w':
|
|
if (c == '_' || NREX_ISALPHANUM(c))
|
|
{
|
|
found = true;
|
|
}
|
|
break;
|
|
case 'D':
|
|
invert = true;
|
|
case 'd':
|
|
if ('0' <= c && c <= '9')
|
|
{
|
|
found = true;
|
|
}
|
|
break;
|
|
case 'S':
|
|
invert = true;
|
|
case 's':
|
|
if (NREX_ISSPACE(c))
|
|
{
|
|
found = true;
|
|
}
|
|
break;
|
|
}
|
|
if (found == invert)
|
|
{
|
|
return -1;
|
|
}
|
|
return next ? next->test(s, pos + 1) : pos + 1;
|
|
}
|
|
};
|
|
|
|
static bool nrex_is_quantifier(nrex_char repr)
|
|
{
|
|
switch (repr)
|
|
{
|
|
case '?':
|
|
case '*':
|
|
case '+':
|
|
case '{':
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
struct nrex_node_quantifier : public nrex_node
|
|
{
|
|
int min;
|
|
int max;
|
|
bool greedy;
|
|
nrex_node* child;
|
|
|
|
nrex_node_quantifier(int min, int max)
|
|
: nrex_node()
|
|
, min(min)
|
|
, max(max)
|
|
, greedy(true)
|
|
, child(NULL)
|
|
{
|
|
}
|
|
|
|
virtual ~nrex_node_quantifier()
|
|
{
|
|
if (child)
|
|
{
|
|
NREX_DELETE(child);
|
|
}
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
return test_step(s, pos, 0, pos);
|
|
}
|
|
|
|
int test_step(nrex_search* s, int pos, int level, int start) const
|
|
{
|
|
if (pos > s->end)
|
|
{
|
|
return -1;
|
|
}
|
|
if (!greedy && level > min)
|
|
{
|
|
int res = pos;
|
|
if (next)
|
|
{
|
|
res = next->test(s, res);
|
|
}
|
|
if (s->complete)
|
|
{
|
|
return res;
|
|
}
|
|
if (res >= 0 && parent->test_parent(s, res) >= 0)
|
|
{
|
|
return res;
|
|
}
|
|
}
|
|
if (max >= 0 && level > max)
|
|
{
|
|
return -1;
|
|
}
|
|
if (level > 1 && level > min + 1 && pos == start)
|
|
{
|
|
return -1;
|
|
}
|
|
int res = pos;
|
|
if (level >= 1)
|
|
{
|
|
res = child->test(s, pos);
|
|
if (s->complete)
|
|
{
|
|
return res;
|
|
}
|
|
}
|
|
if (res >= 0)
|
|
{
|
|
int res_step = test_step(s, res, level + 1, start);
|
|
if (res_step >= 0)
|
|
{
|
|
return res_step;
|
|
}
|
|
else if (greedy && level >= min)
|
|
{
|
|
if (next)
|
|
{
|
|
res = next->test(s, res);
|
|
}
|
|
if (s->complete)
|
|
{
|
|
return res;
|
|
}
|
|
if (res >= 0 && parent->test_parent(s, res) >= 0)
|
|
{
|
|
return res;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
};
|
|
|
|
struct nrex_node_anchor : public nrex_node
|
|
{
|
|
bool end;
|
|
|
|
nrex_node_anchor(bool end)
|
|
: nrex_node()
|
|
, end(end)
|
|
{
|
|
length = 0;
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
if (!end && pos != 0)
|
|
{
|
|
return -1;
|
|
}
|
|
else if (end && pos != s->end)
|
|
{
|
|
return -1;
|
|
}
|
|
return next ? next->test(s, pos) : pos;
|
|
}
|
|
};
|
|
|
|
struct nrex_node_word_boundary : public nrex_node
|
|
{
|
|
bool inverse;
|
|
|
|
nrex_node_word_boundary(bool inverse)
|
|
: nrex_node()
|
|
, inverse(inverse)
|
|
{
|
|
length = 0;
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
bool left = false;
|
|
bool right = false;
|
|
if (pos != 0)
|
|
{
|
|
nrex_char c = s->at(pos - 1);
|
|
if (c == '_' || NREX_ISALPHANUM(c))
|
|
{
|
|
left = true;
|
|
}
|
|
}
|
|
if (pos != s->end)
|
|
{
|
|
nrex_char c = s->at(pos);
|
|
if (c == '_' || NREX_ISALPHANUM(c))
|
|
{
|
|
right = true;
|
|
}
|
|
}
|
|
if ((left != right) == inverse)
|
|
{
|
|
return -1;
|
|
}
|
|
return next ? next->test(s, pos) : pos;
|
|
}
|
|
};
|
|
|
|
struct nrex_node_backreference : public nrex_node
|
|
{
|
|
int ref;
|
|
|
|
nrex_node_backreference(int ref)
|
|
: nrex_node(true)
|
|
, ref(ref)
|
|
{
|
|
length = -1;
|
|
}
|
|
|
|
int test(nrex_search* s, int pos) const
|
|
{
|
|
nrex_result& r = s->captures[ref];
|
|
for (int i = 0; i < r.length; ++i)
|
|
{
|
|
if (pos + i >= s->end)
|
|
{
|
|
return -1;
|
|
}
|
|
if (s->at(r.start + i) != s->at(pos + i))
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
return next ? next->test(s, pos + r.length) : pos + r.length;
|
|
}
|
|
};
|
|
|
|
bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
|
|
{
|
|
for (unsigned int i = 0; i < stack.size(); i++)
|
|
{
|
|
if (stack[i]->mode == nrex_node_group::LookBehind)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
nrex::nrex()
|
|
: _capturing(0)
|
|
, _root(NULL)
|
|
{
|
|
}
|
|
|
|
nrex::nrex(const nrex_char* pattern, int captures)
|
|
: _capturing(0)
|
|
, _root(NULL)
|
|
{
|
|
compile(pattern, captures);
|
|
}
|
|
|
|
nrex::~nrex()
|
|
{
|
|
if (_root)
|
|
{
|
|
NREX_DELETE(_root);
|
|
}
|
|
}
|
|
|
|
bool nrex::valid() const
|
|
{
|
|
return (_root != NULL);
|
|
}
|
|
|
|
void nrex::reset()
|
|
{
|
|
_capturing = 0;
|
|
if (_root)
|
|
{
|
|
NREX_DELETE(_root);
|
|
}
|
|
_root = NULL;
|
|
}
|
|
|
|
int nrex::capture_size() const
|
|
{
|
|
if (_root)
|
|
{
|
|
return _capturing + 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool nrex::compile(const nrex_char* pattern, int captures)
|
|
{
|
|
reset();
|
|
nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing));
|
|
nrex_array<nrex_node_group*> stack;
|
|
stack.push(root);
|
|
_root = root;
|
|
|
|
for (const nrex_char* c = pattern; c[0] != '\0'; ++c)
|
|
{
|
|
if (c[0] == '(')
|
|
{
|
|
if (c[1] == '?')
|
|
{
|
|
if (c[2] == ':')
|
|
{
|
|
c = &c[2];
|
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
|
|
stack.top()->add_child(group);
|
|
stack.push(group);
|
|
}
|
|
else if (c[2] == '!' || c[2] == '=')
|
|
{
|
|
c = &c[2];
|
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookAhead));
|
|
group->negate = (c[0] == '!');
|
|
stack.top()->add_child(group);
|
|
stack.push(group);
|
|
}
|
|
else if (c[2] == '<' && (c[3] == '!' || c[3] == '='))
|
|
{
|
|
c = &c[3];
|
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookBehind));
|
|
group->negate = (c[0] == '!');
|
|
stack.top()->add_child(group);
|
|
stack.push(group);
|
|
}
|
|
else
|
|
{
|
|
NREX_COMPILE_ERROR("unrecognised qualifier for group");
|
|
}
|
|
}
|
|
else if (captures >= 0 && _capturing < captures)
|
|
{
|
|
nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing));
|
|
stack.top()->add_child(group);
|
|
stack.push(group);
|
|
}
|
|
else
|
|
{
|
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
|
|
stack.top()->add_child(group);
|
|
stack.push(group);
|
|
}
|
|
}
|
|
else if (c[0] == ')')
|
|
{
|
|
if (stack.size() > 1)
|
|
{
|
|
stack.pop();
|
|
}
|
|
else
|
|
{
|
|
NREX_COMPILE_ERROR("unexpected ')'");
|
|
}
|
|
}
|
|
else if (c[0] == '[')
|
|
{
|
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::Bracket));
|
|
stack.top()->add_child(group);
|
|
if (c[1] == '^')
|
|
{
|
|
group->negate = true;
|
|
++c;
|
|
}
|
|
bool first_child = true;
|
|
nrex_char previous_child;
|
|
bool previous_child_single = false;
|
|
while (true)
|
|
{
|
|
group->add_childset();
|
|
++c;
|
|
if (c[0] == '\0')
|
|
{
|
|
NREX_COMPILE_ERROR("unclosed bracket expression '['");
|
|
}
|
|
if (c[0] == '[' && c[1] == ':')
|
|
{
|
|
const nrex_char* d = &c[2];
|
|
nrex_class_type cls = nrex_parse_class(&d);
|
|
if (cls != nrex_class_none)
|
|
{
|
|
c = d;
|
|
group->add_child(NREX_NEW(nrex_node_class(cls)));
|
|
previous_child_single = false;
|
|
}
|
|
else
|
|
{
|
|
group->add_child(NREX_NEW(nrex_node_char('[')));
|
|
previous_child = '[';
|
|
previous_child_single = true;
|
|
}
|
|
}
|
|
else if (c[0] == ']' && !first_child)
|
|
{
|
|
break;
|
|
}
|
|
else if (c[0] == '\\')
|
|
{
|
|
if (nrex_is_shorthand(c[1]))
|
|
{
|
|
group->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
|
|
++c;
|
|
previous_child_single = false;
|
|
}
|
|
else
|
|
{
|
|
const nrex_char* d = c;
|
|
nrex_char unescaped = nrex_unescape(d);
|
|
if (c == d)
|
|
{
|
|
NREX_COMPILE_ERROR("invalid escape token");
|
|
}
|
|
group->add_child(NREX_NEW(nrex_node_char(unescaped)));
|
|
c = d;
|
|
previous_child = unescaped;
|
|
previous_child_single = true;
|
|
}
|
|
}
|
|
else if (previous_child_single && c[0] == '-')
|
|
{
|
|
bool is_range = false;
|
|
nrex_char next;
|
|
if (c[1] != '\0' && c[1] != ']')
|
|
{
|
|
if (c[1] == '\\')
|
|
{
|
|
const nrex_char* d = ++c;
|
|
next = nrex_unescape(d);
|
|
if (c == d)
|
|
{
|
|
NREX_COMPILE_ERROR("invalid escape token in range");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
next = c[1];
|
|
++c;
|
|
}
|
|
is_range = true;
|
|
}
|
|
if (is_range)
|
|
{
|
|
if (next < previous_child)
|
|
{
|
|
NREX_COMPILE_ERROR("text range out of order");
|
|
}
|
|
group->pop_back();
|
|
group->add_child(NREX_NEW(nrex_node_range(previous_child, next)));
|
|
previous_child_single = false;
|
|
}
|
|
else
|
|
{
|
|
group->add_child(NREX_NEW(nrex_node_char(c[0])));
|
|
previous_child = c[0];
|
|
previous_child_single = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
group->add_child(NREX_NEW(nrex_node_char(c[0])));
|
|
previous_child = c[0];
|
|
previous_child_single = true;
|
|
}
|
|
first_child = false;
|
|
}
|
|
}
|
|
else if (nrex_is_quantifier(c[0]))
|
|
{
|
|
int min = 0;
|
|
int max = -1;
|
|
bool valid_quantifier = true;
|
|
if (c[0] == '?')
|
|
{
|
|
min = 0;
|
|
max = 1;
|
|
}
|
|
else if (c[0] == '+')
|
|
{
|
|
min = 1;
|
|
max = -1;
|
|
}
|
|
else if (c[0] == '*')
|
|
{
|
|
min = 0;
|
|
max = -1;
|
|
}
|
|
else if (c[0] == '{')
|
|
{
|
|
bool max_set = false;
|
|
const nrex_char* d = c;
|
|
while (true)
|
|
{
|
|
++d;
|
|
if (d[0] == '\0')
|
|
{
|
|
valid_quantifier = false;
|
|
break;
|
|
}
|
|
else if (d[0] == '}')
|
|
{
|
|
break;
|
|
}
|
|
else if (d[0] == ',')
|
|
{
|
|
max_set = true;
|
|
continue;
|
|
}
|
|
else if (d[0] < '0' || '9' < d[0])
|
|
{
|
|
valid_quantifier = false;
|
|
break;
|
|
}
|
|
if (max_set)
|
|
{
|
|
if (max < 0)
|
|
{
|
|
max = int(d[0] - '0');
|
|
}
|
|
else
|
|
{
|
|
max = max * 10 + int(d[0] - '0');
|
|
}
|
|
}
|
|
else
|
|
{
|
|
min = min * 10 + int(d[0] - '0');
|
|
}
|
|
}
|
|
if (!max_set)
|
|
{
|
|
max = min;
|
|
}
|
|
if (valid_quantifier)
|
|
{
|
|
c = d;
|
|
}
|
|
}
|
|
if (valid_quantifier)
|
|
{
|
|
if (stack.top()->back == NULL || !stack.top()->back->quantifiable)
|
|
{
|
|
NREX_COMPILE_ERROR("element not quantifiable");
|
|
}
|
|
nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier(min, max));
|
|
if (min == max)
|
|
{
|
|
if (stack.top()->back->length >= 0)
|
|
{
|
|
quant->length = max * stack.top()->back->length;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (nrex_has_lookbehind(stack))
|
|
{
|
|
NREX_COMPILE_ERROR("variable length quantifiers inside lookbehind not supported");
|
|
}
|
|
}
|
|
quant->child = stack.top()->swap_back(quant);
|
|
quant->child->previous = NULL;
|
|
quant->child->next = NULL;
|
|
quant->child->parent = quant;
|
|
if (c[1] == '?')
|
|
{
|
|
quant->greedy = false;
|
|
++c;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
stack.top()->add_child(NREX_NEW(nrex_node_char(c[0])));
|
|
}
|
|
}
|
|
else if (c[0] == '|')
|
|
{
|
|
if (nrex_has_lookbehind(stack))
|
|
{
|
|
NREX_COMPILE_ERROR("alternations inside lookbehind not supported");
|
|
}
|
|
stack.top()->add_childset();
|
|
}
|
|
else if (c[0] == '^' || c[0] == '$')
|
|
{
|
|
stack.top()->add_child(NREX_NEW(nrex_node_anchor((c[0] == '$'))));
|
|
}
|
|
else if (c[0] == '.')
|
|
{
|
|
stack.top()->add_child(NREX_NEW(nrex_node_shorthand('.')));
|
|
}
|
|
else if (c[0] == '\\')
|
|
{
|
|
if (nrex_is_shorthand(c[1]))
|
|
{
|
|
stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
|
|
++c;
|
|
}
|
|
else if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{'))
|
|
{
|
|
int ref = 0;
|
|
bool unclosed = false;
|
|
if (c[1] == 'g')
|
|
{
|
|
unclosed = true;
|
|
c = &c[2];
|
|
}
|
|
while ('0' <= c[1] && c[1] <= '9')
|
|
{
|
|
ref = ref * 10 + int(c[1] - '0');
|
|
++c;
|
|
}
|
|
if (c[1] == '}')
|
|
{
|
|
unclosed = false;
|
|
++c;
|
|
}
|
|
if (ref > _capturing || ref <= 0 || unclosed)
|
|
{
|
|
NREX_COMPILE_ERROR("backreference to non-existent capture");
|
|
}
|
|
if (nrex_has_lookbehind(stack))
|
|
{
|
|
NREX_COMPILE_ERROR("backreferences inside lookbehind not supported");
|
|
}
|
|
stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref)));
|
|
}
|
|
else if (c[1] == 'b' || c[1] == 'B')
|
|
{
|
|
stack.top()->add_child(NREX_NEW(nrex_node_word_boundary(c[1] == 'B')));
|
|
++c;
|
|
}
|
|
else
|
|
{
|
|
const nrex_char* d = c;
|
|
nrex_char unescaped = nrex_unescape(d);
|
|
if (c == d)
|
|
{
|
|
NREX_COMPILE_ERROR("invalid escape token");
|
|
}
|
|
stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped)));
|
|
c = d;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
stack.top()->add_child(NREX_NEW(nrex_node_char(c[0])));
|
|
}
|
|
}
|
|
if (stack.size() > 1)
|
|
{
|
|
NREX_COMPILE_ERROR("unclosed group '('");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int end) const
|
|
{
|
|
if (!_root)
|
|
{
|
|
return false;
|
|
}
|
|
nrex_search s(str, captures);
|
|
if (end >= offset)
|
|
{
|
|
s.end = end;
|
|
}
|
|
else
|
|
{
|
|
s.end = NREX_STRLEN(str);
|
|
}
|
|
for (int i = offset; i <= s.end; ++i)
|
|
{
|
|
for (int c = 0; c <= _capturing; ++c)
|
|
{
|
|
captures[c].start = 0;
|
|
captures[c].length = 0;
|
|
}
|
|
if (_root->test(&s, i) >= 0)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|