From 439d43932133d32dcabd482f11842072d52b41e1 Mon Sep 17 00:00:00 2001 From: Zher Huei Lee Date: Sun, 23 Oct 2016 01:22:48 +0100 Subject: [PATCH 1/3] RegEx re-implemented as a module Re-wrote nrex as a module using godot-specific parts and new features: * Added string substitutions. * Named groups are now supported. * Removed use of mutable variables in RegEx. RegExMatch is returned instead. --- bin/tests/test_string.cpp | 15 +- doc/base/classes.xml | 126 +- drivers/SCsub | 1 - drivers/nrex/README.md | 75 - drivers/nrex/nrex.cpp | 1496 ----------------- drivers/nrex/nrex.hpp | 176 -- drivers/nrex/nrex_config.h | 12 - drivers/nrex/regex.cpp | 142 -- drivers/register_driver_types.cpp | 4 - {drivers/nrex => modules/regex}/SCsub | 2 +- modules/regex/config.py | 8 + modules/regex/regex.cpp | 1465 ++++++++++++++++ modules/regex/regex.h | 114 ++ .../regex/register_types.cpp | 42 +- modules/regex/register_types.h | 31 + 15 files changed, 1733 insertions(+), 1976 deletions(-) delete mode 100644 drivers/nrex/README.md delete mode 100644 drivers/nrex/nrex.cpp delete mode 100644 drivers/nrex/nrex.hpp delete mode 100644 drivers/nrex/nrex_config.h delete mode 100644 drivers/nrex/regex.cpp rename {drivers/nrex => modules/regex}/SCsub (50%) create mode 100644 modules/regex/config.py create mode 100644 modules/regex/regex.cpp create mode 100644 modules/regex/regex.h rename drivers/nrex/regex.h => modules/regex/register_types.cpp (72%) create mode 100644 modules/regex/register_types.h diff --git a/bin/tests/test_string.cpp b/bin/tests/test_string.cpp index 2e8f5c34944..4990c58896c 100644 --- a/bin/tests/test_string.cpp +++ b/bin/tests/test_string.cpp @@ -31,7 +31,6 @@ //#include "math_funcs.h" #include #include "os/os.h" -#include "drivers/nrex/regex.h" #include "core/io/ip_address.h" #include "test_string.h" @@ -462,18 +461,8 @@ bool test_25() { bool test_26() { - OS::get_singleton()->print("\n\nTest 26: RegEx\n"); - RegEx regexp("(.*):(.*)"); - - int res = regexp.find("name:password"); - printf("\tmatch: %s\n", (res>=0)?"true":"false"); - - printf("\t%i captures:\n", regexp.get_capture_count()); - for (int i = 0; i=0); + //TODO: Do replacement RegEx test + return true; }; struct test_27_data { diff --git a/doc/base/classes.xml b/doc/base/classes.xml index 5eb021f6c05..cafb1449158 100644 --- a/doc/base/classes.xml +++ b/doc/base/classes.xml @@ -32514,6 +32514,7 @@ would be read as [code]"(?:\\.|[^"])*"[/code] Currently supported features: * Capturing [code]()[/code] and non-capturing [code](?:)[/code] groups + * Named capturing groups [code](?P<name>)[/code] * Any character [code].[/code] * Shorthand character classes [code]\w \W \s \S \d \D[/code] * User-defined character classes such as [code][A-Za-z][/code] @@ -32522,7 +32523,7 @@ * Lazy (non-greedy) quantifiers [code]*?[/code] * Beginning [code]^[/code] and end [code]$[/code] anchors * Alternation [code]|[/code] - * Backreferences [code]\1[/code] and [code]\g{1}[/code] + * Backreferences [code]\1[/code], [code]\g{1}[/code], and [code]\g<name>[/code] * POSIX character classes [code][[:alnum:]][/code] * Lookahead [code](?=)[/code], [code](?!)[/code] and lookbehind [code](?<=)[/code], [code](?<!)[/code] * ASCII [code]\xFF[/code] and Unicode [code]\uFFFF[/code] code points (in a style similar to Python) @@ -32531,7 +32532,7 @@ - This method resets the state of the object, as it was freshly created. Namely, it unassigns the regular expression of this object, and forgets all captures made by the last [method find]. + This method resets the state of the object, as it was freshly created. Namely, it unassigns the regular expression of this object. @@ -32539,15 +32540,41 @@ - - - Compiles and assign the regular expression pattern to use. The limit on the number of capturing groups can be specified or made unlimited if negative. + Compiles and assign the regular expression pattern to use. - + + + Returns the number of numeric capturing groups. + + + + + + + Returns an array of names of named capturing groups. + + + + + + + Returns the expression used to compile the code. + + + + + + + Returns whether this object has a valid regular expression assigned. + + + + + @@ -32555,45 +32582,96 @@ - This method tries to find the pattern within the string, and returns the position where it was found. It also stores any capturing group (see [method get_capture]) for further retrieval. + Searches the text for the compiled pattern. Returns a [RegExMatch] container of the first matching reult if found, otherwise null. The starting point of the serch could be specified without moving the string start anchor. - + - + + + + + + + - Returns a captured group. A captured group is the part of a string that matches a part of the pattern delimited by parentheses (unless they are non-capturing parentheses [i](?:)[/i]). + Searches the specified text for the compiled pattern and returns the text with the result replaced. Escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] are automatically expanded and resolved. If no change was found the unmodified text is returned instead. - + + + + + + + + + + + + + + + + + Using results from the search, returns the specified string with escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] expanded and resolved + + + + + + + + + Returns the end position of the match in the string. An interger can be specified for numeric groups or a string for named groups. Returns -1 if that group wasn't found or doesn't exist. Defaults to 0 (whole pattern). + + + + + + + Returns an array of the results of the numeric groups. + + + - Returns the number of capturing groups. A captured group is the part of a string that matches a part of the pattern delimited by parentheses (unless they are non-capturing parentheses [i](?:)[/i]). + Returns the number of numeric capturing groups. - + + + + + Returns a dictionary containing the named capturing groups and their results. + + + + + + + Returns an array of names of named capturing groups. + + + - + + Returns the starting position of the match in the string. An interger can be specified for numeric groups or a string for named groups. Returns -1 if that group wasn't found or doesn't exist. Defaults to 0 (whole pattern). - - + + + + - Return a list of all the captures made by the regular expression. - - - - - - - Returns whether this object has a valid regular expression assigned. + Returns the result of the match in the string. An interger can be specified for numeric groups or a string for named groups. Returns -1 if that group wasn't found or doesn't exist. Defaults to 0 (whole pattern). diff --git a/drivers/SCsub b/drivers/SCsub index 1f1509efa87..ab2c991f24a 100644 --- a/drivers/SCsub +++ b/drivers/SCsub @@ -25,7 +25,6 @@ SConscript('gl_context/SCsub'); # Core dependencies SConscript("png/SCsub"); -SConscript("nrex/SCsub"); # Tools override # FIXME: Should likely be integrated in the tools/ codebase diff --git a/drivers/nrex/README.md b/drivers/nrex/README.md deleted file mode 100644 index 7a942b24520..00000000000 --- a/drivers/nrex/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# NREX: Node RegEx - -[![Build Status](https://travis-ci.org/leezh/nrex.svg?branch=master)](https://travis-ci.org/leezh/nrex) - -** Version 0.2 ** - -Small node-based regular expression library. It only does text pattern -matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp` -and `nrex_config.h` to your project and follow the example: - - nrex regex; - regex.compile("^(fo+)bar$"); - - nrex_result captures[regex.capture_size()]; - if (regex.match("foobar", captures)) - { - std::cout << captures[0].start << std::endl; - std::cout << captures[0].length << std::endl; - } - -More details about its use is documented in `nrex.hpp` - -Currently supported features: - * Capturing `()` and non-capturing `(?:)` groups - * Any character `.` (includes newlines) - * Shorthand caracter classes `\w\W\s\S\d\D` - * POSIX character classes such as `[[:alnum:]]` - * Bracket expressions such as `[A-Za-z]` - * Simple quantifiers `?`, `*` and `+` - * Range quantifiers `{0,1}` - * Lazy (non-greedy) quantifiers `*?` - * Begining `^` and end `$` anchors - * Word boundaries `\b` - * Alternation `|` - * ASCII `\xFF` code points - * Unicode `\uFFFF` code points - * Positive `(?=)` and negative `(?!)` lookahead - * Positive `(?<=)` and negative `(? -#include -#define NREX_ISALPHANUM iswalnum -#define NREX_ISSPACE iswspace -#define NREX_STRLEN wcslen -#else -#include -#include -#define NREX_ISALPHANUM isalnum -#define NREX_ISSPACE isspace -#define NREX_STRLEN strlen -#endif - -#ifdef NREX_THROW_ERROR -#define NREX_COMPILE_ERROR(M) throw nrex_compile_error(M) -#else -#define NREX_COMPILE_ERROR(M) reset(); return false -#endif - -#ifndef NREX_NEW -#define NREX_NEW(X) new X -#define NREX_NEW_ARRAY(X, N) new X[N] -#define NREX_DELETE(X) delete X -#define NREX_DELETE_ARRAY(X) delete[] X -#endif - -template -class nrex_array -{ - private: - T* _data; - unsigned int _reserved; - unsigned int _size; - public: - nrex_array() - : _data(NREX_NEW_ARRAY(T, 2)) - , _reserved(2) - , _size(0) - { - } - - nrex_array(unsigned int reserved) - : _data(NREX_NEW_ARRAY(T, reserved ? reserved : 1)) - , _reserved(reserved ? reserved : 1) - , _size(0) - { - } - - ~nrex_array() - { - NREX_DELETE_ARRAY(_data); - } - - unsigned int size() const - { - return _size; - } - - void reserve(unsigned int size) - { - if (size < _size) { - size = _size; - } - if (size == 0) { - size = 1; - } - T* old = _data; - _data = NREX_NEW_ARRAY(T, size); - _reserved = size; - for (unsigned int i = 0; i < _size; ++i) - { - _data[i] = old[i]; - } - NREX_DELETE_ARRAY(old); - } - - void push(T item) - { - if (_size == _reserved) - { - reserve(_reserved * 2); - } - _data[_size] = item; - _size++; - } - - const T& top() const - { - return _data[_size - 1]; - } - - const T& operator[] (unsigned int i) const - { - return _data[i]; - } - - void pop() - { - if (_size > 0) - { - --_size; - } - } -}; - -static int nrex_parse_hex(nrex_char c) -{ - if ('0' <= c && c <= '9') - { - return int(c - '0'); - } - else if ('a' <= c && c <= 'f') - { - return int(c - 'a') + 10; - } - else if ('A' <= c && c <= 'F') - { - return int(c - 'A') + 10; - } - return -1; -} - -static nrex_char nrex_unescape(const nrex_char*& c) -{ - switch (c[1]) - { - case '0': ++c; return '\0'; - case 'a': ++c; return '\a'; - case 'e': ++c; return '\e'; - case 'f': ++c; return '\f'; - case 'n': ++c; return '\n'; - case 'r': ++c; return '\r'; - case 't': ++c; return '\t'; - case 'v': ++c; return '\v'; - case 'b': ++c; return '\b'; - case 'x': - { - int point = 0; - for (int i = 2; i <= 3; ++i) - { - int res = nrex_parse_hex(c[i]); - if (res == -1) - { - return '\0'; - } - point = (point << 4) + res; - } - c = &c[3]; - return nrex_char(point); - } - case 'u': - { - int point = 0; - for (int i = 2; i <= 5; ++i) - { - int res = nrex_parse_hex(c[i]); - if (res == -1) - { - return '\0'; - } - point = (point << 4) + res; - } - c = &c[5]; - return nrex_char(point); - } - } - return (++c)[0]; -} - -struct nrex_search -{ - const nrex_char* str; - nrex_result* captures; - int end; - bool complete; - nrex_array lookahead_pos; - - nrex_char at(int pos) - { - return str[pos]; - } - - nrex_search(const nrex_char* str, nrex_result* captures, int lookahead) - : str(str) - , captures(captures) - , end(0) - , lookahead_pos(lookahead) - { - } -}; - -struct nrex_node -{ - nrex_node* next; - nrex_node* previous; - nrex_node* parent; - bool quantifiable; - int length; - - nrex_node(bool quantify = false) - : next(NULL) - , previous(NULL) - , parent(NULL) - , quantifiable(quantify) - , length(-1) - { - } - - virtual ~nrex_node() - { - if (next) - { - NREX_DELETE(next); - } - } - - virtual int test(nrex_search* s, int pos) const - { - return next ? next->test(s, pos) : -1; - } - - virtual int test_parent(nrex_search* s, int pos) const - { - if (next) - { - pos = next->test(s, pos); - } - if (pos >= 0) - { - s->complete = true; - } - if (parent && pos >= 0) - { - pos = parent->test_parent(s, pos); - } - if (pos < 0) - { - s->complete = false; - } - return pos; - } - - void increment_length(int amount, bool subtract = false) - { - if (amount >= 0 && length >= 0) - { - if (!subtract) - { - length += amount; - } - else - { - length -= amount; - } - } - else - { - length = -1; - } - if (parent) - { - parent->increment_length(amount, subtract); - } - } -}; - -enum nrex_group_type -{ - nrex_group_capture, - nrex_group_non_capture, - nrex_group_bracket, - nrex_group_look_ahead, - nrex_group_look_behind, -}; - -struct nrex_node_group : public nrex_node -{ - nrex_group_type type; - int id; - bool negate; - nrex_array childset; - nrex_node* back; - - nrex_node_group(nrex_group_type type, int id = 0) - : nrex_node(true) - , type(type) - , id(id) - , negate(false) - , back(NULL) - { - if (type != nrex_group_bracket) - { - length = 0; - } - else - { - length = 1; - } - if (type == nrex_group_look_ahead || type == nrex_group_look_behind) - { - quantifiable = false; - } - } - - virtual ~nrex_node_group() - { - for (unsigned int i = 0; i < childset.size(); ++i) - { - NREX_DELETE(childset[i]); - } - - } - - int test(nrex_search* s, int pos) const - { - int old_start; - if (type == nrex_group_capture) - { - old_start = s->captures[id].start; - s->captures[id].start = pos; - } - for (unsigned int i = 0; i < childset.size(); ++i) - { - s->complete = false; - int offset = 0; - if (type == nrex_group_look_behind) - { - if (pos < length) - { - return -1; - } - offset = length; - } - if (type == nrex_group_look_ahead) - { - s->lookahead_pos.push(pos); - } - int res = childset[i]->test(s, pos - offset); - if (type == nrex_group_look_ahead) - { - s->lookahead_pos.pop(); - } - if (s->complete) - { - return res; - } - if (negate) - { - if (res < 0) - { - res = pos + 1; - } - else - { - return -1; - } - if (i + 1 < childset.size()) - { - continue; - } - } - if (res >= 0) - { - if (type == nrex_group_capture) - { - s->captures[id].length = res - pos; - } - else if (type == nrex_group_look_ahead || type == nrex_group_look_behind) - { - res = pos; - } - return next ? next->test(s, res) : res; - } - } - if (type == nrex_group_capture) - { - s->captures[id].start = old_start; - } - return -1; - } - - virtual int test_parent(nrex_search* s, int pos) const - { - if (type == nrex_group_capture) - { - s->captures[id].length = pos - s->captures[id].start; - } - if (type == nrex_group_look_ahead) - { - pos = s->lookahead_pos[id]; - } - return nrex_node::test_parent(s, pos); - } - - void add_childset() - { - if (childset.size() > 0 && type != nrex_group_bracket) - { - length = -1; - } - back = NULL; - } - - void add_child(nrex_node* node) - { - node->parent = this; - node->previous = back; - if (back && type != nrex_group_bracket) - { - back->next = node; - } - else - { - childset.push(node); - } - if (type != nrex_group_bracket) - { - increment_length(node->length); - } - back = node; - } - - nrex_node* swap_back(nrex_node* node) - { - if (!back) - { - add_child(node); - return NULL; - } - nrex_node* old = back; - if (!old->previous) - { - childset.pop(); - } - if (type != nrex_group_bracket) - { - increment_length(old->length, true); - } - back = old->previous; - add_child(node); - return old; - } - - void pop_back() - { - if (back) - { - nrex_node* old = back; - if (!old->previous) - { - childset.pop(); - } - if (type != nrex_group_bracket) - { - increment_length(old->length, true); - } - back = old->previous; - NREX_DELETE(old); - } - } -}; - -struct nrex_node_char : public nrex_node -{ - nrex_char ch; - - nrex_node_char(nrex_char c) - : nrex_node(true) - , ch(c) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos || s->at(pos) != ch) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } -}; - -struct nrex_node_range : public nrex_node -{ - nrex_char start; - nrex_char end; - - nrex_node_range(nrex_char s, nrex_char e) - : nrex_node(true) - , start(s) - , end(e) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos) - { - return -1; - } - nrex_char c = s->at(pos); - if (c < start || end < c) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } -}; - -enum nrex_class_type -{ - nrex_class_none, - nrex_class_alnum, - nrex_class_alpha, - nrex_class_blank, - nrex_class_cntrl, - nrex_class_digit, - nrex_class_graph, - nrex_class_lower, - nrex_class_print, - nrex_class_punct, - nrex_class_space, - nrex_class_upper, - nrex_class_xdigit, - nrex_class_word -}; - -static bool nrex_compare_class(const nrex_char** pos, const char* text) -{ - unsigned int i = 0; - for (i = 0; text[i] != '\0'; ++i) - { - if ((*pos)[i] != text[i]) - { - return false; - } - } - if ((*pos)[i++] != ':' || (*pos)[i] != ']') - { - return false; - } - *pos = &(*pos)[i]; - return true; -} - -#define NREX_COMPARE_CLASS(POS, NAME) if (nrex_compare_class(POS, #NAME)) return nrex_class_ ## NAME - -static nrex_class_type nrex_parse_class(const nrex_char** pos) -{ - NREX_COMPARE_CLASS(pos, alnum); - NREX_COMPARE_CLASS(pos, alpha); - NREX_COMPARE_CLASS(pos, blank); - NREX_COMPARE_CLASS(pos, cntrl); - NREX_COMPARE_CLASS(pos, digit); - NREX_COMPARE_CLASS(pos, graph); - NREX_COMPARE_CLASS(pos, lower); - NREX_COMPARE_CLASS(pos, print); - NREX_COMPARE_CLASS(pos, punct); - NREX_COMPARE_CLASS(pos, space); - NREX_COMPARE_CLASS(pos, upper); - NREX_COMPARE_CLASS(pos, xdigit); - NREX_COMPARE_CLASS(pos, word); - return nrex_class_none; -} - -struct nrex_node_class : public nrex_node -{ - nrex_class_type type; - - nrex_node_class(nrex_class_type t) - : nrex_node(true) - , type(t) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos) - { - return -1; - } - if (!test_class(s->at(pos))) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } - - bool test_class(nrex_char c) const - { - if ((0 <= c && c <= 0x1F) || c == 0x7F) - { - if (type == nrex_class_cntrl) - { - return true; - } - } - else if (c < 0x7F) - { - if (type == nrex_class_print) - { - return true; - } - else if (type == nrex_class_graph && c != ' ') - { - return true; - } - else if ('0' <= c && c <= '9') - { - switch (type) - { - case nrex_class_alnum: - case nrex_class_digit: - case nrex_class_xdigit: - case nrex_class_word: - return true; - default: - break; - } - } - else if ('A' <= c && c <= 'Z') - { - switch (type) - { - case nrex_class_alnum: - case nrex_class_alpha: - case nrex_class_upper: - case nrex_class_word: - return true; - case nrex_class_xdigit: - if (c <= 'F') - { - return true; - } - default: - break; - } - } - else if ('a' <= c && c <= 'z') - { - switch (type) - { - case nrex_class_alnum: - case nrex_class_alpha: - case nrex_class_lower: - case nrex_class_word: - return true; - case nrex_class_xdigit: - if (c <= 'f') - { - return true; - } - default: - break; - } - } - } - switch (c) - { - case ' ': - case '\t': - if (type == nrex_class_blank) - { - return true; - } - case '\r': - case '\n': - case '\f': - if (type == nrex_class_space) - { - return true; - } - break; - case '_': - if (type == nrex_class_word) - { - return true; - } - case ']': - case '[': - case '!': - case '"': - case '#': - case '$': - case '%': - case '&': - case '\'': - case '(': - case ')': - case '*': - case '+': - case ',': - case '.': - case '/': - case ':': - case ';': - case '<': - case '=': - case '>': - case '?': - case '@': - case '\\': - case '^': - case '`': - case '{': - case '|': - case '}': - case '~': - case '-': - if (type == nrex_class_punct) - { - return true; - } - break; - default: - break; - } - return false; - } -}; - -static bool nrex_is_shorthand(nrex_char repr) -{ - switch (repr) - { - case 'W': - case 'w': - case 'D': - case 'd': - case 'S': - case 's': - return true; - } - return false; -} - -struct nrex_node_shorthand : public nrex_node -{ - nrex_char repr; - - nrex_node_shorthand(nrex_char c) - : nrex_node(true) - , repr(c) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos) - { - return -1; - } - bool found = false; - bool invert = false; - nrex_char c = s->at(pos); - switch (repr) - { - case '.': - found = true; - break; - case 'W': - invert = true; - case 'w': - if (c == '_' || NREX_ISALPHANUM(c)) - { - found = true; - } - break; - case 'D': - invert = true; - case 'd': - if ('0' <= c && c <= '9') - { - found = true; - } - break; - case 'S': - invert = true; - case 's': - if (NREX_ISSPACE(c)) - { - found = true; - } - break; - } - if (found == invert) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } -}; - -static bool nrex_is_quantifier(nrex_char repr) -{ - switch (repr) - { - case '?': - case '*': - case '+': - case '{': - return true; - } - return false; -} - -struct nrex_node_quantifier : public nrex_node -{ - int min; - int max; - bool greedy; - nrex_node* child; - - nrex_node_quantifier(int min, int max) - : nrex_node() - , min(min) - , max(max) - , greedy(true) - , child(NULL) - { - } - - virtual ~nrex_node_quantifier() - { - if (child) - { - NREX_DELETE(child); - } - } - - int test(nrex_search* s, int pos) const - { - return test_step(s, pos, 0, pos); - } - - int test_step(nrex_search* s, int pos, int level, int start) const - { - if (pos > s->end) - { - return -1; - } - if (!greedy && level > min) - { - int res = pos; - if (next) - { - res = next->test(s, res); - } - if (s->complete) - { - return res; - } - if (res >= 0 && parent->test_parent(s, res) >= 0) - { - return res; - } - } - if (max >= 0 && level > max) - { - return -1; - } - if (level > 1 && level > min + 1 && pos == start) - { - return -1; - } - int res = pos; - if (level >= 1) - { - res = child->test(s, pos); - if (s->complete) - { - return res; - } - } - if (res >= 0) - { - int res_step = test_step(s, res, level + 1, start); - if (res_step >= 0) - { - return res_step; - } - else if (greedy && level >= min) - { - if (next) - { - res = next->test(s, res); - } - if (s->complete) - { - return res; - } - if (res >= 0 && parent->test_parent(s, res) >= 0) - { - return res; - } - } - } - return -1; - } - - virtual int test_parent(nrex_search* s, int pos) const - { - s->complete = false; - return pos; - } -}; - -struct nrex_node_anchor : public nrex_node -{ - bool end; - - nrex_node_anchor(bool end) - : nrex_node() - , end(end) - { - length = 0; - } - - int test(nrex_search* s, int pos) const - { - if (!end && pos != 0) - { - return -1; - } - else if (end && pos != s->end) - { - return -1; - } - return next ? next->test(s, pos) : pos; - } -}; - -struct nrex_node_word_boundary : public nrex_node -{ - bool inverse; - - nrex_node_word_boundary(bool inverse) - : nrex_node() - , inverse(inverse) - { - length = 0; - } - - int test(nrex_search* s, int pos) const - { - bool left = false; - bool right = false; - if (pos != 0) - { - nrex_char c = s->at(pos - 1); - if (c == '_' || NREX_ISALPHANUM(c)) - { - left = true; - } - } - if (pos != s->end) - { - nrex_char c = s->at(pos); - if (c == '_' || NREX_ISALPHANUM(c)) - { - right = true; - } - } - if ((left != right) == inverse) - { - return -1; - } - return next ? next->test(s, pos) : pos; - } -}; - -struct nrex_node_backreference : public nrex_node -{ - int ref; - - nrex_node_backreference(int ref) - : nrex_node(true) - , ref(ref) - { - length = -1; - } - - int test(nrex_search* s, int pos) const - { - nrex_result& r = s->captures[ref]; - for (int i = 0; i < r.length; ++i) - { - if (pos + i >= s->end) - { - return -1; - } - if (s->at(r.start + i) != s->at(pos + i)) - { - return -1; - } - } - return next ? next->test(s, pos + r.length) : pos + r.length; - } -}; - -bool nrex_has_lookbehind(nrex_array& stack) -{ - for (unsigned int i = 0; i < stack.size(); i++) - { - if (stack[i]->type == nrex_group_look_behind) - { - return true; - } - } - return false; -} - -nrex::nrex() - : _capturing(0) - , _lookahead_depth(0) - , _root(NULL) -{ -} - -nrex::nrex(const nrex_char* pattern, int captures) - : _capturing(0) - , _lookahead_depth(0) - , _root(NULL) -{ - compile(pattern, captures); -} - -nrex::~nrex() -{ - if (_root) - { - NREX_DELETE(_root); - } -} - -bool nrex::valid() const -{ - return (_root != NULL); -} - -void nrex::reset() -{ - _capturing = 0; - _lookahead_depth = 0; - if (_root) - { - NREX_DELETE(_root); - } - _root = NULL; -} - -int nrex::capture_size() const -{ - if (_root) - { - return _capturing + 1; - } - return 0; -} - -bool nrex::compile(const nrex_char* pattern, int captures) -{ - reset(); - nrex_node_group* root = NREX_NEW(nrex_node_group(nrex_group_capture, _capturing)); - nrex_array stack; - stack.push(root); - unsigned int lookahead_level = 0; - _root = root; - - for (const nrex_char* c = pattern; c[0] != '\0'; ++c) - { - if (c[0] == '(') - { - if (c[1] == '?') - { - if (c[2] == ':') - { - c = &c[2]; - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture)); - stack.top()->add_child(group); - stack.push(group); - } - else if (c[2] == '!' || c[2] == '=') - { - c = &c[2]; - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_ahead, lookahead_level++)); - group->negate = (c[0] == '!'); - stack.top()->add_child(group); - stack.push(group); - if (lookahead_level > _lookahead_depth) - { - _lookahead_depth = lookahead_level; - } - } - else if (c[2] == '<' && (c[3] == '!' || c[3] == '=')) - { - c = &c[3]; - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_behind)); - group->negate = (c[0] == '!'); - stack.top()->add_child(group); - stack.push(group); - } - else - { - NREX_COMPILE_ERROR("unrecognised qualifier for group"); - } - } - else if (captures >= 0 && _capturing < captures) - { - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_capture, ++_capturing)); - stack.top()->add_child(group); - stack.push(group); - } - else - { - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture)); - stack.top()->add_child(group); - stack.push(group); - } - } - else if (c[0] == ')') - { - if (stack.size() > 1) - { - if (stack.top()->type == nrex_group_look_ahead) - { - --lookahead_level; - } - stack.pop(); - } - else - { - NREX_COMPILE_ERROR("unexpected ')'"); - } - } - else if (c[0] == '[') - { - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_bracket)); - stack.top()->add_child(group); - if (c[1] == '^') - { - group->negate = true; - ++c; - } - bool first_child = true; - nrex_char previous_child; - bool previous_child_single = false; - while (true) - { - group->add_childset(); - ++c; - if (c[0] == '\0') - { - NREX_COMPILE_ERROR("unclosed bracket expression '['"); - } - if (c[0] == '[' && c[1] == ':') - { - const nrex_char* d = &c[2]; - nrex_class_type cls = nrex_parse_class(&d); - if (cls != nrex_class_none) - { - c = d; - group->add_child(NREX_NEW(nrex_node_class(cls))); - previous_child_single = false; - } - else - { - group->add_child(NREX_NEW(nrex_node_char('['))); - previous_child = '['; - previous_child_single = true; - } - } - else if (c[0] == ']' && !first_child) - { - break; - } - else if (c[0] == '\\') - { - if (nrex_is_shorthand(c[1])) - { - group->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); - ++c; - previous_child_single = false; - } - else - { - const nrex_char* d = c; - nrex_char unescaped = nrex_unescape(d); - if (c == d) - { - NREX_COMPILE_ERROR("invalid escape token"); - } - group->add_child(NREX_NEW(nrex_node_char(unescaped))); - c = d; - previous_child = unescaped; - previous_child_single = true; - } - } - else if (previous_child_single && c[0] == '-') - { - bool is_range = false; - nrex_char next; - if (c[1] != '\0' && c[1] != ']') - { - if (c[1] == '\\') - { - const nrex_char* d = ++c; - next = nrex_unescape(d); - if (c == d) - { - NREX_COMPILE_ERROR("invalid escape token in range"); - } - } - else - { - next = c[1]; - ++c; - } - is_range = true; - } - if (is_range) - { - if (next < previous_child) - { - NREX_COMPILE_ERROR("text range out of order"); - } - group->pop_back(); - group->add_child(NREX_NEW(nrex_node_range(previous_child, next))); - previous_child_single = false; - } - else - { - group->add_child(NREX_NEW(nrex_node_char(c[0]))); - previous_child = c[0]; - previous_child_single = true; - } - } - else - { - group->add_child(NREX_NEW(nrex_node_char(c[0]))); - previous_child = c[0]; - previous_child_single = true; - } - first_child = false; - } - } - else if (nrex_is_quantifier(c[0])) - { - int min = 0; - int max = -1; - bool valid_quantifier = true; - if (c[0] == '?') - { - min = 0; - max = 1; - } - else if (c[0] == '+') - { - min = 1; - max = -1; - } - else if (c[0] == '*') - { - min = 0; - max = -1; - } - else if (c[0] == '{') - { - bool max_set = false; - const nrex_char* d = c; - while (true) - { - ++d; - if (d[0] == '\0') - { - valid_quantifier = false; - break; - } - else if (d[0] == '}') - { - break; - } - else if (d[0] == ',') - { - max_set = true; - continue; - } - else if (d[0] < '0' || '9' < d[0]) - { - valid_quantifier = false; - break; - } - if (max_set) - { - if (max < 0) - { - max = int(d[0] - '0'); - } - else - { - max = max * 10 + int(d[0] - '0'); - } - } - else - { - min = min * 10 + int(d[0] - '0'); - } - } - if (!max_set) - { - max = min; - } - if (valid_quantifier) - { - c = d; - } - } - if (valid_quantifier) - { - if (stack.top()->back == NULL || !stack.top()->back->quantifiable) - { - NREX_COMPILE_ERROR("element not quantifiable"); - } - nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier(min, max)); - if (min == max) - { - if (stack.top()->back->length >= 0) - { - quant->length = max * stack.top()->back->length; - } - } - else - { - if (nrex_has_lookbehind(stack)) - { - NREX_COMPILE_ERROR("variable length quantifiers inside lookbehind not supported"); - } - } - quant->child = stack.top()->swap_back(quant); - quant->child->previous = NULL; - quant->child->next = NULL; - quant->child->parent = quant; - if (c[1] == '?') - { - quant->greedy = false; - ++c; - } - } - else - { - stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); - } - } - else if (c[0] == '|') - { - if (nrex_has_lookbehind(stack)) - { - NREX_COMPILE_ERROR("alternations inside lookbehind not supported"); - } - stack.top()->add_childset(); - } - else if (c[0] == '^' || c[0] == '$') - { - stack.top()->add_child(NREX_NEW(nrex_node_anchor((c[0] == '$')))); - } - else if (c[0] == '.') - { - stack.top()->add_child(NREX_NEW(nrex_node_shorthand('.'))); - } - else if (c[0] == '\\') - { - if (nrex_is_shorthand(c[1])) - { - stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); - ++c; - } - else if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) - { - int ref = 0; - bool unclosed = false; - if (c[1] == 'g') - { - unclosed = true; - c = &c[2]; - } - while ('0' <= c[1] && c[1] <= '9') - { - ref = ref * 10 + int(c[1] - '0'); - ++c; - } - if (c[1] == '}') - { - unclosed = false; - ++c; - } - if (ref > _capturing || ref <= 0 || unclosed) - { - NREX_COMPILE_ERROR("backreference to non-existent capture"); - } - if (nrex_has_lookbehind(stack)) - { - NREX_COMPILE_ERROR("backreferences inside lookbehind not supported"); - } - stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref))); - } - else if (c[1] == 'b' || c[1] == 'B') - { - stack.top()->add_child(NREX_NEW(nrex_node_word_boundary(c[1] == 'B'))); - ++c; - } - else - { - const nrex_char* d = c; - nrex_char unescaped = nrex_unescape(d); - if (c == d) - { - NREX_COMPILE_ERROR("invalid escape token"); - } - stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped))); - c = d; - } - } - else - { - stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); - } - } - if (stack.size() > 1) - { - NREX_COMPILE_ERROR("unclosed group '('"); - } - return true; -} - -bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int end) const -{ - if (!_root) - { - return false; - } - nrex_search s(str, captures, _lookahead_depth); - if (end >= offset) - { - s.end = end; - } - else - { - s.end = NREX_STRLEN(str); - } - for (int i = offset; i <= s.end; ++i) - { - for (int c = 0; c <= _capturing; ++c) - { - captures[c].start = 0; - captures[c].length = 0; - } - if (_root->test(&s, i) >= 0) - { - return true; - } - } - return false; -} diff --git a/drivers/nrex/nrex.hpp b/drivers/nrex/nrex.hpp deleted file mode 100644 index d30b7d01029..00000000000 --- a/drivers/nrex/nrex.hpp +++ /dev/null @@ -1,176 +0,0 @@ -// NREX: Node RegEx -// Version 0.2 -// -// Copyright (c) 2015-2016, Zher Huei Lee -// All rights reserved. -// -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. -// -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: -// -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would -// be appreciated but is not required. -// -// 2. Altered source versions must be plainly marked as such, and must not -// be misrepresented as being the original software. -// -// 3. This notice may not be removed or altered from any source -// distribution. -// - -#ifndef NREX_HPP -#define NREX_HPP - -#include "nrex_config.h" - -#ifdef NREX_UNICODE -typedef wchar_t nrex_char; -#else -typedef char nrex_char; -#endif - -/*! - * \brief Struct to contain the range of a capture result - * - * The range provided is relative to the begining of the searched string. - * - * \see nrex_node::match() - */ -struct nrex_result -{ - public: - int start; /*!< Start of text range */ - int length; /*!< Length of text range */ -}; - -class nrex_node; - -/*! - * \brief Holds the compiled regex pattern - */ -class nrex -{ - private: - unsigned int _capturing; - unsigned int _lookahead_depth; - nrex_node* _root; - public: - - /*! - * \brief Initialises an empty regex container - */ - nrex(); - - /*! - * \brief Initialises and compiles the regex pattern - * - * This calls nrex::compile() with the same arguments. To check whether - * the compilation was successfull, use nrex::valid(). - * - * If the NREX_THROW_ERROR was defined it would automatically throw a - * runtime error nrex_compile_error if it encounters a problem when - * parsing the pattern. - * - * \param pattern The regex pattern - * \param captures The maximum number of capture groups to allow. Any - * extra would be converted to non-capturing groups. - * If negative, no limit would be imposed. Defaults - * to 9. - * - * \see nrex::compile() - */ - nrex(const nrex_char* pattern, int captures = 9); - - ~nrex(); - - /*! - * \brief Removes the compiled regex and frees up the memory - */ - void reset(); - - /*! - * \brief Checks if there is a compiled regex being stored - * \return True if present, False if not present - */ - bool valid() const; - - /*! - * \brief Provides number of captures the compiled regex uses - * - * This is used to provide the array size of the captures needed for - * nrex::match() to work. The size is actually the number of capture - * groups + one for the matching of the entire pattern. This can be - * capped using the extra argument given in nrex::compile() - * (default 10). - * - * \return The number of captures - */ - int capture_size() const; - - /*! - * \brief Compiles the provided regex pattern - * - * This automatically removes the existing compiled regex if already - * present. - * - * If the NREX_THROW_ERROR was defined it would automatically throw a - * runtime error nrex_compile_error if it encounters a problem when - * parsing the pattern. - * - * \param pattern The regex pattern - * \param captures The maximum number of capture groups to allow. Any - * extra would be converted to non-capturing groups. - * If negative, no limit would be imposed. Defaults - * to 9. - * \return True if the pattern was succesfully compiled - */ - bool compile(const nrex_char* pattern, int captures = 9); - - /*! - * \brief Uses the pattern to search through the provided string - * \param str The text to search through. It only needs to be - * null terminated if the end point is not provided. - * This also determines the starting anchor. - * \param captures The array of results to store the capture results. - * The size of that array needs to be the same as the - * size given in nrex::capture_size(). As it matches - * the function fills the array with the results. 0 is - * the result for the entire pattern, 1 and above - * corresponds to the regex capture group if present. - * \param offset The starting point of the search. This does not move - * the starting anchor. Defaults to 0. - * \param end The end point of the search. This also determines - * the ending anchor. If a number less than the offset - * is provided, the search would be done until null - * termination. Defaults to -1. - * \return True if a match was found. False otherwise. - */ - bool match(const nrex_char* str, nrex_result* captures, int offset = 0, int end = -1) const; -}; - -#ifdef NREX_THROW_ERROR - -#include - -class nrex_compile_error : std::runtime_error -{ - public: - nrex_compile_error(const char* message) - : std::runtime_error(message) - { - } - - ~nrex_compile_error() throw() - { - } -}; - -#endif - -#endif // NREX_HPP diff --git a/drivers/nrex/nrex_config.h b/drivers/nrex/nrex_config.h deleted file mode 100644 index 540f34f8b49..00000000000 --- a/drivers/nrex/nrex_config.h +++ /dev/null @@ -1,12 +0,0 @@ -// Godot-specific configuration -// To use this, replace nrex_config.h - -#include "core/os/memory.h" - -#define NREX_UNICODE -//#define NREX_THROW_ERROR - -#define NREX_NEW(X) memnew(X) -#define NREX_NEW_ARRAY(X, N) memnew_arr(X, N) -#define NREX_DELETE(X) memdelete(X) -#define NREX_DELETE_ARRAY(X) memdelete_arr(X) diff --git a/drivers/nrex/regex.cpp b/drivers/nrex/regex.cpp deleted file mode 100644 index 7bf14d14ad2..00000000000 --- a/drivers/nrex/regex.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/*************************************************************************/ -/* regex.cpp */ -/*************************************************************************/ -/* This file is part of: */ -/* GODOT ENGINE */ -/* http://www.godotengine.org */ -/*************************************************************************/ -/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ -/* */ -/* Permission is hereby granted, free of charge, to any person obtaining */ -/* a copy of this software and associated documentation files (the */ -/* "Software"), to deal in the Software without restriction, including */ -/* without limitation the rights to use, copy, modify, merge, publish, */ -/* distribute, sublicense, and/or sell copies of the Software, and to */ -/* permit persons to whom the Software is furnished to do so, subject to */ -/* the following conditions: */ -/* */ -/* The above copyright notice and this permission notice shall be */ -/* included in all copies or substantial portions of the Software. */ -/* */ -/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ -/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ -/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ -/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ -/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ -/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ -/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -/*************************************************************************/ -#include "regex.h" -#include "nrex.hpp" -#include "core/os/memory.h" - -void RegEx::_bind_methods() { - - ObjectTypeDB::bind_method(_MD("compile","pattern", "capture"),&RegEx::compile, DEFVAL(9)); - ObjectTypeDB::bind_method(_MD("find","text","start","end"),&RegEx::find, DEFVAL(0), DEFVAL(-1)); - ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); - ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); - ObjectTypeDB::bind_method(_MD("get_capture_count"),&RegEx::get_capture_count); - ObjectTypeDB::bind_method(_MD("get_capture","capture"),&RegEx::get_capture); - ObjectTypeDB::bind_method(_MD("get_capture_start","capture"),&RegEx::get_capture_start); - ObjectTypeDB::bind_method(_MD("get_captures"),&RegEx::_bind_get_captures); - -}; - -StringArray RegEx::_bind_get_captures() const { - - StringArray ret; - int count = get_capture_count(); - for (int i=0; i(); } void unregister_core_driver_types() { diff --git a/drivers/nrex/SCsub b/modules/regex/SCsub similarity index 50% rename from drivers/nrex/SCsub rename to modules/regex/SCsub index ee39fd26317..08824067616 100644 --- a/drivers/nrex/SCsub +++ b/modules/regex/SCsub @@ -2,6 +2,6 @@ Import('env') -env.add_source_files(env.drivers_sources, "*.cpp") +env.add_source_files(env.modules_sources, "*.cpp") Export('env') diff --git a/modules/regex/config.py b/modules/regex/config.py new file mode 100644 index 00000000000..667b5d8ba65 --- /dev/null +++ b/modules/regex/config.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python + +def can_build(platform): + return True + +def configure(env): + pass + diff --git a/modules/regex/regex.cpp b/modules/regex/regex.cpp new file mode 100644 index 00000000000..8f26d764c42 --- /dev/null +++ b/modules/regex/regex.cpp @@ -0,0 +1,1465 @@ +/*************************************************************************/ +/* regex.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* http://www.godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "regex.h" +#include +#include + +static int RegEx_hex2int(const CharType c) +{ + if ('0' <= c && c <= '9') + return int(c - '0'); + else if ('a' <= c && c <= 'f') + return int(c - 'a') + 10; + else if ('A' <= c && c <= 'F') + return int(c - 'A') + 10; + return -1; +} + +struct RegExSearch { + + Ref match; + const CharType* str; + int end; + int eof; + + // For standard quantifier behaviour, test_parent is used to check the + // rest of the pattern. If the pattern matches, to prevent the parent + // from testing again, the complete flag is used as a shortcut out. + bool complete; + + // With lookahead, the position needs to rewind to its starting position + // when test_parent is used. Due to functional programming, this state + // has to be kept as a parameter. + Vector lookahead_pos; + + CharType at(int p_pos) { + return str[p_pos]; + } + + RegExSearch(Ref& p_match, int p_end, int p_lookahead) : match(p_match) { + + str = p_match->string.c_str(); + end = p_end; + eof = p_match->string.length(); + complete = false; + lookahead_pos.resize(p_lookahead); + } + +}; + +struct RegExNode { + + RegExNode* next; + RegExNode* previous; + RegExNode* parent; + bool quantifiable; + int length; + + RegExNode() { + + next = NULL; + previous = NULL; + parent = NULL; + quantifiable = false; + length = -1; + } + + virtual ~RegExNode() { + + if (next) + memdelete(next); + } + + virtual int test(RegExSearch& s, int pos) const { + + return next ? next->test(s, pos) : -1; + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + if (next) + pos = next->test(s, pos); + + if (pos >= 0) { + s.complete = true; + if (parent) + pos = parent->test_parent(s, pos); + } + + if (pos < 0) + s.complete = false; + + return pos; + } + + void increment_length(int amount, bool subtract = false) { + + if (amount >= 0 && length >= 0) { + if (!subtract) + length += amount; + else + length -= amount; + } else { + length = -1; + } + + if (parent) + parent->increment_length(amount, subtract); + + } + +}; + +struct RegExNodeChar : public RegExNode { + + CharType ch; + + RegExNodeChar(CharType p_char) { + + length = 1; + quantifiable = true; + ch = p_char; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos || s.at(pos) != ch) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } + + static CharType parse_escape(const CharType*& c) { + + int point = 0; + switch (c[1]) { + case 'x': + for (int i = 2; i <= 3; ++i) { + int res = RegEx_hex2int(c[i]); + if (res == -1) + return '\0'; + point = (point << 4) + res; + } + c = &c[3]; + return CharType(point); + case 'u': + for (int i = 2; i <= 5; ++i) { + int res = RegEx_hex2int(c[i]); + if (res == -1) + return '\0'; + point = (point << 4) + res; + } + c = &c[5]; + return CharType(point); + case '0': ++c; return '\0'; + case 'a': ++c; return '\a'; + case 'e': ++c; return '\e'; + case 'f': ++c; return '\f'; + case 'n': ++c; return '\n'; + case 'r': ++c; return '\r'; + case 't': ++c; return '\t'; + case 'v': ++c; return '\v'; + case 'b': ++c; return '\b'; + default: break; + } + return (++c)[0]; + } +}; + +struct RegExNodeRange : public RegExNode { + + CharType start; + CharType end; + + RegExNodeRange(CharType p_start, CharType p_end) { + + length = 1; + quantifiable = true; + start = p_start; + end = p_end; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + CharType c = s.at(pos); + if (c < start || end < c) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +struct RegExNodeShorthand : public RegExNode { + + CharType repr; + + RegExNodeShorthand(CharType p_repr) { + + length = 1; + quantifiable = true; + repr = p_repr; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + bool found = false; + bool invert = false; + CharType c = s.at(pos); + switch (repr) { + case '.': + found = true; + break; + case 'W': + invert = true; + case 'w': + found = (c == '_' || iswalnum(c) != 0); + break; + case 'D': + invert = true; + case 'd': + found = ('0' <= c && c <= '9'); + break; + case 'S': + invert = true; + case 's': + found = (iswspace(c) != 0); + break; + default: + break; + } + + if (found == invert) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +struct RegExNodeClass : public RegExNode { + + enum Type { + Type_none, + Type_alnum, + Type_alpha, + Type_ascii, + Type_blank, + Type_cntrl, + Type_digit, + Type_graph, + Type_lower, + Type_print, + Type_punct, + Type_space, + Type_upper, + Type_xdigit, + Type_word + }; + + Type type; + + bool test_class(CharType c) const { + + static Vector REGEX_NODE_SPACE = String(" \t\r\n\f"); + static Vector REGEX_NODE_PUNCT = String("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"); + + switch (type) { + case Type_alnum: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return false; + case Type_alpha: + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return false; + case Type_ascii: + return (0x00 <= c && c <= 0x7F); + case Type_blank: + return (c == ' ' || c == '\t'); + case Type_cntrl: + return ((0x00 <= c && c <= 0x1F) || c == 0x7F); + case Type_digit: + return ('0' <= c && c <= '9'); + case Type_graph: + return (0x20 < c && c < 0x7F); + case Type_lower: + return ('a' <= c && c <= 'z'); + case Type_print: + return (0x1F < c && c < 0x1F); + case Type_punct: + return (REGEX_NODE_PUNCT.find(c) >= 0); + case Type_space: + return (REGEX_NODE_SPACE.find(c) >= 0); + case Type_upper: + return ('A' <= c && c <= 'Z'); + case Type_xdigit: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'f') return true; + if ('A' <= c && c <= 'F') return true; + return false; + case Type_word: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return (c == '_'); + default: + return false; + } + return false; + } + + RegExNodeClass(Type p_type) { + + length = 1; + quantifiable = true; + type = p_type; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + if (!test_class(s.at(pos))) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } + +#define REGEX_CMP_CLASS(POS, NAME) if (cmp_class(POS, #NAME)) return Type_ ## NAME + + static Type parse_type(const CharType*& p_pos) { + + REGEX_CMP_CLASS(p_pos, alnum); + REGEX_CMP_CLASS(p_pos, alpha); + REGEX_CMP_CLASS(p_pos, ascii); + REGEX_CMP_CLASS(p_pos, blank); + REGEX_CMP_CLASS(p_pos, cntrl); + REGEX_CMP_CLASS(p_pos, digit); + REGEX_CMP_CLASS(p_pos, graph); + REGEX_CMP_CLASS(p_pos, lower); + REGEX_CMP_CLASS(p_pos, print); + REGEX_CMP_CLASS(p_pos, punct); + REGEX_CMP_CLASS(p_pos, space); + REGEX_CMP_CLASS(p_pos, upper); + REGEX_CMP_CLASS(p_pos, xdigit); + REGEX_CMP_CLASS(p_pos, word); + return Type_none; + } + + static bool cmp_class(const CharType*& p_pos, const char* p_text) { + + unsigned int i = 0; + for (i = 0; p_text[i] != '\0'; ++i) + if (p_pos[i] != p_text[i]) + return false; + + if (p_pos[i++] != ':' || p_pos[i] != ']') + return false; + + p_pos = &p_pos[i]; + return true; + } +}; + +struct RegExNodeAnchorStart : public RegExNode { + + RegExNodeAnchorStart() { + + length = 0; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos != 0) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeAnchorEnd : public RegExNode { + + RegExNodeAnchorEnd() { + + length = 0; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos != s.eof) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeWordBoundary : public RegExNode { + + bool inverse; + + RegExNodeWordBoundary(bool p_inverse) { + + length = 0; + inverse = p_inverse; + } + + virtual int test(RegExSearch& s, int pos) const { + + bool left = false; + bool right = false; + + if (pos != 0) { + CharType c = s.at(pos - 1); + if (c == '_' || iswalnum(c)) + left = true; + } + + if (pos != s.eof) { + CharType c = s.at(pos); + if (c == '_' || iswalnum(c)) + right = true; + } + + if ((left == right) != inverse) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeQuantifier : public RegExNode { + + int min; + int max; + bool greedy; + RegExNode* child; + + RegExNodeQuantifier(int p_min, int p_max) { + + min = p_min; + max = p_max; + greedy = true; + child = NULL; + } + + ~RegExNodeQuantifier() { + + if (child) + memdelete(child); + } + + virtual int test(RegExSearch& s, int pos) const { + + return test_step(s, pos, 0, pos); + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + s.complete = false; + return pos; + } + + int test_step(RegExSearch& s, int pos, int level, int start) const { + + if (pos > s.end) + return -1; + + if (!greedy && level > min) { + int res = next ? next->test(s, pos) : pos; + if (s.complete) + return res; + + if (res >= 0 && parent->test_parent(s, res) >= 0) + return res; + } + + if (max >= 0 && level > max) + return -1; + + int res = pos; + if (level >= 1) { + if (level > min + 1 && pos == start) + return -1; + + res = child->test(s, pos); + if (s.complete) + return res; + } + + if (res >= 0) { + + int res_step = test_step(s, res, level + 1, start); + if (res_step >= 0) + return res_step; + + if (greedy && level >= min) { + if (next) + res = next->test(s, res); + if (s.complete) + return res; + + if (res >= 0 && parent->test_parent(s, res) >= 0) + return res; + } + } + return -1; + } +}; + +struct RegExNodeBackReference : public RegExNode { + + int id; + + RegExNodeBackReference(int p_id) { + + length = -1; + quantifiable = true; + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + for (int i = 0; i < ref.length; ++i) { + + if (pos + i >= s.end) + return -1; + + if (s.at(ref.start + i) != s.at(pos + i)) + return -1; + } + return next ? next->test(s, pos + ref.length) : pos + ref.length; + } +}; + + +struct RegExNodeGroup : public RegExNode { + + bool inverse; + bool reset_pos; + Vector childset; + RegExNode* back; + + RegExNodeGroup() { + + length = 0; + quantifiable = true; + inverse = false; + reset_pos = false; + back = NULL; + } + + virtual ~RegExNodeGroup() { + + for (int i = 0; i < childset.size(); ++i) + memdelete(childset[i]); + } + + virtual int test(RegExSearch& s, int pos) const { + + for (int i = 0; i < childset.size(); ++i) { + + s.complete = false; + + int res = childset[i]->test(s, pos); + + if (s.complete) + return res; + + if (inverse) { + if (res < 0) + res = pos + 1; + else + return -1; + + if (i + 1 < childset.size()) + continue; + } + + if (res >= 0) { + if (reset_pos) + res = pos; + return next ? next->test(s, res) : res; + } + } + return -1; + } + + void add_child(RegExNode* node) { + + node->parent = this; + node->previous = back; + + if (back) + back->next = node; + else + childset.push_back(node); + + increment_length(node->length); + + back = node; + } + + void add_childset() { + + if (childset.size() > 0) + length = -1; + back = NULL; + } + + RegExNode* swap_back(RegExNode* node) { + + RegExNode* old = back; + + if (old) { + if (!old->previous) + childset.remove(childset.size() - 1); + back = old->previous; + increment_length(old->length, true); + } + + add_child(node); + + return old; + } +}; + +struct RegExNodeCapturing : public RegExNodeGroup { + + int id; + + RegExNodeCapturing(int p_id = 0) { + + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + int old_start = ref.start; + ref.start = pos; + + int res = RegExNodeGroup::test(s, pos); + + if (res >= 0) { + if (!s.complete) + ref.length = res - pos; + } else { + ref.start = old_start; + } + + return res; + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + ref.length = pos - ref.start; + return RegExNode::test_parent(s, pos); + } + + static Variant parse_name(const CharType*& c, bool p_allow_numeric) { + + if (c[1] == '0') { + return -1; + } else if ('1' <= c[1] && c[1] <= '9') { + if (!p_allow_numeric) + return -1; + int res = (++c)[0] - '0'; + while ('0' <= c[1] && c[1] <= '9') + res = res * 10 + int((++c)[0] - '0'); + if ((++c)[0] != '>') + return -1; + return res; + } else if (iswalnum(c[1])) { + String res(++c, 1); + while (iswalnum(c[1])) + res += String(++c, 1); + if ((++c)[0] != '>') + return -1; + return res; + } + return -1; + } +}; + +struct RegExNodeLookAhead : public RegExNodeGroup { + + int id; + + RegExNodeLookAhead(bool p_inverse, int p_id = 0) { + + quantifiable = false; + inverse = p_inverse; + reset_pos = true; + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + s.lookahead_pos[id] = pos; + return RegExNodeGroup::test(s, pos); + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + return RegExNode::test_parent(s, s.lookahead_pos[id]); + } +}; + +struct RegExNodeLookBehind : public RegExNodeGroup { + + RegExNodeLookBehind(bool p_inverse, int p_id = 0) { + + quantifiable = false; + inverse = p_inverse; + reset_pos = true; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos < length) + return -1; + return RegExNodeGroup::test(s, pos - length); + } +}; + +struct RegExNodeBracket : public RegExNode { + + bool inverse; + Vector children; + + RegExNodeBracket() { + + length = 1; + quantifiable = true; + inverse = false; + } + + virtual ~RegExNodeBracket() { + + for (int i = 0; i < children.size(); ++i) + memdelete(children[i]); + } + + virtual int test(RegExSearch& s, int pos) const { + + for (int i = 0; i < children.size(); ++i) { + + int res = children[i]->test(s, pos); + + if (inverse) { + if (res < 0) + res = pos + 1; + else + return -1; + + if (i + 1 < children.size()) + continue; + } + + if (res >= 0) + return next ? next->test(s, res) : res; + } + return -1; + } + + void add_child(RegExNode* node) { + + node->parent = this; + children.push_back(node); + } + + void pop_back() { + + memdelete(children[children.size() - 1]); + children.remove(children.size() - 1); + } +}; + +#define REGEX_EXPAND_FAIL(MSG)\ +{\ + ERR_PRINT(MSG);\ + return String();\ +} + +String RegExMatch::expand(const String& p_template) const { + + String res; + for (const CharType* c = p_template.c_str(); *c != '\0'; ++c) { + if (c[0] == '\\') { + if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { + + int ref = 0; + bool unclosed = false; + + if (c[1] == 'g') { + unclosed = true; + c = &c[2]; + } + + while ('0' <= c[1] && c[1] <= '9') { + ref = ref * 10 + int(c[1] - '0'); + ++c; + } + + if (unclosed) { + if (c[1] != '}') + REGEX_EXPAND_FAIL("unclosed backreference '{'"); + ++c; + } + + res += get_string(ref); + + } else if (c[1] =='g' && c[2] == '<') { + + const CharType* d = &c[2]; + + Variant name = RegExNodeCapturing::parse_name(d, true); + if (name == Variant(-1)) + REGEX_EXPAND_FAIL("unrecognised character for group name"); + + c = d; + + res += get_string(name); + + } else { + + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_EXPAND_FAIL("invalid escape token"); + res += String(&ch, 1); + c = d; + } + } else { + res += String(c, 1); + } + } + return res; +} + +int RegExMatch::get_group_count() const { + + int count = 0; + for (int i = 1; i < captures.size(); ++i) + if (captures[i].name.get_type() == Variant::INT) + ++count; + return count; +} + +Array RegExMatch::get_group_array() const { + + Array res; + for (int i = 1; i < captures.size(); ++i) { + const RegExMatch::Group& capture = captures[i]; + if (capture.name.get_type() != Variant::INT) + continue; + + if (capture.start >= 0) + res.push_back(string.substr(capture.start, capture.length)); + else + res.push_back(String()); + } + return res; +} + +Array RegExMatch::get_names() const { + + Array res; + for (int i = 1; i < captures.size(); ++i) + if (captures[i].name.get_type() == Variant::STRING) + res.push_back(captures[i].name); + return res; +} + +Dictionary RegExMatch::get_name_dict() const { + + Dictionary res; + for (int i = 1; i < captures.size(); ++i) { + const RegExMatch::Group& capture = captures[i]; + if (capture.name.get_type() != Variant::STRING) + continue; + + if (capture.start >= 0) + res[capture.name] = string.substr(capture.start, capture.length); + else + res[capture.name] = String(); + } + return res; +} + +String RegExMatch::get_string(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) { + + const RegExMatch::Group& capture = captures[i]; + + if (capture.name != p_name) + continue; + + if (capture.start == -1) + return String(); + + return string.substr(capture.start, capture.length); + } + return String(); +} + +int RegExMatch::get_start(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) + if (captures[i].name == p_name) + return captures[i].start; + return -1; +} + +int RegExMatch::get_end(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) + if (captures[i].name == p_name) + return captures[i].start + captures[i].length; + return -1; +} + +RegExMatch::RegExMatch() { + +} + +static bool RegEx_is_shorthand(CharType ch) { + + switch (ch) { + case 'w': + case 'W': + case 'd': + case 'D': + case 's': + case 'S': + return true; + default: + break; + } + return false; +} + +#define REGEX_COMPILE_FAIL(MSG)\ +{\ + ERR_PRINT(MSG);\ + clear();\ + return FAILED;\ +} + +Error RegEx::compile(const String& p_pattern) { + + if (pattern == p_pattern) + return OK; + + clear(); + pattern = p_pattern; + group_names.push_back(0); + RegExNodeGroup* root_group = memnew(RegExNodeCapturing(0)); + root = root_group; + Vector stack; + stack.push_back(root_group); + int lookahead_level = 0; + int numeric_groups = 0; + const int numeric_max = 9; + + for (const CharType* c = p_pattern.c_str(); *c != '\0'; ++c) { + + switch (c[0]) { + case '(': + if (c[1] == '?') { + + RegExNodeGroup* group = NULL; + switch (c[2]) { + case ':': + c = &c[2]; + group = memnew(RegExNodeGroup()); + break; + case '!': + case '=': + group = memnew(RegExNodeLookAhead((c[2] == '!'), lookahead_level++)); + if (lookahead_depth < lookahead_level) + lookahead_depth = lookahead_level; + c = &c[2]; + break; + case '<': + if (c[3] == '!' || c[3] == '=') { + group = memnew(RegExNodeLookBehind((c[3] == '!'), lookahead_level++)); + c = &c[3]; + } + break; + case 'P': + if (c[3] == '<') { + const CharType* d = &c[3]; + Variant name = RegExNodeCapturing::parse_name(d, false); + if (name == Variant(-1)) + REGEX_COMPILE_FAIL("unrecognised character for group name"); + group = memnew(RegExNodeCapturing(group_names.size())); + group_names.push_back(name); + c = d; + } + default: + break; + } + if (!group) + REGEX_COMPILE_FAIL("unrecognised qualifier for group"); + stack[0]->add_child(group); + stack.insert(0, group); + + } else if (numeric_groups < numeric_max) { + + RegExNodeCapturing* group = memnew(RegExNodeCapturing(group_names.size())); + group_names.push_back(++numeric_groups); + stack[0]->add_child(group); + stack.insert(0, group); + + } else { + + RegExNodeGroup* group = memnew(RegExNodeGroup()); + stack[0]->add_child(group); + stack.insert(0, group); + } + break; + case ')': + if (stack.size() == 1) + REGEX_COMPILE_FAIL("unexpected ')'"); + stack.remove(0); + break; + case '\\': + if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { + + int ref = 0; + bool unclosed = false; + + if (c[1] == 'g') { + unclosed = true; + c = &c[2]; + } + + while ('0' <= c[1] && c[1] <= '9') { + ref = ref * 10 + int(c[1] - '0'); + ++c; + } + + if (unclosed) { + if (c[1] != '}') + REGEX_COMPILE_FAIL("unclosed backreference '{'"); + ++c; + } + + if (ref > numeric_groups || ref <= 0) + REGEX_COMPILE_FAIL("backreference not found"); + + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); + + for (int i = 0; i < group_names.size(); ++i) { + if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { + ref = group_names[i]; + break; + } + } + + stack[0]->add_child(memnew(RegExNodeBackReference(ref))); + + } if (c[1] =='g' && c[2] == '<') { + + const CharType* d = &c[2]; + + Variant name = RegExNodeCapturing::parse_name(d, true); + if (name == Variant(-1)) + REGEX_COMPILE_FAIL("unrecognised character for group name"); + + c = d; + + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); + + int ref = -1; + + for (int i = 0; i < group_names.size(); ++i) { + if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { + ref = group_names[i]; + break; + } + } + + if (ref == -1) + REGEX_COMPILE_FAIL("backreference not found"); + + stack[0]->add_child(memnew(RegExNodeBackReference(ref))); + + } else if (c[1] == 'b' || c[1] == 'B') { + + stack[0]->add_child(memnew(RegExNodeWordBoundary(*(++c) == 'B'))); + + } else if (RegEx_is_shorthand(c[1])) { + + stack[0]->add_child(memnew(RegExNodeShorthand(*(++c)))); + + } else { + + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + stack[0]->add_child(memnew(RegExNodeChar(ch))); + c = d; + + } + break; + case '[': + { + RegExNodeBracket* bracket = memnew(RegExNodeBracket()); + stack[0]->add_child(bracket); + if (c[1] == '^') { + bracket->inverse = true; + ++c; + } + bool first_child = true; + CharType previous_child; + bool previous_child_single = false; + while (true) { + ++c; + if (!first_child && c[0] == ']') { + + break; + + } else if (c[0] == '\0') { + + REGEX_COMPILE_FAIL("unclosed bracket expression '['"); + + } else if (c[0] == '\\') { + + if (RegEx_is_shorthand(c[1])) { + bracket->add_child(memnew(RegExNodeShorthand(*(++c)))); + } else { + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + bracket->add_child(memnew(RegExNodeChar(ch))); + c = d; + previous_child = ch; + previous_child_single = true; + } + + } else if (c[0] == ']' && c[1] == ':') { + + const CharType* d = &c[2]; + RegExNodeClass::Type type = RegExNodeClass::parse_type(d); + if (type != RegExNodeClass::Type_none) { + + c = d; + previous_child_single = false; + + } else { + + bracket->add_child(memnew(RegExNodeChar('['))); + previous_child = '['; + previous_child_single = true; + } + } else if (previous_child_single && c[0] == '-') { + + if (c[1] != '\0' && c[1] != ']') { + + CharType next; + + if (c[1] == '\\') { + const CharType* d = ++c; + next = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + } else { + next = *(++c); + } + + if (next < previous_child) + REGEX_COMPILE_FAIL("text range out of order"); + + bracket->pop_back(); + bracket->add_child(memnew(RegExNodeRange(previous_child, next))); + previous_child_single = false; + } else { + + bracket->add_child(memnew(RegExNodeChar('-'))); + previous_child = '-'; + previous_child_single = true; + } + } else { + + bracket->add_child(memnew(RegExNodeChar(c[0]))); + previous_child = c[0]; + previous_child_single = true; + } + first_child = false; + } + } + break; + case '|': + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("alternations inside lookbehind not supported"); + stack[0]->add_childset(); + break; + case '^': + stack[0]->add_child(memnew(RegExNodeAnchorStart())); + break; + case '$': + stack[0]->add_child(memnew(RegExNodeAnchorEnd())); + break; + case '.': + stack[0]->add_child(memnew(RegExNodeShorthand('.'))); + break; + case '?': + case '*': + case '+': + case '{': + { + int min_val = 0; + int max_val = -1; + bool valid = true; + const CharType* d = c; + bool max_set = true; + switch (c[0]) { + case '?': + min_val = 0; + max_val = 1; + break; + case '*': + min_val = 0; + max_val = -1; + break; + case '+': + min_val = 1; + max_val = -1; + break; + case '{': + max_set = false; + while (valid) { + ++d; + if (d[0] == '}') { + break; + } else if (d[0] == ',') { + max_set = true; + } else if ('0' <= d[0] && d[0] <= '9') { + if (max_set) { + if (max_val < 0) + max_val = int(d[0] - '0'); + else + max_val = max_val * 10 + int(d[0] - '0'); + } else { + min_val = min_val * 10 + int(d[0] - '0'); + } + } else { + valid = false; + } + } + break; + default: + break; + } + + if (!max_set) + max_val = min_val; + + if (valid) { + + c = d; + + if (stack[0]->back == NULL || !stack[0]->back->quantifiable) + REGEX_COMPILE_FAIL("element not quantifiable"); + + if (min_val != max_val) + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("variable length quantifiers inside lookbehind not supported"); + + RegExNodeQuantifier* quant = memnew(RegExNodeQuantifier(min_val, max_val)); + quant->child = stack[0]->swap_back(quant); + quant->child->previous = NULL; + quant->child->parent = quant; + + if (min_val == max_val && quant->child->length >= 0) + quant->length = max_val * quant->child->length; + + if (c[1] == '?') { + quant->greedy = false; + ++c; + } + break; + } + } + default: + stack[0]->add_child(memnew(RegExNodeChar(c[0]))); + break; + } + } + if (stack.size() > 1) + REGEX_COMPILE_FAIL("unclosed group '('"); + return OK; +} + +Ref RegEx::search(const String& p_text, int p_start, int p_end) const { + + Ref res = memnew(RegExMatch()); + + for (int i = 0; i < group_names.size(); ++i) { + RegExMatch::Group group; + group.name = group_names[i]; + res->captures.push_back(group); + } + + res->string = p_text; + + if (p_end < p_start || p_end > p_text.length()) + p_end = p_text.length(); + + RegExSearch s(res, p_end, lookahead_depth); + + for (int i = p_start; i <= s.end; ++i) { + for (int c = 0; c < group_names.size(); ++c) { + res->captures[c].start = -1; + res->captures[c].length = 0; + } + if (root->test(s, i) >= 0) + break; + } + + if (res->captures[0].start >= 0) + return res; + return NULL; +} + +String RegEx::sub(const String& p_text, const String& p_template, int p_start, int p_end) const { + + Ref m = search(p_text, p_start, p_end); + RegExMatch::Group& s = m->captures[0]; + if (s.start >= 0) { + String res = p_text.substr(0, s.start) + m->expand(p_template); + int end = s.start + s.length; + if (end < p_text.length()) + res += p_text.substr(end, p_text.length() - end); + return res; + } + return p_text; +} + +void RegEx::clear() { + + if (root) + memdelete(root); + + pattern.clear(); + group_names.clear(); + lookahead_depth = 0; +} + +bool RegEx::is_valid() const { + + return (root != NULL); +} + +String RegEx::get_pattern() const { + + return pattern; +} + +int RegEx::get_group_count() const { + + int count = 0; + for (int i = 1; i < group_names.size(); ++i) + if (group_names[i].get_type() == Variant::INT) + ++count; + return count; +} + +Array RegEx::get_names() const { + + Array res; + for (int i = 1; i < group_names.size(); ++i) + if (group_names[i].get_type() == Variant::STRING) + res.push_back(group_names[i]); + return res; +} + +RegEx::RegEx() { + + root = NULL; + lookahead_depth = 0; +} + +RegEx::RegEx(const String& p_pattern) { + + root = NULL; + compile(p_pattern); +} + +RegEx::~RegEx() { + + if (root) + memdelete(root); +} + +void RegExMatch::_bind_methods() { + + ObjectTypeDB::bind_method(_MD("expand","template"),&RegExMatch::expand); + ObjectTypeDB::bind_method(_MD("get_group_count"),&RegExMatch::get_group_count); + ObjectTypeDB::bind_method(_MD("get_group_array"),&RegExMatch::get_group_array); + ObjectTypeDB::bind_method(_MD("get_names"),&RegExMatch::get_names); + ObjectTypeDB::bind_method(_MD("get_name_dict"),&RegExMatch::get_name_dict); + ObjectTypeDB::bind_method(_MD("get_string","name"),&RegExMatch::get_string, DEFVAL(0)); + ObjectTypeDB::bind_method(_MD("get_start","name"),&RegExMatch::get_start, DEFVAL(0)); + ObjectTypeDB::bind_method(_MD("get_end","name"),&RegExMatch::get_end, DEFVAL(0)); +} + +void RegEx::_bind_methods() { + + ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); + ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile); + ObjectTypeDB::bind_method(_MD("search","text","start","end"),&RegEx::search, DEFVAL(0), DEFVAL(-1)); + ObjectTypeDB::bind_method(_MD("sub","text","template","start","end"),&RegEx::sub, DEFVAL(0), DEFVAL(-1)); + ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); + ObjectTypeDB::bind_method(_MD("get_pattern"),&RegEx::get_pattern); + ObjectTypeDB::bind_method(_MD("get_group_count"),&RegEx::get_group_count); + ObjectTypeDB::bind_method(_MD("get_names"),&RegEx::get_names); +} + diff --git a/modules/regex/regex.h b/modules/regex/regex.h new file mode 100644 index 00000000000..283368c34fc --- /dev/null +++ b/modules/regex/regex.h @@ -0,0 +1,114 @@ +/*************************************************************************/ +/* regex.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* http://www.godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef REGEX_H +#define REGEX_H + +#include "core/vector.h" +#include "core/ustring.h" +#include "core/dictionary.h" +#include "core/reference.h" +#include "core/resource.h" + +class RegExNode; + +class RegExMatch : public Reference { + + OBJ_TYPE(RegExMatch, Reference); + + struct Group { + Variant name; + int start; + int length; + }; + + Vector captures; + String string; + + friend class RegEx; + friend class RegExSearch; + friend class RegExNodeCapturing; + friend class RegExNodeBackReference; + +protected: + + static void _bind_methods(); + +public: + + String expand(const String& p_template) const; + + int get_group_count() const; + Array get_group_array() const; + + Array get_names() const; + Dictionary get_name_dict() const; + + String get_string(const Variant& p_name) const; + int get_start(const Variant& p_name) const; + int get_end(const Variant& p_name) const; + + RegExMatch(); + +}; + +class RegEx : public Reference { + + OBJ_TYPE(RegEx, Reference); + + RegExNode* root; + Vector group_names; + String pattern; + int lookahead_depth; + +protected: + + static void _bind_methods(); + +public: + + void clear(); + Error compile(const String& p_pattern); + + Ref search(const String& p_text, int p_start = 0, int p_end = -1) const; + String sub(const String& p_text, const String& p_template, int p_start = 0, int p_end = -1) const; + + bool is_valid() const; + String get_pattern() const; + int get_group_count() const; + Array get_names() const; + + RegEx(); + RegEx(const String& p_pattern); + ~RegEx(); + +}; + +#endif // REGEX_H + diff --git a/drivers/nrex/regex.h b/modules/regex/register_types.cpp similarity index 72% rename from drivers/nrex/regex.h rename to modules/regex/register_types.cpp index 74495442c70..050cf3efff0 100644 --- a/drivers/nrex/regex.h +++ b/modules/regex/register_types.cpp @@ -1,5 +1,5 @@ /*************************************************************************/ -/* regex.h */ +/* register_types.cpp */ /*************************************************************************/ /* This file is part of: */ /* GODOT ENGINE */ @@ -26,40 +26,18 @@ /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /*************************************************************************/ -#ifndef REGEX_H -#define REGEX_H -#include "ustring.h" -#include "vector.h" -#include "core/reference.h" -#include "nrex.hpp" +#include "register_types.h" +#include "object_type_db.h" +#include "regex.h" -class RegEx : public Reference { +void register_regex_types() { - OBJ_TYPE(RegEx, Reference); + ObjectTypeDB::register_type(); + ObjectTypeDB::register_type(); +} - mutable String text; - mutable Vector captures; - nrex exp; +void unregister_regex_types() { -protected: +} - static void _bind_methods(); - StringArray _bind_get_captures() const; - -public: - - void clear(); - bool is_valid() const; - int get_capture_count() const; - int get_capture_start(int capture) const; - String get_capture(int capture) const; - Error compile(const String& p_pattern, int capture = 9); - int find(const String& p_text, int p_start = 0, int p_end = -1) const; - - RegEx(); - RegEx(const String& p_pattern); - ~RegEx(); -}; - -#endif // REGEX_H diff --git a/modules/regex/register_types.h b/modules/regex/register_types.h new file mode 100644 index 00000000000..df3b508e144 --- /dev/null +++ b/modules/regex/register_types.h @@ -0,0 +1,31 @@ +/*************************************************************************/ +/* register_types.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* http://www.godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +void register_regex_types(); +void unregister_regex_types(); From c3b4686082bc92c70886ee848064009c8f628193 Mon Sep 17 00:00:00 2001 From: Zher Huei Lee Date: Mon, 24 Oct 2016 22:13:26 +0100 Subject: [PATCH 2/3] Added global sub and bounds checking to RegEx --- doc/base/classes.xml | 14 +++++----- modules/regex/regex.cpp | 57 ++++++++++++++++++++++++++++++++--------- modules/regex/regex.h | 2 +- 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/doc/base/classes.xml b/doc/base/classes.xml index cafb1449158..47fa1deb551 100644 --- a/doc/base/classes.xml +++ b/doc/base/classes.xml @@ -32582,7 +32582,7 @@ - Searches the text for the compiled pattern. Returns a [RegExMatch] container of the first matching reult if found, otherwise null. The starting point of the serch could be specified without moving the string start anchor. + Searches the text for the compiled pattern. Returns a [RegExMatch] container of the first matching reult if found, otherwise null. The region to search within can be specified without modifying where the start and end anchor would be. @@ -32590,14 +32590,16 @@ - + - + - + + + - Searches the specified text for the compiled pattern and returns the text with the result replaced. Escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] are automatically expanded and resolved. If no change was found the unmodified text is returned instead. + Searches the text for the compiled pattern and replaces it with the specified string. Escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] expanded and resolved. By default only the first instance is replaced but it can be changed for all instances (global replacement). The region to search within can be specified without modifying where the start and end anchor would be. @@ -32616,7 +32618,7 @@ - Using results from the search, returns the specified string with escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] expanded and resolved + Using results from the search, returns the specified string with escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] expanded and resolved. diff --git a/modules/regex/regex.cpp b/modules/regex/regex.cpp index 8f26d764c42..388e6dfdecc 100644 --- a/modules/regex/regex.cpp +++ b/modules/regex/regex.cpp @@ -1340,6 +1340,12 @@ Error RegEx::compile(const String& p_pattern) { Ref RegEx::search(const String& p_text, int p_start, int p_end) const { + ERR_FAIL_COND_V(!is_valid(), NULL); + ERR_FAIL_COND_V(p_start < 0, NULL); + ERR_FAIL_COND_V(p_start >= p_text.length(), NULL); + ERR_FAIL_COND_V(p_end > p_text.length(), NULL); + ERR_FAIL_COND_V(p_end != -1 && p_end < p_start, NULL); + Ref res = memnew(RegExMatch()); for (int i = 0; i < group_names.size(); ++i) { @@ -1350,7 +1356,7 @@ Ref RegEx::search(const String& p_text, int p_start, int p_end) cons res->string = p_text; - if (p_end < p_start || p_end > p_text.length()) + if (p_end == -1) p_end = p_text.length(); RegExSearch s(res, p_end, lookahead_depth); @@ -1369,18 +1375,45 @@ Ref RegEx::search(const String& p_text, int p_start, int p_end) cons return NULL; } -String RegEx::sub(const String& p_text, const String& p_template, int p_start, int p_end) const { +String RegEx::sub(const String& p_text, const String& p_replacement, bool p_all, int p_start, int p_end) const { - Ref m = search(p_text, p_start, p_end); - RegExMatch::Group& s = m->captures[0]; - if (s.start >= 0) { - String res = p_text.substr(0, s.start) + m->expand(p_template); - int end = s.start + s.length; - if (end < p_text.length()) - res += p_text.substr(end, p_text.length() - end); - return res; + ERR_FAIL_COND_V(!is_valid(), p_text); + ERR_FAIL_COND_V(p_start < 0, p_text); + ERR_FAIL_COND_V(p_start >= p_text.length(), p_text); + ERR_FAIL_COND_V(p_end > p_text.length(), p_text); + ERR_FAIL_COND_V(p_end != -1 && p_end < p_start, p_text); + + String text = p_text; + int start = p_start; + + if (p_end == -1) + p_end = p_text.length(); + + while (start < text.length() && (p_all || start == p_start)) { + + Ref m = search(text, start, p_end); + + RegExMatch::Group& s = m->captures[0]; + + if (s.start < 0) + break; + + String res = text.substr(0, s.start) + m->expand(p_replacement); + + start = res.length(); + + if (s.length == 0) + ++start; + + int sub_end = s.start + s.length; + if (sub_end < text.length()) + res += text.substr(sub_end, text.length() - sub_end); + + p_end += res.length() - text.length(); + + text = res; } - return p_text; + return text; } void RegEx::clear() { @@ -1456,7 +1489,7 @@ void RegEx::_bind_methods() { ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile); ObjectTypeDB::bind_method(_MD("search","text","start","end"),&RegEx::search, DEFVAL(0), DEFVAL(-1)); - ObjectTypeDB::bind_method(_MD("sub","text","template","start","end"),&RegEx::sub, DEFVAL(0), DEFVAL(-1)); + ObjectTypeDB::bind_method(_MD("sub","text","replacement","all","start","end"),&RegEx::sub, DEFVAL(false), DEFVAL(0), DEFVAL(-1)); ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); ObjectTypeDB::bind_method(_MD("get_pattern"),&RegEx::get_pattern); ObjectTypeDB::bind_method(_MD("get_group_count"),&RegEx::get_group_count); diff --git a/modules/regex/regex.h b/modules/regex/regex.h index 283368c34fc..8d31b847732 100644 --- a/modules/regex/regex.h +++ b/modules/regex/regex.h @@ -97,7 +97,7 @@ public: Error compile(const String& p_pattern); Ref search(const String& p_text, int p_start = 0, int p_end = -1) const; - String sub(const String& p_text, const String& p_template, int p_start = 0, int p_end = -1) const; + String sub(const String& p_text, const String& p_replacement, bool p_all = false, int p_start = 0, int p_end = -1) const; bool is_valid() const; String get_pattern() const; From 9a5ce099f1c3559cc46b923d4e192a7be781163c Mon Sep 17 00:00:00 2001 From: Zher Huei Lee Date: Wed, 26 Oct 2016 13:05:00 +0100 Subject: [PATCH 3/3] Changed RegEx to inherit Resource --- modules/regex/regex.cpp | 8 ++++++-- modules/regex/regex.h | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/regex/regex.cpp b/modules/regex/regex.cpp index 388e6dfdecc..a0f4b4934cc 100644 --- a/modules/regex/regex.cpp +++ b/modules/regex/regex.cpp @@ -985,7 +985,9 @@ static bool RegEx_is_shorthand(CharType ch) { Error RegEx::compile(const String& p_pattern) { - if (pattern == p_pattern) + ERR_FAIL_COND_V(p_pattern.length() == 0, FAILED); + + if (pattern == p_pattern && root) return OK; clear(); @@ -1421,7 +1423,7 @@ void RegEx::clear() { if (root) memdelete(root); - pattern.clear(); + root = NULL; group_names.clear(); lookahead_depth = 0; } @@ -1494,5 +1496,7 @@ void RegEx::_bind_methods() { ObjectTypeDB::bind_method(_MD("get_pattern"),&RegEx::get_pattern); ObjectTypeDB::bind_method(_MD("get_group_count"),&RegEx::get_group_count); ObjectTypeDB::bind_method(_MD("get_names"),&RegEx::get_names); + + ADD_PROPERTY(PropertyInfo(Variant::STRING, "pattern"), _SCS("compile"), _SCS("get_pattern")); } diff --git a/modules/regex/regex.h b/modules/regex/regex.h index 8d31b847732..803aa72b3fe 100644 --- a/modules/regex/regex.h +++ b/modules/regex/regex.h @@ -78,9 +78,9 @@ public: }; -class RegEx : public Reference { +class RegEx : public Resource { - OBJ_TYPE(RegEx, Reference); + OBJ_TYPE(RegEx, Resource); RegExNode* root; Vector group_names;