commit
a7e37586f3
3 changed files with 113 additions and 43 deletions
|
@ -1,6 +1,8 @@
|
||||||
# NREX: Node RegEx
|
# NREX: Node RegEx
|
||||||
|
|
||||||
Version 0.1
|
[![Build Status](https://travis-ci.org/leezh/nrex.svg?branch=master)](https://travis-ci.org/leezh/nrex)
|
||||||
|
|
||||||
|
** Version 0.2 **
|
||||||
|
|
||||||
Small node-based regular expression library. It only does text pattern
|
Small node-based regular expression library. It only does text pattern
|
||||||
matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp`
|
matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp`
|
||||||
|
@ -38,7 +40,7 @@ Currently supported features:
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
Copyright (c) 2015, Zher Huei Lee
|
Copyright (c) 2015-2016, Zher Huei Lee
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
This software is provided 'as-is', without any express or implied
|
This software is provided 'as-is', without any express or implied
|
||||||
|
@ -59,3 +61,15 @@ freely, subject to the following restrictions:
|
||||||
|
|
||||||
3. This notice may not be removed or altered from any source
|
3. This notice may not be removed or altered from any source
|
||||||
distribution.
|
distribution.
|
||||||
|
|
||||||
|
|
||||||
|
# Changes
|
||||||
|
|
||||||
|
## Version 0.2 (2016-08-04)
|
||||||
|
* Fixed capturing groups matching to invalid results
|
||||||
|
* Fixed parents of recursive quantifiers not expanding properly
|
||||||
|
* Fixed LookAhead sometimes adding to result
|
||||||
|
* More verbose unit testing
|
||||||
|
|
||||||
|
## Version 0.1 (2015-12-04)
|
||||||
|
* Initial release
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
// NREX: Node RegEx
|
// NREX: Node RegEx
|
||||||
// Version 0.1
|
// Version 0.2
|
||||||
//
|
//
|
||||||
// Copyright (c) 2015, Zher Huei Lee
|
// Copyright (c) 2015-2016, Zher Huei Lee
|
||||||
// All rights reserved.
|
// All rights reserved.
|
||||||
//
|
//
|
||||||
// This software is provided 'as-is', without any express or implied
|
// This software is provided 'as-is', without any express or implied
|
||||||
|
@ -68,6 +68,13 @@ class nrex_array
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nrex_array(unsigned int size)
|
||||||
|
: _data(NREX_NEW_ARRAY(T, size))
|
||||||
|
, _reserved(size)
|
||||||
|
, _size(0)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
~nrex_array()
|
~nrex_array()
|
||||||
{
|
{
|
||||||
NREX_DELETE_ARRAY(_data);
|
NREX_DELETE_ARRAY(_data);
|
||||||
|
@ -100,7 +107,7 @@ class nrex_array
|
||||||
_size++;
|
_size++;
|
||||||
}
|
}
|
||||||
|
|
||||||
T& top()
|
const T& top() const
|
||||||
{
|
{
|
||||||
return _data[_size - 1];
|
return _data[_size - 1];
|
||||||
}
|
}
|
||||||
|
@ -189,17 +196,19 @@ struct nrex_search
|
||||||
nrex_result* captures;
|
nrex_result* captures;
|
||||||
int end;
|
int end;
|
||||||
bool complete;
|
bool complete;
|
||||||
|
nrex_array<int> lookahead_pos;
|
||||||
|
|
||||||
nrex_char at(int pos)
|
nrex_char at(int pos)
|
||||||
{
|
{
|
||||||
return str[pos];
|
return str[pos];
|
||||||
}
|
}
|
||||||
|
|
||||||
nrex_search(const nrex_char* str, nrex_result* captures)
|
nrex_search(const nrex_char* str, nrex_result* captures, int lookahead)
|
||||||
: str(str)
|
: str(str)
|
||||||
, captures(captures)
|
, captures(captures)
|
||||||
, end(0)
|
, end(0)
|
||||||
{
|
{
|
||||||
|
lookahead_pos.reserve(lookahead);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -239,13 +248,17 @@ struct nrex_node
|
||||||
{
|
{
|
||||||
pos = next->test(s, pos);
|
pos = next->test(s, pos);
|
||||||
}
|
}
|
||||||
|
if (pos >= 0)
|
||||||
|
{
|
||||||
|
s->complete = true;
|
||||||
|
}
|
||||||
if (parent && pos >= 0)
|
if (parent && pos >= 0)
|
||||||
{
|
{
|
||||||
pos = parent->test_parent(s, pos);
|
pos = parent->test_parent(s, pos);
|
||||||
}
|
}
|
||||||
if (pos >= 0)
|
if (pos < 0)
|
||||||
{
|
{
|
||||||
s->complete = true;
|
s->complete = false;
|
||||||
}
|
}
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
@ -274,25 +287,31 @@ struct nrex_node
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum nrex_group_type
|
||||||
|
{
|
||||||
|
nrex_group_capture,
|
||||||
|
nrex_group_non_capture,
|
||||||
|
nrex_group_bracket,
|
||||||
|
nrex_group_look_ahead,
|
||||||
|
nrex_group_look_behind,
|
||||||
|
};
|
||||||
|
|
||||||
struct nrex_node_group : public nrex_node
|
struct nrex_node_group : public nrex_node
|
||||||
{
|
{
|
||||||
static const int NonCapture = -1;
|
nrex_group_type type;
|
||||||
static const int Bracket = -2;
|
int id;
|
||||||
static const int LookAhead = -3;
|
|
||||||
static const int LookBehind = -4;
|
|
||||||
|
|
||||||
int mode;
|
|
||||||
bool negate;
|
bool negate;
|
||||||
nrex_array<nrex_node*> childset;
|
nrex_array<nrex_node*> childset;
|
||||||
nrex_node* back;
|
nrex_node* back;
|
||||||
|
|
||||||
nrex_node_group(int mode)
|
nrex_node_group(nrex_group_type type, int id = 0)
|
||||||
: nrex_node(true)
|
: nrex_node(true)
|
||||||
, mode(mode)
|
, type(type)
|
||||||
|
, id(id)
|
||||||
, negate(false)
|
, negate(false)
|
||||||
, back(NULL)
|
, back(NULL)
|
||||||
{
|
{
|
||||||
if (mode != Bracket)
|
if (type != nrex_group_bracket)
|
||||||
{
|
{
|
||||||
length = 0;
|
length = 0;
|
||||||
}
|
}
|
||||||
|
@ -300,7 +319,7 @@ struct nrex_node_group : public nrex_node
|
||||||
{
|
{
|
||||||
length = 1;
|
length = 1;
|
||||||
}
|
}
|
||||||
if (mode == LookAhead || mode == LookBehind)
|
if (type == nrex_group_look_ahead || type == nrex_group_look_behind)
|
||||||
{
|
{
|
||||||
quantifiable = false;
|
quantifiable = false;
|
||||||
}
|
}
|
||||||
|
@ -317,15 +336,17 @@ struct nrex_node_group : public nrex_node
|
||||||
|
|
||||||
int test(nrex_search* s, int pos) const
|
int test(nrex_search* s, int pos) const
|
||||||
{
|
{
|
||||||
if (mode >= 0)
|
int old_start;
|
||||||
|
if (type == nrex_group_capture)
|
||||||
{
|
{
|
||||||
s->captures[mode].start = pos;
|
old_start = s->captures[id].start;
|
||||||
|
s->captures[id].start = pos;
|
||||||
}
|
}
|
||||||
for (unsigned int i = 0; i < childset.size(); ++i)
|
for (unsigned int i = 0; i < childset.size(); ++i)
|
||||||
{
|
{
|
||||||
s->complete = false;
|
s->complete = false;
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
if (mode == LookBehind)
|
if (type == nrex_group_look_behind)
|
||||||
{
|
{
|
||||||
if (pos < length)
|
if (pos < length)
|
||||||
{
|
{
|
||||||
|
@ -333,7 +354,15 @@ struct nrex_node_group : public nrex_node
|
||||||
}
|
}
|
||||||
offset = length;
|
offset = length;
|
||||||
}
|
}
|
||||||
|
if (type == nrex_group_look_ahead)
|
||||||
|
{
|
||||||
|
s->lookahead_pos.push(pos);
|
||||||
|
}
|
||||||
int res = childset[i]->test(s, pos - offset);
|
int res = childset[i]->test(s, pos - offset);
|
||||||
|
if (type == nrex_group_look_ahead)
|
||||||
|
{
|
||||||
|
s->lookahead_pos.pop();
|
||||||
|
}
|
||||||
if (s->complete)
|
if (s->complete)
|
||||||
{
|
{
|
||||||
return res;
|
return res;
|
||||||
|
@ -355,32 +384,40 @@ struct nrex_node_group : public nrex_node
|
||||||
}
|
}
|
||||||
if (res >= 0)
|
if (res >= 0)
|
||||||
{
|
{
|
||||||
if (mode >= 0)
|
if (type == nrex_group_capture)
|
||||||
{
|
{
|
||||||
s->captures[mode].length = res - pos;
|
s->captures[id].length = res - pos;
|
||||||
}
|
}
|
||||||
else if (mode == LookAhead || mode == LookBehind)
|
else if (type == nrex_group_look_ahead || type == nrex_group_look_behind)
|
||||||
{
|
{
|
||||||
res = pos;
|
res = pos;
|
||||||
}
|
}
|
||||||
return next ? next->test(s, res) : res;
|
return next ? next->test(s, res) : res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (type == nrex_group_capture)
|
||||||
|
{
|
||||||
|
s->captures[id].start = old_start;
|
||||||
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual int test_parent(nrex_search* s, int pos) const
|
virtual int test_parent(nrex_search* s, int pos) const
|
||||||
{
|
{
|
||||||
if (mode >= 0)
|
if (type == nrex_group_capture)
|
||||||
{
|
{
|
||||||
s->captures[mode].length = pos - s->captures[mode].start;
|
s->captures[id].length = pos - s->captures[id].start;
|
||||||
|
}
|
||||||
|
if (type == nrex_group_look_ahead)
|
||||||
|
{
|
||||||
|
pos = s->lookahead_pos[id];
|
||||||
}
|
}
|
||||||
return nrex_node::test_parent(s, pos);
|
return nrex_node::test_parent(s, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_childset()
|
void add_childset()
|
||||||
{
|
{
|
||||||
if (childset.size() > 0 && mode != Bracket)
|
if (childset.size() > 0 && type != nrex_group_bracket)
|
||||||
{
|
{
|
||||||
length = -1;
|
length = -1;
|
||||||
}
|
}
|
||||||
|
@ -391,7 +428,7 @@ struct nrex_node_group : public nrex_node
|
||||||
{
|
{
|
||||||
node->parent = this;
|
node->parent = this;
|
||||||
node->previous = back;
|
node->previous = back;
|
||||||
if (back && mode != Bracket)
|
if (back && type != nrex_group_bracket)
|
||||||
{
|
{
|
||||||
back->next = node;
|
back->next = node;
|
||||||
}
|
}
|
||||||
|
@ -399,7 +436,7 @@ struct nrex_node_group : public nrex_node
|
||||||
{
|
{
|
||||||
childset.push(node);
|
childset.push(node);
|
||||||
}
|
}
|
||||||
if (mode != Bracket)
|
if (type != nrex_group_bracket)
|
||||||
{
|
{
|
||||||
increment_length(node->length);
|
increment_length(node->length);
|
||||||
}
|
}
|
||||||
|
@ -418,7 +455,7 @@ struct nrex_node_group : public nrex_node
|
||||||
{
|
{
|
||||||
childset.pop();
|
childset.pop();
|
||||||
}
|
}
|
||||||
if (mode != Bracket)
|
if (type != nrex_group_bracket)
|
||||||
{
|
{
|
||||||
increment_length(old->length, true);
|
increment_length(old->length, true);
|
||||||
}
|
}
|
||||||
|
@ -436,7 +473,7 @@ struct nrex_node_group : public nrex_node
|
||||||
{
|
{
|
||||||
childset.pop();
|
childset.pop();
|
||||||
}
|
}
|
||||||
if (mode != Bracket)
|
if (type != nrex_group_bracket)
|
||||||
{
|
{
|
||||||
increment_length(old->length, true);
|
increment_length(old->length, true);
|
||||||
}
|
}
|
||||||
|
@ -887,6 +924,12 @@ struct nrex_node_quantifier : public nrex_node
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual int test_parent(nrex_search* s, int pos) const
|
||||||
|
{
|
||||||
|
s->complete = false;
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct nrex_node_anchor : public nrex_node
|
struct nrex_node_anchor : public nrex_node
|
||||||
|
@ -986,7 +1029,7 @@ bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
|
||||||
{
|
{
|
||||||
for (unsigned int i = 0; i < stack.size(); i++)
|
for (unsigned int i = 0; i < stack.size(); i++)
|
||||||
{
|
{
|
||||||
if (stack[i]->mode == nrex_node_group::LookBehind)
|
if (stack[i]->type == nrex_group_look_behind)
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -996,12 +1039,14 @@ bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
|
||||||
|
|
||||||
nrex::nrex()
|
nrex::nrex()
|
||||||
: _capturing(0)
|
: _capturing(0)
|
||||||
|
, _lookahead_depth(0)
|
||||||
, _root(NULL)
|
, _root(NULL)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
nrex::nrex(const nrex_char* pattern, int captures)
|
nrex::nrex(const nrex_char* pattern, int captures)
|
||||||
: _capturing(0)
|
: _capturing(0)
|
||||||
|
, _lookahead_depth(0)
|
||||||
, _root(NULL)
|
, _root(NULL)
|
||||||
{
|
{
|
||||||
compile(pattern, captures);
|
compile(pattern, captures);
|
||||||
|
@ -1023,6 +1068,7 @@ bool nrex::valid() const
|
||||||
void nrex::reset()
|
void nrex::reset()
|
||||||
{
|
{
|
||||||
_capturing = 0;
|
_capturing = 0;
|
||||||
|
_lookahead_depth = 0;
|
||||||
if (_root)
|
if (_root)
|
||||||
{
|
{
|
||||||
NREX_DELETE(_root);
|
NREX_DELETE(_root);
|
||||||
|
@ -1042,9 +1088,10 @@ int nrex::capture_size() const
|
||||||
bool nrex::compile(const nrex_char* pattern, int captures)
|
bool nrex::compile(const nrex_char* pattern, int captures)
|
||||||
{
|
{
|
||||||
reset();
|
reset();
|
||||||
nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing));
|
nrex_node_group* root = NREX_NEW(nrex_node_group(nrex_group_capture, _capturing));
|
||||||
nrex_array<nrex_node_group*> stack;
|
nrex_array<nrex_node_group*> stack;
|
||||||
stack.push(root);
|
stack.push(root);
|
||||||
|
unsigned int lookahead_level = 0;
|
||||||
_root = root;
|
_root = root;
|
||||||
|
|
||||||
for (const nrex_char* c = pattern; c[0] != '\0'; ++c)
|
for (const nrex_char* c = pattern; c[0] != '\0'; ++c)
|
||||||
|
@ -1056,22 +1103,26 @@ bool nrex::compile(const nrex_char* pattern, int captures)
|
||||||
if (c[2] == ':')
|
if (c[2] == ':')
|
||||||
{
|
{
|
||||||
c = &c[2];
|
c = &c[2];
|
||||||
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
|
||||||
stack.top()->add_child(group);
|
stack.top()->add_child(group);
|
||||||
stack.push(group);
|
stack.push(group);
|
||||||
}
|
}
|
||||||
else if (c[2] == '!' || c[2] == '=')
|
else if (c[2] == '!' || c[2] == '=')
|
||||||
{
|
{
|
||||||
c = &c[2];
|
c = &c[2];
|
||||||
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookAhead));
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_ahead, lookahead_level++));
|
||||||
group->negate = (c[0] == '!');
|
group->negate = (c[0] == '!');
|
||||||
stack.top()->add_child(group);
|
stack.top()->add_child(group);
|
||||||
stack.push(group);
|
stack.push(group);
|
||||||
|
if (lookahead_level > _lookahead_depth)
|
||||||
|
{
|
||||||
|
_lookahead_depth = lookahead_level;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (c[2] == '<' && (c[3] == '!' || c[3] == '='))
|
else if (c[2] == '<' && (c[3] == '!' || c[3] == '='))
|
||||||
{
|
{
|
||||||
c = &c[3];
|
c = &c[3];
|
||||||
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookBehind));
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_behind));
|
||||||
group->negate = (c[0] == '!');
|
group->negate = (c[0] == '!');
|
||||||
stack.top()->add_child(group);
|
stack.top()->add_child(group);
|
||||||
stack.push(group);
|
stack.push(group);
|
||||||
|
@ -1083,13 +1134,13 @@ bool nrex::compile(const nrex_char* pattern, int captures)
|
||||||
}
|
}
|
||||||
else if (captures >= 0 && _capturing < captures)
|
else if (captures >= 0 && _capturing < captures)
|
||||||
{
|
{
|
||||||
nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing));
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_capture, ++_capturing));
|
||||||
stack.top()->add_child(group);
|
stack.top()->add_child(group);
|
||||||
stack.push(group);
|
stack.push(group);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
|
||||||
stack.top()->add_child(group);
|
stack.top()->add_child(group);
|
||||||
stack.push(group);
|
stack.push(group);
|
||||||
}
|
}
|
||||||
|
@ -1098,6 +1149,10 @@ bool nrex::compile(const nrex_char* pattern, int captures)
|
||||||
{
|
{
|
||||||
if (stack.size() > 1)
|
if (stack.size() > 1)
|
||||||
{
|
{
|
||||||
|
if (stack.top()->type == nrex_group_look_ahead)
|
||||||
|
{
|
||||||
|
--lookahead_level;
|
||||||
|
}
|
||||||
stack.pop();
|
stack.pop();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -1107,7 +1162,7 @@ bool nrex::compile(const nrex_char* pattern, int captures)
|
||||||
}
|
}
|
||||||
else if (c[0] == '[')
|
else if (c[0] == '[')
|
||||||
{
|
{
|
||||||
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::Bracket));
|
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_bracket));
|
||||||
stack.top()->add_child(group);
|
stack.top()->add_child(group);
|
||||||
if (c[1] == '^')
|
if (c[1] == '^')
|
||||||
{
|
{
|
||||||
|
@ -1410,7 +1465,7 @@ bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int en
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
nrex_search s(str, captures);
|
nrex_search s(str, captures, _lookahead_depth);
|
||||||
if (end >= offset)
|
if (end >= offset)
|
||||||
{
|
{
|
||||||
s.end = end;
|
s.end = end;
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
// NREX: Node RegEx
|
// NREX: Node RegEx
|
||||||
// Version 0.1
|
// Version 0.2
|
||||||
//
|
//
|
||||||
// Copyright (c) 2015, Zher Huei Lee
|
// Copyright (c) 2015-2016, Zher Huei Lee
|
||||||
// All rights reserved.
|
// All rights reserved.
|
||||||
//
|
//
|
||||||
// This software is provided 'as-is', without any express or implied
|
// This software is provided 'as-is', without any express or implied
|
||||||
|
@ -57,7 +57,8 @@ class nrex_node;
|
||||||
class nrex
|
class nrex
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
int _capturing;
|
unsigned int _capturing;
|
||||||
|
unsigned int _lookahead_depth;
|
||||||
nrex_node* _root;
|
nrex_node* _root;
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue