From d0ddf150d9c7207f87409b5899297dae0fc0b708 Mon Sep 17 00:00:00 2001 From: Zher Huei Lee Date: Fri, 4 Dec 2015 21:18:41 +0000 Subject: [PATCH 1/2] updated the RegEx library nrex to v0.1 After implementing unit testing to nrex I caught and fixed some errors so it should behave more like Python's RegEx In addition, I've added version numbering so it should be able to tell if the library needs updating. Here are a list of changes: - Fixed zero count quantifiers failing. - Fixed infinite recursion if quantifying zero length token. - Fixed `$` (as a string pattern on its own) not matching. - Fixed look behind rewinding beyond the start of the string. - Added support for alternative back reference format `\g{1}` similar to Python. This allows digits to be used immediately after back references. - Number of capture groups are still limited to 9 by default but can now be manually set, with option for no limit at all. (Python has no limit) - Curly bracket quantifiers `{0}` no longer interpreted as a literal string if previous token is not quantifiable. (Python behaviour) --- drivers/nrex/README.md | 4 +- drivers/nrex/nrex.cpp | 101 +++++++++++++++++++++++++++-------------- drivers/nrex/nrex.hpp | 41 ++++++++++++++--- drivers/nrex/regex.cpp | 6 +-- drivers/nrex/regex.h | 2 +- 5 files changed, 108 insertions(+), 46 deletions(-) diff --git a/drivers/nrex/README.md b/drivers/nrex/README.md index 951b301c1ed..9ff67992dca 100644 --- a/drivers/nrex/README.md +++ b/drivers/nrex/README.md @@ -1,5 +1,7 @@ # NREX: Node RegEx +Version 0.1 + Small node-based regular expression library. It only does text pattern matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp` and `nrex_config.h` to your project and follow the example: @@ -32,7 +34,7 @@ Currently supported features: * Unicode `\uFFFF` code points * Positive `(?=)` and negative `(?!)` lookahead * Positive `(?<=)` and negative `(?test(s, pos - offset); @@ -450,7 +459,7 @@ struct nrex_node_char : public nrex_node int test(nrex_search* s, int pos) const { - if (s->end == pos || s->at(pos) != ch) + if (s->end <= pos || 0 > pos || s->at(pos) != ch) { return -1; } @@ -473,7 +482,7 @@ struct nrex_node_range : public nrex_node int test(nrex_search* s, int pos) const { - if (s->end == pos) + if (s->end <= pos || 0 > pos) { return -1; } @@ -555,7 +564,7 @@ struct nrex_node_class : public nrex_node int test(nrex_search* s, int pos) const { - if (s->end == pos) + if (s->end <= pos || 0 > pos) { return -1; } @@ -727,7 +736,7 @@ struct nrex_node_shorthand : public nrex_node int test(nrex_search* s, int pos) const { - if (s->end == pos) + if (s->end <= pos || 0 > pos) { return -1; } @@ -811,16 +820,12 @@ struct nrex_node_quantifier : public nrex_node int test(nrex_search* s, int pos) const { - return test_step(s, pos, 1); + return test_step(s, pos, 0, pos); } - int test_step(nrex_search* s, int pos, int level) const + int test_step(nrex_search* s, int pos, int level, int start) const { - if (max == 0) - { - return pos; - } - if ((max >= 1 && level > max) || pos > s->end) + if (pos > s->end) { return -1; } @@ -840,14 +845,26 @@ struct nrex_node_quantifier : public nrex_node return res; } } - int res = child->test(s, pos); - if (s->complete) + if (max >= 0 && level > max) { - return res; + return -1; + } + if (level > 1 && level > min + 1 && pos == start) + { + return -1; + } + int res = pos; + if (level >= 1) + { + res = child->test(s, pos); + if (s->complete) + { + return res; + } } if (res >= 0) { - int res_step = test_step(s, res, level + 1); + int res_step = test_step(s, res, level + 1, start); if (res_step >= 0) { return res_step; @@ -983,6 +1000,13 @@ nrex::nrex() { } +nrex::nrex(const nrex_char* pattern, int captures) + : _capturing(0) + , _root(NULL) +{ + compile(pattern, captures); +} + nrex::~nrex() { if (_root) @@ -1008,10 +1032,14 @@ void nrex::reset() int nrex::capture_size() const { - return _capturing + 1; + if (_root) + { + return _capturing + 1; + } + return 0; } -bool nrex::compile(const nrex_char* pattern, bool extended) +bool nrex::compile(const nrex_char* pattern, int captures) { reset(); nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing)); @@ -1053,7 +1081,7 @@ bool nrex::compile(const nrex_char* pattern, bool extended) NREX_COMPILE_ERROR("unrecognised qualifier for group"); } } - else if ((!extended && _capturing < 9) || (extended && _capturing < 99)) + else if (captures >= 0 && _capturing < captures) { nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing)); stack.top()->add_child(group); @@ -1190,15 +1218,6 @@ bool nrex::compile(const nrex_char* pattern, bool extended) } else if (nrex_is_quantifier(c[0])) { - if (stack.top()->back == NULL || !stack.top()->back->quantifiable) - { - if (c[0] == '{') - { - stack.top()->add_child(NREX_NEW(nrex_node_char('{'))); - continue; - } - NREX_COMPILE_ERROR("element not quantifiable"); - } int min = 0; int max = -1; bool valid_quantifier = true; @@ -1270,6 +1289,10 @@ bool nrex::compile(const nrex_char* pattern, bool extended) } if (valid_quantifier) { + if (stack.top()->back == NULL || !stack.top()->back->quantifiable) + { + NREX_COMPILE_ERROR("element not quantifiable"); + } nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier(min, max)); if (min == max) { @@ -1323,20 +1346,26 @@ bool nrex::compile(const nrex_char* pattern, bool extended) stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); ++c; } - else if ('1' <= c[1] && c[1] <= '9') + else if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { int ref = 0; - if (extended && '0' <= c[2] && c[2] <= '9') + bool unclosed = false; + if (c[1] == 'g') { - ref = int(c[1] - '0') * 10 + int(c[2] - '0'); + unclosed = true; c = &c[2]; } - else + while ('0' <= c[1] && c[1] <= '9') { - ref = int(c[1] - '0'); + ref = ref * 10 + int(c[1] - '0'); ++c; } - if (ref > _capturing) + if (c[1] == '}') + { + unclosed = false; + ++c; + } + if (ref > _capturing || ref <= 0 || unclosed) { NREX_COMPILE_ERROR("backreference to non-existent capture"); } @@ -1377,6 +1406,10 @@ bool nrex::compile(const nrex_char* pattern, bool extended) bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int end) const { + if (!_root) + { + return false; + } nrex_search s(str, captures); if (end >= offset) { @@ -1386,7 +1419,7 @@ bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int en { s.end = NREX_STRLEN(str); } - for (int i = offset; i < s.end; ++i) + for (int i = offset; i <= s.end; ++i) { for (int c = 0; c <= _capturing; ++c) { diff --git a/drivers/nrex/nrex.hpp b/drivers/nrex/nrex.hpp index e26a61c39ae..44e950c5178 100644 --- a/drivers/nrex/nrex.hpp +++ b/drivers/nrex/nrex.hpp @@ -1,4 +1,5 @@ // NREX: Node RegEx +// Version 0.1 // // Copyright (c) 2015, Zher Huei Lee // All rights reserved. @@ -59,7 +60,32 @@ class nrex int _capturing; nrex_node* _root; public: + + /*! + * \brief Initialises an empty regex container + */ nrex(); + + /*! + * \brief Initialises and compiles the regex pattern + * + * This calls nrex::compile() with the same arguments. To check whether + * the compilation was successfull, use nrex::valid(). + * + * If the NREX_THROW_ERROR was defined it would automatically throw a + * runtime error nrex_compile_error if it encounters a problem when + * parsing the pattern. + * + * \param pattern The regex pattern + * \param captures The maximum number of capture groups to allow. Any + * extra would be converted to non-capturing groups. + * If negative, no limit would be imposed. Defaults + * to 9. + * + * \see nrex::compile() + */ + nrex(const nrex_char* pattern, int captures = 9); + ~nrex(); /*! @@ -78,9 +104,9 @@ class nrex * * This is used to provide the array size of the captures needed for * nrex::match() to work. The size is actually the number of capture - * groups + one for the matching of the entire pattern. The result is - * always capped at 10 or 100, depending on the extend option given in - * nrex::compile() (default 10). + * groups + one for the matching of the entire pattern. This can be + * capped using the extra argument given in nrex::compile() + * (default 10). * * \return The number of captures */ @@ -97,12 +123,13 @@ class nrex * parsing the pattern. * * \param pattern The regex pattern - * \param extended If true, raises the limit on number of capture - * groups and back-references to 99. Otherwise limited - * to 9. Defaults to false. + * \param captures The maximum number of capture groups to allow. Any + * extra would be converted to non-capturing groups. + * If negative, no limit would be imposed. Defaults + * to 9. * \return True if the pattern was succesfully compiled */ - bool compile(const nrex_char* pattern, bool extended = false); + bool compile(const nrex_char* pattern, int captures = 9); /*! * \brief Uses the pattern to search through the provided string diff --git a/drivers/nrex/regex.cpp b/drivers/nrex/regex.cpp index 246384b10a6..e8578221a99 100644 --- a/drivers/nrex/regex.cpp +++ b/drivers/nrex/regex.cpp @@ -15,7 +15,7 @@ void RegEx::_bind_methods() { - ObjectTypeDB::bind_method(_MD("compile","pattern", "expanded"),&RegEx::compile, DEFVAL(true)); + ObjectTypeDB::bind_method(_MD("compile","pattern", "capture"),&RegEx::compile, DEFVAL(9)); ObjectTypeDB::bind_method(_MD("find","text","start","end"),&RegEx::find, DEFVAL(0), DEFVAL(-1)); ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); @@ -68,11 +68,11 @@ String RegEx::get_capture(int capture) const { } -Error RegEx::compile(const String& p_pattern, bool expanded) { +Error RegEx::compile(const String& p_pattern, int capture) { clear(); - exp.compile(p_pattern.c_str(), expanded); + exp.compile(p_pattern.c_str(), capture); ERR_FAIL_COND_V( !exp.valid(), FAILED ); diff --git a/drivers/nrex/regex.h b/drivers/nrex/regex.h index be52da81495..76aab2aea65 100644 --- a/drivers/nrex/regex.h +++ b/drivers/nrex/regex.h @@ -36,7 +36,7 @@ public: bool is_valid() const; int get_capture_count() const; String get_capture(int capture) const; - Error compile(const String& p_pattern, bool expanded = false); + Error compile(const String& p_pattern, int capture = 9); int find(const String& p_text, int p_start = 0, int p_end = -1) const; RegEx(); From d957749179f7310995e994b45df72291a57bd0d7 Mon Sep 17 00:00:00 2001 From: Zher Huei Lee Date: Fri, 4 Dec 2015 22:19:53 +0000 Subject: [PATCH 2/2] updated nrex documentation --- doc/base/classes.xml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/base/classes.xml b/doc/base/classes.xml index fd5df945bab..1b1f1d9f3f8 100644 --- a/doc/base/classes.xml +++ b/doc/base/classes.xml @@ -26945,7 +26945,7 @@ This method controls whether the position between two cached points is interpola Lazy (non-greedy) quantifiers [code]*?[/code] Begining [code]^[/code] and end [code]$[/code] anchors Alternation [code]|[/code] - Backreferences [code]\1[/code] to [code]\9[/code] + Backreferences [code]\1[/code] and [code]\g{1}[/code] POSIX character classes [code][[:alnum:]][/code] Lookahead [code](?=)[/code], [code](?!)[/code] and lookbehind [code](?<=)[/code], [code](?<!)[/code] ASCII [code]\xFF[/code] and Unicode [code]\uFFFF[/code] code points (in a style similar to Python) @@ -26957,9 +26957,10 @@ This method controls whether the position between two cached points is interpola - + + Compiles and assign the regular expression pattern to use. The limit on the number of capturing groups can be specified or made unlimited if negative.