mattbucknall / subreg Goto Github PK

View Code? Open in Web Editor NEW

28.0 28.0 3.0 66 KB

Small footprint regular expression engine written in ANSI C - no dependencies.

License: MIT License

C 67.58% CMake 2.00% C++ 30.42%

subreg's People

Contributors

Stargazers

Watchers

Forkers

boohyung tdahlmann

subreg's Issues

regex="(\D)+" doesn't work (bug fix solution)

To fix this bug, please change subreg.c's parse_literal(state_t* state)'s following code:

      case 'D':   result = invert_match(match_digit(c));   break;
      case 'H':   result = invert_match(match_hexadecimal(c)); break;
      case 'S':   result = invert_match(match_whitespace(c)); break;
      case 'W':   result = invert_match(match_word(c));  break;

... to this:

      case 'D':   result = invert_match(match_digit(c));  if (!c) result = 0;    break;
      case 'H':   result = invert_match(match_hexadecimal(c)); if (!c) result = 0;  break;
      case 'S':   result = invert_match(match_whitespace(c)); if (!c) result = 0;    break;
      case 'W':   result = invert_match(match_word(c));  if (!c) result = 0;     break;

regex="(.)H(.)e(.)l(.)l(.)o(.)" doesn't capture input="1H2e3l4l5o6"

This issue is because the program currently tries to match only the longest possible string for repetition blocks: (*) and (+). To fix this problem, we need to try every possible count for each repetition. in subreg.c, please add:

typedef struct Regex_repetition_history Regex_repetition_history;

struct Regex_repetition_history
{  const char* regex_position;
   char block_type; /* ?, *, or + */
   int cur_count;
};

static Regex_repetition_history regex_repetition_history_list[MAX_CAPTURE];
static int regex_repetition_history_count = 0;

Then add the function:

static int get_history_index(const char* regex_position)
{
   for (int i = 0; i < regex_repetition_history_count; i++)
   {  if (regex_repetition_history_list[i].regex_position == regex_position)
      {  return i;
      }
   }
   return -1;
}

Then replace the function static int parse_repetition(state_t* state) with the following:

static int parse_repetition(state_t* state)
{
   const char* regex_begin;
   const char* regex_end;
   const char* check_point;
   int result;
   char rc;
   char is_new = 0;
   regex_begin = state->regex;
   check_point = state->input; /* ADD !!! */

   int history_index = get_history_index(regex_begin);
   /* ()? block was matched in the previous run, so this time we skip it */
   if (history_index >= 0 && regex_repetition_history_list[history_index].cur_count == 0)
   {  result = skip_block(state);
      return result;
   }

   result = parse_literal(state);
   if ( is_bad_result(result) || is_end(state->regex[0]) ) return result;

   rc = state->regex[0];

   /* block type match checking */
   if (history_index >= 0 && regex_repetition_history_list[history_index].block_type != rc)
   {  //printf("Error: block type mismatch\n");
      return -10;
   }

   /* add the new ()?, ()+, ()* block to the repeitition list */
   if (history_index == -1 && (rc == '?' || rc == '+' || rc == '*'))
   {  regex_repetition_history_list[regex_repetition_history_count].regex_position = regex_begin;
      regex_repetition_history_list[regex_repetition_history_count].block_type = rc;
      history_index = regex_repetition_history_count;
      regex_repetition_history_count++;
      is_new = 1;
   }

   if ( rc == '?' )
   {     
      if ( !is_match_result(result) ) /* ADD !!! */
      {  state->input = check_point;
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 0;
      }
      else
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 1;
      state->regex++;
      return SUBREG_RESULT_INTERNAL_MATCH;
   }
   else if ( rc == '+' )
   {
      if ( !is_match_result(result) )
      {
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 0;
         return SUBREG_RESULT_NO_MATCH;
      }
   }
   else if ( rc == '*' )
   {
      if ( !is_match_result(result) )
      {
         state->input = check_point; /* ADD !!! */
         state->regex++;
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 0;
         return SUBREG_RESULT_INTERNAL_MATCH;
      }
   }
   else return result;

   regex_end = state->regex + 1;

   int cur_count = 1;

   if (is_new || regex_repetition_history_list[history_index].cur_count == INT_MAX)
   {  is_new = 1;
      regex_repetition_history_list[history_index].cur_count = 1;
   }
   while (is_new || cur_count < regex_repetition_history_list[history_index].cur_count)
   {
      state->regex = regex_begin;
      check_point = state->input;
      result = parse_literal(state);
      if ( is_bad_result(result) ) return result;

      if ( !is_match_result(result) ) { state->input = check_point; break;}
      cur_count++;
      if (is_new)
         regex_repetition_history_list[history_index].cur_count++;
   }
   state->regex = regex_end;
   return SUBREG_RESULT_INTERNAL_MATCH;
}

Then finally, in function int subreg_match(), after calling result = parse_expr(&state);, add the following:

   while (!result)
   {
      char is_decreased = 0;

      for (int i = regex_repetition_history_count - 1; i >= 0; i--)
         if (regex_repetition_history_list[i].cur_count == INT_MAX)
            regex_repetition_history_list[i].cur_count = 0;

      for (int i = regex_repetition_history_count - 1; i >= 0; i--)
      {  if (regex_repetition_history_list[i].cur_count > 0)
         {  regex_repetition_history_list[i].cur_count--;
            is_decreased = 1;
            break;
         }
         else
         {  regex_repetition_history_list[i].cur_count = INT_MAX;
         }
      }
      if (!is_decreased)
         break;
      state.capture_index = 1;
      state.depth = 0;
      state.regex = regex;
      state.input = input;

      result = parse_expr(&state);
   }

BUG: Regex "B(AAC)?AAD" can't capture input "BAAD"

This error for (* and ?) occurs because the program doesn't rewind the input when * or ? doesn't match initially. So we need checkpoints not only for the regex, also for the input.

To fix this, inside function static int parse_repetition(state_t* state), at at the very beginning:

const char* check_point = state->input;

and inside the if ( rc == '?' ) block, add:

      if ( !is_match_result(result) ) /* ADD !!! */
         state->input = check_point;

and inside the else if ( rc == '*' ) { if ( !is_match_result(result) ) block, add:

state->input = check_point;

Adding support for range matches such as [0-9a-z] (bug fix solution)

Hi,

I'm a PhD student who is trying to leverage your subreg library to build a regex app in SGX. I think your library is great because it supports capture groups. However, I realized your library doesn't allow the regular expression syntax using square brackets, such as [0-9a-z]. I wonder if the range matches like this is innately not supported by your code.... If so, should we implement it by ourselves?

Non-consuming regular expression (?=) and negating regular expression (?!)

Hi,

I have another question. I have to implement negation, union, and intersection in Javascript. We can do union by doing something like (A|B), which is supported by subreg. However, subreg doesn't seem to support non-consuming regex (?=) and negating regex (?!). Am I correct? If so, I will implement it share my proposed code with you. I have to start working on it today, so please let me know if you see this message. Thanks.

^ $, and Capture group indexing

Hi,

I have miscellaneous issues to discuss.

^ and $ denote the beginning and end of a match. However, in subreg, all inputs seem to be implicitly treated to have ^ at the beginning and $ at the end. I think it might be better if we let each match to be a substring of an input like Javascript does. In this case, we could have extra capture groups that match the prefix & postfix from an input that don't belong to the match (so that we don't lose any match information).
In subreg, the capture group indexing seems to be little confusing in case of ()+ and ()*. Subreg counts each instance of such repetitive match to be a unique capture group. However, Javascript treatd only the last match of a repetitive match to be the final capture group of that repetitive bracket. I think it might be good to follow the Javascript convention.

I added a bunch of code, and because they are now complex I can't describe patches by text... But please feel free to refer to my test cases that support the above two features:

./subreg-main "(?:(.)*|BYE)" "BYE"
Capture Group [0]: BYE
Capture Group [1]: E


./subreg-main "(?:^(.)$|BYE)" "BYE"
Capture Group [0]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE
Capture Group [1]: BYE


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "HELLO"
Capture Group [0]: HELLO
Capture Group [1]: HELLO


./subreg-main "(.)*H(.)*E(.)*L(.)*L(.)*O(.)*" "HELLO"
Capture Group [0]: HELLO


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "HELLO"
Capture Group [0]: HELLO
Capture Group [1]: HELLO


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE
Capture Group [1]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "HELLO"
Capture Group [0]: HELLO


./subreg-main "((HI)|(BYE))+" "HIBYEHI"
Capture Group [0]: HIBYEHI
Capture Group [1]: HI
Capture Group [2]: HI
Capture Group [3]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "H...H...E...L...L...O..."
Capture Group [0]: H...H...E...L...L...O...
Capture Group [1]: .
Capture Group [2]: .
Capture Group [3]: .
Capture Group [4]: .
Capture Group [5]: .
Capture Group [6]: .


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "..H...H...E...L...L...O..."
Capture Group [0]: ..H...H...E...L...L...O...
Capture Group [1]: .
Capture Group [2]: .
Capture Group [3]: .
Capture Group [4]: .
Capture Group [5]: .
Capture Group [6]: .



./subreg-main "Verizon" "Verizon"
Capture Group [0]: Verizon


./subreg-main "Verizon" "aaaVerizon"
Capture Group [0]: Verizon
Capture Group [BEFORE]: aaa


./subreg-main "Verizon" "Verizonbbb"
Capture Group [0]: Verizon
Capture Group [AFTER]: bbb


./subreg-main "Verizon" "aaaVerizonbbb"
Capture Group [0]: Verizon
Capture Group [BEFORE]: aaa
Capture Group [AFTER]: bbb

./subreg-main '((((?!H))*)*)' 'H'
Capture Groups = 0
Capture Group [AFTER]: H

./subreg-main '(^(?:(?![a-z]).)*[0-9]$)(.*)' 'A9'
Capture Group [0]: A9
Capture Group [1]: A9

./subreg-main '^(?:(?!(?:a)+).)*$' 'A9'
Capture Group [0]: A9


./subreg-main '((^(?:(?!(?:[a-z])+).)*$)(.*))(.*)' 'A9'
Capture Group [0]: A9
Capture Group [1]: A9
Capture Group [2]: A9

MSVC signed/unsigned conflict between state_t.max_depth and state_t.depth

changing max_depth to int instead of unsigned int fixes the warning, nothing serious but might change for clean compile in the future.

typedef struct
{
    const char* regex;
    const char* input;
    subreg_capture_t* captures;
    unsigned int max_captures;
    /*unsigned*/ int max_depth;
    unsigned int capture_index;
    int depth;
    
} state_t;

BUG: regex="(ab|cd)+c" doesn't match input="abc"

The way to fix it is in function static int parse_repetition(state_t* state), change the for-loop block

   for (;;)
   {  
      state->regex = regex_begin;
      result = parse_literal(state);
      if ( is_bad_result(result) ) return result;
      
      if ( !is_match_result(result) ) break;
   }

to the following:

   for (;;)
   {  
      state->regex = regex_begin;
      const char* input_checkpoint = state->input;
      result = parse_literal(state);
      if ( is_bad_result(result) ) return result;
      
      if ( !is_match_result(result) ) { state->input = input_checkpoint; break;}
   }

mattbucknall / subreg Goto Github PK

subreg's People

Contributors

Stargazers

Watchers

Forkers

subreg's Issues

regex="(\D)+" doesn't work (bug fix solution)

regex="(.)H(.)e(.)l(.)l(.)o(.)" doesn't capture input="1H2e3l4l5o6"

BUG: Regex "B(AAC)?AAD" can't capture input "BAAD"

Adding support for range matches such as [0-9a-z] (bug fix solution)

Non-consuming regular expression (?=) and negating regular expression (?!)

^ $, and Capture group indexing

MSVC signed/unsigned conflict between state_t.max_depth and state_t.depth

BUG: regex="(ab|cd)+c" doesn't match input="abc"

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent