Giter Club home page Giter Club logo

subreg's People

Contributors

mattbucknall avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar

subreg's Issues

regex="(\D)+" doesn't work (bug fix solution)

To fix this bug, please change subreg.c's parse_literal(state_t* state)'s following code:

      case 'D':   result = invert_match(match_digit(c));   break;
      case 'H':   result = invert_match(match_hexadecimal(c)); break;
      case 'S':   result = invert_match(match_whitespace(c)); break;
      case 'W':   result = invert_match(match_word(c));  break;

... to this:

      case 'D':   result = invert_match(match_digit(c));  if (!c) result = 0;    break;
      case 'H':   result = invert_match(match_hexadecimal(c)); if (!c) result = 0;  break;
      case 'S':   result = invert_match(match_whitespace(c)); if (!c) result = 0;    break;
      case 'W':   result = invert_match(match_word(c));  if (!c) result = 0;     break;

regex="(.*)H(.*)e(.*)l(.*)l(.*)o(.*)" doesn't capture input="1H2e3l4l5o6"

This issue is because the program currently tries to match only the longest possible string for repetition blocks: (*) and (+). To fix this problem, we need to try every possible count for each repetition. in subreg.c, please add:

typedef struct Regex_repetition_history Regex_repetition_history;

struct Regex_repetition_history
{  const char* regex_position;
   char block_type; /* ?, *, or + */
   int cur_count;
};

static Regex_repetition_history regex_repetition_history_list[MAX_CAPTURE];
static int regex_repetition_history_count = 0;

Then add the function:

static int get_history_index(const char* regex_position)
{
   for (int i = 0; i < regex_repetition_history_count; i++)
   {  if (regex_repetition_history_list[i].regex_position == regex_position)
      {  return i;
      }
   }
   return -1;
}

Then replace the function static int parse_repetition(state_t* state) with the following:

static int parse_repetition(state_t* state)
{
   const char* regex_begin;
   const char* regex_end;
   const char* check_point;
   int result;
   char rc;
   char is_new = 0;
   regex_begin = state->regex;
   check_point = state->input; /* ADD !!! */

   int history_index = get_history_index(regex_begin);
   /* ()? block was matched in the previous run, so this time we skip it */
   if (history_index >= 0 && regex_repetition_history_list[history_index].cur_count == 0)
   {  result = skip_block(state);
      return result;
   }

   result = parse_literal(state);
   if ( is_bad_result(result) || is_end(state->regex[0]) ) return result;

   rc = state->regex[0];

   /* block type match checking */
   if (history_index >= 0 && regex_repetition_history_list[history_index].block_type != rc)
   {  //printf("Error: block type mismatch\n");
      return -10;
   }

   /* add the new ()?, ()+, ()* block to the repeitition list */
   if (history_index == -1 && (rc == '?' || rc == '+' || rc == '*'))
   {  regex_repetition_history_list[regex_repetition_history_count].regex_position = regex_begin;
      regex_repetition_history_list[regex_repetition_history_count].block_type = rc;
      history_index = regex_repetition_history_count;
      regex_repetition_history_count++;
      is_new = 1;
   }

   if ( rc == '?' )
   {     
      if ( !is_match_result(result) ) /* ADD !!! */
      {  state->input = check_point;
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 0;
      }
      else
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 1;
      state->regex++;
      return SUBREG_RESULT_INTERNAL_MATCH;
   }
   else if ( rc == '+' )
   {
      if ( !is_match_result(result) )
      {
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 0;
         return SUBREG_RESULT_NO_MATCH;
      }
   }
   else if ( rc == '*' )
   {
      if ( !is_match_result(result) )
      {
         state->input = check_point; /* ADD !!! */
         state->regex++;
         regex_repetition_history_list[regex_repetition_history_count].cur_count = 0;
         return SUBREG_RESULT_INTERNAL_MATCH;
      }
   }
   else return result;

   regex_end = state->regex + 1;

   int cur_count = 1;

   if (is_new || regex_repetition_history_list[history_index].cur_count == INT_MAX)
   {  is_new = 1;
      regex_repetition_history_list[history_index].cur_count = 1;
   }
   while (is_new || cur_count < regex_repetition_history_list[history_index].cur_count)
   {
      state->regex = regex_begin;
      check_point = state->input;
      result = parse_literal(state);
      if ( is_bad_result(result) ) return result;

      if ( !is_match_result(result) ) { state->input = check_point; break;}
      cur_count++;
      if (is_new)
         regex_repetition_history_list[history_index].cur_count++;
   }
   state->regex = regex_end;
   return SUBREG_RESULT_INTERNAL_MATCH;
}

Then finally, in function int subreg_match(), after calling result = parse_expr(&state);, add the following:

   while (!result)
   {
      char is_decreased = 0;

      for (int i = regex_repetition_history_count - 1; i >= 0; i--)
         if (regex_repetition_history_list[i].cur_count == INT_MAX)
            regex_repetition_history_list[i].cur_count = 0;

      for (int i = regex_repetition_history_count - 1; i >= 0; i--)
      {  if (regex_repetition_history_list[i].cur_count > 0)
         {  regex_repetition_history_list[i].cur_count--;
            is_decreased = 1;
            break;
         }
         else
         {  regex_repetition_history_list[i].cur_count = INT_MAX;
         }
      }
      if (!is_decreased)
         break;
      state.capture_index = 1;
      state.depth = 0;
      state.regex = regex;
      state.input = input;

      result = parse_expr(&state);
   }

BUG: Regex "B(AAC)?AAD" can't capture input "BAAD"

This error for (* and ?) occurs because the program doesn't rewind the input when * or ? doesn't match initially. So we need checkpoints not only for the regex, also for the input.

To fix this, inside function static int parse_repetition(state_t* state), at at the very beginning:

const char* check_point = state->input;

and inside the if ( rc == '?' ) block, add:

      if ( !is_match_result(result) ) /* ADD !!! */
         state->input = check_point;

and inside the else if ( rc == '*' ) { if ( !is_match_result(result) ) block, add:

state->input = check_point;

Adding support for range matches such as [0-9a-z] (bug fix solution)

Hi,

I'm a PhD student who is trying to leverage your subreg library to build a regex app in SGX. I think your library is great because it supports capture groups. However, I realized your library doesn't allow the regular expression syntax using square brackets, such as [0-9a-z]. I wonder if the range matches like this is innately not supported by your code.... If so, should we implement it by ourselves?

Non-consuming regular expression (?=) and negating regular expression (?!)

Hi,

I have another question. I have to implement negation, union, and intersection in Javascript. We can do union by doing something like (A|B), which is supported by subreg. However, subreg doesn't seem to support non-consuming regex (?=) and negating regex (?!). Am I correct? If so, I will implement it share my proposed code with you. I have to start working on it today, so please let me know if you see this message. Thanks.

^ $, and Capture group indexing

Hi,

I have miscellaneous issues to discuss.

  1. ^ and $ denote the beginning and end of a match. However, in subreg, all inputs seem to be implicitly treated to have ^ at the beginning and $ at the end. I think it might be better if we let each match to be a substring of an input like Javascript does. In this case, we could have extra capture groups that match the prefix & postfix from an input that don't belong to the match (so that we don't lose any match information).

  2. In subreg, the capture group indexing seems to be little confusing in case of ()+ and ()*. Subreg counts each instance of such repetitive match to be a unique capture group. However, Javascript treatd only the last match of a repetitive match to be the final capture group of that repetitive bracket. I think it might be good to follow the Javascript convention.

I added a bunch of code, and because they are now complex I can't describe patches by text... But please feel free to refer to my test cases that support the above two features:

./subreg-main "(?:(.)*|BYE)" "BYE"
Capture Group [0]: BYE
Capture Group [1]: E


./subreg-main "(?:^(.)$|BYE)" "BYE"
Capture Group [0]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE
Capture Group [1]: BYE


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "HELLO"
Capture Group [0]: HELLO
Capture Group [1]: HELLO


./subreg-main "(.)*H(.)*E(.)*L(.)*L(.)*O(.)*" "HELLO"
Capture Group [0]: HELLO


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "HELLO"
Capture Group [0]: HELLO
Capture Group [1]: HELLO


./subreg-main "(^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE
Capture Group [1]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "BYE"
Capture Group [0]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "HELLO"
Capture Group [0]: HELLO


./subreg-main "((HI)|(BYE))+" "HIBYEHI"
Capture Group [0]: HIBYEHI
Capture Group [1]: HI
Capture Group [2]: HI
Capture Group [3]: BYE


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "H...H...E...L...L...O..."
Capture Group [0]: H...H...E...L...L...O...
Capture Group [1]: .
Capture Group [2]: .
Capture Group [3]: .
Capture Group [4]: .
Capture Group [5]: .
Capture Group [6]: .


./subreg-main "(?:^(.)*H(.)*E(.)*L(.)*L(.)*O(.)*$|BYE)" "..H...H...E...L...L...O..."
Capture Group [0]: ..H...H...E...L...L...O...
Capture Group [1]: .
Capture Group [2]: .
Capture Group [3]: .
Capture Group [4]: .
Capture Group [5]: .
Capture Group [6]: .



./subreg-main "Verizon" "Verizon"
Capture Group [0]: Verizon


./subreg-main "Verizon" "aaaVerizon"
Capture Group [0]: Verizon
Capture Group [BEFORE]: aaa


./subreg-main "Verizon" "Verizonbbb"
Capture Group [0]: Verizon
Capture Group [AFTER]: bbb


./subreg-main "Verizon" "aaaVerizonbbb"
Capture Group [0]: Verizon
Capture Group [BEFORE]: aaa
Capture Group [AFTER]: bbb

./subreg-main '((((?!H))*)*)' 'H'
Capture Groups = 0
Capture Group [AFTER]: H

./subreg-main '(^(?:(?![a-z]).)*[0-9]$)(.*)' 'A9'
Capture Group [0]: A9
Capture Group [1]: A9

./subreg-main '^(?:(?!(?:a)+).)*$' 'A9'
Capture Group [0]: A9


./subreg-main '((^(?:(?!(?:[a-z])+).)*$)(.*))(.*)' 'A9'
Capture Group [0]: A9
Capture Group [1]: A9
Capture Group [2]: A9

MSVC signed/unsigned conflict between state_t.max_depth and state_t.depth

changing max_depth to int instead of unsigned int fixes the warning, nothing serious but might change for clean compile in the future.

typedef struct
{
    const char* regex;
    const char* input;
    subreg_capture_t* captures;
    unsigned int max_captures;
    /*unsigned*/ int max_depth;
    unsigned int capture_index;
    int depth;
    
} state_t;

BUG: regex="(ab|cd)+c" doesn't match input="abc"

The way to fix it is in function static int parse_repetition(state_t* state), change the for-loop block

   for (;;)
   {  
      state->regex = regex_begin;
      result = parse_literal(state);
      if ( is_bad_result(result) ) return result;
      
      if ( !is_match_result(result) ) break;
   }

to the following:

   for (;;)
   {  
      state->regex = regex_begin;
      const char* input_checkpoint = state->input;
      result = parse_literal(state);
      if ( is_bad_result(result) ) return result;
      
      if ( !is_match_result(result) ) { state->input = input_checkpoint; break;}
   }

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.