linguist/samples/Jison/lex.jison


// `%nonassoc` tells the parser compiler (JISON) that these tokens cannot occur more than once,
// i.e. input like '//a' (tokens '/', '/' and 'a') is not a legal input while '/a' (tokens '/' and 'a')
// *is* legal input for this grammar.

%nonassoc '/' '/!'

// Likewise for `%left`: this informs the LALR(1) grammar compiler (JISON) that these tokens
// *can* occur repeatedly, e.g. 'a?*' and even 'a**' are considered legal inputs given this
// grammar!
//
// Token `RANGE_REGEX` may seem the odd one out here but really isn't: given the `regex_base`
// choice/rule `regex_base range_regex`, which is recursive, this grammar tells JISON that
// any input matching a sequence like `regex_base range_regex range_regex` *is* legal.
// If you do not want that to be legal, you MUST adjust the grammar rule set you match your
// actual intent.

%left '*' '+' '?' RANGE_REGEX


%%

lex
    : definitions include '%%' rules '%%' EOF
        {{ $$ = {macros: $1, rules: $4};
          if ($2) $$.actionInclude = $2;
          return $$; }}
    | definitions include '%%' rules EOF
        {{ $$ = {macros: $1, rules: $4};
          if ($2) $$.actionInclude = $2;
          return $$; }}
    ;

include
    : action
    |
    ;

definitions
    : definitions definition
        { $$ = $1; $$.concat($2); }
    | definition
        { $$ = [$1]; }
    ;

definition
    : name regex
        { $$ = [$1, $2]; }
    ;

name
    : NAME
        { $$ = yytext; }
    ;

rules
    : rules rule
        { $$ = $1; $$.push($2); }
    | rule
        { $$ = [$1]; }
    ;

rule
    : regex action
        { $$ = [$1, $2]; }
    ;

action
    : ACTION
        { $$ = yytext; }
    ;

regex
    : start_caret regex_list end_dollar
        { $$ = $1+$2+$3; }
    ;

start_caret
    : '^'
        { $$ = '^'; }
    |
        { $$ = ''; }
    ;

end_dollar
    : '$'
        { $$ = '$'; }
    |
        { $$ = ''; }
    ;

regex_list
    : regex_list '|' regex_chain
        { $$ = $1+'|'+$3; }
    | regex_chain
    ;

regex_chain
    : regex_chain regex_base
        { $$ = $1+$2;}
    | regex_base
        { $$ = $1;}
    ;

regex_base
    : '(' regex_list ')'
        { $$ = '('+$2+')'; }
    | regex_base '+'
        { $$ = $1+'+'; }
    | regex_base '*'
        { $$ = $1+'*'; }
    | regex_base '?'
        { $$ = $1+'?'; }
    | '/' regex_base
        { $$ = '(?=' + $regex_base + ')'; }
    | '/!' regex_base
        { $$ = '(?!' + $regex_base + ')'; }
    | name_expansion
    | regex_base range_regex
        { $$ = $1+$2; }
    | any_group_regex
    | '.'
        { $$ = '.'; }
    | string
    ;

name_expansion
    : '{' name '}'
        {{ $$ = '{'+$2+'}'; }}
    ;

any_group_regex
    : ANY_GROUP_REGEX
        { $$ = yytext; }
    ;

range_regex
    : RANGE_REGEX
        { $$ = yytext; }
    ;

string
    : STRING_LIT
        { $$ = yy.prepareString(yytext.substr(1, yyleng-2)); }
    ;