linguist/samples/Erlang/lfe_scan.xrl

%% Copyright (c) 2008-2013 Robert Virding
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.

%% File    : lfe_scan.xrl
%% Author  : Robert Virding
%% Purpose : Token definitions for Lisp Flavoured Erlang.

Definitions.
B    = [01]
O    = [0-7]
D    = [0-9]
H    = [0-9a-fA-F]
B36  = [0-9a-zA-Z]
U    = [A-Z]
L    = [a-z]
A    = ({U}|{L})
DEL  = [][()}{";\000-\s]
SYM  = [^][()}{";\000-\s\177-\237]
SSYM = [^][()}{"|;#`',\000-\s\177-\237]
WS   = ([\000-\s]|;[^\n]*)

Rules.
%% Bracketed Comments using #| foo |#
#{D}*\|[^\|]*\|+([^#\|][^\|]*\|+)*# :
        block_comment(string:substr(TokenChars, 3)).

%% Separators
'               :    {token,{'\'',TokenLine}}.
`               :    {token,{'`',TokenLine}}.
,               :    {token,{',',TokenLine}}.
,@              :    {token,{',@',TokenLine}}.
\.              :    {token,{'.',TokenLine}}.
[][()}{]        :    {token,{list_to_atom(TokenChars),TokenLine}}.

#{D}*[bB]\(     :    {token,{'#B(',TokenLine}}.
#{D}*[mM]\(     :    {token,{'#M(',TokenLine}}.
#{D}*\(         :    {token,{'#(',TokenLine}}.
#{D}*\.         :    {token,{'#.',TokenLine}}.

#{D}*`          :    {token,{'#`',TokenLine}}.
#{D}*;          :    {token,{'#;',TokenLine}}.
#{D}*,          :    {token,{'#,',TokenLine}}.
#{D}*,@         :    {token,{'#,@',TokenLine}}.

%% Characters
#{D}*\\(x{H}+|.) :   char_token(skip_past(TokenChars, $\\, $\\), TokenLine).

%% Based numbers
#{D}*\*{SYM}+   :    base_token(skip_past(TokenChars, $*, $*), 2, TokenLine).
#{D}*[bB]{SYM}+ :    base_token(skip_past(TokenChars, $b, $B), 2, TokenLine).
#{D}*[oO]{SYM}+ :    base_token(skip_past(TokenChars, $o, $O), 8, TokenLine).
#{D}*[dD]{SYM}+ :    base_token(skip_past(TokenChars, $d, $D), 10, TokenLine).
#{D}*[xX]{SYM}+ :    base_token(skip_past(TokenChars, $x, $X), 16, TokenLine).
#{D}*[rR]{SYM}+ :
        %% Scan over digit chars to get base.
        {Base,[_|Ds]} = base1(tl(TokenChars), 10, 0),
        base_token(Ds, Base, TokenLine).

%% String
"(\\x{H}+;|\\.|[^"\\])*" :
        %% Strip quotes.
        S = string:substr(TokenChars, 2, TokenLen - 2),
        {token,{string,TokenLine,chars(S)}}.
%% Binary string
#"(\\x{H}+;|\\.|[^"\\])*" :
        %% Strip quotes.
        S = string:substr(TokenChars, 3, TokenLen - 3),
        Bin = unicode:characters_to_binary(chars(S), utf8, utf8),
        {token,{binary,TokenLine,Bin}}.
%% Symbols
\|(\\x{H}+;|\\.|[^|\\])*\| :
        %% Strip quotes.
        S = string:substr(TokenChars, 2, TokenLen - 2),
        symbol_token(chars(S), TokenLine).
%% Funs
#'{SSYM}{SYM}*/{D}+ :
        %% Strip sharpsign single-quote.
        FunStr = string:substr(TokenChars,3),
        {token,{'#\'',TokenLine,FunStr}}.
%% Atoms
[+-]?{D}+       :
        case catch {ok,list_to_integer(TokenChars)} of
            {ok,I} -> {token,{number,TokenLine,I}};
            _ -> {error,"illegal integer"}
        end.
[+-]?{D}+\.{D}+([eE][+-]?{D}+)? :
        case catch {ok,list_to_float(TokenChars)} of
            {ok,F} -> {token,{number,TokenLine,F}};
            _ -> {error,"illegal float"}
        end.
{SSYM}{SYM}*    :
        symbol_token(TokenChars, TokenLine).
{WS}+           :    skip_token.

Erlang code.
%% Copyright (c) 2008-2013 Robert Virding
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.

%% File    : lfe_scan.erl
%% Author  : Robert Virding
%% Purpose : Token definitions for Lisp Flavoured Erlang.

-export([start_symbol_char/1,symbol_char/1]).

-import(string, [substr/2,substr/3]).

%% start_symbol_char(Char) -> true | false.
%% symbol_char(Char) -> true | false.
%%  Define start symbol chars and symbol chars.

start_symbol_char($#) -> false;
start_symbol_char($`) -> false;
start_symbol_char($') -> false;                 %'
start_symbol_char($,) -> false;
start_symbol_char($|) -> false;                 %Symbol quote character
start_symbol_char(C) -> symbol_char(C).

symbol_char($() -> false;
symbol_char($)) -> false;
symbol_char($[) -> false;
symbol_char($]) -> false;
symbol_char(${) -> false;
symbol_char($}) -> false;
symbol_char($") -> false;
symbol_char($;) -> false;
symbol_char(C) -> ((C > $\s) and (C =< $~)) orelse (C > $\240).

%% symbol_token(Chars, Line) -> {token,{symbol,Line,Symbol}} | {error,E}.
%%  Build a symbol from list of legal characters, else error.

symbol_token(Cs, L) ->
    case catch {ok,list_to_atom(Cs)} of
        {ok,S} -> {token,{symbol,L,S}};
        _ -> {error,"illegal symbol"}
    end.

%% base_token(Chars, Base, Line) -> Integer.
%%  Convert a string of Base characters into a number. We only allow
%%  base betqeen 2 and 36, and an optional sign character first.

base_token(_, B, _) when B < 2; B > 36 ->
    {error,"illegal number base"};
base_token([$+|Cs], B, L) -> base_token(Cs, B, +1, L);
base_token([$-|Cs], B, L) -> base_token(Cs, B, -1, L);
base_token(Cs, B, L) -> base_token(Cs, B, +1, L).

base_token(Cs, B, S, L) ->
    case base1(Cs, B, 0) of
        {N,[]} -> {token,{number,L,S*N}};
        {_,_} -> {error,"illegal based number"}
    end.

base1([C|Cs], Base, SoFar) when C >= $0, C =< $9, C < Base + $0 ->
    Next = SoFar * Base + (C - $0),
    base1(Cs, Base, Next);
base1([C|Cs], Base, SoFar) when C >= $a, C =< $z, C < Base + $a - 10 ->
    Next = SoFar * Base + (C - $a + 10),
    base1(Cs, Base, Next);
base1([C|Cs], Base, SoFar) when C >= $A, C =< $Z, C < Base + $A - 10 ->
    Next = SoFar * Base + (C - $A + 10),
    base1(Cs, Base, Next);
base1([C|Cs], _Base, SoFar) -> {SoFar,[C|Cs]};
base1([], _Base, N) -> {N,[]}.

-define(IS_UNICODE(C), ((C >= 0) and (C =< 16#10FFFF))).

%% char_token(InputChars, Line) -> {token,{number,L,N}} | {error,E}.
%%  Convert an input string into the corresponding character. For a
%%  sequence of hex characters we check resultant is code is in the
%%  unicode range.

char_token([$x,C|Cs], L) ->
    case base1([C|Cs], 16, 0) of
        {N,[]} when ?IS_UNICODE(N) -> {token,{number,L,N}};
        _ -> {error,"illegal character"}
    end;
char_token([C], L) -> {token,{number,L,C}}.

%% chars(InputChars) -> Chars.
%%  Convert an input string into the corresponding string characters.
%%  We know that the input string is correct.

chars([$\\,$x,C|Cs0]) ->
    case hex_char(C) of
        true ->
            case base1([C|Cs0], 16, 0) of
                {N,[$;|Cs1]} -> [N|chars(Cs1)];
                _Other -> [escape_char($x)|chars([C|Cs0])]
            end;
        false -> [escape_char($x)|chars([C|Cs0])]
    end;
chars([$\\,C|Cs]) -> [escape_char(C)|chars(Cs)];
chars([C|Cs]) -> [C|chars(Cs)];
chars([]) -> [].

hex_char(C) when C >= $0, C =< $9 -> true;
hex_char(C) when C >= $a, C =< $f -> true;
hex_char(C) when C >= $A, C =< $F -> true;
hex_char(_) -> false.

escape_char($b) -> $\b;                %\b = BS
escape_char($t) -> $\t;                %\t = TAB
escape_char($n) -> $\n;                %\n = LF
escape_char($v) -> $\v;                %\v = VT
escape_char($f) -> $\f;                %\f = FF
escape_char($r) -> $\r;                %\r = CR
escape_char($e) -> $\e;                %\e = ESC
escape_char($s) -> $\s;                %\s = SPC
escape_char($d) -> $\d;                %\d = DEL
escape_char(C) -> C.

%% Block Comment:
%%  Provide a sensible error when people attempt to include nested
%%  comments because currently the parser cannot process them without
%%  a rebuild. But simply exploding on a '#|' is not going to be that
%%  helpful.

block_comment(TokenChars) ->
    %% Check we're not opening another comment block.
    case string:str(TokenChars, "#|") of
        0 -> skip_token; %% No nesting found
        _ -> {error, "illegal nested block comment"}
    end.

%% skip_until(String, Char1, Char2) -> String.
%% skip_past(String, Char1, Char2) -> String.

%% skip_until([C|_]=Cs, C1, C2) when C =:= C1 ; C =:= C2 -> Cs;
%% skip_until([_|Cs], C1, C2) -> skip_until(Cs, C1, C2);
%% skip_until([], _, _) -> [].

skip_past([C|Cs], C1, C2) when C =:= C1 ; C =:= C2 -> Cs;
skip_past([_|Cs], C1, C2) -> skip_past(Cs, C1, C2);
skip_past([], _, _) -> [].