Files
linguist/samples/Erlang/lfe_scan.xrl
2016-01-04 12:17:31 +00:00

257 lines
8.8 KiB
Erlang

%% Copyright (c) 2008-2013 Robert Virding
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%% File : lfe_scan.xrl
%% Author : Robert Virding
%% Purpose : Token definitions for Lisp Flavoured Erlang.
Definitions.
B = [01]
O = [0-7]
D = [0-9]
H = [0-9a-fA-F]
B36 = [0-9a-zA-Z]
U = [A-Z]
L = [a-z]
A = ({U}|{L})
DEL = [][()}{";\000-\s]
SYM = [^][()}{";\000-\s\177-\237]
SSYM = [^][()}{"|;#`',\000-\s\177-\237]
WS = ([\000-\s]|;[^\n]*)
Rules.
%% Bracketed Comments using #| foo |#
#{D}*\|[^\|]*\|+([^#\|][^\|]*\|+)*# :
block_comment(string:substr(TokenChars, 3)).
%% Separators
' : {token,{'\'',TokenLine}}.
` : {token,{'`',TokenLine}}.
, : {token,{',',TokenLine}}.
,@ : {token,{',@',TokenLine}}.
\. : {token,{'.',TokenLine}}.
[][()}{] : {token,{list_to_atom(TokenChars),TokenLine}}.
#{D}*[bB]\( : {token,{'#B(',TokenLine}}.
#{D}*[mM]\( : {token,{'#M(',TokenLine}}.
#{D}*\( : {token,{'#(',TokenLine}}.
#{D}*\. : {token,{'#.',TokenLine}}.
#{D}*` : {token,{'#`',TokenLine}}.
#{D}*; : {token,{'#;',TokenLine}}.
#{D}*, : {token,{'#,',TokenLine}}.
#{D}*,@ : {token,{'#,@',TokenLine}}.
%% Characters
#{D}*\\(x{H}+|.) : char_token(skip_past(TokenChars, $\\, $\\), TokenLine).
%% Based numbers
#{D}*\*{SYM}+ : base_token(skip_past(TokenChars, $*, $*), 2, TokenLine).
#{D}*[bB]{SYM}+ : base_token(skip_past(TokenChars, $b, $B), 2, TokenLine).
#{D}*[oO]{SYM}+ : base_token(skip_past(TokenChars, $o, $O), 8, TokenLine).
#{D}*[dD]{SYM}+ : base_token(skip_past(TokenChars, $d, $D), 10, TokenLine).
#{D}*[xX]{SYM}+ : base_token(skip_past(TokenChars, $x, $X), 16, TokenLine).
#{D}*[rR]{SYM}+ :
%% Scan over digit chars to get base.
{Base,[_|Ds]} = base1(tl(TokenChars), 10, 0),
base_token(Ds, Base, TokenLine).
%% String
"(\\x{H}+;|\\.|[^"\\])*" :
%% Strip quotes.
S = string:substr(TokenChars, 2, TokenLen - 2),
{token,{string,TokenLine,chars(S)}}.
%% Binary string
#"(\\x{H}+;|\\.|[^"\\])*" :
%% Strip quotes.
S = string:substr(TokenChars, 3, TokenLen - 3),
Bin = unicode:characters_to_binary(chars(S), utf8, utf8),
{token,{binary,TokenLine,Bin}}.
%% Symbols
\|(\\x{H}+;|\\.|[^|\\])*\| :
%% Strip quotes.
S = string:substr(TokenChars, 2, TokenLen - 2),
symbol_token(chars(S), TokenLine).
%% Funs
#'{SSYM}{SYM}*/{D}+ :
%% Strip sharpsign single-quote.
FunStr = string:substr(TokenChars,3),
{token,{'#\'',TokenLine,FunStr}}.
%% Atoms
[+-]?{D}+ :
case catch {ok,list_to_integer(TokenChars)} of
{ok,I} -> {token,{number,TokenLine,I}};
_ -> {error,"illegal integer"}
end.
[+-]?{D}+\.{D}+([eE][+-]?{D}+)? :
case catch {ok,list_to_float(TokenChars)} of
{ok,F} -> {token,{number,TokenLine,F}};
_ -> {error,"illegal float"}
end.
{SSYM}{SYM}* :
symbol_token(TokenChars, TokenLine).
{WS}+ : skip_token.
Erlang code.
%% Copyright (c) 2008-2013 Robert Virding
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%% File : lfe_scan.erl
%% Author : Robert Virding
%% Purpose : Token definitions for Lisp Flavoured Erlang.
-export([start_symbol_char/1,symbol_char/1]).
-import(string, [substr/2,substr/3]).
%% start_symbol_char(Char) -> true | false.
%% symbol_char(Char) -> true | false.
%% Define start symbol chars and symbol chars.
start_symbol_char($#) -> false;
start_symbol_char($`) -> false;
start_symbol_char($') -> false; %'
start_symbol_char($,) -> false;
start_symbol_char($|) -> false; %Symbol quote character
start_symbol_char(C) -> symbol_char(C).
symbol_char($() -> false;
symbol_char($)) -> false;
symbol_char($[) -> false;
symbol_char($]) -> false;
symbol_char(${) -> false;
symbol_char($}) -> false;
symbol_char($") -> false;
symbol_char($;) -> false;
symbol_char(C) -> ((C > $\s) and (C =< $~)) orelse (C > $\240).
%% symbol_token(Chars, Line) -> {token,{symbol,Line,Symbol}} | {error,E}.
%% Build a symbol from list of legal characters, else error.
symbol_token(Cs, L) ->
case catch {ok,list_to_atom(Cs)} of
{ok,S} -> {token,{symbol,L,S}};
_ -> {error,"illegal symbol"}
end.
%% base_token(Chars, Base, Line) -> Integer.
%% Convert a string of Base characters into a number. We only allow
%% base betqeen 2 and 36, and an optional sign character first.
base_token(_, B, _) when B < 2; B > 36 ->
{error,"illegal number base"};
base_token([$+|Cs], B, L) -> base_token(Cs, B, +1, L);
base_token([$-|Cs], B, L) -> base_token(Cs, B, -1, L);
base_token(Cs, B, L) -> base_token(Cs, B, +1, L).
base_token(Cs, B, S, L) ->
case base1(Cs, B, 0) of
{N,[]} -> {token,{number,L,S*N}};
{_,_} -> {error,"illegal based number"}
end.
base1([C|Cs], Base, SoFar) when C >= $0, C =< $9, C < Base + $0 ->
Next = SoFar * Base + (C - $0),
base1(Cs, Base, Next);
base1([C|Cs], Base, SoFar) when C >= $a, C =< $z, C < Base + $a - 10 ->
Next = SoFar * Base + (C - $a + 10),
base1(Cs, Base, Next);
base1([C|Cs], Base, SoFar) when C >= $A, C =< $Z, C < Base + $A - 10 ->
Next = SoFar * Base + (C - $A + 10),
base1(Cs, Base, Next);
base1([C|Cs], _Base, SoFar) -> {SoFar,[C|Cs]};
base1([], _Base, N) -> {N,[]}.
-define(IS_UNICODE(C), ((C >= 0) and (C =< 16#10FFFF))).
%% char_token(InputChars, Line) -> {token,{number,L,N}} | {error,E}.
%% Convert an input string into the corresponding character. For a
%% sequence of hex characters we check resultant is code is in the
%% unicode range.
char_token([$x,C|Cs], L) ->
case base1([C|Cs], 16, 0) of
{N,[]} when ?IS_UNICODE(N) -> {token,{number,L,N}};
_ -> {error,"illegal character"}
end;
char_token([C], L) -> {token,{number,L,C}}.
%% chars(InputChars) -> Chars.
%% Convert an input string into the corresponding string characters.
%% We know that the input string is correct.
chars([$\\,$x,C|Cs0]) ->
case hex_char(C) of
true ->
case base1([C|Cs0], 16, 0) of
{N,[$;|Cs1]} -> [N|chars(Cs1)];
_Other -> [escape_char($x)|chars([C|Cs0])]
end;
false -> [escape_char($x)|chars([C|Cs0])]
end;
chars([$\\,C|Cs]) -> [escape_char(C)|chars(Cs)];
chars([C|Cs]) -> [C|chars(Cs)];
chars([]) -> [].
hex_char(C) when C >= $0, C =< $9 -> true;
hex_char(C) when C >= $a, C =< $f -> true;
hex_char(C) when C >= $A, C =< $F -> true;
hex_char(_) -> false.
escape_char($b) -> $\b; %\b = BS
escape_char($t) -> $\t; %\t = TAB
escape_char($n) -> $\n; %\n = LF
escape_char($v) -> $\v; %\v = VT
escape_char($f) -> $\f; %\f = FF
escape_char($r) -> $\r; %\r = CR
escape_char($e) -> $\e; %\e = ESC
escape_char($s) -> $\s; %\s = SPC
escape_char($d) -> $\d; %\d = DEL
escape_char(C) -> C.
%% Block Comment:
%% Provide a sensible error when people attempt to include nested
%% comments because currently the parser cannot process them without
%% a rebuild. But simply exploding on a '#|' is not going to be that
%% helpful.
block_comment(TokenChars) ->
%% Check we're not opening another comment block.
case string:str(TokenChars, "#|") of
0 -> skip_token; %% No nesting found
_ -> {error, "illegal nested block comment"}
end.
%% skip_until(String, Char1, Char2) -> String.
%% skip_past(String, Char1, Char2) -> String.
%% skip_until([C|_]=Cs, C1, C2) when C =:= C1 ; C =:= C2 -> Cs;
%% skip_until([_|Cs], C1, C2) -> skip_until(Cs, C1, C2);
%% skip_until([], _, _) -> [].
skip_past([C|Cs], C1, C2) when C =:= C1 ; C =:= C2 -> Cs;
skip_past([_|Cs], C1, C2) -> skip_past(Cs, C1, C2);
skip_past([], _, _) -> [].