Files
linguist/ext/linguist/tokenizer.l
Ashe Connor c9b3d19c6f Lexer crash fix (#3900)
* input may return 0 for EOF

Stops overruns into fread from nothing.

* remove two trailing contexts

* fix up sgml tokens
2017-11-10 22:11:32 +11:00

120 lines
3.9 KiB
Plaintext

%{
#include "linguist.h"
#define feed_token(tok, typ) do { \
yyextra->token = (tok); \
yyextra->type = (typ); \
} while (0)
#define eat_until_eol() do { \
int c; \
while ((c = input(yyscanner)) != '\n' && c != EOF && c); \
if (c == EOF || !c) \
return 0; \
} while (0)
#define eat_until_unescaped(q) do { \
int c; \
while ((c = input(yyscanner)) != EOF && c) { \
if (c == '\n') \
break; \
if (c == '\\') { \
c = input(yyscanner); \
if (c == EOF || !c) \
return 0; \
} else if (c == q) \
break; \
} \
if (c == EOF || !c) \
return 0; \
} while (0)
%}
%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
%x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
%%
^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
const char *off = strrchr(yytext, ' ');
if (!off)
off = yytext;
else
++off;
feed_token(strdup(off), SHEBANG_TOKEN);
eat_until_eol();
return 1;
}
^#![ \t]*[[:alpha:]_\/]+ {
const char *off = strrchr(yytext, '/');
if (!off)
off = yytext;
else
++off;
if (strcmp(off, "env") == 0) {
eat_until_eol();
} else {
feed_token(strdup(off), SHEBANG_TOKEN);
eat_until_eol();
return 1;
}
}
^[ \t]*(\/\/|--|\#|%|\")" ".* { /* nothing */ }
"/*" { BEGIN(c_comment); }
/* See below for xml_comment start. */
"{-" { BEGIN(haskell_comment); }
"(*" { BEGIN(ocaml_comment); }
"\"\"\"" { BEGIN(python_dcomment); }
"'''" { BEGIN(python_scomment); }
<c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ }
<c_comment>"*/" { BEGIN(INITIAL); }
<xml_comment>"-->" { BEGIN(INITIAL); }
<haskell_comment>"-}" { BEGIN(INITIAL); }
<ocaml_comment>"*)" { BEGIN(INITIAL); }
<python_dcomment>"\"\"\"" { BEGIN(INITIAL); }
<python_scomment>"'''" { BEGIN(INITIAL); }
\"\"|'' { /* nothing */ }
\" { eat_until_unescaped('"'); }
' { eat_until_unescaped('\''); }
(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
\<[[:alnum:]_!./?-]+ {
if (strcmp(yytext, "<!--") == 0) {
BEGIN(xml_comment);
} else {
feed_token(strdup(yytext), SGML_TOKEN);
BEGIN(sgml);
return 1;
}
}
<sgml>[[:alnum:]_]+=\" { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('"'); return 1; }
<sgml>[[:alnum:]_]+=' { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('\''); return 1; }
<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
<sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
<sgml>\> { BEGIN(INITIAL); }
<sgml>.|\n { /* nothing */ }
;|\{|\}|\(|\)|\[|\] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
[[:alnum:]_.@#/*]+ {
if (strncmp(yytext, "/*", 2) == 0) {
if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
/* nothing */
} else {
BEGIN(c_comment);
}
} else {
feed_token(strdup(yytext), REGULAR_TOKEN);
return 1;
}
}
\<\<?|\+|\-|\*|\/|%|&&?|\|\|? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
.|\n { /* nothing */ }
%%