Files
linguist/ext/linguist/tokenizer.l
Ashe Connor 99eaf5faf9 Replace the tokenizer with a flex-based scanner (#3846)
* Lex everything except SGML, multiline, SHEBANG

* Prepend SHEBANG#! to tokens

* Support SGML tag/attribute extraction

* Multiline comments

* WIP cont'd; productionifying

* Compile before test

* Add extension to gemspec

* Add flex task to build lexer

* Reentrant extra data storage

* regenerate lexer

* use prefix

* rebuild lexer on linux

* Optimise a number of operations:

* Don't read and split the entire file if we only ever use the first/last n
  lines

* Only consider the first 50KiB when using heuristics/classifying.  This can
  save a *lot* of time; running a large number of regexes over 1MiB of text
  takes a while.

* Memoize File.size/read/stat; re-reading in a 500KiB file every time `data` is
  called adds up a lot.

* Use single regex for C++

* act like #lines

* [1][-2..-1] => nil, ffs

* k may not be set
2017-10-31 11:06:56 +11:00

120 lines
3.9 KiB
Plaintext

%{
#include "linguist.h"
#define feed_token(tok, typ) do { \
yyextra->token = (tok); \
yyextra->type = (typ); \
} while (0)
#define eat_until_eol() do { \
int c; \
while ((c = input(yyscanner)) != '\n' && c != EOF); \
if (c == EOF) \
yyterminate(); \
} while (0)
#define eat_until_unescaped(q) do { \
int c; \
while ((c = input(yyscanner)) != EOF) { \
if (c == '\n') \
break; \
if (c == '\\') { \
c = input(yyscanner); \
if (c == EOF) \
yyterminate(); \
} else if (c == q) \
break; \
} \
if (c == EOF) \
yyterminate(); \
} while (0)
%}
%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
%x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
%%
^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
const char *off = strrchr(yytext, ' ');
if (!off)
off = yytext;
else
++off;
feed_token(strdup(off), SHEBANG_TOKEN);
eat_until_eol();
return 1;
}
^#![ \t]*[[:alpha:]_\/]+ {
const char *off = strrchr(yytext, '/');
if (!off)
off = yytext;
else
++off;
if (strcmp(off, "env") == 0) {
eat_until_eol();
} else {
feed_token(strdup(off), SHEBANG_TOKEN);
eat_until_eol();
return 1;
}
}
^[ \t]*(\/\/|--|\#|%|\")" ".* { /* nothing */ }
"/*" { BEGIN(c_comment); }
/* See below for xml_comment start. */
"{-" { BEGIN(haskell_comment); }
"(*" { BEGIN(ocaml_comment); }
"\"\"\"" { BEGIN(python_dcomment); }
"'''" { BEGIN(python_scomment); }
<c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ }
<c_comment>"*/" { BEGIN(INITIAL); }
<xml_comment>"-->" { BEGIN(INITIAL); }
<haskell_comment>"-}" { BEGIN(INITIAL); }
<ocaml_comment>"*)" { BEGIN(INITIAL); }
<python_dcomment>"\"\"\"" { BEGIN(INITIAL); }
<python_scomment>"'''" { BEGIN(INITIAL); }
\"\"|'' { /* nothing */ }
\" { eat_until_unescaped('"'); }
' { eat_until_unescaped('\''); }
(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
\<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}> {
if (strcmp(yytext, "<!--") == 0) {
BEGIN(xml_comment);
} else {
feed_token(strdup(yytext), SGML_TOKEN);
BEGIN(sgml);
return 1;
}
}
<sgml>[[:alnum:]_]+=/\" { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
<sgml>[[:alnum:]_]+=/' { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
<sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
<sgml>\> { BEGIN(INITIAL); }
<sgml>.|\n { /* nothing */ }
;|\{|\}|\(|\)|\[|\] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
[[:alnum:]_.@#/*]+ {
if (strncmp(yytext, "/*", 2) == 0) {
if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
/* nothing */
} else {
BEGIN(c_comment);
}
} else {
feed_token(strdup(yytext), REGULAR_TOKEN);
return 1;
}
}
\<\<?|\+|\-|\*|\/|%|&&?|\|\|? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
.|\n { /* nothing */ }
%%