Lexer crash fix (#3900)

* input may return 0 for EOF

Stops overruns into fread from nothing.

* remove two trailing contexts

* fix up sgml tokens
This commit is contained in:
Ashe Connor
2017-11-10 22:11:32 +11:00
committed by GitHub
parent 0f4955e5d5
commit c9b3d19c6f
4 changed files with 260 additions and 6325 deletions

View File

@@ -47,15 +47,10 @@ task :samples => :compile do
File.write 'lib/linguist/samples.json', json
end
FLEX_MIN_VER = [2, 5, 39]
task :flex do
if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/
if `flex -V` !~ /^flex \d+\.\d+\.\d+/
fail "flex not detected"
end
maj, min, rev = $1.to_i, $2.to_i, $3.to_i
if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2])))
fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}"
end
system "cd ext/linguist && flex tokenizer.l"
end

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5
#define YY_FLEX_SUBMINOR_VERSION 39
#define YY_FLEX_SUBMINOR_VERSION 35
#if YY_FLEX_SUBMINOR_VERSION > 0
#define FLEX_BETA
#endif
@@ -49,6 +49,7 @@ typedef int16_t flex_int16_t;
typedef uint16_t flex_uint16_t;
typedef int32_t flex_int32_t;
typedef uint32_t flex_uint32_t;
typedef uint64_t flex_uint64_t;
#else
typedef signed char flex_int8_t;
typedef short int flex_int16_t;
@@ -56,6 +57,7 @@ typedef int flex_int32_t;
typedef unsigned char flex_uint8_t;
typedef unsigned short int flex_uint16_t;
typedef unsigned int flex_uint32_t;
#endif /* ! C99 */
/* Limits of integral types. */
#ifndef INT8_MIN
@@ -86,8 +88,6 @@ typedef unsigned int flex_uint32_t;
#define UINT32_MAX (4294967295U)
#endif
#endif /* ! C99 */
#endif /* ! FLEXINT_H */
#ifdef __cplusplus
@@ -130,15 +130,7 @@ typedef void* yyscan_t;
/* Size of default input buffer. */
#ifndef YY_BUF_SIZE
#ifdef __ia64__
/* On IA-64, the buffer size is 16k, not 8k.
* Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
* Ditto for the __ia64__ case accordingly.
*/
#define YY_BUF_SIZE 32768
#else
#define YY_BUF_SIZE 16384
#endif /* __ia64__ */
#endif
#ifndef YY_TYPEDEF_YY_BUFFER_STATE
@@ -277,10 +269,6 @@ int linguist_yyget_lineno (yyscan_t yyscanner );
void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
int linguist_yyget_column (yyscan_t yyscanner );
void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
/* Macros after this point can all be overridden by user definitions in
* section 1.
*/
@@ -307,12 +295,7 @@ static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
/* Amount of stuff to slurp up with each read. */
#ifndef YY_READ_BUF_SIZE
#ifdef __ia64__
/* On IA-64, the buffer size is 16k, not 8k */
#define YY_READ_BUF_SIZE 16384
#else
#define YY_READ_BUF_SIZE 8192
#endif /* __ia64__ */
#endif
/* Number of entries by which start-condition stack grows. */
@@ -345,9 +328,9 @@ extern int linguist_yylex (yyscan_t yyscanner);
#undef YY_DECL
#endif
#line 117 "tokenizer.l"
#line 118 "tokenizer.l"
#line 352 "lex.linguist_yy.h"
#line 335 "lex.linguist_yy.h"
#undef linguist_yyIN_HEADER
#endif /* linguist_yyHEADER_H */

View File

@@ -9,25 +9,25 @@
#define eat_until_eol() do { \
int c; \
while ((c = input(yyscanner)) != '\n' && c != EOF); \
if (c == EOF) \
yyterminate(); \
while ((c = input(yyscanner)) != '\n' && c != EOF && c); \
if (c == EOF || !c) \
return 0; \
} while (0)
#define eat_until_unescaped(q) do { \
int c; \
while ((c = input(yyscanner)) != EOF) { \
while ((c = input(yyscanner)) != EOF && c) { \
if (c == '\n') \
break; \
if (c == '\\') { \
c = input(yyscanner); \
if (c == EOF) \
yyterminate(); \
if (c == EOF || !c) \
return 0; \
} else if (c == q) \
break; \
} \
if (c == EOF) \
yyterminate(); \
if (c == EOF || !c) \
return 0; \
} while (0)
%}
@@ -84,7 +84,7 @@
\" { eat_until_unescaped('"'); }
' { eat_until_unescaped('\''); }
(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
\<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}> {
\<[[:alnum:]_!./?-]+ {
if (strcmp(yytext, "<!--") == 0) {
BEGIN(xml_comment);
} else {
@@ -93,8 +93,8 @@
return 1;
}
}
<sgml>[[:alnum:]_]+=/\" { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
<sgml>[[:alnum:]_]+=/' { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
<sgml>[[:alnum:]_]+=\" { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('"'); return 1; }
<sgml>[[:alnum:]_]+=' { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('\''); return 1; }
<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
<sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
<sgml>\> { BEGIN(INITIAL); }