Lexer crash fix (#3900)

* input may return 0 for EOF

Stops overruns into fread from nothing.

* remove two trailing contexts

* fix up sgml tokens
This commit is contained in:
Ashe Connor
2017-11-10 22:11:32 +11:00
committed by GitHub
parent 0f4955e5d5
commit c9b3d19c6f
4 changed files with 260 additions and 6325 deletions

View File

@@ -47,15 +47,10 @@ task :samples => :compile do
File.write 'lib/linguist/samples.json', json File.write 'lib/linguist/samples.json', json
end end
FLEX_MIN_VER = [2, 5, 39]
task :flex do task :flex do
if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/ if `flex -V` !~ /^flex \d+\.\d+\.\d+/
fail "flex not detected" fail "flex not detected"
end end
maj, min, rev = $1.to_i, $2.to_i, $3.to_i
if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2])))
fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}"
end
system "cd ext/linguist && flex tokenizer.l" system "cd ext/linguist && flex tokenizer.l"
end end

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
#define FLEX_SCANNER #define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2 #define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5 #define YY_FLEX_MINOR_VERSION 5
#define YY_FLEX_SUBMINOR_VERSION 39 #define YY_FLEX_SUBMINOR_VERSION 35
#if YY_FLEX_SUBMINOR_VERSION > 0 #if YY_FLEX_SUBMINOR_VERSION > 0
#define FLEX_BETA #define FLEX_BETA
#endif #endif
@@ -49,6 +49,7 @@ typedef int16_t flex_int16_t;
typedef uint16_t flex_uint16_t; typedef uint16_t flex_uint16_t;
typedef int32_t flex_int32_t; typedef int32_t flex_int32_t;
typedef uint32_t flex_uint32_t; typedef uint32_t flex_uint32_t;
typedef uint64_t flex_uint64_t;
#else #else
typedef signed char flex_int8_t; typedef signed char flex_int8_t;
typedef short int flex_int16_t; typedef short int flex_int16_t;
@@ -56,6 +57,7 @@ typedef int flex_int32_t;
typedef unsigned char flex_uint8_t; typedef unsigned char flex_uint8_t;
typedef unsigned short int flex_uint16_t; typedef unsigned short int flex_uint16_t;
typedef unsigned int flex_uint32_t; typedef unsigned int flex_uint32_t;
#endif /* ! C99 */
/* Limits of integral types. */ /* Limits of integral types. */
#ifndef INT8_MIN #ifndef INT8_MIN
@@ -86,8 +88,6 @@ typedef unsigned int flex_uint32_t;
#define UINT32_MAX (4294967295U) #define UINT32_MAX (4294967295U)
#endif #endif
#endif /* ! C99 */
#endif /* ! FLEXINT_H */ #endif /* ! FLEXINT_H */
#ifdef __cplusplus #ifdef __cplusplus
@@ -130,15 +130,7 @@ typedef void* yyscan_t;
/* Size of default input buffer. */ /* Size of default input buffer. */
#ifndef YY_BUF_SIZE #ifndef YY_BUF_SIZE
#ifdef __ia64__
/* On IA-64, the buffer size is 16k, not 8k.
* Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
* Ditto for the __ia64__ case accordingly.
*/
#define YY_BUF_SIZE 32768
#else
#define YY_BUF_SIZE 16384 #define YY_BUF_SIZE 16384
#endif /* __ia64__ */
#endif #endif
#ifndef YY_TYPEDEF_YY_BUFFER_STATE #ifndef YY_TYPEDEF_YY_BUFFER_STATE
@@ -277,10 +269,6 @@ int linguist_yyget_lineno (yyscan_t yyscanner );
void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner ); void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
int linguist_yyget_column (yyscan_t yyscanner );
void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
/* Macros after this point can all be overridden by user definitions in /* Macros after this point can all be overridden by user definitions in
* section 1. * section 1.
*/ */
@@ -307,12 +295,7 @@ static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
/* Amount of stuff to slurp up with each read. */ /* Amount of stuff to slurp up with each read. */
#ifndef YY_READ_BUF_SIZE #ifndef YY_READ_BUF_SIZE
#ifdef __ia64__
/* On IA-64, the buffer size is 16k, not 8k */
#define YY_READ_BUF_SIZE 16384
#else
#define YY_READ_BUF_SIZE 8192 #define YY_READ_BUF_SIZE 8192
#endif /* __ia64__ */
#endif #endif
/* Number of entries by which start-condition stack grows. */ /* Number of entries by which start-condition stack grows. */
@@ -345,9 +328,9 @@ extern int linguist_yylex (yyscan_t yyscanner);
#undef YY_DECL #undef YY_DECL
#endif #endif
#line 117 "tokenizer.l" #line 118 "tokenizer.l"
#line 352 "lex.linguist_yy.h" #line 335 "lex.linguist_yy.h"
#undef linguist_yyIN_HEADER #undef linguist_yyIN_HEADER
#endif /* linguist_yyHEADER_H */ #endif /* linguist_yyHEADER_H */

View File

@@ -9,25 +9,25 @@
#define eat_until_eol() do { \ #define eat_until_eol() do { \
int c; \ int c; \
while ((c = input(yyscanner)) != '\n' && c != EOF); \ while ((c = input(yyscanner)) != '\n' && c != EOF && c); \
if (c == EOF) \ if (c == EOF || !c) \
yyterminate(); \ return 0; \
} while (0) } while (0)
#define eat_until_unescaped(q) do { \ #define eat_until_unescaped(q) do { \
int c; \ int c; \
while ((c = input(yyscanner)) != EOF) { \ while ((c = input(yyscanner)) != EOF && c) { \
if (c == '\n') \ if (c == '\n') \
break; \ break; \
if (c == '\\') { \ if (c == '\\') { \
c = input(yyscanner); \ c = input(yyscanner); \
if (c == EOF) \ if (c == EOF || !c) \
yyterminate(); \ return 0; \
} else if (c == q) \ } else if (c == q) \
break; \ break; \
} \ } \
if (c == EOF) \ if (c == EOF || !c) \
yyterminate(); \ return 0; \
} while (0) } while (0)
%} %}
@@ -84,7 +84,7 @@
\" { eat_until_unescaped('"'); } \" { eat_until_unescaped('"'); }
' { eat_until_unescaped('\''); } ' { eat_until_unescaped('\''); }
(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ } (0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
\<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}> { \<[[:alnum:]_!./?-]+ {
if (strcmp(yytext, "<!--") == 0) { if (strcmp(yytext, "<!--") == 0) {
BEGIN(xml_comment); BEGIN(xml_comment);
} else { } else {
@@ -93,8 +93,8 @@
return 1; return 1;
} }
} }
<sgml>[[:alnum:]_]+=/\" { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; } <sgml>[[:alnum:]_]+=\" { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('"'); return 1; }
<sgml>[[:alnum:]_]+=/' { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; } <sgml>[[:alnum:]_]+=' { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('\''); return 1; }
<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; } <sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
<sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } <sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
<sgml>\> { BEGIN(INITIAL); } <sgml>\> { BEGIN(INITIAL); }