Lexer crash fix (#3900)

* input may return 0 for EOF Stops overruns into fread from nothing. * remove two trailing contexts * fix up sgml tokens
2026-02-11 02:39:34 +00:00 · 2017-11-10 22:11:32 +11:00
parent 0f4955e5d5
commit c9b3d19c6f
4 changed files with 260 additions and 6325 deletions
--- a/7
+++ b/7
@@ -47,15 +47,10 @@ task :samples => :compile do
  File.write 'lib/linguist/samples.json', json
 end
 FLEX_MIN_VER = [2, 5, 39]
 task :flex do
-  if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/
+  if `flex -V` !~ /^flex \d+\.\d+\.\d+/
    fail "flex not detected"
  end
  maj, min, rev = $1.to_i, $2.to_i, $3.to_i
  if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2])))
    fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}"
  end
  system "cd ext/linguist && flex tokenizer.l"
 end
--- a/ext/linguist/lex.linguist_yy.c
+++ b/ext/linguist/lex.linguist_yy.c
--- a/ext/linguist/lex.linguist_yy.h
+++ b/ext/linguist/lex.linguist_yy.h
@@ -11,7 +11,7 @@
 #define FLEX_SCANNER
 #define YY_FLEX_MAJOR_VERSION 2
 #define YY_FLEX_MINOR_VERSION 5
-#define YY_FLEX_SUBMINOR_VERSION 39
+#define YY_FLEX_SUBMINOR_VERSION 35
 #if YY_FLEX_SUBMINOR_VERSION > 0
 #define FLEX_BETA
 #endif
@@ -49,6 +49,7 @@ typedef int16_t flex_int16_t;
 typedef uint16_t flex_uint16_t;
 typedef int32_t flex_int32_t;
 typedef uint32_t flex_uint32_t;
 typedef uint64_t flex_uint64_t;
 #else
 typedef signed char flex_int8_t;
 typedef short int flex_int16_t;
@@ -56,6 +57,7 @@ typedef int flex_int32_t;
 typedef unsigned char flex_uint8_t; 
 typedef unsigned short int flex_uint16_t;
 typedef unsigned int flex_uint32_t;
 #endif /* ! C99 */
 /* Limits of integral types. */
 #ifndef INT8_MIN
@@ -86,8 +88,6 @@ typedef unsigned int flex_uint32_t;
 #define UINT32_MAX             (4294967295U)
 #endif
 #endif /* ! C99 */
 #endif /* ! FLEXINT_H */
 #ifdef __cplusplus
@@ -130,15 +130,7 @@ typedef void* yyscan_t;
 /* Size of default input buffer. */
 #ifndef YY_BUF_SIZE
 #ifdef __ia64__
 /* On IA-64, the buffer size is 16k, not 8k.
 * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
 * Ditto for the __ia64__ case accordingly.
 */
 #define YY_BUF_SIZE 32768
 #else
 #define YY_BUF_SIZE 16384
 #endif /* __ia64__ */
 #endif
 #ifndef YY_TYPEDEF_YY_BUFFER_STATE
@@ -277,10 +269,6 @@ int linguist_yyget_lineno (yyscan_t yyscanner );
 void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
 int linguist_yyget_column  (yyscan_t yyscanner );
 void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
 /* Macros after this point can all be overridden by user definitions in
 * section 1.
 */
@@ -307,12 +295,7 @@ static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
 /* Amount of stuff to slurp up with each read. */
 #ifndef YY_READ_BUF_SIZE
 #ifdef __ia64__
 /* On IA-64, the buffer size is 16k, not 8k */
 #define YY_READ_BUF_SIZE 16384
 #else
 #define YY_READ_BUF_SIZE 8192
 #endif /* __ia64__ */
 #endif
 /* Number of entries by which start-condition stack grows. */
@@ -345,9 +328,9 @@ extern int linguist_yylex (yyscan_t yyscanner);
 #undef YY_DECL
 #endif
-#line 117 "tokenizer.l"
+#line 118 "tokenizer.l"
-#line 352 "lex.linguist_yy.h"
+#line 335 "lex.linguist_yy.h"
 #undef linguist_yyIN_HEADER
 #endif /* linguist_yyHEADER_H */
--- a/ext/linguist/tokenizer.l
+++ b/ext/linguist/tokenizer.l
@@ -9,25 +9,25 @@
 #define eat_until_eol() do { \
    int c; \
-    while ((c = input(yyscanner)) != '\n' && c != EOF); \
+    while ((c = input(yyscanner)) != '\n' && c != EOF && c); \
-    if (c == EOF) \
+    if (c == EOF || !c) \
-      yyterminate(); \
+      return 0; \
  } while (0)
 #define eat_until_unescaped(q) do { \
    int c; \
-    while ((c = input(yyscanner)) != EOF) { \
+    while ((c = input(yyscanner)) != EOF && c) { \
      if (c == '\n') \
        break; \
      if (c == '\\') { \
        c = input(yyscanner); \
-        if (c == EOF) \
+        if (c == EOF || !c) \
-          yyterminate(); \
+          return 0; \
      } else if (c == q) \
        break; \
    } \
-    if (c == EOF) \
+    if (c == EOF || !c) \
-      yyterminate(); \
+      return 0; \
  } while (0)
 %}
@@ -84,7 +84,7 @@
 \"                                { eat_until_unescaped('"'); }
 '                                 { eat_until_unescaped('\''); }
 (0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
-\<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}>               {
+\<[[:alnum:]_!./?-]+              {
    if (strcmp(yytext, "<!--") == 0) {
     BEGIN(xml_comment);
    } else {
@@ -93,8 +93,8 @@
      return 1;
    }
  }
-<sgml>[[:alnum:]_]+=/\"           { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
+<sgml>[[:alnum:]_]+=\"            { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('"'); return 1; }
-<sgml>[[:alnum:]_]+=/'            { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
+<sgml>[[:alnum:]_]+='             { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('\''); return 1; }
 <sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
 <sgml>[[:alnum:]_]+               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
 <sgml>\>                          { BEGIN(INITIAL); }