Replace the tokenizer with a flex-based scanner (#3846)

* Lex everything except SGML, multiline, SHEBANG * Prepend SHEBANG#! to tokens * Support SGML tag/attribute extraction * Multiline comments * WIP cont'd; productionifying * Compile before test * Add extension to gemspec * Add flex task to build lexer * Reentrant extra data storage * regenerate lexer * use prefix * rebuild lexer on linux * Optimise a number of operations: * Don't read and split the entire file if we only ever use the first/last n lines * Only consider the first 50KiB when using heuristics/classifying. This can save a *lot* of time; running a large number of regexes over 1MiB of text takes a while. * Memoize File.size/read/stat; re-reading in a 500KiB file every time `data` is called adds up a lot. * Use single regex for C++ * act like #lines * [1][-2..-1] => nil, ffs * k may not be set
2026-07-27 20:11:59 +00:00 · 2017-10-31 11:06:56 +11:00
parent 21babbceb1
commit 99eaf5faf9
15 changed files with 8914 additions and 202 deletions
@@ -0,0 +1,3 @@
+require 'mkmf'
+dir_config('linguist')
+create_makefile('linguist/linguist')
@@ -0,0 +1,353 @@
+#ifndef linguist_yyHEADER_H
+#define linguist_yyHEADER_H 1
+#define linguist_yyIN_HEADER 1
+
+#line 6 "lex.linguist_yy.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 39
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	yy_size_t yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
+void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void linguist_yypop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
+
+void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
+void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void linguist_yyfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+#define sgml 1
+#define c_comment 2
+#define xml_comment 3
+#define haskell_comment 4
+#define ocaml_comment 5
+#define python_dcomment 6
+#define python_scomment 7
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#define YY_EXTRA_TYPE struct tokenizer_extra *
+
+int linguist_yylex_init (yyscan_t* scanner);
+
+int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int linguist_yylex_destroy (yyscan_t yyscanner );
+
+int linguist_yyget_debug (yyscan_t yyscanner );
+
+void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
+
+void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_in (yyscan_t yyscanner );
+
+void linguist_yyset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *linguist_yyget_out (yyscan_t yyscanner );
+
+void linguist_yyset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
+
+char *linguist_yyget_text (yyscan_t yyscanner );
+
+int linguist_yyget_lineno (yyscan_t yyscanner );
+
+void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
+
+int linguist_yyget_column  (yyscan_t yyscanner );
+
+void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int linguist_yywrap (yyscan_t yyscanner );
+#else
+extern int linguist_yywrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int linguist_yylex (yyscan_t yyscanner);
+
+#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 117 "tokenizer.l"
+
+
+#line 352 "lex.linguist_yy.h"
+#undef linguist_yyIN_HEADER
+#endif /* linguist_yyHEADER_H */
@@ -0,0 +1,64 @@
+#include "ruby.h"
+#include "linguist.h"
+#include "lex.linguist_yy.h"
+
+int linguist_yywrap(yyscan_t yyscanner) {
+	return 1;
+}
+
+static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) {
+	YY_BUFFER_STATE buf;
+	yyscan_t scanner;
+	struct tokenizer_extra extra;
+	VALUE ary, s;
+	long len;
+	int r;
+
+	Check_Type(rb_data, T_STRING);
+
+	len = RSTRING_LEN(rb_data);
+	if (len > 100000)
+		len = 100000;
+
+	linguist_yylex_init_extra(&extra, &scanner);
+	buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner);
+
+	ary = rb_ary_new();
+	do {
+		extra.type = NO_ACTION;
+		extra.token = NULL;
+		r = linguist_yylex(scanner);
+		switch (extra.type) {
+		case NO_ACTION:
+			break;
+		case REGULAR_TOKEN:
+			rb_ary_push(ary, rb_str_new2(extra.token));
+			free(extra.token);
+			break;
+		case SHEBANG_TOKEN:
+			s = rb_str_new2("SHEBANG#!");
+			rb_str_cat2(s, extra.token);
+			rb_ary_push(ary, s);
+			free(extra.token);
+			break;
+		case SGML_TOKEN:
+			s = rb_str_new2(extra.token);
+			rb_str_cat2(s, ">");
+			rb_ary_push(ary, s);
+			free(extra.token);
+			break;
+		}
+	} while (r);
+
+	linguist_yy_delete_buffer(buf, scanner);
+	linguist_yylex_destroy(scanner);
+
+	return ary;
+}
+
+__attribute__((visibility("default"))) void Init_linguist() {
+	VALUE rb_mLinguist = rb_define_module("Linguist");
+	VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject);
+
+	rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1);
+}
@@ -0,0 +1,11 @@
+enum tokenizer_type {
+  NO_ACTION,
+  REGULAR_TOKEN,
+  SHEBANG_TOKEN,
+  SGML_TOKEN,
+};
+
+struct tokenizer_extra {
+  char *token;
+  enum tokenizer_type type;
+};
@@ -0,0 +1,119 @@
+%{
+
+#include "linguist.h"
+
+#define feed_token(tok, typ) do { \
+    yyextra->token = (tok); \
+    yyextra->type = (typ); \
+  } while (0)
+
+#define eat_until_eol() do { \
+    int c; \
+    while ((c = input(yyscanner)) != '\n' && c != EOF); \
+    if (c == EOF) \
+      yyterminate(); \
+  } while (0)
+
+#define eat_until_unescaped(q) do { \
+    int c; \
+    while ((c = input(yyscanner)) != EOF) { \
+      if (c == '\n') \
+        break; \
+      if (c == '\\') { \
+        c = input(yyscanner); \
+        if (c == EOF) \
+          yyterminate(); \
+      } else if (c == q) \
+        break; \
+    } \
+    if (c == EOF) \
+      yyterminate(); \
+  } while (0)
+
+%}
+
+%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
+%x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
+
+%%
+
+^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
+    const char *off = strrchr(yytext, ' ');
+    if (!off)
+      off = yytext;
+    else
+      ++off;
+    feed_token(strdup(off), SHEBANG_TOKEN);
+    eat_until_eol();
+    return 1;
+  }
+
+^#![ \t]*[[:alpha:]_\/]+ {
+    const char *off = strrchr(yytext, '/');
+    if (!off)
+      off = yytext;
+    else
+      ++off;
+    if (strcmp(off, "env") == 0) {
+      eat_until_eol();
+    } else {
+      feed_token(strdup(off), SHEBANG_TOKEN);
+      eat_until_eol();
+      return 1;
+    }
+  }
+
+^[ \t]*(\/\/|--|\#|%|\")" ".*   { /* nothing */ }
+
+"/*"                              { BEGIN(c_comment); }
+  /* See below for xml_comment start. */
+"{-"                              { BEGIN(haskell_comment); }
+"(*"                              { BEGIN(ocaml_comment); }
+"\"\"\""                          { BEGIN(python_dcomment); }
+"'''"                             { BEGIN(python_scomment); }
+
+<c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ }
+<c_comment>"*/"                   { BEGIN(INITIAL); }
+<xml_comment>"-->"                { BEGIN(INITIAL); }
+<haskell_comment>"-}"             { BEGIN(INITIAL); }
+<ocaml_comment>"*)"               { BEGIN(INITIAL); }
+<python_dcomment>"\"\"\""         { BEGIN(INITIAL); }
+<python_scomment>"'''"            { BEGIN(INITIAL); }
+
+\"\"|''                           { /* nothing */ }
+\"                                { eat_until_unescaped('"'); }
+'                                 { eat_until_unescaped('\''); }
+(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
+\<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}>               {
+    if (strcmp(yytext, "<!--") == 0) {
+     BEGIN(xml_comment);
+    } else {
+      feed_token(strdup(yytext), SGML_TOKEN);
+      BEGIN(sgml);
+      return 1;
+    }
+  }
+<sgml>[[:alnum:]_]+=/\"           { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
+<sgml>[[:alnum:]_]+=/'            { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
+<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
+<sgml>[[:alnum:]_]+               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
+<sgml>\>                          { BEGIN(INITIAL); }
+<sgml>.|\n                        { /* nothing */ }
+;|\{|\}|\(|\)|\[|\]               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
+[[:alnum:]_.@#/*]+                {
+    if (strncmp(yytext, "/*", 2) == 0) {
+      if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
+        /* nothing */
+      } else {
+        BEGIN(c_comment);
+      }
+    } else {
+      feed_token(strdup(yytext), REGULAR_TOKEN);
+      return 1;
+    }
+  }
+\<\<?|\+|\-|\*|\/|%|&&?|\|\|?     { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
+.|\n                              { /* nothing */ }
+
+%%
+