Revert "Replace the tokenizer with a flex-based scanner (#3846 )"

This reverts commit 99eaf5faf9.
2025-10-29 17:50:22 +00:00 · 2017-11-10 10:27:50 +11:00
15 changed files with 202 additions and 8914 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,3 @@ lib/linguist/samples.json
 /node_modules
 test/fixtures/ace_modes.json
 /vendor/gems/
-/tmp
-*.bundle
-*.so
--- a/23
+++ b/23
@@ -1,7 +1,6 @@
 require 'bundler/setup'
 require 'rake/clean'
 require 'rake/testtask'
-require 'rake/extensiontask'
 require 'yaml'
 require 'yajl'
 require 'open-uri'
@@ -11,14 +10,8 @@ task :default => :test

 Rake::TestTask.new

-gem_spec = Gem::Specification.load('github-linguist.gemspec')
-
-Rake::ExtensionTask.new('linguist', gem_spec) do |ext|
-  ext.lib_dir = File.join('lib', 'linguist')
-end
-
 # Extend test task to check for samples and fetch latest Ace modes
-task :test => [:compile, :check_samples, :fetch_ace_modes]
+task :test => [:check_samples, :fetch_ace_modes]

 desc "Check that we have samples.json generated"
 task :check_samples do
@@ -41,24 +34,12 @@ task :fetch_ace_modes do
  end
 end

-task :samples => :compile do
+task :samples do
  require 'linguist/samples'
  json = Yajl.dump(Linguist::Samples.data, :pretty => true)
  File.write 'lib/linguist/samples.json', json
 end

-FLEX_MIN_VER = [2, 5, 39]
-task :flex do
-  if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/
-    fail "flex not detected"
-  end
-  maj, min, rev = $1.to_i, $2.to_i, $3.to_i
-  if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2])))
-    fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}"
-  end
-  system "cd ext/linguist && flex tokenizer.l"
-end
-
 task :build_gem => :samples do
  rm_rf "grammars"
  sh "script/convert-grammars"
--- a/ext/linguist/extconf.rb
+++ b/ext/linguist/extconf.rb
@@ -1,3 +0,0 @@
-require 'mkmf'
-dir_config('linguist')
-create_makefile('linguist/linguist')
--- a/ext/linguist/lex.linguist_yy.c
+++ b/ext/linguist/lex.linguist_yy.c
--- a/ext/linguist/lex.linguist_yy.h
+++ b/ext/linguist/lex.linguist_yy.h
@@ -1,353 +0,0 @@
-#ifndef linguist_yyHEADER_H
-#define linguist_yyHEADER_H 1
-#define linguist_yyIN_HEADER 1
-
-#line 6 "lex.linguist_yy.h"
-
-#define  YY_INT_ALIGNED short int
-
-/* A lexical scanner generated by flex */
-
-#define FLEX_SCANNER
-#define YY_FLEX_MAJOR_VERSION 2
-#define YY_FLEX_MINOR_VERSION 5
-#define YY_FLEX_SUBMINOR_VERSION 39
-#if YY_FLEX_SUBMINOR_VERSION > 0
-#define FLEX_BETA
-#endif
-
-/* First, we deal with  platform-specific or compiler-specific issues. */
-
-/* begin standard C headers. */
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <stdlib.h>
-
-/* end standard C headers. */
-
-/* flex integer type definitions */
-
-#ifndef FLEXINT_H
-#define FLEXINT_H
-
-/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
-
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-
-/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
- * if you want the limit (max/min) macros for int types. 
- */
-#ifndef __STDC_LIMIT_MACROS
-#define __STDC_LIMIT_MACROS 1
-#endif
-
-#include <inttypes.h>
-typedef int8_t flex_int8_t;
-typedef uint8_t flex_uint8_t;
-typedef int16_t flex_int16_t;
-typedef uint16_t flex_uint16_t;
-typedef int32_t flex_int32_t;
-typedef uint32_t flex_uint32_t;
-#else
-typedef signed char flex_int8_t;
-typedef short int flex_int16_t;
-typedef int flex_int32_t;
-typedef unsigned char flex_uint8_t; 
-typedef unsigned short int flex_uint16_t;
-typedef unsigned int flex_uint32_t;
-
-/* Limits of integral types. */
-#ifndef INT8_MIN
-#define INT8_MIN               (-128)
-#endif
-#ifndef INT16_MIN
-#define INT16_MIN              (-32767-1)
-#endif
-#ifndef INT32_MIN
-#define INT32_MIN              (-2147483647-1)
-#endif
-#ifndef INT8_MAX
-#define INT8_MAX               (127)
-#endif
-#ifndef INT16_MAX
-#define INT16_MAX              (32767)
-#endif
-#ifndef INT32_MAX
-#define INT32_MAX              (2147483647)
-#endif
-#ifndef UINT8_MAX
-#define UINT8_MAX              (255U)
-#endif
-#ifndef UINT16_MAX
-#define UINT16_MAX             (65535U)
-#endif
-#ifndef UINT32_MAX
-#define UINT32_MAX             (4294967295U)
-#endif
-
-#endif /* ! C99 */
-
-#endif /* ! FLEXINT_H */
-
-#ifdef __cplusplus
-
-/* The "const" storage-class-modifier is valid. */
-#define YY_USE_CONST
-
-#else	/* ! __cplusplus */
-
-/* C99 requires __STDC__ to be defined as 1. */
-#if defined (__STDC__)
-
-#define YY_USE_CONST
-
-#endif	/* defined (__STDC__) */
-#endif	/* ! __cplusplus */
-
-#ifdef YY_USE_CONST
-#define yyconst const
-#else
-#define yyconst
-#endif
-
-/* An opaque pointer. */
-#ifndef YY_TYPEDEF_YY_SCANNER_T
-#define YY_TYPEDEF_YY_SCANNER_T
-typedef void* yyscan_t;
-#endif
-
-/* For convenience, these vars (plus the bison vars far below)
-   are macros in the reentrant scanner. */
-#define yyin yyg->yyin_r
-#define yyout yyg->yyout_r
-#define yyextra yyg->yyextra_r
-#define yyleng yyg->yyleng_r
-#define yytext yyg->yytext_r
-#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
-#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
-#define yy_flex_debug yyg->yy_flex_debug_r
-
-/* Size of default input buffer. */
-#ifndef YY_BUF_SIZE
-#ifdef __ia64__
-/* On IA-64, the buffer size is 16k, not 8k.
- * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
- * Ditto for the __ia64__ case accordingly.
- */
-#define YY_BUF_SIZE 32768
-#else
-#define YY_BUF_SIZE 16384
-#endif /* __ia64__ */
-#endif
-
-#ifndef YY_TYPEDEF_YY_BUFFER_STATE
-#define YY_TYPEDEF_YY_BUFFER_STATE
-typedef struct yy_buffer_state *YY_BUFFER_STATE;
-#endif
-
-#ifndef YY_TYPEDEF_YY_SIZE_T
-#define YY_TYPEDEF_YY_SIZE_T
-typedef size_t yy_size_t;
-#endif
-
-#ifndef YY_STRUCT_YY_BUFFER_STATE
-#define YY_STRUCT_YY_BUFFER_STATE
-struct yy_buffer_state
-	{
-	FILE *yy_input_file;
-
-	char *yy_ch_buf;		/* input buffer */
-	char *yy_buf_pos;		/* current position in input buffer */
-
-	/* Size of input buffer in bytes, not including room for EOB
-	 * characters.
-	 */
-	yy_size_t yy_buf_size;
-
-	/* Number of characters read into yy_ch_buf, not including EOB
-	 * characters.
-	 */
-	yy_size_t yy_n_chars;
-
-	/* Whether we "own" the buffer - i.e., we know we created it,
-	 * and can realloc() it to grow it, and should free() it to
-	 * delete it.
-	 */
-	int yy_is_our_buffer;
-
-	/* Whether this is an "interactive" input source; if so, and
-	 * if we're using stdio for input, then we want to use getc()
-	 * instead of fread(), to make sure we stop fetching input after
-	 * each newline.
-	 */
-	int yy_is_interactive;
-
-	/* Whether we're considered to be at the beginning of a line.
-	 * If so, '^' rules will be active on the next match, otherwise
-	 * not.
-	 */
-	int yy_at_bol;
-
-    int yy_bs_lineno; /**< The line count. */
-    int yy_bs_column; /**< The column count. */
-    
-	/* Whether to try to fill the input buffer when we reach the
-	 * end of it.
-	 */
-	int yy_fill_buffer;
-
-	int yy_buffer_status;
-
-	};
-#endif /* !YY_STRUCT_YY_BUFFER_STATE */
-
-void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
-void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
-YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
-void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
-void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
-void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
-void linguist_yypop_buffer_state (yyscan_t yyscanner );
-
-YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
-YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
-YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
-
-void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
-void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
-void linguist_yyfree (void * ,yyscan_t yyscanner );
-
-/* Begin user sect3 */
-
-#define yytext_ptr yytext_r
-
-#ifdef YY_HEADER_EXPORT_START_CONDITIONS
-#define INITIAL 0
-#define sgml 1
-#define c_comment 2
-#define xml_comment 3
-#define haskell_comment 4
-#define ocaml_comment 5
-#define python_dcomment 6
-#define python_scomment 7
-
-#endif
-
-#ifndef YY_NO_UNISTD_H
-/* Special case for "unistd.h", since it is non-ANSI. We include it way
- * down here because we want the user's section 1 to have been scanned first.
- * The user has a chance to override it with an option.
- */
-#include <unistd.h>
-#endif
-
-#define YY_EXTRA_TYPE struct tokenizer_extra *
-
-int linguist_yylex_init (yyscan_t* scanner);
-
-int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
-
-/* Accessor methods to globals.
-   These are made visible to non-reentrant scanners for convenience. */
-
-int linguist_yylex_destroy (yyscan_t yyscanner );
-
-int linguist_yyget_debug (yyscan_t yyscanner );
-
-void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
-
-YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
-
-void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
-
-FILE *linguist_yyget_in (yyscan_t yyscanner );
-
-void linguist_yyset_in  (FILE * in_str ,yyscan_t yyscanner );
-
-FILE *linguist_yyget_out (yyscan_t yyscanner );
-
-void linguist_yyset_out  (FILE * out_str ,yyscan_t yyscanner );
-
-yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
-
-char *linguist_yyget_text (yyscan_t yyscanner );
-
-int linguist_yyget_lineno (yyscan_t yyscanner );
-
-void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
-
-int linguist_yyget_column  (yyscan_t yyscanner );
-
-void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
-
-/* Macros after this point can all be overridden by user definitions in
- * section 1.
- */
-
-#ifndef YY_SKIP_YYWRAP
-#ifdef __cplusplus
-extern "C" int linguist_yywrap (yyscan_t yyscanner );
-#else
-extern int linguist_yywrap (yyscan_t yyscanner );
-#endif
-#endif
-
-#ifndef yytext_ptr
-static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
-#endif
-
-#ifdef YY_NEED_STRLEN
-static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
-#endif
-
-#ifndef YY_NO_INPUT
-
-#endif
-
-/* Amount of stuff to slurp up with each read. */
-#ifndef YY_READ_BUF_SIZE
-#ifdef __ia64__
-/* On IA-64, the buffer size is 16k, not 8k */
-#define YY_READ_BUF_SIZE 16384
-#else
-#define YY_READ_BUF_SIZE 8192
-#endif /* __ia64__ */
-#endif
-
-/* Number of entries by which start-condition stack grows. */
-#ifndef YY_START_STACK_INCR
-#define YY_START_STACK_INCR 25
-#endif
-
-/* Default declaration of generated scanner - a define so the user can
- * easily add parameters.
- */
-#ifndef YY_DECL
-#define YY_DECL_IS_OURS 1
-
-extern int linguist_yylex (yyscan_t yyscanner);
-
-#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
-#endif /* !YY_DECL */
-
-/* yy_get_previous_state - get the state just before the EOB char was reached */
-
-#undef YY_NEW_FILE
-#undef YY_FLUSH_BUFFER
-#undef yy_set_bol
-#undef yy_new_buffer
-#undef yy_set_interactive
-#undef YY_DO_BEFORE_ACTION
-
-#ifdef YY_DECL_IS_OURS
-#undef YY_DECL_IS_OURS
-#undef YY_DECL
-#endif
-
-#line 117 "tokenizer.l"
-
-
-#line 352 "lex.linguist_yy.h"
-#undef linguist_yyIN_HEADER
-#endif /* linguist_yyHEADER_H */
--- a/ext/linguist/linguist.c
+++ b/ext/linguist/linguist.c
@@ -1,64 +0,0 @@
-#include "ruby.h"
-#include "linguist.h"
-#include "lex.linguist_yy.h"
-
-int linguist_yywrap(yyscan_t yyscanner) {
-	return 1;
-}
-
-static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) {
-	YY_BUFFER_STATE buf;
-	yyscan_t scanner;
-	struct tokenizer_extra extra;
-	VALUE ary, s;
-	long len;
-	int r;
-
-	Check_Type(rb_data, T_STRING);
-
-	len = RSTRING_LEN(rb_data);
-	if (len > 100000)
-		len = 100000;
-
-	linguist_yylex_init_extra(&extra, &scanner);
-	buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner);
-
-	ary = rb_ary_new();
-	do {
-		extra.type = NO_ACTION;
-		extra.token = NULL;
-		r = linguist_yylex(scanner);
-		switch (extra.type) {
-		case NO_ACTION:
-			break;
-		case REGULAR_TOKEN:
-			rb_ary_push(ary, rb_str_new2(extra.token));
-			free(extra.token);
-			break;
-		case SHEBANG_TOKEN:
-			s = rb_str_new2("SHEBANG#!");
-			rb_str_cat2(s, extra.token);
-			rb_ary_push(ary, s);
-			free(extra.token);
-			break;
-		case SGML_TOKEN:
-			s = rb_str_new2(extra.token);
-			rb_str_cat2(s, ">");
-			rb_ary_push(ary, s);
-			free(extra.token);
-			break;
-		}
-	} while (r);
-
-	linguist_yy_delete_buffer(buf, scanner);
-	linguist_yylex_destroy(scanner);
-
-	return ary;
-}
-
-__attribute__((visibility("default"))) void Init_linguist() {
-	VALUE rb_mLinguist = rb_define_module("Linguist");
-	VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject);
-
-	rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1);
-}
--- a/ext/linguist/linguist.h
+++ b/ext/linguist/linguist.h
@@ -1,11 +0,0 @@
-enum tokenizer_type {
-  NO_ACTION,
-  REGULAR_TOKEN,
-  SHEBANG_TOKEN,
-  SGML_TOKEN,
-};
-
-struct tokenizer_extra {
-  char *token;
-  enum tokenizer_type type;
-};
--- a/ext/linguist/tokenizer.l
+++ b/ext/linguist/tokenizer.l
@@ -1,119 +0,0 @@
-%{
-
-#include "linguist.h"
-
-#define feed_token(tok, typ) do { \
-    yyextra->token = (tok); \
-    yyextra->type = (typ); \
-  } while (0)
-
-#define eat_until_eol() do { \
-    int c; \
-    while ((c = input(yyscanner)) != '\n' && c != EOF); \
-    if (c == EOF) \
-      yyterminate(); \
-  } while (0)
-
-#define eat_until_unescaped(q) do { \
-    int c; \
-    while ((c = input(yyscanner)) != EOF) { \
-      if (c == '\n') \
-        break; \
-      if (c == '\\') { \
-        c = input(yyscanner); \
-        if (c == EOF) \
-          yyterminate(); \
-      } else if (c == q) \
-        break; \
-    } \
-    if (c == EOF) \
-      yyterminate(); \
-  } while (0)
-
-%}
-
-%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
-%x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
-
-%%
-
-^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
-    const char *off = strrchr(yytext, ' ');
-    if (!off)
-      off = yytext;
-    else
-      ++off;
-    feed_token(strdup(off), SHEBANG_TOKEN);
-    eat_until_eol();
-    return 1;
-  }
-
-^#![ \t]*[[:alpha:]_\/]+ {
-    const char *off = strrchr(yytext, '/');
-    if (!off)
-      off = yytext;
-    else
-      ++off;
-    if (strcmp(off, "env") == 0) {
-      eat_until_eol();
-    } else {
-      feed_token(strdup(off), SHEBANG_TOKEN);
-      eat_until_eol();
-      return 1;
-    }
-  }
-
-^[ \t]*(\/\/|--|\#|%|\")" ".*   { /* nothing */ }
-
-"/*"                              { BEGIN(c_comment); }
-  /* See below for xml_comment start. */
-"{-"                              { BEGIN(haskell_comment); }
-"(*"                              { BEGIN(ocaml_comment); }
-"\"\"\""                          { BEGIN(python_dcomment); }
-"'''"                             { BEGIN(python_scomment); }
-
-<c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ }
-<c_comment>"*/"                   { BEGIN(INITIAL); }
-<xml_comment>"-->"                { BEGIN(INITIAL); }
-<haskell_comment>"-}"             { BEGIN(INITIAL); }
-<ocaml_comment>"*)"               { BEGIN(INITIAL); }
-<python_dcomment>"\"\"\""         { BEGIN(INITIAL); }
-<python_scomment>"'''"            { BEGIN(INITIAL); }
-
-\"\"|''                           { /* nothing */ }
-\"                                { eat_until_unescaped('"'); }
-'                                 { eat_until_unescaped('\''); }
-(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
-\<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}>               {
-    if (strcmp(yytext, "<!--") == 0) {
-     BEGIN(xml_comment);
-    } else {
-      feed_token(strdup(yytext), SGML_TOKEN);
-      BEGIN(sgml);
-      return 1;
-    }
-  }
-<sgml>[[:alnum:]_]+=/\"           { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
-<sgml>[[:alnum:]_]+=/'            { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
-<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
-<sgml>[[:alnum:]_]+               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
-<sgml>\>                          { BEGIN(INITIAL); }
-<sgml>.|\n                        { /* nothing */ }
-;|\{|\}|\(|\)|\[|\]               { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
-[[:alnum:]_.@#/*]+                {
-    if (strncmp(yytext, "/*", 2) == 0) {
-      if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
-        /* nothing */
-      } else {
-        BEGIN(c_comment);
-      }
-    } else {
-      feed_token(strdup(yytext), REGULAR_TOKEN);
-      return 1;
-    }
-  }
-\<\<?|\+|\-|\*|\/|%|&&?|\|\|?     { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
-.|\n                              { /* nothing */ }
-
-%%
-
--- a/github-linguist.gemspec
+++ b/github-linguist.gemspec
@@ -10,9 +10,8 @@ Gem::Specification.new do |s|
  s.homepage = "https://github.com/github/linguist"
  s.license  = "MIT"

-  s.files = Dir['lib/**/*'] + Dir['ext/**/*'] + Dir['grammars/*'] + ['LICENSE']
+  s.files = Dir['lib/**/*'] + Dir['grammars/*'] + ['LICENSE']
  s.executables = ['linguist', 'git-linguist']
-  s.extensions = ['ext/linguist/extconf.rb']

  s.add_dependency 'charlock_holmes', '~> 0.7.5'
  s.add_dependency 'escape_utils',    '~> 1.1.0'
@@ -20,7 +19,6 @@ Gem::Specification.new do |s|
  s.add_dependency 'rugged',          '>= 0.25.1'

  s.add_development_dependency 'minitest', '>= 5.0'
-  s.add_development_dependency 'rake-compiler', '~> 0.9'
  s.add_development_dependency 'mocha'
  s.add_development_dependency 'plist', '~>3.1'
  s.add_development_dependency 'pry'
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -275,8 +275,10 @@ module Linguist
          # also--importantly--without having to duplicate many (potentially
          # large) strings.
          begin
+            encoded_newlines = ["\r\n", "\r", "\n"].
+              map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }

-            data.split(encoded_newlines_re, -1)
+            data.split(Regexp.union(encoded_newlines), -1)
          rescue Encoding::ConverterNotFoundError
            # The data is not splittable in the detected encoding.  Assume it's
            # one big line.
@@ -287,51 +289,6 @@ module Linguist
        end
    end

-    def encoded_newlines_re
-      @encoded_newlines_re ||= Regexp.union(["\r\n", "\r", "\n"].
-                                              map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
-
-    end
-
-    def first_lines(n)
-      return lines[0...n] if defined? @lines
-      return [] unless viewable? && data
-
-      i, c = 0, 0
-      while c < n && j = data.index(encoded_newlines_re, i)
-        i = j + $&.length
-        c += 1
-      end
-      data[0...i].split(encoded_newlines_re, -1)
-    end
-
-    def last_lines(n)
-      if defined? @lines
-        if n >= @lines.length
-          @lines
-        else
-          lines[-n..-1]
-        end
-      end
-      return [] unless viewable? && data
-
-      no_eol = true
-      i, c = data.length, 0
-      k = i
-      while c < n && j = data.rindex(encoded_newlines_re, i - 1)
-        if c == 0 && j + $&.length == i
-          no_eol = false
-          n += 1
-        end
-        i = j
-        k = j + $&.length
-        c += 1
-      end
-      r = data[k..-1].split(encoded_newlines_re, -1)
-      r.pop if !no_eol
-      r
-    end
-
    # Public: Get number of lines of code
    #
    # Requires Blob#data
--- a/lib/linguist/classifier.rb
+++ b/lib/linguist/classifier.rb
@@ -3,8 +3,6 @@ require 'linguist/tokenizer'
 module Linguist
  # Language bayesian classifier.
  class Classifier
-    CLASSIFIER_CONSIDER_BYTES = 50 * 1024
-
    # Public: Use the classifier to detect language of the blob.
    #
    # blob               - An object that quacks like a blob.
@@ -19,7 +17,7 @@ module Linguist
    # Returns an Array of Language objects, most probable first.
    def self.call(blob, possible_languages)
      language_names = possible_languages.map(&:name)
-      classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _|
+      classify(Samples.cache, blob.data, language_names).map do |name, _|
        Language[name] # Return the actual Language objects
      end
    end
--- a/lib/linguist/file_blob.rb
+++ b/lib/linguist/file_blob.rb
@@ -23,21 +23,21 @@ module Linguist
    #
    # Returns a String like '100644'
    def mode
-      @mode ||= File.stat(@fullpath).mode.to_s(8)
+      File.stat(@fullpath).mode.to_s(8)
    end

    # Public: Read file contents.
    #
    # Returns a String.
    def data
-      @data ||= File.read(@fullpath)
+      File.read(@fullpath)
    end

    # Public: Get byte size
    #
    # Returns an Integer.
    def size
-      @size ||= File.size(@fullpath)
+      File.size(@fullpath)
    end
  end
 end
--- a/lib/linguist/heuristics.rb
+++ b/lib/linguist/heuristics.rb
@@ -1,8 +1,6 @@
 module Linguist
  # A collection of simple heuristics that can be used to better analyze languages.
  class Heuristics
-    HEURISTICS_CONSIDER_BYTES = 50 * 1024
-
    # Public: Use heuristics to detect language of the blob.
    #
    # blob               - An object that quacks like a blob.
@@ -16,7 +14,7 @@ module Linguist
    #
    # Returns an Array of languages, or empty if none matched or were inconclusive.
    def self.call(blob, candidates)
-      data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
+      data = blob.data

      @heuristics.each do |heuristic|
        if heuristic.matches?(blob.name, candidates)
@@ -74,14 +72,6 @@ module Linguist

    # Common heuristics
    ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
-    CPlusPlusRegex = Regexp.union(
-        /^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/,
-        /^\s*template\s*</,
-        /^[ \t]*try/,
-        /^[ \t]*catch\s*\(/,
-        /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/,
-        /^[ \t]*(private|public|protected):$/,
-        /std::\w+/)

    disambiguate ".as" do |data|
      if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data)
@@ -229,7 +219,8 @@ module Linguist
    disambiguate ".h" do |data|
      if ObjectiveCRegex.match(data)
        Language["Objective-C"]
-      elsif CPlusPlusRegex.match(data)
+      elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
+        /^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
        Language["C++"]
      end
    end
--- a/lib/linguist/strategy/modeline.rb
+++ b/lib/linguist/strategy/modeline.rb
@@ -109,8 +109,8 @@ module Linguist
      # Returns an Array with one Language if the blob has a Vim or Emacs modeline
      # that matches a Language name or alias. Returns an empty array if no match.
      def self.call(blob, _ = nil)
-        header = blob.first_lines(SEARCH_SCOPE).join("\n")
-        footer = blob.last_lines(SEARCH_SCOPE).join("\n")
+        header = blob.lines.first(SEARCH_SCOPE).join("\n")
+        footer = blob.lines.last(SEARCH_SCOPE).join("\n")
        Array(Language.find_by_alias(modeline(header + footer)))
      end

--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -1,5 +1,4 @@
 require 'strscan'
-require 'linguist/linguist'

 module Linguist
  # Generic programming language tokenizer.
@@ -16,5 +15,191 @@ module Linguist
    def self.tokenize(data)
      new.extract_tokens(data)
    end
+
+    # Read up to 100KB
+    BYTE_LIMIT = 100_000
+
+    # Start state on token, ignore anything till the next newline
+    SINGLE_LINE_COMMENTS = [
+      '//', # C
+      '--', # Ada, Haskell, AppleScript
+      '#',  # Ruby
+      '%',  # Tex
+      '"',  # Vim
+    ]
+
+    # Start state on opening token, ignore anything until the closing
+    # token is reached.
+    MULTI_LINE_COMMENTS = [
+      ['/*', '*/'],    # C
+      ['<!--', '-->'], # XML
+      ['{-', '-}'],    # Haskell
+      ['(*', '*)'],    # Coq
+      ['"""', '"""'],  # Python
+      ["'''", "'''"]   # Python
+    ]
+
+    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
+      "\s*#{Regexp.escape(c)} "
+    }.join("|"))
+
+    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
+      Regexp.escape(c[0])
+    }.join("|"))
+
+    # Internal: Extract generic tokens from data.
+    #
+    # data - String to scan.
+    #
+    # Examples
+    #
+    #   extract_tokens("printf('Hello')")
+    #   # => ['printf', '(', ')']
+    #
+    # Returns Array of token Strings.
+    def extract_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+      until s.eos?
+        break if s.pos >= BYTE_LIMIT
+
+        if token = s.scan(/^#!.+$/)
+          if name = extract_shebang(token)
+            tokens << "SHEBANG#!#{name}"
+          end
+
+        # Single line comment
+        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
+          # tokens << token.strip
+          s.skip_until(/\n|\Z/)
+
+        # Multiline comments
+        elsif token = s.scan(START_MULTI_LINE_COMMENT)
+          # tokens << token
+          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
+          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
+          # tokens << close_token
+
+        # Skip single or double quoted strings
+        elsif s.scan(/"/)
+          if s.peek(1) == "\""
+            s.getch
+          else
+            s.skip_until(/(?<!\\)"/)
+          end
+        elsif s.scan(/'/)
+          if s.peek(1) == "'"
+            s.getch
+          else
+            s.skip_until(/(?<!\\)'/)
+          end
+
+        # Skip number literals
+        elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
+
+        # SGML style brackets
+        elsif token = s.scan(/<[^\s<>][^<>]*>/)
+          extract_sgml_tokens(token).each { |t| tokens << t }
+
+        # Common programming punctuation
+        elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
+          tokens << token
+
+        # Regular token
+        elsif token = s.scan(/[\w\.@#\/\*]+/)
+          tokens << token
+
+        # Common operators
+        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
+          tokens << token
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
+
+    # Internal: Extract normalized shebang command token.
+    #
+    # Examples
+    #
+    #   extract_shebang("#!/usr/bin/ruby")
+    #   # => "ruby"
+    #
+    #   extract_shebang("#!/usr/bin/env node")
+    #   # => "node"
+    #
+    #   extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
+    #   # => "awk"
+    #
+    # Returns String token or nil it couldn't be parsed.
+    def extract_shebang(data)
+      s = StringScanner.new(data)
+
+      if path = s.scan(/^#!\s*\S+/)
+        script = path.split('/').last
+        if script == 'env'
+          s.scan(/\s+/)
+          s.scan(/.*=[^\s]+\s+/)
+          script = s.scan(/\S+/)
+        end
+        script = script[/[^\d]+/, 0] if script
+        return script
+      end
+
+      nil
+    end
+
+    # Internal: Extract tokens from inside SGML tag.
+    #
+    # data - SGML tag String.
+    #
+    # Examples
+    #
+    #   extract_sgml_tokens("<a href='' class=foo>")
+    #   # => ["<a>", "href="]
+    #
+    # Returns Array of token Strings.
+    def extract_sgml_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+
+      until s.eos?
+        # Emit start token
+        if token = s.scan(/<\/?[^\s>]+/)
+          tokens << "#{token}>"
+
+        # Emit attributes with trailing =
+        elsif token = s.scan(/\w+=/)
+          tokens << token
+
+          # Then skip over attribute value
+          if s.scan(/"/)
+            s.skip_until(/[^\\]"/)
+          elsif s.scan(/'/)
+            s.skip_until(/[^\\]'/)
+          else
+            s.skip_until(/\w+/)
+          end
+
+        # Emit lone attributes
+        elsif token = s.scan(/\w+/)
+          tokens << token
+
+        # Stop at the end of the tag
+        elsif s.scan(/>/)
+          s.terminate
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
  end
 end