Replace the tokenizer with a flex-based scanner (#3846)

* Lex everything except SGML, multiline, SHEBANG * Prepend SHEBANG#! to tokens * Support SGML tag/attribute extraction * Multiline comments * WIP cont'd; productionifying * Compile before test * Add extension to gemspec * Add flex task to build lexer * Reentrant extra data storage * regenerate lexer * use prefix * rebuild lexer on linux * Optimise a number of operations: * Don't read and split the entire file if we only ever use the first/last n lines * Only consider the first 50KiB when using heuristics/classifying. This can save a *lot* of time; running a large number of regexes over 1MiB of text takes a while. * Memoize File.size/read/stat; re-reading in a 500KiB file every time `data` is called adds up a lot. * Use single regex for C++ * act like #lines * [1][-2..-1] => nil, ffs * k may not be set
2026-07-04 08:41:57 +00:00 · 2017-10-31 11:06:56 +11:00
parent 21babbceb1
commit 99eaf5faf9
15 changed files with 8914 additions and 202 deletions
--- a/23
+++ b/23
@@ -1,6 +1,7 @@
 require 'bundler/setup'
 require 'rake/clean'
 require 'rake/testtask'
+require 'rake/extensiontask'
 require 'yaml'
 require 'yajl'
 require 'open-uri'
@@ -10,8 +11,14 @@ task :default => :test

 Rake::TestTask.new

+gem_spec = Gem::Specification.load('github-linguist.gemspec')
+
+Rake::ExtensionTask.new('linguist', gem_spec) do |ext|
+  ext.lib_dir = File.join('lib', 'linguist')
+end
+
 # Extend test task to check for samples and fetch latest Ace modes
-task :test => [:check_samples, :fetch_ace_modes]
+task :test => [:compile, :check_samples, :fetch_ace_modes]

 desc "Check that we have samples.json generated"
 task :check_samples do
@@ -34,12 +41,24 @@ task :fetch_ace_modes do
  end
 end

-task :samples do
+task :samples => :compile do
  require 'linguist/samples'
  json = Yajl.dump(Linguist::Samples.data, :pretty => true)
  File.write 'lib/linguist/samples.json', json
 end

+FLEX_MIN_VER = [2, 5, 39]
+task :flex do
+  if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/
+    fail "flex not detected"
+  end
+  maj, min, rev = $1.to_i, $2.to_i, $3.to_i
+  if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2])))
+    fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}"
+  end
+  system "cd ext/linguist && flex tokenizer.l"
+end
+
 task :build_gem => :samples do
  rm_rf "grammars"
  sh "script/convert-grammars"