mirror of
https://github.com/KevinMidboe/linguist.git
synced 2026-01-10 11:25:32 +00:00
Replace the tokenizer with a flex-based scanner (#3846)
* Lex everything except SGML, multiline, SHEBANG * Prepend SHEBANG#! to tokens * Support SGML tag/attribute extraction * Multiline comments * WIP cont'd; productionifying * Compile before test * Add extension to gemspec * Add flex task to build lexer * Reentrant extra data storage * regenerate lexer * use prefix * rebuild lexer on linux * Optimise a number of operations: * Don't read and split the entire file if we only ever use the first/last n lines * Only consider the first 50KiB when using heuristics/classifying. This can save a *lot* of time; running a large number of regexes over 1MiB of text takes a while. * Memoize File.size/read/stat; re-reading in a 500KiB file every time `data` is called adds up a lot. * Use single regex for C++ * act like #lines * [1][-2..-1] => nil, ffs * k may not be set
This commit is contained in:
23
Rakefile
23
Rakefile
@@ -1,6 +1,7 @@
|
||||
require 'bundler/setup'
|
||||
require 'rake/clean'
|
||||
require 'rake/testtask'
|
||||
require 'rake/extensiontask'
|
||||
require 'yaml'
|
||||
require 'yajl'
|
||||
require 'open-uri'
|
||||
@@ -10,8 +11,14 @@ task :default => :test
|
||||
|
||||
Rake::TestTask.new
|
||||
|
||||
gem_spec = Gem::Specification.load('github-linguist.gemspec')
|
||||
|
||||
Rake::ExtensionTask.new('linguist', gem_spec) do |ext|
|
||||
ext.lib_dir = File.join('lib', 'linguist')
|
||||
end
|
||||
|
||||
# Extend test task to check for samples and fetch latest Ace modes
|
||||
task :test => [:check_samples, :fetch_ace_modes]
|
||||
task :test => [:compile, :check_samples, :fetch_ace_modes]
|
||||
|
||||
desc "Check that we have samples.json generated"
|
||||
task :check_samples do
|
||||
@@ -34,12 +41,24 @@ task :fetch_ace_modes do
|
||||
end
|
||||
end
|
||||
|
||||
task :samples do
|
||||
task :samples => :compile do
|
||||
require 'linguist/samples'
|
||||
json = Yajl.dump(Linguist::Samples.data, :pretty => true)
|
||||
File.write 'lib/linguist/samples.json', json
|
||||
end
|
||||
|
||||
FLEX_MIN_VER = [2, 5, 39]
|
||||
task :flex do
|
||||
if `flex -V` !~ /^flex (\d+)\.(\d+)\.(\d+)/
|
||||
fail "flex not detected"
|
||||
end
|
||||
maj, min, rev = $1.to_i, $2.to_i, $3.to_i
|
||||
if maj < FLEX_MIN_VER[0] || (maj == FLEX_MIN_VER[0] && (min < FLEX_MIN_VER[1] || (min == FLEX_MIN_VER[1] && rev < FLEX_MIN_VER[2])))
|
||||
fail "building linguist's lexer requires at least flex #{FLEX_MIN_VER.join(".")}"
|
||||
end
|
||||
system "cd ext/linguist && flex tokenizer.l"
|
||||
end
|
||||
|
||||
task :build_gem => :samples do
|
||||
rm_rf "grammars"
|
||||
sh "script/convert-grammars"
|
||||
|
||||
Reference in New Issue
Block a user