Files
linguist/Rakefile
Brandon Keepers 02aeb4f895 Merge remote-tracking branch 'origin/master' into just-yajl
* origin/master: (42 commits)
  its always greener
  that new green shell
  Removing stale extension
  Update README.md
  Add moon interpreter for MoonScript
  Bumping version for 3.4.1 release
  Use text.html.erb scope for HTML+ERB files
  Add sample .dyalog file for file type APL
  Added extra Papyrus sample files.
  Add sample Papyrus script
  Add Papyrus support
  Add LOLCODE support
  Add ProGuard config files to vendored files
  Recognise *.dyalog as APL sources
  Assign a bunch more TextMate scopes
  CI step for samples
  Add .command as a Shell file extension
  CI config
  Vendored gems
  Update cibuild
  ...

Conflicts:
	Rakefile
2014-10-31 18:03:03 -04:00

149 lines
4.0 KiB
Ruby

require 'bundler/setup'
require 'rake/clean'
require 'rake/testtask'
require 'yaml'
require 'yajl'
task :default => :test
Rake::TestTask.new
# Extend test task to check for samples
task :test => :check_samples
desc "Check that we have samples.json generated"
task :check_samples do
unless File.exist?('lib/linguist/samples.json')
Rake::Task[:samples].invoke
end
end
task :samples do
require 'linguist/samples'
json = Yajl.dump(Linguist::Samples.data, :pretty => true)
File.write 'lib/linguist/samples.json', json
end
task :build_gem => :samples do
languages = YAML.load_file("lib/linguist/languages.yml")
File.write("lib/linguist/languages.json", Yajl.dump(languages))
`gem build github-linguist.gemspec`
File.delete("lib/linguist/languages.json")
end
namespace :benchmark do
benchmark_path = "benchmark/results"
# $ bundle exec rake benchmark:generate CORPUS=path/to/samples
desc "Generate results for"
task :generate do
ref = `git rev-parse HEAD`.strip[0,8]
corpus = File.expand_path(ENV["CORPUS"] || "samples")
require 'linguist/language'
results = Hash.new
Dir.glob("#{corpus}/**/*").each do |file|
next unless File.file?(file)
filename = file.gsub("#{corpus}/", "")
results[filename] = Linguist::FileBlob.new(file).language
end
# Ensure results directory exists
FileUtils.mkdir_p("benchmark/results")
# Write results
if `git status`.include?('working directory clean')
result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}.json"
else
result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}-unstaged.json"
end
File.write(result_filename, results.to_json)
puts "wrote #{result_filename}"
end
# $ bundle exec rake benchmark:compare REFERENCE=path/to/reference.json CANDIDATE=path/to/candidate.json
desc "Compare results"
task :compare do
reference_file = ENV["REFERENCE"]
candidate_file = ENV["CANDIDATE"]
reference = Yajl.load(File.read(reference_file))
reference_counts = Hash.new(0)
reference.each { |filename, language| reference_counts[language] += 1 }
candidate = Yajl.load(File.read(candidate_file))
candidate_counts = Hash.new(0)
candidate.each { |filename, language| candidate_counts[language] += 1 }
changes = diff(reference_counts, candidate_counts)
if changes.any?
changes.each do |language, (before, after)|
before_percent = 100 * before / reference.size.to_f
after_percent = 100 * after / candidate.size.to_f
puts "%s changed from %.1f%% to %.1f%%" % [language || 'unknown', before_percent, after_percent]
end
else
puts "No changes"
end
end
end
namespace :classifier do
LIMIT = 1_000
desc "Run classifier against #{LIMIT} public gists"
task :test do
require 'linguist/classifier'
require 'linguist/samples'
total, correct, incorrect = 0, 0, 0
$stdout.sync = true
each_public_gist do |gist_url, file_url, file_language|
next if file_language.nil? || file_language == 'Text'
begin
data = open(file_url).read
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
total += 1
guessed_language == file_language ? correct += 1 : incorrect += 1
print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
$stdout.flush
rescue URI::InvalidURIError
else
break if total >= LIMIT
end
end
puts ""
end
def each_public_gist
require 'open-uri'
url = "https://api.github.com/gists/public"
loop do
resp = open(url)
url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
gists = Yajl.load(resp.read)
for gist in gists
for filename, attrs in gist['files']
yield gist['url'], attrs['raw_url'], attrs['language']
end
end
end
end
end
def diff(a, b)
(a.keys | b.keys).each_with_object({}) do |key, diff|
diff[key] = [a[key], b[key]] unless a[key] == b[key]
end
end