mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 09:40:21 +00:00
Both JSON and Yajl were listed as dependencies. Pygments.rb already requires yajl, so let's just use that instead of using both.
148 lines
4.0 KiB
Ruby
148 lines
4.0 KiB
Ruby
require 'rake/clean'
|
|
require 'rake/testtask'
|
|
require 'yaml'
|
|
require 'yajl'
|
|
|
|
task :default => :test
|
|
|
|
Rake::TestTask.new
|
|
|
|
# Extend test task to check for samples
|
|
task :test => :check_samples
|
|
|
|
desc "Check that we have samples.json generated"
|
|
task :check_samples do
|
|
unless File.exist?('lib/linguist/samples.json')
|
|
Rake::Task[:samples].invoke
|
|
end
|
|
end
|
|
|
|
task :samples do
|
|
require 'linguist/samples'
|
|
json = Yajl.dump(Linguist::Samples.data, :pretty => true)
|
|
File.write 'lib/linguist/samples.json', json
|
|
end
|
|
|
|
task :build_gem => :samples do
|
|
languages = YAML.load_file("lib/linguist/languages.yml")
|
|
File.write("lib/linguist/languages.json", Yajl.dump(languages))
|
|
`gem build github-linguist.gemspec`
|
|
File.delete("lib/linguist/languages.json")
|
|
end
|
|
|
|
namespace :benchmark do
|
|
benchmark_path = "benchmark/results"
|
|
|
|
# $ bundle exec rake benchmark:generate CORPUS=path/to/samples
|
|
desc "Generate results for"
|
|
task :generate do
|
|
ref = `git rev-parse HEAD`.strip[0,8]
|
|
|
|
corpus = File.expand_path(ENV["CORPUS"] || "samples")
|
|
|
|
require 'linguist/language'
|
|
|
|
results = Hash.new
|
|
Dir.glob("#{corpus}/**/*").each do |file|
|
|
next unless File.file?(file)
|
|
filename = file.gsub("#{corpus}/", "")
|
|
results[filename] = Linguist::FileBlob.new(file).language
|
|
end
|
|
|
|
# Ensure results directory exists
|
|
FileUtils.mkdir_p("benchmark/results")
|
|
|
|
# Write results
|
|
if `git status`.include?('working directory clean')
|
|
result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}.json"
|
|
else
|
|
result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}-unstaged.json"
|
|
end
|
|
|
|
File.write(result_filename, results.to_json)
|
|
puts "wrote #{result_filename}"
|
|
end
|
|
|
|
# $ bundle exec rake benchmark:compare REFERENCE=path/to/reference.json CANDIDATE=path/to/candidate.json
|
|
desc "Compare results"
|
|
task :compare do
|
|
reference_file = ENV["REFERENCE"]
|
|
candidate_file = ENV["CANDIDATE"]
|
|
|
|
reference = Yajl.load(File.read(reference_file))
|
|
reference_counts = Hash.new(0)
|
|
reference.each { |filename, language| reference_counts[language] += 1 }
|
|
|
|
candidate = Yajl.load(File.read(candidate_file))
|
|
candidate_counts = Hash.new(0)
|
|
candidate.each { |filename, language| candidate_counts[language] += 1 }
|
|
|
|
changes = diff(reference_counts, candidate_counts)
|
|
|
|
if changes.any?
|
|
changes.each do |language, (before, after)|
|
|
before_percent = 100 * before / reference.size.to_f
|
|
after_percent = 100 * after / candidate.size.to_f
|
|
puts "%s changed from %.1f%% to %.1f%%" % [language || 'unknown', before_percent, after_percent]
|
|
end
|
|
else
|
|
puts "No changes"
|
|
end
|
|
end
|
|
end
|
|
|
|
namespace :classifier do
|
|
LIMIT = 1_000
|
|
|
|
desc "Run classifier against #{LIMIT} public gists"
|
|
task :test do
|
|
require 'linguist/classifier'
|
|
require 'linguist/samples'
|
|
|
|
total, correct, incorrect = 0, 0, 0
|
|
$stdout.sync = true
|
|
|
|
each_public_gist do |gist_url, file_url, file_language|
|
|
next if file_language.nil? || file_language == 'Text'
|
|
begin
|
|
data = open(file_url).read
|
|
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
|
|
|
|
total += 1
|
|
guessed_language == file_language ? correct += 1 : incorrect += 1
|
|
|
|
print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
|
|
$stdout.flush
|
|
rescue URI::InvalidURIError
|
|
else
|
|
break if total >= LIMIT
|
|
end
|
|
end
|
|
puts ""
|
|
end
|
|
|
|
def each_public_gist
|
|
require 'open-uri'
|
|
url = "https://api.github.com/gists/public"
|
|
|
|
loop do
|
|
resp = open(url)
|
|
url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
|
|
gists = Yajl.load(resp.read)
|
|
|
|
for gist in gists
|
|
for filename, attrs in gist['files']
|
|
yield gist['url'], attrs['raw_url'], attrs['language']
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
def diff(a, b)
|
|
(a.keys | b.keys).each_with_object({}) do |key, diff|
|
|
diff[key] = [a[key], b[key]] unless a[key] == b[key]
|
|
end
|
|
end
|