Files
linguist/Rakefile
2014-07-23 11:30:25 -05:00

229 lines
6.5 KiB
Ruby

require 'json'
require 'rake/clean'
require 'rake/testtask'
require 'yaml'
require 'pry'
task :default => :test
Rake::TestTask.new
task :samples do
require 'linguist/samples'
require 'yajl'
data = Linguist::Samples.data
json = Yajl::Encoder.encode(data, :pretty => true)
File.open('lib/linguist/samples.json', 'w') { |io| io.write json }
end
task :build_gem do
languages = YAML.load_file("lib/linguist/languages.yml")
File.write("lib/linguist/languages.json", JSON.dump(languages))
`gem build github-linguist.gemspec`
File.delete("lib/linguist/languages.json")
end
namespace :benchmark do
require 'git'
require 'linguist/language'
require './lib/linguist/diff'
git = Git.open('.')
desc "Testin'"
task :run do
reference, compare = ENV['compare'].split('...')
puts "Comparing #{reference}...#{compare}"
abort("Unstaged changes - aborting") if git.status.changed.any?
# Get the current branch
# Would like to get this from the Git gem
current_branch = `git rev-parse --abbrev-ref HEAD`.strip
# Create tmp branch for reference commit
puts "Creating branch tmp_#{reference}"
git.branch("tmp_#{reference}").checkout
git.reset_hard(reference)
# RUN BENCHMARK
# Go through benchmark/samples/LANG dirs
# For each Language
Rake::Task["benchmark:index"].execute(:commit => reference)
# Create tmp branch for compare commit
puts ""
puts "Creating temporary branch tmp_#{compare}"
git.branch("tmp_#{compare}").checkout
git.reset_hard(compare)
# RUN BENCHMARK AGAIN
# `rake benchmark:index`
Rake::Task["benchmark:index"].execute(:commit => compare)
git.branch(current_branch).checkout
# CLEAN UP
git.branch("tmp_#{reference}").delete
git.branch("tmp_#{compare}").delete
# COMPARE AND PRINT RESULTS
Rake::Task["benchmark:results"].execute
end
desc "Build benchmark index"
task :index, [:commit] do |t, args|
require 'linguist/language'
results = Hash.new
languages = Dir.glob('benchmark/samples/*')
languages.each do |lang|
puts ""
puts "Starting with #{lang}"
results[lang] = {}
files = Dir.glob("#{lang}/*")
files.each do |file|
next unless File.file?(file)
puts " #{file}"
blob = Linguist::FileBlob.new(file, Dir.pwd)
result = blob.language
filename = File.basename(file)
if result.nil? # No results
results[lang][filename] = "No language"
else
results[lang][filename] = result.name
end
end
end
File.open("benchmark/results/#{args[:commit]}_output.json", "w") {|f| f.write(results.to_json) }
end
desc "Compare results"
task :results do
# `diff -u file1 file2`
reference, compare = ENV['compare'].split('...')
reference_classifications_file = "benchmark/results/#{reference}_output.json"
compare_classifications_file = "benchmark/results/#{compare}_output.json"
# DO COMPARISON...
abort("No result files to compare") unless (File.exist?(reference_classifications_file) && File.exist?(compare_classifications_file))
reference_classifications = JSON.parse(File.read(reference_classifications_file))
compare_classifications = JSON.parse(File.read(compare_classifications_file))
# Check if samples don't match current classification
puts ""
puts "Potential misclassifications for #{reference}"
reference_classifications.each do |lang, files|
language_name = lang.split('/').last
files.each do |name, classification|
# FIXME Don't want to report stuff from these dirs for now
next if ['Binary', 'Text'].include?(language_name)
unless classification == language_name
puts " #{name} is classified as #{classification} but #{language_name} was expected"
end
end
end
# Check if samples don't match current classification
# TODO DRY this up.
puts ""
puts "Potential misclassifications for #{compare}"
compare_classifications.each do |lang, files|
language_name = lang.split('/').last
files.each do |name, classification|
# FIXME Don't want to report stuff from these dirs for now
next if ['Binary', 'Text'].include?(language_name)
unless classification == language_name
puts " #{name} is classified as #{classification} but #{language_name} was expected"
end
end
end
puts ""
puts "Changes between #{reference}...#{compare}"
changes = reference_classifications.deep_diff(compare_classifications)
# Are there any differences in the linguist classification?
if changes.any?
changes.each do |lang, files|
previous_count = reference_classifications[lang].size
# Count the number of changed classifications (language and number)
summary = changes[lang].inject(Hash.new(0)) do |result, (key, val)|
new_lang = val.last
result[new_lang] += 1
result
end
puts "#{lang}"
# Work out the percentage change
summary.each do |new_lang, count|
percent = count / previous_count.to_f
puts " #{sprintf("%.2f", percent)}% change to #{new_lang} (count files)"
end
end
else
puts " No changes"
end
end
end
namespace :classifier do
LIMIT = 1_000
desc "Run classifier against #{LIMIT} public gists"
task :test do
require 'linguist/classifier'
require 'linguist/samples'
total, correct, incorrect = 0, 0, 0
$stdout.sync = true
each_public_gist do |gist_url, file_url, file_language|
next if file_language.nil? || file_language == 'Text'
begin
data = open(file_url).read
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first
total += 1
guessed_language == file_language ? correct += 1 : incorrect += 1
print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
$stdout.flush
rescue URI::InvalidURIError
else
break if total >= LIMIT
end
end
puts ""
end
def each_public_gist
require 'open-uri'
require 'json'
url = "https://api.github.com/gists/public"
loop do
resp = open(url)
url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
gists = JSON.parse(resp.read)
for gist in gists
for filename, attrs in gist['files']
yield gist['url'], attrs['raw_url'], attrs['language']
end
end
end
end
end