diff --git a/.gitignore b/.gitignore index 97ef7367..391e05a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ Gemfile.lock .bundle/ vendor/ +benchmark/ diff --git a/Rakefile b/Rakefile index e89e75a9..9c3aa2ee 100644 --- a/Rakefile +++ b/Rakefile @@ -2,6 +2,7 @@ require 'json' require 'rake/clean' require 'rake/testtask' require 'yaml' +require 'pry' task :default => :test @@ -22,6 +23,67 @@ task :build_gem do File.delete("lib/linguist/languages.json") end +namespace :benchmark do + benchmark_path = "benchmark/results" + + # $ bundle exec rake benchmark:generate CORPUS=path/to/samples + desc "Generate results for" + task :generate do + ref = `git rev-parse HEAD`.strip[0,8] + + corpus = File.expand_path(ENV["CORPUS"] || "samples") + + require 'linguist/language' + + results = Hash.new + Dir.glob("#{corpus}/**/*").each do |file| + next unless File.file?(file) + filename = file.gsub("#{corpus}/", "") + results[filename] = Linguist::FileBlob.new(file).language + end + + # Ensure results directory exists + FileUtils.mkdir_p("benchmark/results") + + # Write results + if `git status`.include?('working directory clean') + result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}.json" + else + result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}-unstaged.json" + end + + File.write(result_filename, results.to_json) + puts "wrote #{result_filename}" + end + + # $ bundle exec rake benchmark:compare REFERENCE=path/to/reference.json CANDIDATE=path/to/candidate.json + desc "Compare results" + task :compare do + reference_file = ENV["REFERENCE"] + candidate_file = ENV["CANDIDATE"] + + reference = JSON.parse(File.read(reference_file)) + reference_counts = Hash.new(0) + reference.each { |filename, language| reference_counts[language] += 1 } + + candidate = JSON.parse(File.read(candidate_file)) + candidate_counts = Hash.new(0) + candidate.each { |filename, language| candidate_counts[language] += 1 } + + changes = diff(reference_counts, candidate_counts) + + if changes.any? + changes.each do |language, (before, after)| + before_percent = 100 * before / reference.size.to_f + after_percent = 100 * after / candidate.size.to_f + puts "%s changed from %.1f%% to %.1f%%" % [language || 'unknown', before_percent, after_percent] + end + else + puts "No changes" + end + end +end + namespace :classifier do LIMIT = 1_000 @@ -71,3 +133,10 @@ namespace :classifier do end end end + + +def diff(a, b) + (a.keys | b.keys).each_with_object({}) do |key, diff| + diff[key] = [a[key], b[key]] unless a[key] == b[key] + end +end diff --git a/github-linguist.gemspec b/github-linguist.gemspec index 936550f3..382b6cae 100644 --- a/github-linguist.gemspec +++ b/github-linguist.gemspec @@ -21,6 +21,7 @@ Gem::Specification.new do |s| s.add_development_dependency 'json' s.add_development_dependency 'mocha' + s.add_development_dependency 'pry' s.add_development_dependency 'rake' s.add_development_dependency 'yajl-ruby' end