From dab75f6f97940362ca1fe05a00efc8ede415bf8b Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Wed, 10 Sep 2014 15:47:44 -0500 Subject: [PATCH] Rework benchmarking script to avoid git operations $ git checkout master $ bundle exec rake benchmark:generate CORPUS=~/Downloads/samples-9 wrote benchmark/results/samples-9-8cdb8ed4.json $ git checkout branch-name $ bx rake benchmark:generate CORPUS=~/Downloads/samples-9 wrote benchmark/results/samples-9-8d8020dd.json $ bx rake benchmark:compare REFERENCE=benchmark/results/samples-9-8cdb8ed4.json CANDIDATE=benchmark/results/samples-9-8d8020dd.json LanguageA changed from 95.9% to 0.0% LanguageB changed from 4.0% to 99.9% --- Rakefile | 170 +++++++++++++------------------------------------------ 1 file changed, 39 insertions(+), 131 deletions(-) diff --git a/Rakefile b/Rakefile index 53f5e924..4d52d197 100644 --- a/Rakefile +++ b/Rakefile @@ -23,156 +23,57 @@ task :build_gem do File.delete("lib/linguist/languages.json") end -# Want to do: rake benchmark:compare refs=20154eb04...6ed0a05b4 -# If output for 20154eb04 or 6ed0a05b4 doesn't exist then should throw error -# Classification outputs for each commit need to be generated before the comparison can be done -# With something like: rake benchmark:generate ref=20154eb04 - namespace :benchmark do - require 'git' benchmark_path = "benchmark/results" - git = Git.open('.') - - desc "Compare outputs" - task :compare do - reference, compare = ENV['refs'].split('...') - puts "Comparing #{reference}...#{compare}" - - # Abort if there are uncommitted changes - abort("Uncommitted changes -- aborting") if git.status.changed.any? - - [reference, compare].each do |ref| - abort("No output file for #{ref}, run 'rake benchmark:generate ref=#{ref}'") unless File.exist?("#{benchmark_path}/#{ref}.json") - end - end - - desc "Generate classification summary for given ref" + # $ rake benchmark:generate CORPUS=path/to/samples + desc "Generate results for" task :generate do - ref = ENV['ref'] - abort("Must specify a commit ref, e.g. 'rake benchmark:generate ref=08819f82'") unless ref - abort("Unstaged changes - aborting") if git.status.changed.any? - - # Get the current branch - # Would like to get this from the Git gem - current_branch = `git rev-parse --abbrev-ref HEAD`.strip - - puts "Checking out #{ref}" - git.checkout(ref) - - # RUN BENCHMARK - # Go through benchmark/samples/LANG dirs - # For each Language - - Rake::Task["benchmark:index"].execute(:commit => ref) - - # Checkout original branch - git.checkout(current_branch) - end - - desc "Build benchmark index" - task :index, [:commit] do |t, args| + ref = `git rev-parse HEAD`.strip[0,8] + corpus = File.expand_path(ENV["CORPUS"] || "samples") require 'linguist/language' + results = Hash.new - languages = Dir.glob('benchmark/samples/*') - - languages.each do |lang| - puts "" - puts "Starting with #{lang}" - results[lang] = {} - files = Dir.glob("#{lang}/*") - files.each do |file| - next unless File.file?(file) - puts " #{file}" - - blob = Linguist::FileBlob.new(file, Dir.pwd) - result = blob.language - - filename = File.basename(file) - if result.nil? # No results - results[lang][filename] = "No language" - else - results[lang][filename] = result.name - end - end + Dir.glob("#{corpus}/**/*").each do |file| + next unless File.file?(file) + filename = file.gsub("#{corpus}/", "") + results[filename] = Linguist::FileBlob.new(file).language end - File.open("benchmark/results/#{args[:commit]}.json", "w") {|f| f.write(results.to_json) } + # Ensure results directory exists + FileUtils.mkdir_p("benchmark/results") + + # Write results + result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}.json" + File.write(result_filename, results.to_json) + puts "wrote #{result_filename}" end + # $ rake benchmark:compare REFERENCE=path/to/reference.json CANDIDATE=path/to/candidate.json desc "Compare results" - task :results do - # Deep diffing - require './lib/linguist/diff' + task :compare do + reference_file = ENV["REFERENCE"] + candidate_file = ENV["CANDIDATE"] - reference, compare = ENV['refs'].split('...') + reference = JSON.parse(File.read(reference_file)) + reference_counts = Hash.new(0) + reference.each { |filename, language| reference_counts[language] += 1 } - reference_classifications_file = "benchmark/results/#{reference}.json" - compare_classifications_file = "benchmark/results/#{compare}.json" + candidate = JSON.parse(File.read(candidate_file)) + candidate_counts = Hash.new(0) + candidate.each { |filename, language| candidate_counts[language] += 1 } - # DO COMPARISON... - abort("No result files to compare") unless (File.exist?(reference_classifications_file) && File.exist?(compare_classifications_file)) - reference_classifications = JSON.parse(File.read(reference_classifications_file)) - compare_classifications = JSON.parse(File.read(compare_classifications_file)) + changes = diff(reference_counts, candidate_counts) - # Check if samples don't match current classification - puts "" - puts "Potential misclassifications for #{reference}" - reference_classifications.each do |lang, files| - language_name = lang.split('/').last - - files.each do |name, classification| - # FIXME Don't want to report stuff from these dirs for now - next if ['Binary', 'Text'].include?(language_name) - unless classification == language_name - puts " #{name} is classified as #{classification} but #{language_name} was expected" - end - end - end - - # Check if samples don't match current classification - # TODO DRY this up. - puts "" - puts "Potential misclassifications for #{compare}" - compare_classifications.each do |lang, files| - language_name = lang.split('/').last - - files.each do |name, classification| - # FIXME Don't want to report stuff from these dirs for now - next if ['Binary', 'Text'].include?(language_name) - unless classification == language_name - puts " #{name} is classified as #{classification} but #{language_name} was expected" - end - end - end - - puts "" - puts "Changes between #{reference}...#{compare}" - changes = reference_classifications.deep_diff(compare_classifications) - - # Are there any differences in the linguist classification? if changes.any? - changes.each do |lang, files| - previous_count = reference_classifications[lang].size - - # Count the number of changed classifications (language and number) - summary = changes[lang].inject(Hash.new(0)) do |result, (key, val)| - new_lang = val.last - result[new_lang] += 1 - result - end - - puts "#{lang}" - - # Work out the percentage change - summary.each do |new_lang, count| - percent = count / previous_count.to_f - puts " #{sprintf("%.2f", percent)}% change to #{new_lang} (count files)" - end + changes.each do |language, (before, after)| + before_percent = 100 * before / reference.size.to_f + after_percent = 100 * after / candidate.size.to_f + puts "%s changed from %.1f%% to %.1f%%" % [language || 'unknown', before_percent, after_percent] end else - puts " No changes" + puts "No changes" end end end @@ -226,3 +127,10 @@ namespace :classifier do end end end + + +def diff(a, b) + (a.keys | b.keys).each_with_object({}) do |key, diff| + diff[key] = [a[key], b[key]] unless a[key] == b[key] + end +end