mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Rework benchmarking script to avoid git operations
$ git checkout master
$ bundle exec rake benchmark:generate CORPUS=~/Downloads/samples-9
wrote benchmark/results/samples-9-8cdb8ed4.json
$ git checkout branch-name
$ bx rake benchmark:generate CORPUS=~/Downloads/samples-9
wrote benchmark/results/samples-9-8d8020dd.json
$ bx rake benchmark:compare
REFERENCE=benchmark/results/samples-9-8cdb8ed4.json
CANDIDATE=benchmark/results/samples-9-8d8020dd.json
LanguageA changed from 95.9% to 0.0%
LanguageB changed from 4.0% to 99.9%
This commit is contained in:
166
Rakefile
166
Rakefile
@@ -23,153 +23,54 @@ task :build_gem do
|
|||||||
File.delete("lib/linguist/languages.json")
|
File.delete("lib/linguist/languages.json")
|
||||||
end
|
end
|
||||||
|
|
||||||
# Want to do: rake benchmark:compare refs=20154eb04...6ed0a05b4
|
|
||||||
# If output for 20154eb04 or 6ed0a05b4 doesn't exist then should throw error
|
|
||||||
# Classification outputs for each commit need to be generated before the comparison can be done
|
|
||||||
# With something like: rake benchmark:generate ref=20154eb04
|
|
||||||
|
|
||||||
namespace :benchmark do
|
namespace :benchmark do
|
||||||
require 'git'
|
|
||||||
benchmark_path = "benchmark/results"
|
benchmark_path = "benchmark/results"
|
||||||
|
|
||||||
git = Git.open('.')
|
# $ rake benchmark:generate CORPUS=path/to/samples
|
||||||
|
desc "Generate results for"
|
||||||
desc "Compare outputs"
|
|
||||||
task :compare do
|
|
||||||
reference, compare = ENV['refs'].split('...')
|
|
||||||
puts "Comparing #{reference}...#{compare}"
|
|
||||||
|
|
||||||
# Abort if there are uncommitted changes
|
|
||||||
abort("Uncommitted changes -- aborting") if git.status.changed.any?
|
|
||||||
|
|
||||||
[reference, compare].each do |ref|
|
|
||||||
abort("No output file for #{ref}, run 'rake benchmark:generate ref=#{ref}'") unless File.exist?("#{benchmark_path}/#{ref}.json")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
desc "Generate classification summary for given ref"
|
|
||||||
task :generate do
|
task :generate do
|
||||||
ref = ENV['ref']
|
ref = `git rev-parse HEAD`.strip[0,8]
|
||||||
abort("Must specify a commit ref, e.g. 'rake benchmark:generate ref=08819f82'") unless ref
|
corpus = File.expand_path(ENV["CORPUS"] || "samples")
|
||||||
abort("Unstaged changes - aborting") if git.status.changed.any?
|
|
||||||
|
|
||||||
# Get the current branch
|
|
||||||
# Would like to get this from the Git gem
|
|
||||||
current_branch = `git rev-parse --abbrev-ref HEAD`.strip
|
|
||||||
|
|
||||||
puts "Checking out #{ref}"
|
|
||||||
git.checkout(ref)
|
|
||||||
|
|
||||||
# RUN BENCHMARK
|
|
||||||
# Go through benchmark/samples/LANG dirs
|
|
||||||
# For each Language
|
|
||||||
|
|
||||||
Rake::Task["benchmark:index"].execute(:commit => ref)
|
|
||||||
|
|
||||||
# Checkout original branch
|
|
||||||
git.checkout(current_branch)
|
|
||||||
end
|
|
||||||
|
|
||||||
desc "Build benchmark index"
|
|
||||||
task :index, [:commit] do |t, args|
|
|
||||||
|
|
||||||
require 'linguist/language'
|
require 'linguist/language'
|
||||||
|
|
||||||
results = Hash.new
|
results = Hash.new
|
||||||
languages = Dir.glob('benchmark/samples/*')
|
Dir.glob("#{corpus}/**/*").each do |file|
|
||||||
|
|
||||||
languages.each do |lang|
|
|
||||||
puts ""
|
|
||||||
puts "Starting with #{lang}"
|
|
||||||
results[lang] = {}
|
|
||||||
files = Dir.glob("#{lang}/*")
|
|
||||||
files.each do |file|
|
|
||||||
next unless File.file?(file)
|
next unless File.file?(file)
|
||||||
puts " #{file}"
|
filename = file.gsub("#{corpus}/", "")
|
||||||
|
results[filename] = Linguist::FileBlob.new(file).language
|
||||||
blob = Linguist::FileBlob.new(file, Dir.pwd)
|
|
||||||
result = blob.language
|
|
||||||
|
|
||||||
filename = File.basename(file)
|
|
||||||
if result.nil? # No results
|
|
||||||
results[lang][filename] = "No language"
|
|
||||||
else
|
|
||||||
results[lang][filename] = result.name
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
File.open("benchmark/results/#{args[:commit]}.json", "w") {|f| f.write(results.to_json) }
|
# Ensure results directory exists
|
||||||
|
FileUtils.mkdir_p("benchmark/results")
|
||||||
|
|
||||||
|
# Write results
|
||||||
|
result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}.json"
|
||||||
|
File.write(result_filename, results.to_json)
|
||||||
|
puts "wrote #{result_filename}"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# $ rake benchmark:compare REFERENCE=path/to/reference.json CANDIDATE=path/to/candidate.json
|
||||||
desc "Compare results"
|
desc "Compare results"
|
||||||
task :results do
|
task :compare do
|
||||||
# Deep diffing
|
reference_file = ENV["REFERENCE"]
|
||||||
require './lib/linguist/diff'
|
candidate_file = ENV["CANDIDATE"]
|
||||||
|
|
||||||
reference, compare = ENV['refs'].split('...')
|
reference = JSON.parse(File.read(reference_file))
|
||||||
|
reference_counts = Hash.new(0)
|
||||||
|
reference.each { |filename, language| reference_counts[language] += 1 }
|
||||||
|
|
||||||
reference_classifications_file = "benchmark/results/#{reference}.json"
|
candidate = JSON.parse(File.read(candidate_file))
|
||||||
compare_classifications_file = "benchmark/results/#{compare}.json"
|
candidate_counts = Hash.new(0)
|
||||||
|
candidate.each { |filename, language| candidate_counts[language] += 1 }
|
||||||
|
|
||||||
# DO COMPARISON...
|
changes = diff(reference_counts, candidate_counts)
|
||||||
abort("No result files to compare") unless (File.exist?(reference_classifications_file) && File.exist?(compare_classifications_file))
|
|
||||||
reference_classifications = JSON.parse(File.read(reference_classifications_file))
|
|
||||||
compare_classifications = JSON.parse(File.read(compare_classifications_file))
|
|
||||||
|
|
||||||
# Check if samples don't match current classification
|
|
||||||
puts ""
|
|
||||||
puts "Potential misclassifications for #{reference}"
|
|
||||||
reference_classifications.each do |lang, files|
|
|
||||||
language_name = lang.split('/').last
|
|
||||||
|
|
||||||
files.each do |name, classification|
|
|
||||||
# FIXME Don't want to report stuff from these dirs for now
|
|
||||||
next if ['Binary', 'Text'].include?(language_name)
|
|
||||||
unless classification == language_name
|
|
||||||
puts " #{name} is classified as #{classification} but #{language_name} was expected"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Check if samples don't match current classification
|
|
||||||
# TODO DRY this up.
|
|
||||||
puts ""
|
|
||||||
puts "Potential misclassifications for #{compare}"
|
|
||||||
compare_classifications.each do |lang, files|
|
|
||||||
language_name = lang.split('/').last
|
|
||||||
|
|
||||||
files.each do |name, classification|
|
|
||||||
# FIXME Don't want to report stuff from these dirs for now
|
|
||||||
next if ['Binary', 'Text'].include?(language_name)
|
|
||||||
unless classification == language_name
|
|
||||||
puts " #{name} is classified as #{classification} but #{language_name} was expected"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
puts ""
|
|
||||||
puts "Changes between #{reference}...#{compare}"
|
|
||||||
changes = reference_classifications.deep_diff(compare_classifications)
|
|
||||||
|
|
||||||
# Are there any differences in the linguist classification?
|
|
||||||
if changes.any?
|
if changes.any?
|
||||||
changes.each do |lang, files|
|
changes.each do |language, (before, after)|
|
||||||
previous_count = reference_classifications[lang].size
|
before_percent = 100 * before / reference.size.to_f
|
||||||
|
after_percent = 100 * after / candidate.size.to_f
|
||||||
# Count the number of changed classifications (language and number)
|
puts "%s changed from %.1f%% to %.1f%%" % [language || 'unknown', before_percent, after_percent]
|
||||||
summary = changes[lang].inject(Hash.new(0)) do |result, (key, val)|
|
|
||||||
new_lang = val.last
|
|
||||||
result[new_lang] += 1
|
|
||||||
result
|
|
||||||
end
|
|
||||||
|
|
||||||
puts "#{lang}"
|
|
||||||
|
|
||||||
# Work out the percentage change
|
|
||||||
summary.each do |new_lang, count|
|
|
||||||
percent = count / previous_count.to_f
|
|
||||||
puts " #{sprintf("%.2f", percent)}% change to #{new_lang} (count files)"
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
puts "No changes"
|
puts "No changes"
|
||||||
@@ -226,3 +127,10 @@ namespace :classifier do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
def diff(a, b)
|
||||||
|
(a.keys | b.keys).each_with_object({}) do |key, diff|
|
||||||
|
diff[key] = [a[key], b[key]] unless a[key] == b[key]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user