require 'json' require 'rake/clean' require 'rake/testtask' require 'yaml' require 'pry' task :default => :test Rake::TestTask.new task :samples do require 'linguist/samples' require 'yajl' data = Linguist::Samples.data json = Yajl::Encoder.encode(data, :pretty => true) File.open('lib/linguist/samples.json', 'w') { |io| io.write json } end task :build_gem do languages = YAML.load_file("lib/linguist/languages.yml") File.write("lib/linguist/languages.json", JSON.dump(languages)) `gem build github-linguist.gemspec` File.delete("lib/linguist/languages.json") end # Want to do: rake benchmark:compare refs=20154eb04...6ed0a05b4 # If output for 20154eb04 or 6ed0a05b4 doesn't exist then should throw error # Classification outputs for each commit need to be generated before the comparison can be done # With something like: rake benchmark:generate ref=20154eb04 namespace :benchmark do require 'git' benchmark_path = "benchmark/results" git = Git.open('.') desc "Compare outputs" task :compare do reference, compare = ENV['refs'].split('...') puts "Comparing #{reference}...#{compare}" # Abort if there are uncommitted changes abort("Uncommitted changes -- aborting") if git.status.changed.any? [reference, compare].each do |ref| abort("No output file for #{ref}, run 'rake benchmark:generate ref=#{ref}'") unless File.exist?("#{benchmark_path}/#{ref}.json") end end desc "Generate classification summary for given ref" task :generate do ref = ENV['ref'] abort("Must specify a commit ref, e.g. 'rake benchmark:generate ref=08819f82'") unless ref abort("Unstaged changes - aborting") if git.status.changed.any? # Get the current branch # Would like to get this from the Git gem current_branch = `git rev-parse --abbrev-ref HEAD`.strip puts "Checking out #{ref}" git.checkout(ref) # RUN BENCHMARK # Go through benchmark/samples/LANG dirs # For each Language Rake::Task["benchmark:index"].execute(:commit => ref) # Checkout original branch git.checkout(current_branch) end desc "Build benchmark index" task :index, [:commit] do |t, args| require 'linguist/language' results = Hash.new languages = Dir.glob('benchmark/samples/*') languages.each do |lang| puts "" puts "Starting with #{lang}" results[lang] = {} files = Dir.glob("#{lang}/*") files.each do |file| next unless File.file?(file) puts " #{file}" blob = Linguist::FileBlob.new(file, Dir.pwd) result = blob.language filename = File.basename(file) if result.nil? # No results results[lang][filename] = "No language" else results[lang][filename] = result.name end end end File.open("benchmark/results/#{args[:commit]}.json", "w") {|f| f.write(results.to_json) } end desc "Compare results" task :results do # Deep diffing require './lib/linguist/diff' reference, compare = ENV['refs'].split('...') reference_classifications_file = "benchmark/results/#{reference}.json" compare_classifications_file = "benchmark/results/#{compare}.json" # DO COMPARISON... abort("No result files to compare") unless (File.exist?(reference_classifications_file) && File.exist?(compare_classifications_file)) reference_classifications = JSON.parse(File.read(reference_classifications_file)) compare_classifications = JSON.parse(File.read(compare_classifications_file)) # Check if samples don't match current classification puts "" puts "Potential misclassifications for #{reference}" reference_classifications.each do |lang, files| language_name = lang.split('/').last files.each do |name, classification| # FIXME Don't want to report stuff from these dirs for now next if ['Binary', 'Text'].include?(language_name) unless classification == language_name puts " #{name} is classified as #{classification} but #{language_name} was expected" end end end # Check if samples don't match current classification # TODO DRY this up. puts "" puts "Potential misclassifications for #{compare}" compare_classifications.each do |lang, files| language_name = lang.split('/').last files.each do |name, classification| # FIXME Don't want to report stuff from these dirs for now next if ['Binary', 'Text'].include?(language_name) unless classification == language_name puts " #{name} is classified as #{classification} but #{language_name} was expected" end end end puts "" puts "Changes between #{reference}...#{compare}" changes = reference_classifications.deep_diff(compare_classifications) # Are there any differences in the linguist classification? if changes.any? changes.each do |lang, files| previous_count = reference_classifications[lang].size # Count the number of changed classifications (language and number) summary = changes[lang].inject(Hash.new(0)) do |result, (key, val)| new_lang = val.last result[new_lang] += 1 result end puts "#{lang}" # Work out the percentage change summary.each do |new_lang, count| percent = count / previous_count.to_f puts " #{sprintf("%.2f", percent)}% change to #{new_lang} (count files)" end end else puts " No changes" end end end namespace :classifier do LIMIT = 1_000 desc "Run classifier against #{LIMIT} public gists" task :test do require 'linguist/classifier' require 'linguist/samples' total, correct, incorrect = 0, 0, 0 $stdout.sync = true each_public_gist do |gist_url, file_url, file_language| next if file_language.nil? || file_language == 'Text' begin data = open(file_url).read guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first total += 1 guessed_language == file_language ? correct += 1 : incorrect += 1 print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100] $stdout.flush rescue URI::InvalidURIError else break if total >= LIMIT end end puts "" end def each_public_gist require 'open-uri' require 'json' url = "https://api.github.com/gists/public" loop do resp = open(url) url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1] gists = JSON.parse(resp.read) for gist in gists for filename, attrs in gist['files'] yield gist['url'], attrs['raw_url'], attrs['language'] end end end end end