require 'json' require 'rake/clean' require 'rake/testtask' require 'yaml' require 'pry' task :default => :test Rake::TestTask.new task :samples do require 'linguist/samples' require 'yajl' data = Linguist::Samples.data json = Yajl::Encoder.encode(data, :pretty => true) File.open('lib/linguist/samples.json', 'w') { |io| io.write json } end task :build_gem do languages = YAML.load_file("lib/linguist/languages.yml") File.write("lib/linguist/languages.json", JSON.dump(languages)) `gem build github-linguist.gemspec` File.delete("lib/linguist/languages.json") end namespace :benchmark do require 'git' require 'linguist/language' require './lib/linguist/diff' git = Git.open('.') desc "Testin'" task :run do reference, compare = ENV['compare'].split('...') puts "Comparing #{reference}...#{compare}" abort("Unstaged changes - aborting") if git.status.changed.any? # Get the current branch # Would like to get this from the Git gem current_branch = `git rev-parse --abbrev-ref HEAD`.strip # Create tmp branch for reference commit puts "Creating branch tmp_#{reference}" git.branch("tmp_#{reference}").checkout git.reset_hard(reference) # RUN BENCHMARK # Go through benchmark/samples/LANG dirs # For each Language Rake::Task["benchmark:index"].execute(:commit => reference) # Create tmp branch for compare commit puts "" puts "Creating temporary branch tmp_#{compare}" git.branch("tmp_#{compare}").checkout git.reset_hard(compare) # RUN BENCHMARK AGAIN # `rake benchmark:index` Rake::Task["benchmark:index"].execute(:commit => compare) git.branch(current_branch).checkout # CLEAN UP git.branch("tmp_#{reference}").delete git.branch("tmp_#{compare}").delete # COMPARE AND PRINT RESULTS Rake::Task["benchmark:results"].execute end desc "Build benchmark index" task :index, [:commit] do |t, args| require 'linguist/language' results = Hash.new languages = Dir.glob('benchmark/samples/*') languages.each do |lang| puts "" puts "Starting with #{lang}" results[lang] = {} files = Dir.glob("#{lang}/*") files.each do |file| next unless File.file?(file) puts " #{file}" blob = Linguist::FileBlob.new(file, Dir.pwd) result = blob.language filename = File.basename(file) if result.nil? # No results results[lang][filename] = "No language" else results[lang][filename] = result.name end end end File.open("benchmark/results/#{args[:commit]}_output.json", "w") {|f| f.write(results.to_json) } end desc "Compare results" task :results do # `diff -u file1 file2` reference, compare = ENV['compare'].split('...') reference_classifications_file = "benchmark/results/#{reference}_output.json" compare_classifications_file = "benchmark/results/#{compare}_output.json" # DO COMPARISON... abort("No result files to compare") unless (File.exist?(reference_classifications_file) && File.exist?(compare_classifications_file)) reference_classifications = JSON.parse(File.read(reference_classifications_file)) compare_classifications = JSON.parse(File.read(compare_classifications_file)) # Check if samples don't match current classification puts "" puts "Potential misclassifications for #{reference}" reference_classifications.each do |lang, files| language_name = lang.split('/').last files.each do |name, classification| # FIXME Don't want to report stuff from these dirs for now next if ['Binary', 'Text'].include?(language_name) unless classification == language_name puts " #{name} is classified as #{classification} but #{language_name} was expected" end end end # Check if samples don't match current classification # TODO DRY this up. puts "" puts "Potential misclassifications for #{compare}" compare_classifications.each do |lang, files| language_name = lang.split('/').last files.each do |name, classification| # FIXME Don't want to report stuff from these dirs for now next if ['Binary', 'Text'].include?(language_name) unless classification == language_name puts " #{name} is classified as #{classification} but #{language_name} was expected" end end end puts "" puts "Changes between #{reference}...#{compare}" changes = reference_classifications.deep_diff(compare_classifications) # Are there any differences in the linguist classification? if changes.any? changes.each do |lang, files| previous_count = reference_classifications[lang].size # Count the number of changed classifications (language and number) summary = changes[lang].inject(Hash.new(0)) do |result, (key, val)| new_lang = val.last result[new_lang] += 1 result end puts "#{lang}" # Work out the percentage change summary.each do |new_lang, count| percent = count / previous_count.to_f puts " #{sprintf("%.2f", percent)}% change to #{new_lang} (count files)" end end else puts " No changes" end end end namespace :classifier do LIMIT = 1_000 desc "Run classifier against #{LIMIT} public gists" task :test do require 'linguist/classifier' require 'linguist/samples' total, correct, incorrect = 0, 0, 0 $stdout.sync = true each_public_gist do |gist_url, file_url, file_language| next if file_language.nil? || file_language == 'Text' begin data = open(file_url).read guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first total += 1 guessed_language == file_language ? correct += 1 : incorrect += 1 print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100] $stdout.flush rescue URI::InvalidURIError else break if total >= LIMIT end end puts "" end def each_public_gist require 'open-uri' require 'json' url = "https://api.github.com/gists/public" loop do resp = open(url) url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1] gists = JSON.parse(resp.read) for gist in gists for filename, attrs in gist['files'] yield gist['url'], attrs['raw_url'], attrs['language'] end end end end end