diff --git a/Rakefile b/Rakefile index 18c0e3d2..d51b7d34 100644 --- a/Rakefile +++ b/Rakefile @@ -17,3 +17,52 @@ end CLOBBER.include 'lib/linguist/classifier.yml' task :classifier => ['lib/linguist/classifier.yml'] + +namespace :classifier do + LIMIT = 1_000 + + desc "Run classifier against #{LIMIT} public gists" + task :test do + require 'linguist/classifier' + + total, correct, incorrect = 0, 0, 0 + $stdout.sync = true + + each_public_gist do |gist_url, file_url, file_language| + next if file_language.nil? || file_language == 'Text' + begin + data = open(file_url).read + guessed_language, score = Linguist::Classifier.instance.classify(data).first + + total += 1 + guessed_language.name == file_language ? correct += 1 : incorrect += 1 + + print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100] + $stdout.flush + rescue URI::InvalidURIError + else + break if total >= LIMIT + end + end + puts "" + end + + def each_public_gist + require 'open-uri' + require 'json' + + url = "https://api.github.com/gists/public" + + loop do + resp = open(url) + url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1] + gists = JSON.parse(resp.read) + + for gist in gists + for filename, attrs in gist['files'] + yield gist['url'], attrs['raw_url'], attrs['language'] + end + end + end + end +end diff --git a/linguist.gemspec b/linguist.gemspec index fbc46003..a48aaefa 100644 --- a/linguist.gemspec +++ b/linguist.gemspec @@ -12,5 +12,6 @@ Gem::Specification.new do |s| s.add_dependency 'escape_utils', '~> 0.2.3' s.add_dependency 'mime-types', '~> 1.18' s.add_dependency 'pygments.rb', '~> 0.2.11' + s.add_development_dependency 'json' s.add_development_dependency 'rake' end