Files
linguist/Rakefile
Adam Roben 046fb18980 Add github-linguist-grammars gem
The purpose of this gem is to package up the language grammars that are
used for syntax highlighting on github.com. The grammars are TextMate,
Sublime Text, or Atom language grammars, converted to JSON and given the
filename SCOPE.json, where SCOPE is the language scope that the grammar
defines.

The github-linguist-grammars gem packages up all the grammars, and also
exports a Linguist::Grammars.path method to locate the directory
containing the grammars.

To build the gem, simply run `rake build_grammars_gem`. The grammars.yml
file lists all the repositories we download grammars from, as well as
which scopes are defined by each repository. The
script/download-grammars script takes that list and downloads and
processes the grammars into the format expected by the gem.
2014-11-13 11:03:53 -05:00

155 lines
4.1 KiB
Ruby

require 'bundler/setup'
require 'rake/clean'
require 'rake/testtask'
require 'yaml'
require 'yajl'
task :default => :test
Rake::TestTask.new
# Extend test task to check for samples
task :test => :check_samples
desc "Check that we have samples.json generated"
task :check_samples do
unless File.exist?('lib/linguist/samples.json')
Rake::Task[:samples].invoke
end
end
task :samples do
require 'linguist/samples'
json = Yajl.dump(Linguist::Samples.data, :pretty => true)
File.write 'lib/linguist/samples.json', json
end
task :build_gem => :samples do
languages = YAML.load_file("lib/linguist/languages.yml")
File.write("lib/linguist/languages.json", Yajl.dump(languages))
`gem build github-linguist.gemspec`
File.delete("lib/linguist/languages.json")
end
task :build_grammars_gem do
rm_rf "grammars"
sh "script/download-grammars"
sh "gem", "build", "github-linguist-grammars.gemspec"
end
namespace :benchmark do
benchmark_path = "benchmark/results"
# $ bundle exec rake benchmark:generate CORPUS=path/to/samples
desc "Generate results for"
task :generate do
ref = `git rev-parse HEAD`.strip[0,8]
corpus = File.expand_path(ENV["CORPUS"] || "samples")
require 'linguist/language'
results = Hash.new
Dir.glob("#{corpus}/**/*").each do |file|
next unless File.file?(file)
filename = file.gsub("#{corpus}/", "")
results[filename] = Linguist::FileBlob.new(file).language
end
# Ensure results directory exists
FileUtils.mkdir_p("benchmark/results")
# Write results
if `git status`.include?('working directory clean')
result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}.json"
else
result_filename = "benchmark/results/#{File.basename(corpus)}-#{ref}-unstaged.json"
end
File.write(result_filename, results.to_json)
puts "wrote #{result_filename}"
end
# $ bundle exec rake benchmark:compare REFERENCE=path/to/reference.json CANDIDATE=path/to/candidate.json
desc "Compare results"
task :compare do
reference_file = ENV["REFERENCE"]
candidate_file = ENV["CANDIDATE"]
reference = Yajl.load(File.read(reference_file))
reference_counts = Hash.new(0)
reference.each { |filename, language| reference_counts[language] += 1 }
candidate = Yajl.load(File.read(candidate_file))
candidate_counts = Hash.new(0)
candidate.each { |filename, language| candidate_counts[language] += 1 }
changes = diff(reference_counts, candidate_counts)
if changes.any?
changes.each do |language, (before, after)|
before_percent = 100 * before / reference.size.to_f
after_percent = 100 * after / candidate.size.to_f
puts "%s changed from %.1f%% to %.1f%%" % [language || 'unknown', before_percent, after_percent]
end
else
puts "No changes"
end
end
end
namespace :classifier do
LIMIT = 1_000
desc "Run classifier against #{LIMIT} public gists"
task :test do
require 'linguist/classifier'
require 'linguist/samples'
total, correct, incorrect = 0, 0, 0
$stdout.sync = true
each_public_gist do |gist_url, file_url, file_language|
next if file_language.nil? || file_language == 'Text'
begin
data = open(file_url).read
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
total += 1
guessed_language == file_language ? correct += 1 : incorrect += 1
print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
$stdout.flush
rescue URI::InvalidURIError
else
break if total >= LIMIT
end
end
puts ""
end
def each_public_gist
require 'open-uri'
url = "https://api.github.com/gists/public"
loop do
resp = open(url)
url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
gists = Yajl.load(resp.read)
for gist in gists
for filename, attrs in gist['files']
yield gist['url'], attrs['raw_url'], attrs['language']
end
end
end
end
end
def diff(a, b)
(a.keys | b.keys).each_with_object({}) do |key, diff|
diff[key] = [a[key], b[key]] unless a[key] == b[key]
end
end