Merge pull request #1537 from github/drop-samples.json

Ignore samples.json
This commit is contained in:
Arfon Smith
2014-09-18 14:30:15 -05:00
9 changed files with 28 additions and 74021 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ Gemfile.lock
.bundle/
vendor/
benchmark/
lib/linguist/samples.json

View File

@@ -2,8 +2,6 @@ before_install:
- git fetch origin master:master
- git fetch origin v2.0.0:v2.0.0
- sudo apt-get install libicu-dev -y
before_script:
- bundle exec rake samples
rvm:
- 1.9.3
- 2.0.0

View File

@@ -102,10 +102,6 @@ We try to only add languages once they have some usage on GitHub, so please note
Almost all bug fixes or new language additions should come with some additional code samples. Just drop them under [`samples/`](https://github.com/github/linguist/tree/master/samples) in the correct subdirectory and our test suite will automatically test them. In most cases you shouldn't need to add any new assertions.
To update the `samples.json` after adding new files to [`samples/`](https://github.com/github/linguist/tree/master/samples):
bundle exec rake samples
### A note on language extensions
Linguist has a number of methods available to it for identifying the language of a particular file. The initial lookup is based upon the extension of the file, possible file extensions are defined in an array called `extensions`. Take a look at this example for example for `Perl`:

View File

@@ -8,6 +8,16 @@ task :default => :test
Rake::TestTask.new
# Extend test task to check for samples
task :test => :check_samples
desc "Check that we have samples.json generated"
task :check_samples do
unless File.exist?('lib/linguist/samples.json')
Rake::Task[:samples].invoke
end
end
task :samples do
require 'linguist/samples'
require 'yajl'
@@ -16,7 +26,7 @@ task :samples do
File.open('lib/linguist/samples.json', 'w') { |io| io.write json }
end
task :build_gem do
task :build_gem => :samples do
languages = YAML.load_file("lib/linguist/languages.yml")
File.write("lib/linguist/languages.json", JSON.dump(languages))
`gem build github-linguist.gemspec`
@@ -99,7 +109,7 @@ namespace :classifier do
next if file_language.nil? || file_language == 'Text'
begin
data = open(file_url).read
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
total += 1
guessed_language == file_language ? correct += 1 : incorrect += 1

View File

@@ -136,7 +136,7 @@ module Linguist
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
determined.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names).first
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[classified[0]]
end
@@ -510,9 +510,9 @@ module Linguist
end
end
extensions = Samples::DATA['extnames']
interpreters = Samples::DATA['interpreters']
filenames = Samples::DATA['filenames']
extensions = Samples.cache['extnames']
interpreters = Samples.cache['interpreters']
filenames = Samples.cache['filenames']
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
languages_yml = File.expand_path("../languages.yml", __FILE__)

File diff suppressed because it is too large Load Diff

View File

@@ -17,9 +17,11 @@ module Linguist
PATH = File.expand_path('../samples.json', __FILE__)
# Hash of serialized samples object
if File.exist?(PATH)
serializer = defined?(JSON) ? JSON : YAML
DATA = serializer.load(File.read(PATH))
def self.cache
@cache ||= begin
serializer = defined?(JSON) ? JSON : YAML
serializer.load(File.read(PATH))
end
end
# Public: Iterate over each sample.

View File

@@ -44,12 +44,12 @@ class TestClassifier < Test::Unit::TestCase
end
def test_instance_classify_empty
results = Classifier.classify(Samples::DATA, "")
results = Classifier.classify(Samples.cache, "")
assert results.first[1] < 0.5, results.first.inspect
end
def test_instance_classify_nil
assert_equal [], Classifier.classify(Samples::DATA, nil)
assert_equal [], Classifier.classify(Samples.cache, nil)
end
def test_classify_ambiguous_languages
@@ -58,7 +58,7 @@ class TestClassifier < Test::Unit::TestCase
languages = Language.find_by_filename(sample[:path]).map(&:name)
next unless languages.length > 1
results = Classifier.classify(Samples::DATA, File.read(sample[:path]), languages)
results = Classifier.classify(Samples.cache, File.read(sample[:path]), languages)
assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
end
end

View File

@@ -8,7 +8,7 @@ class TestSamples < Test::Unit::TestCase
include Linguist
def test_up_to_date
assert serialized = Samples::DATA
assert serialized = Samples.cache
assert latest = Samples.data
# Just warn, it shouldn't scare people off by breaking the build.
@@ -29,7 +29,7 @@ class TestSamples < Test::Unit::TestCase
end
def test_verify
assert data = Samples::DATA
assert data = Samples.cache
assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
@@ -38,7 +38,7 @@ class TestSamples < Test::Unit::TestCase
# Check that there aren't samples with extensions that aren't explicitly defined in languages.yml
def test_parity
extensions = Samples::DATA['extnames']
extensions = Samples.cache['extnames']
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
languages = YAML.load_file(languages_yml)