Merge pull request #1537 from github/drop-samples.json

Ignore samples.json
This commit is contained in:
Arfon Smith
2014-09-18 14:30:15 -05:00
9 changed files with 28 additions and 74021 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ Gemfile.lock
.bundle/ .bundle/
vendor/ vendor/
benchmark/ benchmark/
lib/linguist/samples.json

View File

@@ -2,8 +2,6 @@ before_install:
- git fetch origin master:master - git fetch origin master:master
- git fetch origin v2.0.0:v2.0.0 - git fetch origin v2.0.0:v2.0.0
- sudo apt-get install libicu-dev -y - sudo apt-get install libicu-dev -y
before_script:
- bundle exec rake samples
rvm: rvm:
- 1.9.3 - 1.9.3
- 2.0.0 - 2.0.0

View File

@@ -102,10 +102,6 @@ We try to only add languages once they have some usage on GitHub, so please note
Almost all bug fixes or new language additions should come with some additional code samples. Just drop them under [`samples/`](https://github.com/github/linguist/tree/master/samples) in the correct subdirectory and our test suite will automatically test them. In most cases you shouldn't need to add any new assertions. Almost all bug fixes or new language additions should come with some additional code samples. Just drop them under [`samples/`](https://github.com/github/linguist/tree/master/samples) in the correct subdirectory and our test suite will automatically test them. In most cases you shouldn't need to add any new assertions.
To update the `samples.json` after adding new files to [`samples/`](https://github.com/github/linguist/tree/master/samples):
bundle exec rake samples
### A note on language extensions ### A note on language extensions
Linguist has a number of methods available to it for identifying the language of a particular file. The initial lookup is based upon the extension of the file, possible file extensions are defined in an array called `extensions`. Take a look at this example for example for `Perl`: Linguist has a number of methods available to it for identifying the language of a particular file. The initial lookup is based upon the extension of the file, possible file extensions are defined in an array called `extensions`. Take a look at this example for example for `Perl`:

View File

@@ -8,6 +8,16 @@ task :default => :test
Rake::TestTask.new Rake::TestTask.new
# Extend test task to check for samples
task :test => :check_samples
desc "Check that we have samples.json generated"
task :check_samples do
unless File.exist?('lib/linguist/samples.json')
Rake::Task[:samples].invoke
end
end
task :samples do task :samples do
require 'linguist/samples' require 'linguist/samples'
require 'yajl' require 'yajl'
@@ -16,7 +26,7 @@ task :samples do
File.open('lib/linguist/samples.json', 'w') { |io| io.write json } File.open('lib/linguist/samples.json', 'w') { |io| io.write json }
end end
task :build_gem do task :build_gem => :samples do
languages = YAML.load_file("lib/linguist/languages.yml") languages = YAML.load_file("lib/linguist/languages.yml")
File.write("lib/linguist/languages.json", JSON.dump(languages)) File.write("lib/linguist/languages.json", JSON.dump(languages))
`gem build github-linguist.gemspec` `gem build github-linguist.gemspec`
@@ -99,7 +109,7 @@ namespace :classifier do
next if file_language.nil? || file_language == 'Text' next if file_language.nil? || file_language == 'Text'
begin begin
data = open(file_url).read data = open(file_url).read
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
total += 1 total += 1
guessed_language == file_language ? correct += 1 : incorrect += 1 guessed_language == file_language ? correct += 1 : incorrect += 1

View File

@@ -136,7 +136,7 @@ module Linguist
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
determined.first determined.first
# Lastly, fall back to the probabilistic classifier. # Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names).first elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`) # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[classified[0]] Language[classified[0]]
end end
@@ -510,9 +510,9 @@ module Linguist
end end
end end
extensions = Samples::DATA['extnames'] extensions = Samples.cache['extnames']
interpreters = Samples::DATA['interpreters'] interpreters = Samples.cache['interpreters']
filenames = Samples::DATA['filenames'] filenames = Samples.cache['filenames']
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__)) popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
languages_yml = File.expand_path("../languages.yml", __FILE__) languages_yml = File.expand_path("../languages.yml", __FILE__)

File diff suppressed because it is too large Load Diff

View File

@@ -17,9 +17,11 @@ module Linguist
PATH = File.expand_path('../samples.json', __FILE__) PATH = File.expand_path('../samples.json', __FILE__)
# Hash of serialized samples object # Hash of serialized samples object
if File.exist?(PATH) def self.cache
serializer = defined?(JSON) ? JSON : YAML @cache ||= begin
DATA = serializer.load(File.read(PATH)) serializer = defined?(JSON) ? JSON : YAML
serializer.load(File.read(PATH))
end
end end
# Public: Iterate over each sample. # Public: Iterate over each sample.

View File

@@ -44,12 +44,12 @@ class TestClassifier < Test::Unit::TestCase
end end
def test_instance_classify_empty def test_instance_classify_empty
results = Classifier.classify(Samples::DATA, "") results = Classifier.classify(Samples.cache, "")
assert results.first[1] < 0.5, results.first.inspect assert results.first[1] < 0.5, results.first.inspect
end end
def test_instance_classify_nil def test_instance_classify_nil
assert_equal [], Classifier.classify(Samples::DATA, nil) assert_equal [], Classifier.classify(Samples.cache, nil)
end end
def test_classify_ambiguous_languages def test_classify_ambiguous_languages
@@ -58,7 +58,7 @@ class TestClassifier < Test::Unit::TestCase
languages = Language.find_by_filename(sample[:path]).map(&:name) languages = Language.find_by_filename(sample[:path]).map(&:name)
next unless languages.length > 1 next unless languages.length > 1
results = Classifier.classify(Samples::DATA, File.read(sample[:path]), languages) results = Classifier.classify(Samples.cache, File.read(sample[:path]), languages)
assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}" assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
end end
end end

View File

@@ -8,7 +8,7 @@ class TestSamples < Test::Unit::TestCase
include Linguist include Linguist
def test_up_to_date def test_up_to_date
assert serialized = Samples::DATA assert serialized = Samples.cache
assert latest = Samples.data assert latest = Samples.data
# Just warn, it shouldn't scare people off by breaking the build. # Just warn, it shouldn't scare people off by breaking the build.
@@ -29,7 +29,7 @@ class TestSamples < Test::Unit::TestCase
end end
def test_verify def test_verify
assert data = Samples::DATA assert data = Samples.cache
assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c } assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c } assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
@@ -38,7 +38,7 @@ class TestSamples < Test::Unit::TestCase
# Check that there aren't samples with extensions that aren't explicitly defined in languages.yml # Check that there aren't samples with extensions that aren't explicitly defined in languages.yml
def test_parity def test_parity
extensions = Samples::DATA['extnames'] extensions = Samples.cache['extnames']
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__) languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
languages = YAML.load_file(languages_yml) languages = YAML.load_file(languages_yml)