Merge pull request #1537 from github/drop-samples.json

Ignore samples.json
2025-12-28 21:01:00 +00:00 · 2014-09-18 14:30:15 -05:00
parent 29bbf50900 036855072e
commit 950882be78
9 changed files with 28 additions and 74021 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ Gemfile.lock
 .bundle/
 vendor/
 benchmark/
 lib/linguist/samples.json
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,8 +2,6 @@ before_install:
  - git fetch origin master:master
  - git fetch origin v2.0.0:v2.0.0
  - sudo apt-get install libicu-dev -y
 before_script:
  - bundle exec rake samples
 rvm:
  - 1.9.3
  - 2.0.0
--- a/README.md
+++ b/README.md
@@ -102,10 +102,6 @@ We try to only add languages once they have some usage on GitHub, so please note
 Almost all bug fixes or new language additions should come with some additional code samples. Just drop them under [`samples/`](https://github.com/github/linguist/tree/master/samples) in the correct subdirectory and our test suite will automatically test them. In most cases you shouldn't need to add any new assertions.
 To update the `samples.json` after adding new files to [`samples/`](https://github.com/github/linguist/tree/master/samples):
    bundle exec rake samples
 ### A note on language extensions
 Linguist has a number of methods available to it for identifying the language of a particular file. The initial lookup is based upon the extension of the file, possible file extensions are defined in an array called `extensions`. Take a look at this example for example for `Perl`:
--- a/14
+++ b/14
@@ -8,6 +8,16 @@ task :default => :test
 Rake::TestTask.new
 # Extend test task to check for samples
 task :test => :check_samples
 desc "Check that we have samples.json generated"
 task :check_samples do
  unless File.exist?('lib/linguist/samples.json')
    Rake::Task[:samples].invoke
  end
 end
 task :samples do
  require 'linguist/samples'
  require 'yajl'
@@ -16,7 +26,7 @@ task :samples do
  File.open('lib/linguist/samples.json', 'w') { |io| io.write json }
 end
-task :build_gem do
+task :build_gem => :samples do
  languages = YAML.load_file("lib/linguist/languages.yml")
  File.write("lib/linguist/languages.json", JSON.dump(languages))
  `gem build github-linguist.gemspec`
@@ -99,7 +109,7 @@ namespace :classifier do
      next if file_language.nil? || file_language == 'Text'
      begin
        data = open(file_url).read
-        guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first
+        guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
        total += 1
        guessed_language == file_language ? correct += 1 : incorrect += 1
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -136,7 +136,7 @@ module Linguist
        elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
          determined.first
        # Lastly, fall back to the probabilistic classifier.
-        elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names).first
+        elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
          Language[classified[0]]
        end
@@ -510,9 +510,9 @@ module Linguist
    end
  end
-  extensions = Samples::DATA['extnames']
+  extensions = Samples.cache['extnames']
-  interpreters = Samples::DATA['interpreters']
+  interpreters = Samples.cache['interpreters']
-  filenames = Samples::DATA['filenames']
+  filenames = Samples.cache['filenames']
  popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
  languages_yml = File.expand_path("../languages.yml", __FILE__)
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -17,9 +17,11 @@ module Linguist
    PATH = File.expand_path('../samples.json', __FILE__)
    # Hash of serialized samples object
-    if File.exist?(PATH)
+    def self.cache
-      serializer = defined?(JSON) ? JSON : YAML
+      @cache ||= begin
-      DATA = serializer.load(File.read(PATH))
+        serializer = defined?(JSON) ? JSON : YAML
        serializer.load(File.read(PATH))
      end
    end
    # Public: Iterate over each sample.
--- a/test/test_classifier.rb
+++ b/test/test_classifier.rb
@@ -44,12 +44,12 @@ class TestClassifier < Test::Unit::TestCase
  end
  def test_instance_classify_empty
-    results = Classifier.classify(Samples::DATA, "")
+    results = Classifier.classify(Samples.cache, "")
    assert results.first[1] < 0.5, results.first.inspect
  end
  def test_instance_classify_nil
-    assert_equal [], Classifier.classify(Samples::DATA, nil)
+    assert_equal [], Classifier.classify(Samples.cache, nil)
  end
  def test_classify_ambiguous_languages
@@ -58,7 +58,7 @@ class TestClassifier < Test::Unit::TestCase
      languages = Language.find_by_filename(sample[:path]).map(&:name)
      next unless languages.length > 1
-      results = Classifier.classify(Samples::DATA, File.read(sample[:path]), languages)
+      results = Classifier.classify(Samples.cache, File.read(sample[:path]), languages)
      assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
    end
  end
--- a/test/test_samples.rb
+++ b/test/test_samples.rb
@@ -8,7 +8,7 @@ class TestSamples < Test::Unit::TestCase
  include Linguist
  def test_up_to_date
-    assert serialized = Samples::DATA
+    assert serialized = Samples.cache
    assert latest = Samples.data
    # Just warn, it shouldn't scare people off by breaking the build.
@@ -29,7 +29,7 @@ class TestSamples < Test::Unit::TestCase
  end
  def test_verify
-    assert data = Samples::DATA
+    assert data = Samples.cache
    assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
    assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
@@ -38,7 +38,7 @@ class TestSamples < Test::Unit::TestCase
  # Check that there aren't samples with extensions that aren't explicitly defined in languages.yml
  def test_parity
-    extensions = Samples::DATA['extnames']
+    extensions = Samples.cache['extnames']
    languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
    languages = YAML.load_file(languages_yml)