mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 20:38:47 +00:00
Merge pull request #1537 from github/drop-samples.json
Ignore samples.json
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@ Gemfile.lock
|
||||
.bundle/
|
||||
vendor/
|
||||
benchmark/
|
||||
lib/linguist/samples.json
|
||||
|
||||
@@ -2,8 +2,6 @@ before_install:
|
||||
- git fetch origin master:master
|
||||
- git fetch origin v2.0.0:v2.0.0
|
||||
- sudo apt-get install libicu-dev -y
|
||||
before_script:
|
||||
- bundle exec rake samples
|
||||
rvm:
|
||||
- 1.9.3
|
||||
- 2.0.0
|
||||
|
||||
@@ -102,10 +102,6 @@ We try to only add languages once they have some usage on GitHub, so please note
|
||||
|
||||
Almost all bug fixes or new language additions should come with some additional code samples. Just drop them under [`samples/`](https://github.com/github/linguist/tree/master/samples) in the correct subdirectory and our test suite will automatically test them. In most cases you shouldn't need to add any new assertions.
|
||||
|
||||
To update the `samples.json` after adding new files to [`samples/`](https://github.com/github/linguist/tree/master/samples):
|
||||
|
||||
bundle exec rake samples
|
||||
|
||||
### A note on language extensions
|
||||
|
||||
Linguist has a number of methods available to it for identifying the language of a particular file. The initial lookup is based upon the extension of the file, possible file extensions are defined in an array called `extensions`. Take a look at this example for example for `Perl`:
|
||||
|
||||
14
Rakefile
14
Rakefile
@@ -8,6 +8,16 @@ task :default => :test
|
||||
|
||||
Rake::TestTask.new
|
||||
|
||||
# Extend test task to check for samples
|
||||
task :test => :check_samples
|
||||
|
||||
desc "Check that we have samples.json generated"
|
||||
task :check_samples do
|
||||
unless File.exist?('lib/linguist/samples.json')
|
||||
Rake::Task[:samples].invoke
|
||||
end
|
||||
end
|
||||
|
||||
task :samples do
|
||||
require 'linguist/samples'
|
||||
require 'yajl'
|
||||
@@ -16,7 +26,7 @@ task :samples do
|
||||
File.open('lib/linguist/samples.json', 'w') { |io| io.write json }
|
||||
end
|
||||
|
||||
task :build_gem do
|
||||
task :build_gem => :samples do
|
||||
languages = YAML.load_file("lib/linguist/languages.yml")
|
||||
File.write("lib/linguist/languages.json", JSON.dump(languages))
|
||||
`gem build github-linguist.gemspec`
|
||||
@@ -99,7 +109,7 @@ namespace :classifier do
|
||||
next if file_language.nil? || file_language == 'Text'
|
||||
begin
|
||||
data = open(file_url).read
|
||||
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first
|
||||
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
|
||||
|
||||
total += 1
|
||||
guessed_language == file_language ? correct += 1 : incorrect += 1
|
||||
|
||||
@@ -136,7 +136,7 @@ module Linguist
|
||||
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
||||
determined.first
|
||||
# Lastly, fall back to the probabilistic classifier.
|
||||
elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names).first
|
||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||
Language[classified[0]]
|
||||
end
|
||||
@@ -510,9 +510,9 @@ module Linguist
|
||||
end
|
||||
end
|
||||
|
||||
extensions = Samples::DATA['extnames']
|
||||
interpreters = Samples::DATA['interpreters']
|
||||
filenames = Samples::DATA['filenames']
|
||||
extensions = Samples.cache['extnames']
|
||||
interpreters = Samples.cache['interpreters']
|
||||
filenames = Samples.cache['filenames']
|
||||
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
|
||||
|
||||
languages_yml = File.expand_path("../languages.yml", __FILE__)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -17,9 +17,11 @@ module Linguist
|
||||
PATH = File.expand_path('../samples.json', __FILE__)
|
||||
|
||||
# Hash of serialized samples object
|
||||
if File.exist?(PATH)
|
||||
serializer = defined?(JSON) ? JSON : YAML
|
||||
DATA = serializer.load(File.read(PATH))
|
||||
def self.cache
|
||||
@cache ||= begin
|
||||
serializer = defined?(JSON) ? JSON : YAML
|
||||
serializer.load(File.read(PATH))
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Iterate over each sample.
|
||||
|
||||
@@ -44,12 +44,12 @@ class TestClassifier < Test::Unit::TestCase
|
||||
end
|
||||
|
||||
def test_instance_classify_empty
|
||||
results = Classifier.classify(Samples::DATA, "")
|
||||
results = Classifier.classify(Samples.cache, "")
|
||||
assert results.first[1] < 0.5, results.first.inspect
|
||||
end
|
||||
|
||||
def test_instance_classify_nil
|
||||
assert_equal [], Classifier.classify(Samples::DATA, nil)
|
||||
assert_equal [], Classifier.classify(Samples.cache, nil)
|
||||
end
|
||||
|
||||
def test_classify_ambiguous_languages
|
||||
@@ -58,7 +58,7 @@ class TestClassifier < Test::Unit::TestCase
|
||||
languages = Language.find_by_filename(sample[:path]).map(&:name)
|
||||
next unless languages.length > 1
|
||||
|
||||
results = Classifier.classify(Samples::DATA, File.read(sample[:path]), languages)
|
||||
results = Classifier.classify(Samples.cache, File.read(sample[:path]), languages)
|
||||
assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
|
||||
end
|
||||
end
|
||||
|
||||
@@ -8,7 +8,7 @@ class TestSamples < Test::Unit::TestCase
|
||||
include Linguist
|
||||
|
||||
def test_up_to_date
|
||||
assert serialized = Samples::DATA
|
||||
assert serialized = Samples.cache
|
||||
assert latest = Samples.data
|
||||
|
||||
# Just warn, it shouldn't scare people off by breaking the build.
|
||||
@@ -29,7 +29,7 @@ class TestSamples < Test::Unit::TestCase
|
||||
end
|
||||
|
||||
def test_verify
|
||||
assert data = Samples::DATA
|
||||
assert data = Samples.cache
|
||||
|
||||
assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
|
||||
assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
|
||||
@@ -38,7 +38,7 @@ class TestSamples < Test::Unit::TestCase
|
||||
|
||||
# Check that there aren't samples with extensions that aren't explicitly defined in languages.yml
|
||||
def test_parity
|
||||
extensions = Samples::DATA['extnames']
|
||||
extensions = Samples.cache['extnames']
|
||||
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
|
||||
languages = YAML.load_file(languages_yml)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user