mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-28 21:01:00 +00:00
Merge pull request #1537 from github/drop-samples.json
Ignore samples.json
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@ Gemfile.lock
|
|||||||
.bundle/
|
.bundle/
|
||||||
vendor/
|
vendor/
|
||||||
benchmark/
|
benchmark/
|
||||||
|
lib/linguist/samples.json
|
||||||
|
|||||||
@@ -2,8 +2,6 @@ before_install:
|
|||||||
- git fetch origin master:master
|
- git fetch origin master:master
|
||||||
- git fetch origin v2.0.0:v2.0.0
|
- git fetch origin v2.0.0:v2.0.0
|
||||||
- sudo apt-get install libicu-dev -y
|
- sudo apt-get install libicu-dev -y
|
||||||
before_script:
|
|
||||||
- bundle exec rake samples
|
|
||||||
rvm:
|
rvm:
|
||||||
- 1.9.3
|
- 1.9.3
|
||||||
- 2.0.0
|
- 2.0.0
|
||||||
|
|||||||
@@ -102,10 +102,6 @@ We try to only add languages once they have some usage on GitHub, so please note
|
|||||||
|
|
||||||
Almost all bug fixes or new language additions should come with some additional code samples. Just drop them under [`samples/`](https://github.com/github/linguist/tree/master/samples) in the correct subdirectory and our test suite will automatically test them. In most cases you shouldn't need to add any new assertions.
|
Almost all bug fixes or new language additions should come with some additional code samples. Just drop them under [`samples/`](https://github.com/github/linguist/tree/master/samples) in the correct subdirectory and our test suite will automatically test them. In most cases you shouldn't need to add any new assertions.
|
||||||
|
|
||||||
To update the `samples.json` after adding new files to [`samples/`](https://github.com/github/linguist/tree/master/samples):
|
|
||||||
|
|
||||||
bundle exec rake samples
|
|
||||||
|
|
||||||
### A note on language extensions
|
### A note on language extensions
|
||||||
|
|
||||||
Linguist has a number of methods available to it for identifying the language of a particular file. The initial lookup is based upon the extension of the file, possible file extensions are defined in an array called `extensions`. Take a look at this example for example for `Perl`:
|
Linguist has a number of methods available to it for identifying the language of a particular file. The initial lookup is based upon the extension of the file, possible file extensions are defined in an array called `extensions`. Take a look at this example for example for `Perl`:
|
||||||
|
|||||||
14
Rakefile
14
Rakefile
@@ -8,6 +8,16 @@ task :default => :test
|
|||||||
|
|
||||||
Rake::TestTask.new
|
Rake::TestTask.new
|
||||||
|
|
||||||
|
# Extend test task to check for samples
|
||||||
|
task :test => :check_samples
|
||||||
|
|
||||||
|
desc "Check that we have samples.json generated"
|
||||||
|
task :check_samples do
|
||||||
|
unless File.exist?('lib/linguist/samples.json')
|
||||||
|
Rake::Task[:samples].invoke
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
task :samples do
|
task :samples do
|
||||||
require 'linguist/samples'
|
require 'linguist/samples'
|
||||||
require 'yajl'
|
require 'yajl'
|
||||||
@@ -16,7 +26,7 @@ task :samples do
|
|||||||
File.open('lib/linguist/samples.json', 'w') { |io| io.write json }
|
File.open('lib/linguist/samples.json', 'w') { |io| io.write json }
|
||||||
end
|
end
|
||||||
|
|
||||||
task :build_gem do
|
task :build_gem => :samples do
|
||||||
languages = YAML.load_file("lib/linguist/languages.yml")
|
languages = YAML.load_file("lib/linguist/languages.yml")
|
||||||
File.write("lib/linguist/languages.json", JSON.dump(languages))
|
File.write("lib/linguist/languages.json", JSON.dump(languages))
|
||||||
`gem build github-linguist.gemspec`
|
`gem build github-linguist.gemspec`
|
||||||
@@ -99,7 +109,7 @@ namespace :classifier do
|
|||||||
next if file_language.nil? || file_language == 'Text'
|
next if file_language.nil? || file_language == 'Text'
|
||||||
begin
|
begin
|
||||||
data = open(file_url).read
|
data = open(file_url).read
|
||||||
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples::DATA, data).first
|
guessed_language, score = Linguist::Classifier.classify(Linguist::Samples.cache, data).first
|
||||||
|
|
||||||
total += 1
|
total += 1
|
||||||
guessed_language == file_language ? correct += 1 : incorrect += 1
|
guessed_language == file_language ? correct += 1 : incorrect += 1
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ module Linguist
|
|||||||
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
||||||
determined.first
|
determined.first
|
||||||
# Lastly, fall back to the probabilistic classifier.
|
# Lastly, fall back to the probabilistic classifier.
|
||||||
elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names).first
|
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
||||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||||
Language[classified[0]]
|
Language[classified[0]]
|
||||||
end
|
end
|
||||||
@@ -510,9 +510,9 @@ module Linguist
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
extensions = Samples::DATA['extnames']
|
extensions = Samples.cache['extnames']
|
||||||
interpreters = Samples::DATA['interpreters']
|
interpreters = Samples.cache['interpreters']
|
||||||
filenames = Samples::DATA['filenames']
|
filenames = Samples.cache['filenames']
|
||||||
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
|
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
|
||||||
|
|
||||||
languages_yml = File.expand_path("../languages.yml", __FILE__)
|
languages_yml = File.expand_path("../languages.yml", __FILE__)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -17,9 +17,11 @@ module Linguist
|
|||||||
PATH = File.expand_path('../samples.json', __FILE__)
|
PATH = File.expand_path('../samples.json', __FILE__)
|
||||||
|
|
||||||
# Hash of serialized samples object
|
# Hash of serialized samples object
|
||||||
if File.exist?(PATH)
|
def self.cache
|
||||||
serializer = defined?(JSON) ? JSON : YAML
|
@cache ||= begin
|
||||||
DATA = serializer.load(File.read(PATH))
|
serializer = defined?(JSON) ? JSON : YAML
|
||||||
|
serializer.load(File.read(PATH))
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Public: Iterate over each sample.
|
# Public: Iterate over each sample.
|
||||||
|
|||||||
@@ -44,12 +44,12 @@ class TestClassifier < Test::Unit::TestCase
|
|||||||
end
|
end
|
||||||
|
|
||||||
def test_instance_classify_empty
|
def test_instance_classify_empty
|
||||||
results = Classifier.classify(Samples::DATA, "")
|
results = Classifier.classify(Samples.cache, "")
|
||||||
assert results.first[1] < 0.5, results.first.inspect
|
assert results.first[1] < 0.5, results.first.inspect
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_instance_classify_nil
|
def test_instance_classify_nil
|
||||||
assert_equal [], Classifier.classify(Samples::DATA, nil)
|
assert_equal [], Classifier.classify(Samples.cache, nil)
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_classify_ambiguous_languages
|
def test_classify_ambiguous_languages
|
||||||
@@ -58,7 +58,7 @@ class TestClassifier < Test::Unit::TestCase
|
|||||||
languages = Language.find_by_filename(sample[:path]).map(&:name)
|
languages = Language.find_by_filename(sample[:path]).map(&:name)
|
||||||
next unless languages.length > 1
|
next unless languages.length > 1
|
||||||
|
|
||||||
results = Classifier.classify(Samples::DATA, File.read(sample[:path]), languages)
|
results = Classifier.classify(Samples.cache, File.read(sample[:path]), languages)
|
||||||
assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
|
assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ class TestSamples < Test::Unit::TestCase
|
|||||||
include Linguist
|
include Linguist
|
||||||
|
|
||||||
def test_up_to_date
|
def test_up_to_date
|
||||||
assert serialized = Samples::DATA
|
assert serialized = Samples.cache
|
||||||
assert latest = Samples.data
|
assert latest = Samples.data
|
||||||
|
|
||||||
# Just warn, it shouldn't scare people off by breaking the build.
|
# Just warn, it shouldn't scare people off by breaking the build.
|
||||||
@@ -29,7 +29,7 @@ class TestSamples < Test::Unit::TestCase
|
|||||||
end
|
end
|
||||||
|
|
||||||
def test_verify
|
def test_verify
|
||||||
assert data = Samples::DATA
|
assert data = Samples.cache
|
||||||
|
|
||||||
assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
|
assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
|
||||||
assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
|
assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
|
||||||
@@ -38,7 +38,7 @@ class TestSamples < Test::Unit::TestCase
|
|||||||
|
|
||||||
# Check that there aren't samples with extensions that aren't explicitly defined in languages.yml
|
# Check that there aren't samples with extensions that aren't explicitly defined in languages.yml
|
||||||
def test_parity
|
def test_parity
|
||||||
extensions = Samples::DATA['extnames']
|
extensions = Samples.cache['extnames']
|
||||||
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
|
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
|
||||||
languages = YAML.load_file(languages_yml)
|
languages = YAML.load_file(languages_yml)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user