mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Merge remote-tracking branch 'origin/master' into filename-matches-multiple-langages
* origin/master: Allow mime-types 2.x to be used with Linguist Upgrade to rugged 0.22.0b1 Mention that languages need to be quite popular fix vendor/cache Gemfile.lock is nolonger considered generated Tests for BlobHelper#empty? remove reference to empty.js Remove more empty samples Bail earlier if the file is empty. Moving comments Use heuristics earlier to inform the rest of the classification process Removing inconsistency of `find_by_heuristics` (was sometimes returning nil and sometimes returning and empty array) Removing unused array of candidate languages. Reworking most heuristics to only return one match
This commit is contained in:
@@ -24,7 +24,7 @@ Great! You'll need to:
|
||||
0. Add a grammar for your language to [`grammars.yml`][grammars] by running `script/download-grammars --add URL`. Please only add grammars that have a license that permits redistribution.
|
||||
0. Add samples for your language to the [samples directory][samples].
|
||||
|
||||
We try only to add languages once they have some usage on GitHub, so please note in-the-wild usage examples in your pull request.
|
||||
We try only to add languages once they have some usage on GitHub, so please note in-the-wild usage examples in your pull request. In most cases we prefer that languages already be in use in hundreds of repositories before supporting them in Linguist.
|
||||
|
||||
[grammars]: /grammars.yml
|
||||
[languages]: /lib/linguist/languages.yml
|
||||
|
||||
@@ -15,8 +15,8 @@ Gem::Specification.new do |s|
|
||||
|
||||
s.add_dependency 'charlock_holmes', '~> 0.7.3'
|
||||
s.add_dependency 'escape_utils', '~> 1.0.1'
|
||||
s.add_dependency 'mime-types', '~> 1.19'
|
||||
s.add_dependency 'rugged', '~> 0.21.1b2'
|
||||
s.add_dependency 'mime-types', '>= 1.19'
|
||||
s.add_dependency 'rugged', '~> 0.22.0b1'
|
||||
|
||||
s.add_development_dependency 'mocha'
|
||||
s.add_development_dependency 'pry'
|
||||
|
||||
@@ -146,6 +146,13 @@ module Linguist
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Is the blob empty?
|
||||
#
|
||||
# Return true or false
|
||||
def empty?
|
||||
data.nil? || data == ""
|
||||
end
|
||||
|
||||
# Public: Is the blob text?
|
||||
#
|
||||
# Return true or false
|
||||
|
||||
@@ -51,26 +51,25 @@ module Linguist
|
||||
#
|
||||
# Return true or false
|
||||
def generated?
|
||||
name == 'Gemfile.lock' ||
|
||||
minified_files? ||
|
||||
compiled_coffeescript? ||
|
||||
xcode_file? ||
|
||||
generated_parser? ||
|
||||
generated_net_docfile? ||
|
||||
generated_net_designer_file? ||
|
||||
generated_postscript? ||
|
||||
generated_protocol_buffer? ||
|
||||
generated_jni_header? ||
|
||||
composer_lock? ||
|
||||
node_modules? ||
|
||||
godeps? ||
|
||||
vcr_cassette? ||
|
||||
generated_by_zephir?
|
||||
minified_files? ||
|
||||
compiled_coffeescript? ||
|
||||
xcode_file? ||
|
||||
generated_parser? ||
|
||||
generated_net_docfile? ||
|
||||
generated_net_designer_file? ||
|
||||
generated_postscript? ||
|
||||
generated_protocol_buffer? ||
|
||||
generated_jni_header? ||
|
||||
composer_lock? ||
|
||||
node_modules? ||
|
||||
godeps? ||
|
||||
vcr_cassette? ||
|
||||
generated_by_zephir?
|
||||
end
|
||||
|
||||
# Internal: Is the blob an Xcode file?
|
||||
#
|
||||
# Generated if the file extension is an Xcode
|
||||
# Generated if the file extension is an Xcode
|
||||
# file extension.
|
||||
#
|
||||
# Returns true of false.
|
||||
@@ -265,4 +264,3 @@ module Linguist
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -13,26 +13,28 @@ module Linguist
|
||||
# Returns an array of Languages or []
|
||||
def self.find_by_heuristics(data, languages)
|
||||
if active?
|
||||
result = []
|
||||
|
||||
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
|
||||
result = disambiguate_pl(data, languages)
|
||||
result = disambiguate_pl(data)
|
||||
end
|
||||
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
|
||||
result = disambiguate_ecl(data, languages)
|
||||
result = disambiguate_ecl(data)
|
||||
end
|
||||
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
|
||||
result = disambiguate_pro(data, languages)
|
||||
result = disambiguate_pro(data)
|
||||
end
|
||||
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
|
||||
result = disambiguate_cl(data, languages)
|
||||
result = disambiguate_cl(data)
|
||||
end
|
||||
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
|
||||
result = disambiguate_hack(data, languages)
|
||||
result = disambiguate_hack(data)
|
||||
end
|
||||
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
|
||||
result = disambiguate_sc(data, languages)
|
||||
result = disambiguate_sc(data)
|
||||
end
|
||||
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
|
||||
result = disambiguate_asc(data, languages)
|
||||
result = disambiguate_asc(data)
|
||||
end
|
||||
return result
|
||||
end
|
||||
@@ -42,28 +44,37 @@ module Linguist
|
||||
# We want to shortcut look for Objective-C _and_ now C++ too!
|
||||
#
|
||||
# Returns an array of Languages or []
|
||||
def self.disambiguate_c(data, languages)
|
||||
def self.disambiguate_c(data)
|
||||
matches = []
|
||||
matches << Language["Objective-C"] if data.include?("@interface")
|
||||
matches << Language["C++"] if data.include?("#include <cstdint>")
|
||||
if data.include?("@interface")
|
||||
matches << Language["Objective-C"]
|
||||
elsif data.include?("#include <cstdint>")
|
||||
matches << Language["C++"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_pl(data, languages)
|
||||
def self.disambiguate_pl(data)
|
||||
matches = []
|
||||
matches << Language["Prolog"] if data.include?(":-")
|
||||
matches << Language["Perl"] if data.include?("use strict")
|
||||
if data.include?("use strict")
|
||||
matches << Language["Perl"]
|
||||
elsif data.include?(":-")
|
||||
matches << Language["Prolog"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_ecl(data, languages)
|
||||
def self.disambiguate_ecl(data)
|
||||
matches = []
|
||||
matches << Language["Prolog"] if data.include?(":-")
|
||||
matches << Language["ECL"] if data.include?(":=")
|
||||
if data.include?(":-")
|
||||
matches << Language["Prolog"]
|
||||
elsif data.include?(":=")
|
||||
matches << Language["ECL"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_pro(data, languages)
|
||||
def self.disambiguate_pro(data)
|
||||
matches = []
|
||||
if (data.include?(":-"))
|
||||
matches << Language["Prolog"]
|
||||
@@ -73,7 +84,7 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_ts(data, languages)
|
||||
def self.disambiguate_ts(data)
|
||||
matches = []
|
||||
if (data.include?("</translation>"))
|
||||
matches << Language["XML"]
|
||||
@@ -83,21 +94,24 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_cl(data, languages)
|
||||
def self.disambiguate_cl(data)
|
||||
matches = []
|
||||
matches << Language["Common Lisp"] if data.include?("(defun ")
|
||||
matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
|
||||
if data.include?("(defun ")
|
||||
matches << Language["Common Lisp"]
|
||||
elsif /\/\* |\/\/ |^\}/.match(data)
|
||||
matches << Language["OpenCL"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_r(data, languages)
|
||||
def self.disambiguate_r(data)
|
||||
matches = []
|
||||
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
|
||||
matches << Language["R"] if data.include?("<-")
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_hack(data, languages)
|
||||
def self.disambiguate_hack(data)
|
||||
matches = []
|
||||
if data.include?("<?hh")
|
||||
matches << Language["Hack"]
|
||||
@@ -107,7 +121,7 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_sc(data, languages)
|
||||
def self.disambiguate_sc(data)
|
||||
matches = []
|
||||
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
|
||||
matches << Language["SuperCollider"]
|
||||
@@ -118,7 +132,7 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_asc(data, languages)
|
||||
def self.disambiguate_asc(data)
|
||||
matches = []
|
||||
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
||||
matches
|
||||
|
||||
@@ -100,12 +100,8 @@ module Linguist
|
||||
def self.detect(blob)
|
||||
name = blob.name.to_s
|
||||
|
||||
# Check if the blob is possibly binary and bail early; this is a cheap
|
||||
# test that uses the extension name to guess a binary binary mime type.
|
||||
#
|
||||
# We'll perform a more comprehensive test later which actually involves
|
||||
# looking for binary characters in the blob
|
||||
return nil if blob.likely_binary? || blob.binary?
|
||||
# Bail early if the blob is binary or empty.
|
||||
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
||||
|
||||
# A bit of an elegant hack. If the file is executable but extensionless,
|
||||
# append a "magic" extension so it can be classified with other
|
||||
@@ -124,16 +120,18 @@ module Linguist
|
||||
if possible_languages.length > 1
|
||||
data = blob.data
|
||||
possible_language_names = possible_languages.map(&:name)
|
||||
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
||||
|
||||
if heuristic_languages.size > 1
|
||||
possible_language_names = heuristic_languages.map(&:name)
|
||||
end
|
||||
|
||||
# Don't bother with binary contents or an empty file
|
||||
if data.nil? || data == ""
|
||||
nil
|
||||
# Check if there's a shebang line and use that as authoritative
|
||||
elsif (result = find_by_shebang(data)) && !result.empty?
|
||||
if (result = find_by_shebang(data)) && !result.empty?
|
||||
result.first
|
||||
# No shebang. Still more work to do. Try to find it with our heuristics.
|
||||
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
||||
determined.first
|
||||
elsif heuristic_languages.size == 1
|
||||
heuristic_languages.first
|
||||
# Lastly, fall back to the probabilistic classifier.
|
||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
(function() {
|
||||
|
||||
}).call(this);
|
||||
@@ -193,8 +193,8 @@ class TestBlob < Test::Unit::TestCase
|
||||
assert blob("Binary/MainMenu.nib").generated?
|
||||
assert !blob("XML/project.pbxproj").generated?
|
||||
|
||||
# Gemfile.locks
|
||||
assert blob("Gemfile.lock").generated?
|
||||
# Gemfile.lock is NOT generated
|
||||
assert !blob("Gemfile.lock").generated?
|
||||
|
||||
# Generated .NET Docfiles
|
||||
assert blob("XML/net_docfile.xml").generated?
|
||||
@@ -226,7 +226,6 @@ class TestBlob < Test::Unit::TestCase
|
||||
assert !blob("PostScript/sierpinski.ps").generated?
|
||||
|
||||
# These examples are too basic to tell
|
||||
assert !blob("JavaScript/empty.js").generated?
|
||||
assert !blob("JavaScript/hello.js").generated?
|
||||
|
||||
assert blob("JavaScript/intro-old.js").generated?
|
||||
@@ -469,4 +468,13 @@ class TestBlob < Test::Unit::TestCase
|
||||
def test_minified_files_not_safe_to_highlight
|
||||
assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize?
|
||||
end
|
||||
|
||||
def test_empty
|
||||
blob = Struct.new(:data) { include Linguist::BlobHelper }
|
||||
|
||||
assert blob.new("").empty?
|
||||
assert blob.new(nil).empty?
|
||||
refute blob.new(" ").empty?
|
||||
refute blob.new("nope").empty?
|
||||
end
|
||||
end
|
||||
|
||||
@@ -20,18 +20,18 @@ class TestHeuristcs < Test::Unit::TestCase
|
||||
Dir.glob("#{samples_path}/#{language_name}/#{file}")
|
||||
end
|
||||
|
||||
# Candidate languages = ["C++", "Objective-C"]
|
||||
def test_obj_c_by_heuristics
|
||||
languages = ["C++", "Objective-C"]
|
||||
# Only calling out '.h' filenames as these are the ones causing issues
|
||||
all_fixtures("Objective-C", "*.h").each do |fixture|
|
||||
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"), languages)
|
||||
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"))
|
||||
assert_equal Language["Objective-C"], results.first
|
||||
end
|
||||
end
|
||||
|
||||
# Candidate languages = ["C++", "Objective-C"]
|
||||
def test_cpp_by_heuristics
|
||||
languages = ["C++", "Objective-C"]
|
||||
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"), languages)
|
||||
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"))
|
||||
assert_equal Language["C++"], results.first
|
||||
end
|
||||
|
||||
@@ -41,57 +41,57 @@ class TestHeuristcs < Test::Unit::TestCase
|
||||
assert_equal Language["Objective-C"], match
|
||||
end
|
||||
|
||||
# Candidate languages = ["Perl", "Prolog"]
|
||||
def test_pl_prolog_by_heuristics
|
||||
languages = ["Perl", "Prolog"]
|
||||
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages)
|
||||
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"))
|
||||
assert_equal Language["Prolog"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["Perl", "Prolog"]
|
||||
def test_pl_perl_by_heuristics
|
||||
languages = ["Perl", "Prolog"]
|
||||
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"), languages)
|
||||
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"))
|
||||
assert_equal Language["Perl"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["ECL", "Prolog"]
|
||||
def test_ecl_prolog_by_heuristics
|
||||
languages = ["ECL", "Prolog"]
|
||||
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"), languages)
|
||||
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"))
|
||||
assert_equal Language["Prolog"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["ECL", "Prolog"]
|
||||
def test_ecl_ecl_by_heuristics
|
||||
languages = ["ECL", "Prolog"]
|
||||
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"), languages)
|
||||
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"))
|
||||
assert_equal Language["ECL"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["IDL", "Prolog"]
|
||||
def test_pro_prolog_by_heuristics
|
||||
languages = ["IDL", "Prolog"]
|
||||
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"), languages)
|
||||
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"))
|
||||
assert_equal Language["Prolog"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["IDL", "Prolog"]
|
||||
def test_pro_idl_by_heuristics
|
||||
languages = ["IDL", "Prolog"]
|
||||
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"), languages)
|
||||
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"))
|
||||
assert_equal Language["IDL"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["AGS Script", "AsciiDoc"]
|
||||
def test_asc_asciidoc_by_heuristics
|
||||
languages = ["AGS Script", "AsciiDoc"]
|
||||
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"), languages)
|
||||
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"))
|
||||
assert_equal Language["AsciiDoc"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["TypeScript", "XML"]
|
||||
def test_ts_typescript_by_heuristics
|
||||
languages = ["TypeScript", "XML"]
|
||||
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"), languages)
|
||||
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"))
|
||||
assert_equal Language["TypeScript"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["TypeScript", "XML"]
|
||||
def test_ts_xml_by_heuristics
|
||||
languages = ["TypeScript", "XML"]
|
||||
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"), languages)
|
||||
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"))
|
||||
assert_equal Language["XML"], results.first
|
||||
end
|
||||
|
||||
@@ -99,27 +99,27 @@ class TestHeuristcs < Test::Unit::TestCase
|
||||
languages = ["Common Lisp", "OpenCL"]
|
||||
languages.each do |language|
|
||||
all_fixtures(language).each do |fixture|
|
||||
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"), languages)
|
||||
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"))
|
||||
assert_equal Language[language], results.first
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Candidate languages = ["Hack", "PHP"]
|
||||
def test_hack_by_heuristics
|
||||
languages = ["Hack", "PHP"]
|
||||
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"), languages)
|
||||
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"))
|
||||
assert_equal Language["Hack"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["Scala", "SuperCollider"]
|
||||
def test_sc_supercollider_by_heuristics
|
||||
languages = ["Scala", "SuperCollider"]
|
||||
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"), languages)
|
||||
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"))
|
||||
assert_equal Language["SuperCollider"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["Scala", "SuperCollider"]
|
||||
def test_sc_scala_by_heuristics
|
||||
languages = ["Scala", "SuperCollider"]
|
||||
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"), languages)
|
||||
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"))
|
||||
assert_equal Language["Scala"], results.first
|
||||
end
|
||||
end
|
||||
|
||||
BIN
vendor/cache/rugged-0.21.1b2.gem
vendored
BIN
vendor/cache/rugged-0.21.1b2.gem
vendored
Binary file not shown.
BIN
vendor/cache/rugged-0.22.0b1.gem
vendored
Normal file
BIN
vendor/cache/rugged-0.22.0b1.gem
vendored
Normal file
Binary file not shown.
BIN
vendor/cache/yajl-ruby-1.1.0.gem
vendored
Normal file
BIN
vendor/cache/yajl-ruby-1.1.0.gem
vendored
Normal file
Binary file not shown.
Reference in New Issue
Block a user