Merge remote-tracking branch 'origin/master' into filename-matches-multiple-langages

* origin/master:
  Allow mime-types 2.x to be used with Linguist
  Upgrade to rugged 0.22.0b1
  Mention that languages need to be quite popular
  fix vendor/cache
  Gemfile.lock is nolonger considered generated
  Tests for BlobHelper#empty?
  remove reference to empty.js
  Remove more empty samples
  Bail earlier if the file is empty.
  Moving comments
  Use heuristics earlier to inform the rest of the classification process
  Removing inconsistency of `find_by_heuristics` (was sometimes returning nil and sometimes returning and empty array)
  Removing unused array of candidate languages.
  Reworking most heuristics to only return one match
This commit is contained in:
Brandon Keepers
2014-11-18 14:09:15 -05:00
14 changed files with 114 additions and 92 deletions

View File

@@ -24,7 +24,7 @@ Great! You'll need to:
0. Add a grammar for your language to [`grammars.yml`][grammars] by running `script/download-grammars --add URL`. Please only add grammars that have a license that permits redistribution. 0. Add a grammar for your language to [`grammars.yml`][grammars] by running `script/download-grammars --add URL`. Please only add grammars that have a license that permits redistribution.
0. Add samples for your language to the [samples directory][samples]. 0. Add samples for your language to the [samples directory][samples].
We try only to add languages once they have some usage on GitHub, so please note in-the-wild usage examples in your pull request. We try only to add languages once they have some usage on GitHub, so please note in-the-wild usage examples in your pull request. In most cases we prefer that languages already be in use in hundreds of repositories before supporting them in Linguist.
[grammars]: /grammars.yml [grammars]: /grammars.yml
[languages]: /lib/linguist/languages.yml [languages]: /lib/linguist/languages.yml

View File

@@ -15,8 +15,8 @@ Gem::Specification.new do |s|
s.add_dependency 'charlock_holmes', '~> 0.7.3' s.add_dependency 'charlock_holmes', '~> 0.7.3'
s.add_dependency 'escape_utils', '~> 1.0.1' s.add_dependency 'escape_utils', '~> 1.0.1'
s.add_dependency 'mime-types', '~> 1.19' s.add_dependency 'mime-types', '>= 1.19'
s.add_dependency 'rugged', '~> 0.21.1b2' s.add_dependency 'rugged', '~> 0.22.0b1'
s.add_development_dependency 'mocha' s.add_development_dependency 'mocha'
s.add_development_dependency 'pry' s.add_development_dependency 'pry'

View File

@@ -146,6 +146,13 @@ module Linguist
end end
end end
# Public: Is the blob empty?
#
# Return true or false
def empty?
data.nil? || data == ""
end
# Public: Is the blob text? # Public: Is the blob text?
# #
# Return true or false # Return true or false

View File

@@ -51,21 +51,20 @@ module Linguist
# #
# Return true or false # Return true or false
def generated? def generated?
name == 'Gemfile.lock' || minified_files? ||
minified_files? || compiled_coffeescript? ||
compiled_coffeescript? || xcode_file? ||
xcode_file? || generated_parser? ||
generated_parser? || generated_net_docfile? ||
generated_net_docfile? || generated_net_designer_file? ||
generated_net_designer_file? || generated_postscript? ||
generated_postscript? || generated_protocol_buffer? ||
generated_protocol_buffer? || generated_jni_header? ||
generated_jni_header? || composer_lock? ||
composer_lock? || node_modules? ||
node_modules? || godeps? ||
godeps? || vcr_cassette? ||
vcr_cassette? || generated_by_zephir?
generated_by_zephir?
end end
# Internal: Is the blob an Xcode file? # Internal: Is the blob an Xcode file?
@@ -265,4 +264,3 @@ module Linguist
end end
end end
end end

View File

@@ -13,26 +13,28 @@ module Linguist
# Returns an array of Languages or [] # Returns an array of Languages or []
def self.find_by_heuristics(data, languages) def self.find_by_heuristics(data, languages)
if active? if active?
result = []
if languages.all? { |l| ["Perl", "Prolog"].include?(l) } if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
result = disambiguate_pl(data, languages) result = disambiguate_pl(data)
end end
if languages.all? { |l| ["ECL", "Prolog"].include?(l) } if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
result = disambiguate_ecl(data, languages) result = disambiguate_ecl(data)
end end
if languages.all? { |l| ["IDL", "Prolog"].include?(l) } if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
result = disambiguate_pro(data, languages) result = disambiguate_pro(data)
end end
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) } if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
result = disambiguate_cl(data, languages) result = disambiguate_cl(data)
end end
if languages.all? { |l| ["Hack", "PHP"].include?(l) } if languages.all? { |l| ["Hack", "PHP"].include?(l) }
result = disambiguate_hack(data, languages) result = disambiguate_hack(data)
end end
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) } if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
result = disambiguate_sc(data, languages) result = disambiguate_sc(data)
end end
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) } if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
result = disambiguate_asc(data, languages) result = disambiguate_asc(data)
end end
return result return result
end end
@@ -42,28 +44,37 @@ module Linguist
# We want to shortcut look for Objective-C _and_ now C++ too! # We want to shortcut look for Objective-C _and_ now C++ too!
# #
# Returns an array of Languages or [] # Returns an array of Languages or []
def self.disambiguate_c(data, languages) def self.disambiguate_c(data)
matches = [] matches = []
matches << Language["Objective-C"] if data.include?("@interface") if data.include?("@interface")
matches << Language["C++"] if data.include?("#include <cstdint>") matches << Language["Objective-C"]
elsif data.include?("#include <cstdint>")
matches << Language["C++"]
end
matches matches
end end
def self.disambiguate_pl(data, languages) def self.disambiguate_pl(data)
matches = [] matches = []
matches << Language["Prolog"] if data.include?(":-") if data.include?("use strict")
matches << Language["Perl"] if data.include?("use strict") matches << Language["Perl"]
elsif data.include?(":-")
matches << Language["Prolog"]
end
matches matches
end end
def self.disambiguate_ecl(data, languages) def self.disambiguate_ecl(data)
matches = [] matches = []
matches << Language["Prolog"] if data.include?(":-") if data.include?(":-")
matches << Language["ECL"] if data.include?(":=") matches << Language["Prolog"]
elsif data.include?(":=")
matches << Language["ECL"]
end
matches matches
end end
def self.disambiguate_pro(data, languages) def self.disambiguate_pro(data)
matches = [] matches = []
if (data.include?(":-")) if (data.include?(":-"))
matches << Language["Prolog"] matches << Language["Prolog"]
@@ -73,7 +84,7 @@ module Linguist
matches matches
end end
def self.disambiguate_ts(data, languages) def self.disambiguate_ts(data)
matches = [] matches = []
if (data.include?("</translation>")) if (data.include?("</translation>"))
matches << Language["XML"] matches << Language["XML"]
@@ -83,21 +94,24 @@ module Linguist
matches matches
end end
def self.disambiguate_cl(data, languages) def self.disambiguate_cl(data)
matches = [] matches = []
matches << Language["Common Lisp"] if data.include?("(defun ") if data.include?("(defun ")
matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data) matches << Language["Common Lisp"]
elsif /\/\* |\/\/ |^\}/.match(data)
matches << Language["OpenCL"]
end
matches matches
end end
def self.disambiguate_r(data, languages) def self.disambiguate_r(data)
matches = [] matches = []
matches << Language["Rebol"] if /\bRebol\b/i.match(data) matches << Language["Rebol"] if /\bRebol\b/i.match(data)
matches << Language["R"] if data.include?("<-") matches << Language["R"] if data.include?("<-")
matches matches
end end
def self.disambiguate_hack(data, languages) def self.disambiguate_hack(data)
matches = [] matches = []
if data.include?("<?hh") if data.include?("<?hh")
matches << Language["Hack"] matches << Language["Hack"]
@@ -107,7 +121,7 @@ module Linguist
matches matches
end end
def self.disambiguate_sc(data, languages) def self.disambiguate_sc(data)
matches = [] matches = []
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)) if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
matches << Language["SuperCollider"] matches << Language["SuperCollider"]
@@ -118,7 +132,7 @@ module Linguist
matches matches
end end
def self.disambiguate_asc(data, languages) def self.disambiguate_asc(data)
matches = [] matches = []
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data) matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
matches matches

View File

@@ -100,12 +100,8 @@ module Linguist
def self.detect(blob) def self.detect(blob)
name = blob.name.to_s name = blob.name.to_s
# Check if the blob is possibly binary and bail early; this is a cheap # Bail early if the blob is binary or empty.
# test that uses the extension name to guess a binary binary mime type. return nil if blob.likely_binary? || blob.binary? || blob.empty?
#
# We'll perform a more comprehensive test later which actually involves
# looking for binary characters in the blob
return nil if blob.likely_binary? || blob.binary?
# A bit of an elegant hack. If the file is executable but extensionless, # A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other # append a "magic" extension so it can be classified with other
@@ -124,16 +120,18 @@ module Linguist
if possible_languages.length > 1 if possible_languages.length > 1
data = blob.data data = blob.data
possible_language_names = possible_languages.map(&:name) possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
end
# Don't bother with binary contents or an empty file
if data.nil? || data == ""
nil
# Check if there's a shebang line and use that as authoritative # Check if there's a shebang line and use that as authoritative
elsif (result = find_by_shebang(data)) && !result.empty? if (result = find_by_shebang(data)) && !result.empty?
result.first result.first
# No shebang. Still more work to do. Try to find it with our heuristics. # No shebang. Still more work to do. Try to find it with our heuristics.
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? elsif heuristic_languages.size == 1
determined.first heuristic_languages.first
# Lastly, fall back to the probabilistic classifier. # Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`) # Return the actual Language object based of the string language name (i.e., first element of `#classify`)

View File

@@ -1,3 +0,0 @@
(function() {
}).call(this);

View File

@@ -193,8 +193,8 @@ class TestBlob < Test::Unit::TestCase
assert blob("Binary/MainMenu.nib").generated? assert blob("Binary/MainMenu.nib").generated?
assert !blob("XML/project.pbxproj").generated? assert !blob("XML/project.pbxproj").generated?
# Gemfile.locks # Gemfile.lock is NOT generated
assert blob("Gemfile.lock").generated? assert !blob("Gemfile.lock").generated?
# Generated .NET Docfiles # Generated .NET Docfiles
assert blob("XML/net_docfile.xml").generated? assert blob("XML/net_docfile.xml").generated?
@@ -226,7 +226,6 @@ class TestBlob < Test::Unit::TestCase
assert !blob("PostScript/sierpinski.ps").generated? assert !blob("PostScript/sierpinski.ps").generated?
# These examples are too basic to tell # These examples are too basic to tell
assert !blob("JavaScript/empty.js").generated?
assert !blob("JavaScript/hello.js").generated? assert !blob("JavaScript/hello.js").generated?
assert blob("JavaScript/intro-old.js").generated? assert blob("JavaScript/intro-old.js").generated?
@@ -469,4 +468,13 @@ class TestBlob < Test::Unit::TestCase
def test_minified_files_not_safe_to_highlight def test_minified_files_not_safe_to_highlight
assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize? assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize?
end end
def test_empty
blob = Struct.new(:data) { include Linguist::BlobHelper }
assert blob.new("").empty?
assert blob.new(nil).empty?
refute blob.new(" ").empty?
refute blob.new("nope").empty?
end
end end

View File

@@ -20,18 +20,18 @@ class TestHeuristcs < Test::Unit::TestCase
Dir.glob("#{samples_path}/#{language_name}/#{file}") Dir.glob("#{samples_path}/#{language_name}/#{file}")
end end
# Candidate languages = ["C++", "Objective-C"]
def test_obj_c_by_heuristics def test_obj_c_by_heuristics
languages = ["C++", "Objective-C"]
# Only calling out '.h' filenames as these are the ones causing issues # Only calling out '.h' filenames as these are the ones causing issues
all_fixtures("Objective-C", "*.h").each do |fixture| all_fixtures("Objective-C", "*.h").each do |fixture|
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"), languages) results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"))
assert_equal Language["Objective-C"], results.first assert_equal Language["Objective-C"], results.first
end end
end end
# Candidate languages = ["C++", "Objective-C"]
def test_cpp_by_heuristics def test_cpp_by_heuristics
languages = ["C++", "Objective-C"] results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"))
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"), languages)
assert_equal Language["C++"], results.first assert_equal Language["C++"], results.first
end end
@@ -41,57 +41,57 @@ class TestHeuristcs < Test::Unit::TestCase
assert_equal Language["Objective-C"], match assert_equal Language["Objective-C"], match
end end
# Candidate languages = ["Perl", "Prolog"]
def test_pl_prolog_by_heuristics def test_pl_prolog_by_heuristics
languages = ["Perl", "Prolog"] results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"))
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages)
assert_equal Language["Prolog"], results.first assert_equal Language["Prolog"], results.first
end end
# Candidate languages = ["Perl", "Prolog"]
def test_pl_perl_by_heuristics def test_pl_perl_by_heuristics
languages = ["Perl", "Prolog"] results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"))
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"), languages)
assert_equal Language["Perl"], results.first assert_equal Language["Perl"], results.first
end end
# Candidate languages = ["ECL", "Prolog"]
def test_ecl_prolog_by_heuristics def test_ecl_prolog_by_heuristics
languages = ["ECL", "Prolog"] results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"))
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"), languages)
assert_equal Language["Prolog"], results.first assert_equal Language["Prolog"], results.first
end end
# Candidate languages = ["ECL", "Prolog"]
def test_ecl_ecl_by_heuristics def test_ecl_ecl_by_heuristics
languages = ["ECL", "Prolog"] results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"))
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"), languages)
assert_equal Language["ECL"], results.first assert_equal Language["ECL"], results.first
end end
# Candidate languages = ["IDL", "Prolog"]
def test_pro_prolog_by_heuristics def test_pro_prolog_by_heuristics
languages = ["IDL", "Prolog"] results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"))
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"), languages)
assert_equal Language["Prolog"], results.first assert_equal Language["Prolog"], results.first
end end
# Candidate languages = ["IDL", "Prolog"]
def test_pro_idl_by_heuristics def test_pro_idl_by_heuristics
languages = ["IDL", "Prolog"] results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"))
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"), languages)
assert_equal Language["IDL"], results.first assert_equal Language["IDL"], results.first
end end
# Candidate languages = ["AGS Script", "AsciiDoc"]
def test_asc_asciidoc_by_heuristics def test_asc_asciidoc_by_heuristics
languages = ["AGS Script", "AsciiDoc"] results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"))
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"), languages)
assert_equal Language["AsciiDoc"], results.first assert_equal Language["AsciiDoc"], results.first
end end
# Candidate languages = ["TypeScript", "XML"]
def test_ts_typescript_by_heuristics def test_ts_typescript_by_heuristics
languages = ["TypeScript", "XML"] results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"))
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"), languages)
assert_equal Language["TypeScript"], results.first assert_equal Language["TypeScript"], results.first
end end
# Candidate languages = ["TypeScript", "XML"]
def test_ts_xml_by_heuristics def test_ts_xml_by_heuristics
languages = ["TypeScript", "XML"] results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"))
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"), languages)
assert_equal Language["XML"], results.first assert_equal Language["XML"], results.first
end end
@@ -99,27 +99,27 @@ class TestHeuristcs < Test::Unit::TestCase
languages = ["Common Lisp", "OpenCL"] languages = ["Common Lisp", "OpenCL"]
languages.each do |language| languages.each do |language|
all_fixtures(language).each do |fixture| all_fixtures(language).each do |fixture|
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"), languages) results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"))
assert_equal Language[language], results.first assert_equal Language[language], results.first
end end
end end
end end
# Candidate languages = ["Hack", "PHP"]
def test_hack_by_heuristics def test_hack_by_heuristics
languages = ["Hack", "PHP"] results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"))
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"), languages)
assert_equal Language["Hack"], results.first assert_equal Language["Hack"], results.first
end end
# Candidate languages = ["Scala", "SuperCollider"]
def test_sc_supercollider_by_heuristics def test_sc_supercollider_by_heuristics
languages = ["Scala", "SuperCollider"] results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"))
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"), languages)
assert_equal Language["SuperCollider"], results.first assert_equal Language["SuperCollider"], results.first
end end
# Candidate languages = ["Scala", "SuperCollider"]
def test_sc_scala_by_heuristics def test_sc_scala_by_heuristics
languages = ["Scala", "SuperCollider"] results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"))
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"), languages)
assert_equal Language["Scala"], results.first assert_equal Language["Scala"], results.first
end end
end end

Binary file not shown.

BIN
vendor/cache/rugged-0.22.0b1.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/yajl-ruby-1.1.0.gem vendored Normal file

Binary file not shown.