mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 20:38:47 +00:00
Merge pull request #1674 from github/rework-heuristics
Rework heuristics
This commit is contained in:
@@ -146,6 +146,13 @@ module Linguist
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Is the blob empty?
|
||||
#
|
||||
# Return true or false
|
||||
def empty?
|
||||
data.nil? || data == ""
|
||||
end
|
||||
|
||||
# Public: Is the blob text?
|
||||
#
|
||||
# Return true or false
|
||||
|
||||
@@ -13,26 +13,28 @@ module Linguist
|
||||
# Returns an array of Languages or []
|
||||
def self.find_by_heuristics(data, languages)
|
||||
if active?
|
||||
result = []
|
||||
|
||||
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
|
||||
result = disambiguate_pl(data, languages)
|
||||
result = disambiguate_pl(data)
|
||||
end
|
||||
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
|
||||
result = disambiguate_ecl(data, languages)
|
||||
result = disambiguate_ecl(data)
|
||||
end
|
||||
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
|
||||
result = disambiguate_pro(data, languages)
|
||||
result = disambiguate_pro(data)
|
||||
end
|
||||
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
|
||||
result = disambiguate_cl(data, languages)
|
||||
result = disambiguate_cl(data)
|
||||
end
|
||||
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
|
||||
result = disambiguate_hack(data, languages)
|
||||
result = disambiguate_hack(data)
|
||||
end
|
||||
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
|
||||
result = disambiguate_sc(data, languages)
|
||||
result = disambiguate_sc(data)
|
||||
end
|
||||
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
|
||||
result = disambiguate_asc(data, languages)
|
||||
result = disambiguate_asc(data)
|
||||
end
|
||||
return result
|
||||
end
|
||||
@@ -42,28 +44,37 @@ module Linguist
|
||||
# We want to shortcut look for Objective-C _and_ now C++ too!
|
||||
#
|
||||
# Returns an array of Languages or []
|
||||
def self.disambiguate_c(data, languages)
|
||||
def self.disambiguate_c(data)
|
||||
matches = []
|
||||
matches << Language["Objective-C"] if data.include?("@interface")
|
||||
matches << Language["C++"] if data.include?("#include <cstdint>")
|
||||
if data.include?("@interface")
|
||||
matches << Language["Objective-C"]
|
||||
elsif data.include?("#include <cstdint>")
|
||||
matches << Language["C++"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_pl(data, languages)
|
||||
def self.disambiguate_pl(data)
|
||||
matches = []
|
||||
matches << Language["Prolog"] if data.include?(":-")
|
||||
matches << Language["Perl"] if data.include?("use strict")
|
||||
if data.include?("use strict")
|
||||
matches << Language["Perl"]
|
||||
elsif data.include?(":-")
|
||||
matches << Language["Prolog"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_ecl(data, languages)
|
||||
def self.disambiguate_ecl(data)
|
||||
matches = []
|
||||
matches << Language["Prolog"] if data.include?(":-")
|
||||
matches << Language["ECL"] if data.include?(":=")
|
||||
if data.include?(":-")
|
||||
matches << Language["Prolog"]
|
||||
elsif data.include?(":=")
|
||||
matches << Language["ECL"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_pro(data, languages)
|
||||
def self.disambiguate_pro(data)
|
||||
matches = []
|
||||
if (data.include?(":-"))
|
||||
matches << Language["Prolog"]
|
||||
@@ -73,7 +84,7 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_ts(data, languages)
|
||||
def self.disambiguate_ts(data)
|
||||
matches = []
|
||||
if (data.include?("</translation>"))
|
||||
matches << Language["XML"]
|
||||
@@ -83,21 +94,24 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_cl(data, languages)
|
||||
def self.disambiguate_cl(data)
|
||||
matches = []
|
||||
matches << Language["Common Lisp"] if data.include?("(defun ")
|
||||
matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
|
||||
if data.include?("(defun ")
|
||||
matches << Language["Common Lisp"]
|
||||
elsif /\/\* |\/\/ |^\}/.match(data)
|
||||
matches << Language["OpenCL"]
|
||||
end
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_r(data, languages)
|
||||
def self.disambiguate_r(data)
|
||||
matches = []
|
||||
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
|
||||
matches << Language["R"] if data.include?("<-")
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_hack(data, languages)
|
||||
def self.disambiguate_hack(data)
|
||||
matches = []
|
||||
if data.include?("<?hh")
|
||||
matches << Language["Hack"]
|
||||
@@ -107,7 +121,7 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_sc(data, languages)
|
||||
def self.disambiguate_sc(data)
|
||||
matches = []
|
||||
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
|
||||
matches << Language["SuperCollider"]
|
||||
@@ -118,7 +132,7 @@ module Linguist
|
||||
matches
|
||||
end
|
||||
|
||||
def self.disambiguate_asc(data, languages)
|
||||
def self.disambiguate_asc(data)
|
||||
matches = []
|
||||
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
||||
matches
|
||||
|
||||
@@ -100,12 +100,8 @@ module Linguist
|
||||
def self.detect(blob)
|
||||
name = blob.name.to_s
|
||||
|
||||
# Check if the blob is possibly binary and bail early; this is a cheap
|
||||
# test that uses the extension name to guess a binary binary mime type.
|
||||
#
|
||||
# We'll perform a more comprehensive test later which actually involves
|
||||
# looking for binary characters in the blob
|
||||
return nil if blob.likely_binary? || blob.binary?
|
||||
# Bail early if the blob is binary or empty.
|
||||
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
||||
|
||||
# A bit of an elegant hack. If the file is executable but extensionless,
|
||||
# append a "magic" extension so it can be classified with other
|
||||
@@ -124,16 +120,18 @@ module Linguist
|
||||
if possible_languages.length > 1
|
||||
data = blob.data
|
||||
possible_language_names = possible_languages.map(&:name)
|
||||
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
||||
|
||||
if heuristic_languages.size > 1
|
||||
possible_language_names = heuristic_languages.map(&:name)
|
||||
end
|
||||
|
||||
# Don't bother with binary contents or an empty file
|
||||
if data.nil? || data == ""
|
||||
nil
|
||||
# Check if there's a shebang line and use that as authoritative
|
||||
elsif (result = find_by_shebang(data)) && !result.empty?
|
||||
if (result = find_by_shebang(data)) && !result.empty?
|
||||
result.first
|
||||
# No shebang. Still more work to do. Try to find it with our heuristics.
|
||||
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
||||
determined.first
|
||||
elsif heuristic_languages.size == 1
|
||||
heuristic_languages.first
|
||||
# Lastly, fall back to the probabilistic classifier.
|
||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
(function() {
|
||||
|
||||
}).call(this);
|
||||
@@ -226,7 +226,6 @@ class TestBlob < Test::Unit::TestCase
|
||||
assert !blob("PostScript/sierpinski.ps").generated?
|
||||
|
||||
# These examples are too basic to tell
|
||||
assert !blob("JavaScript/empty.js").generated?
|
||||
assert !blob("JavaScript/hello.js").generated?
|
||||
|
||||
assert blob("JavaScript/intro-old.js").generated?
|
||||
@@ -469,4 +468,13 @@ class TestBlob < Test::Unit::TestCase
|
||||
def test_minified_files_not_safe_to_highlight
|
||||
assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize?
|
||||
end
|
||||
|
||||
def test_empty
|
||||
blob = Struct.new(:data) { include Linguist::BlobHelper }
|
||||
|
||||
assert blob.new("").empty?
|
||||
assert blob.new(nil).empty?
|
||||
refute blob.new(" ").empty?
|
||||
refute blob.new("nope").empty?
|
||||
end
|
||||
end
|
||||
|
||||
@@ -20,18 +20,18 @@ class TestHeuristcs < Test::Unit::TestCase
|
||||
Dir.glob("#{samples_path}/#{language_name}/#{file}")
|
||||
end
|
||||
|
||||
# Candidate languages = ["C++", "Objective-C"]
|
||||
def test_obj_c_by_heuristics
|
||||
languages = ["C++", "Objective-C"]
|
||||
# Only calling out '.h' filenames as these are the ones causing issues
|
||||
all_fixtures("Objective-C", "*.h").each do |fixture|
|
||||
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"), languages)
|
||||
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"))
|
||||
assert_equal Language["Objective-C"], results.first
|
||||
end
|
||||
end
|
||||
|
||||
# Candidate languages = ["C++", "Objective-C"]
|
||||
def test_cpp_by_heuristics
|
||||
languages = ["C++", "Objective-C"]
|
||||
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"), languages)
|
||||
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"))
|
||||
assert_equal Language["C++"], results.first
|
||||
end
|
||||
|
||||
@@ -41,57 +41,57 @@ class TestHeuristcs < Test::Unit::TestCase
|
||||
assert_equal Language["Objective-C"], match
|
||||
end
|
||||
|
||||
# Candidate languages = ["Perl", "Prolog"]
|
||||
def test_pl_prolog_by_heuristics
|
||||
languages = ["Perl", "Prolog"]
|
||||
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages)
|
||||
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"))
|
||||
assert_equal Language["Prolog"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["Perl", "Prolog"]
|
||||
def test_pl_perl_by_heuristics
|
||||
languages = ["Perl", "Prolog"]
|
||||
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"), languages)
|
||||
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"))
|
||||
assert_equal Language["Perl"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["ECL", "Prolog"]
|
||||
def test_ecl_prolog_by_heuristics
|
||||
languages = ["ECL", "Prolog"]
|
||||
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"), languages)
|
||||
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"))
|
||||
assert_equal Language["Prolog"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["ECL", "Prolog"]
|
||||
def test_ecl_ecl_by_heuristics
|
||||
languages = ["ECL", "Prolog"]
|
||||
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"), languages)
|
||||
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"))
|
||||
assert_equal Language["ECL"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["IDL", "Prolog"]
|
||||
def test_pro_prolog_by_heuristics
|
||||
languages = ["IDL", "Prolog"]
|
||||
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"), languages)
|
||||
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"))
|
||||
assert_equal Language["Prolog"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["IDL", "Prolog"]
|
||||
def test_pro_idl_by_heuristics
|
||||
languages = ["IDL", "Prolog"]
|
||||
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"), languages)
|
||||
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"))
|
||||
assert_equal Language["IDL"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["AGS Script", "AsciiDoc"]
|
||||
def test_asc_asciidoc_by_heuristics
|
||||
languages = ["AGS Script", "AsciiDoc"]
|
||||
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"), languages)
|
||||
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"))
|
||||
assert_equal Language["AsciiDoc"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["TypeScript", "XML"]
|
||||
def test_ts_typescript_by_heuristics
|
||||
languages = ["TypeScript", "XML"]
|
||||
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"), languages)
|
||||
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"))
|
||||
assert_equal Language["TypeScript"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["TypeScript", "XML"]
|
||||
def test_ts_xml_by_heuristics
|
||||
languages = ["TypeScript", "XML"]
|
||||
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"), languages)
|
||||
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"))
|
||||
assert_equal Language["XML"], results.first
|
||||
end
|
||||
|
||||
@@ -99,27 +99,27 @@ class TestHeuristcs < Test::Unit::TestCase
|
||||
languages = ["Common Lisp", "OpenCL"]
|
||||
languages.each do |language|
|
||||
all_fixtures(language).each do |fixture|
|
||||
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"), languages)
|
||||
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"))
|
||||
assert_equal Language[language], results.first
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Candidate languages = ["Hack", "PHP"]
|
||||
def test_hack_by_heuristics
|
||||
languages = ["Hack", "PHP"]
|
||||
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"), languages)
|
||||
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"))
|
||||
assert_equal Language["Hack"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["Scala", "SuperCollider"]
|
||||
def test_sc_supercollider_by_heuristics
|
||||
languages = ["Scala", "SuperCollider"]
|
||||
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"), languages)
|
||||
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"))
|
||||
assert_equal Language["SuperCollider"], results.first
|
||||
end
|
||||
|
||||
# Candidate languages = ["Scala", "SuperCollider"]
|
||||
def test_sc_scala_by_heuristics
|
||||
languages = ["Scala", "SuperCollider"]
|
||||
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"), languages)
|
||||
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"))
|
||||
assert_equal Language["Scala"], results.first
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user