mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 20:38:47 +00:00
Merge pull request #1674 from github/rework-heuristics
Rework heuristics
This commit is contained in:
@@ -146,6 +146,13 @@ module Linguist
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Public: Is the blob empty?
|
||||||
|
#
|
||||||
|
# Return true or false
|
||||||
|
def empty?
|
||||||
|
data.nil? || data == ""
|
||||||
|
end
|
||||||
|
|
||||||
# Public: Is the blob text?
|
# Public: Is the blob text?
|
||||||
#
|
#
|
||||||
# Return true or false
|
# Return true or false
|
||||||
|
|||||||
@@ -13,26 +13,28 @@ module Linguist
|
|||||||
# Returns an array of Languages or []
|
# Returns an array of Languages or []
|
||||||
def self.find_by_heuristics(data, languages)
|
def self.find_by_heuristics(data, languages)
|
||||||
if active?
|
if active?
|
||||||
|
result = []
|
||||||
|
|
||||||
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
|
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
|
||||||
result = disambiguate_pl(data, languages)
|
result = disambiguate_pl(data)
|
||||||
end
|
end
|
||||||
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
|
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
|
||||||
result = disambiguate_ecl(data, languages)
|
result = disambiguate_ecl(data)
|
||||||
end
|
end
|
||||||
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
|
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
|
||||||
result = disambiguate_pro(data, languages)
|
result = disambiguate_pro(data)
|
||||||
end
|
end
|
||||||
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
|
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
|
||||||
result = disambiguate_cl(data, languages)
|
result = disambiguate_cl(data)
|
||||||
end
|
end
|
||||||
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
|
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
|
||||||
result = disambiguate_hack(data, languages)
|
result = disambiguate_hack(data)
|
||||||
end
|
end
|
||||||
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
|
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
|
||||||
result = disambiguate_sc(data, languages)
|
result = disambiguate_sc(data)
|
||||||
end
|
end
|
||||||
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
|
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
|
||||||
result = disambiguate_asc(data, languages)
|
result = disambiguate_asc(data)
|
||||||
end
|
end
|
||||||
return result
|
return result
|
||||||
end
|
end
|
||||||
@@ -42,28 +44,37 @@ module Linguist
|
|||||||
# We want to shortcut look for Objective-C _and_ now C++ too!
|
# We want to shortcut look for Objective-C _and_ now C++ too!
|
||||||
#
|
#
|
||||||
# Returns an array of Languages or []
|
# Returns an array of Languages or []
|
||||||
def self.disambiguate_c(data, languages)
|
def self.disambiguate_c(data)
|
||||||
matches = []
|
matches = []
|
||||||
matches << Language["Objective-C"] if data.include?("@interface")
|
if data.include?("@interface")
|
||||||
matches << Language["C++"] if data.include?("#include <cstdint>")
|
matches << Language["Objective-C"]
|
||||||
|
elsif data.include?("#include <cstdint>")
|
||||||
|
matches << Language["C++"]
|
||||||
|
end
|
||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_pl(data, languages)
|
def self.disambiguate_pl(data)
|
||||||
matches = []
|
matches = []
|
||||||
matches << Language["Prolog"] if data.include?(":-")
|
if data.include?("use strict")
|
||||||
matches << Language["Perl"] if data.include?("use strict")
|
matches << Language["Perl"]
|
||||||
|
elsif data.include?(":-")
|
||||||
|
matches << Language["Prolog"]
|
||||||
|
end
|
||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_ecl(data, languages)
|
def self.disambiguate_ecl(data)
|
||||||
matches = []
|
matches = []
|
||||||
matches << Language["Prolog"] if data.include?(":-")
|
if data.include?(":-")
|
||||||
matches << Language["ECL"] if data.include?(":=")
|
matches << Language["Prolog"]
|
||||||
|
elsif data.include?(":=")
|
||||||
|
matches << Language["ECL"]
|
||||||
|
end
|
||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_pro(data, languages)
|
def self.disambiguate_pro(data)
|
||||||
matches = []
|
matches = []
|
||||||
if (data.include?(":-"))
|
if (data.include?(":-"))
|
||||||
matches << Language["Prolog"]
|
matches << Language["Prolog"]
|
||||||
@@ -73,7 +84,7 @@ module Linguist
|
|||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_ts(data, languages)
|
def self.disambiguate_ts(data)
|
||||||
matches = []
|
matches = []
|
||||||
if (data.include?("</translation>"))
|
if (data.include?("</translation>"))
|
||||||
matches << Language["XML"]
|
matches << Language["XML"]
|
||||||
@@ -83,21 +94,24 @@ module Linguist
|
|||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_cl(data, languages)
|
def self.disambiguate_cl(data)
|
||||||
matches = []
|
matches = []
|
||||||
matches << Language["Common Lisp"] if data.include?("(defun ")
|
if data.include?("(defun ")
|
||||||
matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
|
matches << Language["Common Lisp"]
|
||||||
|
elsif /\/\* |\/\/ |^\}/.match(data)
|
||||||
|
matches << Language["OpenCL"]
|
||||||
|
end
|
||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_r(data, languages)
|
def self.disambiguate_r(data)
|
||||||
matches = []
|
matches = []
|
||||||
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
|
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
|
||||||
matches << Language["R"] if data.include?("<-")
|
matches << Language["R"] if data.include?("<-")
|
||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_hack(data, languages)
|
def self.disambiguate_hack(data)
|
||||||
matches = []
|
matches = []
|
||||||
if data.include?("<?hh")
|
if data.include?("<?hh")
|
||||||
matches << Language["Hack"]
|
matches << Language["Hack"]
|
||||||
@@ -107,7 +121,7 @@ module Linguist
|
|||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_sc(data, languages)
|
def self.disambiguate_sc(data)
|
||||||
matches = []
|
matches = []
|
||||||
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
|
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
|
||||||
matches << Language["SuperCollider"]
|
matches << Language["SuperCollider"]
|
||||||
@@ -118,7 +132,7 @@ module Linguist
|
|||||||
matches
|
matches
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.disambiguate_asc(data, languages)
|
def self.disambiguate_asc(data)
|
||||||
matches = []
|
matches = []
|
||||||
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
|
||||||
matches
|
matches
|
||||||
|
|||||||
@@ -100,12 +100,8 @@ module Linguist
|
|||||||
def self.detect(blob)
|
def self.detect(blob)
|
||||||
name = blob.name.to_s
|
name = blob.name.to_s
|
||||||
|
|
||||||
# Check if the blob is possibly binary and bail early; this is a cheap
|
# Bail early if the blob is binary or empty.
|
||||||
# test that uses the extension name to guess a binary binary mime type.
|
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
||||||
#
|
|
||||||
# We'll perform a more comprehensive test later which actually involves
|
|
||||||
# looking for binary characters in the blob
|
|
||||||
return nil if blob.likely_binary? || blob.binary?
|
|
||||||
|
|
||||||
# A bit of an elegant hack. If the file is executable but extensionless,
|
# A bit of an elegant hack. If the file is executable but extensionless,
|
||||||
# append a "magic" extension so it can be classified with other
|
# append a "magic" extension so it can be classified with other
|
||||||
@@ -124,16 +120,18 @@ module Linguist
|
|||||||
if possible_languages.length > 1
|
if possible_languages.length > 1
|
||||||
data = blob.data
|
data = blob.data
|
||||||
possible_language_names = possible_languages.map(&:name)
|
possible_language_names = possible_languages.map(&:name)
|
||||||
|
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
||||||
|
|
||||||
|
if heuristic_languages.size > 1
|
||||||
|
possible_language_names = heuristic_languages.map(&:name)
|
||||||
|
end
|
||||||
|
|
||||||
# Don't bother with binary contents or an empty file
|
|
||||||
if data.nil? || data == ""
|
|
||||||
nil
|
|
||||||
# Check if there's a shebang line and use that as authoritative
|
# Check if there's a shebang line and use that as authoritative
|
||||||
elsif (result = find_by_shebang(data)) && !result.empty?
|
if (result = find_by_shebang(data)) && !result.empty?
|
||||||
result.first
|
result.first
|
||||||
# No shebang. Still more work to do. Try to find it with our heuristics.
|
# No shebang. Still more work to do. Try to find it with our heuristics.
|
||||||
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
elsif heuristic_languages.size == 1
|
||||||
determined.first
|
heuristic_languages.first
|
||||||
# Lastly, fall back to the probabilistic classifier.
|
# Lastly, fall back to the probabilistic classifier.
|
||||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
||||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||||
|
|||||||
@@ -1,3 +0,0 @@
|
|||||||
(function() {
|
|
||||||
|
|
||||||
}).call(this);
|
|
||||||
@@ -226,7 +226,6 @@ class TestBlob < Test::Unit::TestCase
|
|||||||
assert !blob("PostScript/sierpinski.ps").generated?
|
assert !blob("PostScript/sierpinski.ps").generated?
|
||||||
|
|
||||||
# These examples are too basic to tell
|
# These examples are too basic to tell
|
||||||
assert !blob("JavaScript/empty.js").generated?
|
|
||||||
assert !blob("JavaScript/hello.js").generated?
|
assert !blob("JavaScript/hello.js").generated?
|
||||||
|
|
||||||
assert blob("JavaScript/intro-old.js").generated?
|
assert blob("JavaScript/intro-old.js").generated?
|
||||||
@@ -469,4 +468,13 @@ class TestBlob < Test::Unit::TestCase
|
|||||||
def test_minified_files_not_safe_to_highlight
|
def test_minified_files_not_safe_to_highlight
|
||||||
assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize?
|
assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize?
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_empty
|
||||||
|
blob = Struct.new(:data) { include Linguist::BlobHelper }
|
||||||
|
|
||||||
|
assert blob.new("").empty?
|
||||||
|
assert blob.new(nil).empty?
|
||||||
|
refute blob.new(" ").empty?
|
||||||
|
refute blob.new("nope").empty?
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -20,18 +20,18 @@ class TestHeuristcs < Test::Unit::TestCase
|
|||||||
Dir.glob("#{samples_path}/#{language_name}/#{file}")
|
Dir.glob("#{samples_path}/#{language_name}/#{file}")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["C++", "Objective-C"]
|
||||||
def test_obj_c_by_heuristics
|
def test_obj_c_by_heuristics
|
||||||
languages = ["C++", "Objective-C"]
|
|
||||||
# Only calling out '.h' filenames as these are the ones causing issues
|
# Only calling out '.h' filenames as these are the ones causing issues
|
||||||
all_fixtures("Objective-C", "*.h").each do |fixture|
|
all_fixtures("Objective-C", "*.h").each do |fixture|
|
||||||
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"), languages)
|
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"))
|
||||||
assert_equal Language["Objective-C"], results.first
|
assert_equal Language["Objective-C"], results.first
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["C++", "Objective-C"]
|
||||||
def test_cpp_by_heuristics
|
def test_cpp_by_heuristics
|
||||||
languages = ["C++", "Objective-C"]
|
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"))
|
||||||
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"), languages)
|
|
||||||
assert_equal Language["C++"], results.first
|
assert_equal Language["C++"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -41,57 +41,57 @@ class TestHeuristcs < Test::Unit::TestCase
|
|||||||
assert_equal Language["Objective-C"], match
|
assert_equal Language["Objective-C"], match
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["Perl", "Prolog"]
|
||||||
def test_pl_prolog_by_heuristics
|
def test_pl_prolog_by_heuristics
|
||||||
languages = ["Perl", "Prolog"]
|
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"))
|
||||||
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages)
|
|
||||||
assert_equal Language["Prolog"], results.first
|
assert_equal Language["Prolog"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["Perl", "Prolog"]
|
||||||
def test_pl_perl_by_heuristics
|
def test_pl_perl_by_heuristics
|
||||||
languages = ["Perl", "Prolog"]
|
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"))
|
||||||
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"), languages)
|
|
||||||
assert_equal Language["Perl"], results.first
|
assert_equal Language["Perl"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["ECL", "Prolog"]
|
||||||
def test_ecl_prolog_by_heuristics
|
def test_ecl_prolog_by_heuristics
|
||||||
languages = ["ECL", "Prolog"]
|
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"))
|
||||||
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"), languages)
|
|
||||||
assert_equal Language["Prolog"], results.first
|
assert_equal Language["Prolog"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["ECL", "Prolog"]
|
||||||
def test_ecl_ecl_by_heuristics
|
def test_ecl_ecl_by_heuristics
|
||||||
languages = ["ECL", "Prolog"]
|
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"))
|
||||||
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"), languages)
|
|
||||||
assert_equal Language["ECL"], results.first
|
assert_equal Language["ECL"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["IDL", "Prolog"]
|
||||||
def test_pro_prolog_by_heuristics
|
def test_pro_prolog_by_heuristics
|
||||||
languages = ["IDL", "Prolog"]
|
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"))
|
||||||
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"), languages)
|
|
||||||
assert_equal Language["Prolog"], results.first
|
assert_equal Language["Prolog"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["IDL", "Prolog"]
|
||||||
def test_pro_idl_by_heuristics
|
def test_pro_idl_by_heuristics
|
||||||
languages = ["IDL", "Prolog"]
|
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"))
|
||||||
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"), languages)
|
|
||||||
assert_equal Language["IDL"], results.first
|
assert_equal Language["IDL"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["AGS Script", "AsciiDoc"]
|
||||||
def test_asc_asciidoc_by_heuristics
|
def test_asc_asciidoc_by_heuristics
|
||||||
languages = ["AGS Script", "AsciiDoc"]
|
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"))
|
||||||
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"), languages)
|
|
||||||
assert_equal Language["AsciiDoc"], results.first
|
assert_equal Language["AsciiDoc"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["TypeScript", "XML"]
|
||||||
def test_ts_typescript_by_heuristics
|
def test_ts_typescript_by_heuristics
|
||||||
languages = ["TypeScript", "XML"]
|
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"))
|
||||||
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"), languages)
|
|
||||||
assert_equal Language["TypeScript"], results.first
|
assert_equal Language["TypeScript"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["TypeScript", "XML"]
|
||||||
def test_ts_xml_by_heuristics
|
def test_ts_xml_by_heuristics
|
||||||
languages = ["TypeScript", "XML"]
|
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"))
|
||||||
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"), languages)
|
|
||||||
assert_equal Language["XML"], results.first
|
assert_equal Language["XML"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -99,27 +99,27 @@ class TestHeuristcs < Test::Unit::TestCase
|
|||||||
languages = ["Common Lisp", "OpenCL"]
|
languages = ["Common Lisp", "OpenCL"]
|
||||||
languages.each do |language|
|
languages.each do |language|
|
||||||
all_fixtures(language).each do |fixture|
|
all_fixtures(language).each do |fixture|
|
||||||
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"), languages)
|
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"))
|
||||||
assert_equal Language[language], results.first
|
assert_equal Language[language], results.first
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["Hack", "PHP"]
|
||||||
def test_hack_by_heuristics
|
def test_hack_by_heuristics
|
||||||
languages = ["Hack", "PHP"]
|
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"))
|
||||||
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"), languages)
|
|
||||||
assert_equal Language["Hack"], results.first
|
assert_equal Language["Hack"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["Scala", "SuperCollider"]
|
||||||
def test_sc_supercollider_by_heuristics
|
def test_sc_supercollider_by_heuristics
|
||||||
languages = ["Scala", "SuperCollider"]
|
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"))
|
||||||
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"), languages)
|
|
||||||
assert_equal Language["SuperCollider"], results.first
|
assert_equal Language["SuperCollider"], results.first
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Candidate languages = ["Scala", "SuperCollider"]
|
||||||
def test_sc_scala_by_heuristics
|
def test_sc_scala_by_heuristics
|
||||||
languages = ["Scala", "SuperCollider"]
|
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"))
|
||||||
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"), languages)
|
|
||||||
assert_equal Language["Scala"], results.first
|
assert_equal Language["Scala"], results.first
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user