Merge pull request #1674 from github/rework-heuristics

Rework heuristics
This commit is contained in:
Arfon Smith
2014-11-18 10:43:01 -06:00
8 changed files with 94 additions and 70 deletions

View File

@@ -146,6 +146,13 @@ module Linguist
end
end
# Public: Is the blob empty?
#
# Return true or false
def empty?
data.nil? || data == ""
end
# Public: Is the blob text?
#
# Return true or false

View File

@@ -13,26 +13,28 @@ module Linguist
# Returns an array of Languages or []
def self.find_by_heuristics(data, languages)
if active?
result = []
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
result = disambiguate_pl(data, languages)
result = disambiguate_pl(data)
end
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
result = disambiguate_ecl(data, languages)
result = disambiguate_ecl(data)
end
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
result = disambiguate_pro(data, languages)
result = disambiguate_pro(data)
end
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
result = disambiguate_cl(data, languages)
result = disambiguate_cl(data)
end
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
result = disambiguate_hack(data, languages)
result = disambiguate_hack(data)
end
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
result = disambiguate_sc(data, languages)
result = disambiguate_sc(data)
end
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
result = disambiguate_asc(data, languages)
result = disambiguate_asc(data)
end
return result
end
@@ -42,28 +44,37 @@ module Linguist
# We want to shortcut look for Objective-C _and_ now C++ too!
#
# Returns an array of Languages or []
def self.disambiguate_c(data, languages)
def self.disambiguate_c(data)
matches = []
matches << Language["Objective-C"] if data.include?("@interface")
matches << Language["C++"] if data.include?("#include <cstdint>")
if data.include?("@interface")
matches << Language["Objective-C"]
elsif data.include?("#include <cstdint>")
matches << Language["C++"]
end
matches
end
def self.disambiguate_pl(data, languages)
def self.disambiguate_pl(data)
matches = []
matches << Language["Prolog"] if data.include?(":-")
matches << Language["Perl"] if data.include?("use strict")
if data.include?("use strict")
matches << Language["Perl"]
elsif data.include?(":-")
matches << Language["Prolog"]
end
matches
end
def self.disambiguate_ecl(data, languages)
def self.disambiguate_ecl(data)
matches = []
matches << Language["Prolog"] if data.include?(":-")
matches << Language["ECL"] if data.include?(":=")
if data.include?(":-")
matches << Language["Prolog"]
elsif data.include?(":=")
matches << Language["ECL"]
end
matches
end
def self.disambiguate_pro(data, languages)
def self.disambiguate_pro(data)
matches = []
if (data.include?(":-"))
matches << Language["Prolog"]
@@ -73,7 +84,7 @@ module Linguist
matches
end
def self.disambiguate_ts(data, languages)
def self.disambiguate_ts(data)
matches = []
if (data.include?("</translation>"))
matches << Language["XML"]
@@ -83,21 +94,24 @@ module Linguist
matches
end
def self.disambiguate_cl(data, languages)
def self.disambiguate_cl(data)
matches = []
matches << Language["Common Lisp"] if data.include?("(defun ")
matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
if data.include?("(defun ")
matches << Language["Common Lisp"]
elsif /\/\* |\/\/ |^\}/.match(data)
matches << Language["OpenCL"]
end
matches
end
def self.disambiguate_r(data, languages)
def self.disambiguate_r(data)
matches = []
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
matches << Language["R"] if data.include?("<-")
matches
end
def self.disambiguate_hack(data, languages)
def self.disambiguate_hack(data)
matches = []
if data.include?("<?hh")
matches << Language["Hack"]
@@ -107,7 +121,7 @@ module Linguist
matches
end
def self.disambiguate_sc(data, languages)
def self.disambiguate_sc(data)
matches = []
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
matches << Language["SuperCollider"]
@@ -118,7 +132,7 @@ module Linguist
matches
end
def self.disambiguate_asc(data, languages)
def self.disambiguate_asc(data)
matches = []
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
matches

View File

@@ -100,12 +100,8 @@ module Linguist
def self.detect(blob)
name = blob.name.to_s
# Check if the blob is possibly binary and bail early; this is a cheap
# test that uses the extension name to guess a binary binary mime type.
#
# We'll perform a more comprehensive test later which actually involves
# looking for binary characters in the blob
return nil if blob.likely_binary? || blob.binary?
# Bail early if the blob is binary or empty.
return nil if blob.likely_binary? || blob.binary? || blob.empty?
# A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other
@@ -124,16 +120,18 @@ module Linguist
if possible_languages.length > 1
data = blob.data
possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
end
# Don't bother with binary contents or an empty file
if data.nil? || data == ""
nil
# Check if there's a shebang line and use that as authoritative
elsif (result = find_by_shebang(data)) && !result.empty?
if (result = find_by_shebang(data)) && !result.empty?
result.first
# No shebang. Still more work to do. Try to find it with our heuristics.
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
determined.first
elsif heuristic_languages.size == 1
heuristic_languages.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)

View File

@@ -1,3 +0,0 @@
(function() {
}).call(this);

View File

@@ -226,7 +226,6 @@ class TestBlob < Test::Unit::TestCase
assert !blob("PostScript/sierpinski.ps").generated?
# These examples are too basic to tell
assert !blob("JavaScript/empty.js").generated?
assert !blob("JavaScript/hello.js").generated?
assert blob("JavaScript/intro-old.js").generated?
@@ -469,4 +468,13 @@ class TestBlob < Test::Unit::TestCase
def test_minified_files_not_safe_to_highlight
assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize?
end
def test_empty
blob = Struct.new(:data) { include Linguist::BlobHelper }
assert blob.new("").empty?
assert blob.new(nil).empty?
refute blob.new(" ").empty?
refute blob.new("nope").empty?
end
end

View File

@@ -20,18 +20,18 @@ class TestHeuristcs < Test::Unit::TestCase
Dir.glob("#{samples_path}/#{language_name}/#{file}")
end
# Candidate languages = ["C++", "Objective-C"]
def test_obj_c_by_heuristics
languages = ["C++", "Objective-C"]
# Only calling out '.h' filenames as these are the ones causing issues
all_fixtures("Objective-C", "*.h").each do |fixture|
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"), languages)
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"))
assert_equal Language["Objective-C"], results.first
end
end
# Candidate languages = ["C++", "Objective-C"]
def test_cpp_by_heuristics
languages = ["C++", "Objective-C"]
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"), languages)
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"))
assert_equal Language["C++"], results.first
end
@@ -41,57 +41,57 @@ class TestHeuristcs < Test::Unit::TestCase
assert_equal Language["Objective-C"], match
end
# Candidate languages = ["Perl", "Prolog"]
def test_pl_prolog_by_heuristics
languages = ["Perl", "Prolog"]
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages)
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"))
assert_equal Language["Prolog"], results.first
end
# Candidate languages = ["Perl", "Prolog"]
def test_pl_perl_by_heuristics
languages = ["Perl", "Prolog"]
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"), languages)
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"))
assert_equal Language["Perl"], results.first
end
# Candidate languages = ["ECL", "Prolog"]
def test_ecl_prolog_by_heuristics
languages = ["ECL", "Prolog"]
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"), languages)
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"))
assert_equal Language["Prolog"], results.first
end
# Candidate languages = ["ECL", "Prolog"]
def test_ecl_ecl_by_heuristics
languages = ["ECL", "Prolog"]
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"), languages)
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"))
assert_equal Language["ECL"], results.first
end
# Candidate languages = ["IDL", "Prolog"]
def test_pro_prolog_by_heuristics
languages = ["IDL", "Prolog"]
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"), languages)
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"))
assert_equal Language["Prolog"], results.first
end
# Candidate languages = ["IDL", "Prolog"]
def test_pro_idl_by_heuristics
languages = ["IDL", "Prolog"]
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"), languages)
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"))
assert_equal Language["IDL"], results.first
end
# Candidate languages = ["AGS Script", "AsciiDoc"]
def test_asc_asciidoc_by_heuristics
languages = ["AGS Script", "AsciiDoc"]
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"), languages)
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"))
assert_equal Language["AsciiDoc"], results.first
end
# Candidate languages = ["TypeScript", "XML"]
def test_ts_typescript_by_heuristics
languages = ["TypeScript", "XML"]
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"), languages)
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"))
assert_equal Language["TypeScript"], results.first
end
# Candidate languages = ["TypeScript", "XML"]
def test_ts_xml_by_heuristics
languages = ["TypeScript", "XML"]
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"), languages)
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"))
assert_equal Language["XML"], results.first
end
@@ -99,27 +99,27 @@ class TestHeuristcs < Test::Unit::TestCase
languages = ["Common Lisp", "OpenCL"]
languages.each do |language|
all_fixtures(language).each do |fixture|
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"), languages)
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"))
assert_equal Language[language], results.first
end
end
end
# Candidate languages = ["Hack", "PHP"]
def test_hack_by_heuristics
languages = ["Hack", "PHP"]
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"), languages)
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"))
assert_equal Language["Hack"], results.first
end
# Candidate languages = ["Scala", "SuperCollider"]
def test_sc_supercollider_by_heuristics
languages = ["Scala", "SuperCollider"]
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"), languages)
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"))
assert_equal Language["SuperCollider"], results.first
end
# Candidate languages = ["Scala", "SuperCollider"]
def test_sc_scala_by_heuristics
languages = ["Scala", "SuperCollider"]
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"), languages)
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"))
assert_equal Language["Scala"], results.first
end
end