diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index d6d3dd30..c368b4d0 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -146,6 +146,13 @@ module Linguist end end + # Public: Is the blob empty? + # + # Return true or false + def empty? + data.nil? || data == "" + end + # Public: Is the blob text? # # Return true or false diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index c7519881..9c1d9948 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -13,26 +13,28 @@ module Linguist # Returns an array of Languages or [] def self.find_by_heuristics(data, languages) if active? + result = [] + if languages.all? { |l| ["Perl", "Prolog"].include?(l) } - result = disambiguate_pl(data, languages) + result = disambiguate_pl(data) end if languages.all? { |l| ["ECL", "Prolog"].include?(l) } - result = disambiguate_ecl(data, languages) + result = disambiguate_ecl(data) end if languages.all? { |l| ["IDL", "Prolog"].include?(l) } - result = disambiguate_pro(data, languages) + result = disambiguate_pro(data) end if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) } - result = disambiguate_cl(data, languages) + result = disambiguate_cl(data) end if languages.all? { |l| ["Hack", "PHP"].include?(l) } - result = disambiguate_hack(data, languages) + result = disambiguate_hack(data) end if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) } - result = disambiguate_sc(data, languages) + result = disambiguate_sc(data) end if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) } - result = disambiguate_asc(data, languages) + result = disambiguate_asc(data) end return result end @@ -42,28 +44,37 @@ module Linguist # We want to shortcut look for Objective-C _and_ now C++ too! # # Returns an array of Languages or [] - def self.disambiguate_c(data, languages) + def self.disambiguate_c(data) matches = [] - matches << Language["Objective-C"] if data.include?("@interface") - matches << Language["C++"] if data.include?("#include ") + if data.include?("@interface") + matches << Language["Objective-C"] + elsif data.include?("#include ") + matches << Language["C++"] + end matches end - def self.disambiguate_pl(data, languages) + def self.disambiguate_pl(data) matches = [] - matches << Language["Prolog"] if data.include?(":-") - matches << Language["Perl"] if data.include?("use strict") + if data.include?("use strict") + matches << Language["Perl"] + elsif data.include?(":-") + matches << Language["Prolog"] + end matches end - def self.disambiguate_ecl(data, languages) + def self.disambiguate_ecl(data) matches = [] - matches << Language["Prolog"] if data.include?(":-") - matches << Language["ECL"] if data.include?(":=") + if data.include?(":-") + matches << Language["Prolog"] + elsif data.include?(":=") + matches << Language["ECL"] + end matches end - def self.disambiguate_pro(data, languages) + def self.disambiguate_pro(data) matches = [] if (data.include?(":-")) matches << Language["Prolog"] @@ -73,7 +84,7 @@ module Linguist matches end - def self.disambiguate_ts(data, languages) + def self.disambiguate_ts(data) matches = [] if (data.include?("")) matches << Language["XML"] @@ -83,21 +94,24 @@ module Linguist matches end - def self.disambiguate_cl(data, languages) + def self.disambiguate_cl(data) matches = [] - matches << Language["Common Lisp"] if data.include?("(defun ") - matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data) + if data.include?("(defun ") + matches << Language["Common Lisp"] + elsif /\/\* |\/\/ |^\}/.match(data) + matches << Language["OpenCL"] + end matches end - def self.disambiguate_r(data, languages) + def self.disambiguate_r(data) matches = [] matches << Language["Rebol"] if /\bRebol\b/i.match(data) matches << Language["R"] if data.include?("<-") matches end - def self.disambiguate_hack(data, languages) + def self.disambiguate_hack(data) matches = [] if data.include?(" 1 data = blob.data possible_language_names = possible_languages.map(&:name) + heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names) + + if heuristic_languages.size > 1 + possible_language_names = heuristic_languages.map(&:name) + end - # Don't bother with binary contents or an empty file - if data.nil? || data == "" - nil # Check if there's a shebang line and use that as authoritative - elsif (result = find_by_shebang(data)) && !result.empty? + if (result = find_by_shebang(data)) && !result.empty? result.first # No shebang. Still more work to do. Try to find it with our heuristics. - elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? - determined.first + elsif heuristic_languages.size == 1 + heuristic_languages.first # Lastly, fall back to the probabilistic classifier. elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first # Return the actual Language object based of the string language name (i.e., first element of `#classify`) diff --git a/samples/CoffeeScript/empty.coffee b/samples/CoffeeScript/empty.coffee deleted file mode 100644 index e69de29b..00000000 diff --git a/samples/JavaScript/empty.js b/samples/JavaScript/empty.js deleted file mode 100644 index f5e757a8..00000000 --- a/samples/JavaScript/empty.js +++ /dev/null @@ -1,3 +0,0 @@ -(function() { - -}).call(this); diff --git a/samples/TypeScript/empty.ts b/samples/TypeScript/empty.ts deleted file mode 100644 index e69de29b..00000000 diff --git a/test/test_blob.rb b/test/test_blob.rb index 1fed5357..b0bdc368 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -226,7 +226,6 @@ class TestBlob < Test::Unit::TestCase assert !blob("PostScript/sierpinski.ps").generated? # These examples are too basic to tell - assert !blob("JavaScript/empty.js").generated? assert !blob("JavaScript/hello.js").generated? assert blob("JavaScript/intro-old.js").generated? @@ -469,4 +468,13 @@ class TestBlob < Test::Unit::TestCase def test_minified_files_not_safe_to_highlight assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize? end + + def test_empty + blob = Struct.new(:data) { include Linguist::BlobHelper } + + assert blob.new("").empty? + assert blob.new(nil).empty? + refute blob.new(" ").empty? + refute blob.new("nope").empty? + end end diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index e75fb470..3e04daf3 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -20,18 +20,18 @@ class TestHeuristcs < Test::Unit::TestCase Dir.glob("#{samples_path}/#{language_name}/#{file}") end + # Candidate languages = ["C++", "Objective-C"] def test_obj_c_by_heuristics - languages = ["C++", "Objective-C"] # Only calling out '.h' filenames as these are the ones causing issues all_fixtures("Objective-C", "*.h").each do |fixture| - results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"), languages) + results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}")) assert_equal Language["Objective-C"], results.first end end + # Candidate languages = ["C++", "Objective-C"] def test_cpp_by_heuristics - languages = ["C++", "Objective-C"] - results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"), languages) + results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp")) assert_equal Language["C++"], results.first end @@ -41,57 +41,57 @@ class TestHeuristcs < Test::Unit::TestCase assert_equal Language["Objective-C"], match end + # Candidate languages = ["Perl", "Prolog"] def test_pl_prolog_by_heuristics - languages = ["Perl", "Prolog"] - results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages) + results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl")) assert_equal Language["Prolog"], results.first end + # Candidate languages = ["Perl", "Prolog"] def test_pl_perl_by_heuristics - languages = ["Perl", "Prolog"] - results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"), languages) + results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t")) assert_equal Language["Perl"], results.first end + # Candidate languages = ["ECL", "Prolog"] def test_ecl_prolog_by_heuristics - languages = ["ECL", "Prolog"] - results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"), languages) + results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl")) assert_equal Language["Prolog"], results.first end + # Candidate languages = ["ECL", "Prolog"] def test_ecl_ecl_by_heuristics - languages = ["ECL", "Prolog"] - results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"), languages) + results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl")) assert_equal Language["ECL"], results.first end + # Candidate languages = ["IDL", "Prolog"] def test_pro_prolog_by_heuristics - languages = ["IDL", "Prolog"] - results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"), languages) + results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro")) assert_equal Language["Prolog"], results.first end + # Candidate languages = ["IDL", "Prolog"] def test_pro_idl_by_heuristics - languages = ["IDL", "Prolog"] - results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"), languages) + results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro")) assert_equal Language["IDL"], results.first end + # Candidate languages = ["AGS Script", "AsciiDoc"] def test_asc_asciidoc_by_heuristics - languages = ["AGS Script", "AsciiDoc"] - results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"), languages) + results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc")) assert_equal Language["AsciiDoc"], results.first end + # Candidate languages = ["TypeScript", "XML"] def test_ts_typescript_by_heuristics - languages = ["TypeScript", "XML"] - results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"), languages) + results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts")) assert_equal Language["TypeScript"], results.first end + # Candidate languages = ["TypeScript", "XML"] def test_ts_xml_by_heuristics - languages = ["TypeScript", "XML"] - results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"), languages) + results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml")) assert_equal Language["XML"], results.first end @@ -99,27 +99,27 @@ class TestHeuristcs < Test::Unit::TestCase languages = ["Common Lisp", "OpenCL"] languages.each do |language| all_fixtures(language).each do |fixture| - results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"), languages) + results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}")) assert_equal Language[language], results.first end end end + # Candidate languages = ["Hack", "PHP"] def test_hack_by_heuristics - languages = ["Hack", "PHP"] - results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"), languages) + results = Heuristics.disambiguate_hack(fixture("Hack/funs.php")) assert_equal Language["Hack"], results.first end + # Candidate languages = ["Scala", "SuperCollider"] def test_sc_supercollider_by_heuristics - languages = ["Scala", "SuperCollider"] - results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"), languages) + results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc")) assert_equal Language["SuperCollider"], results.first end + # Candidate languages = ["Scala", "SuperCollider"] def test_sc_scala_by_heuristics - languages = ["Scala", "SuperCollider"] - results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"), languages) + results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc")) assert_equal Language["Scala"], results.first end end