mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Merge pull request #1674 from github/rework-heuristics
Rework heuristics
This commit is contained in:
		@@ -146,6 +146,13 @@ module Linguist
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Public: Is the blob empty?
 | 
			
		||||
    #
 | 
			
		||||
    # Return true or false
 | 
			
		||||
    def empty?
 | 
			
		||||
      data.nil? || data == ""
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Public: Is the blob text?
 | 
			
		||||
    #
 | 
			
		||||
    # Return true or false
 | 
			
		||||
 
 | 
			
		||||
@@ -13,26 +13,28 @@ module Linguist
 | 
			
		||||
    # Returns an array of Languages or []
 | 
			
		||||
    def self.find_by_heuristics(data, languages)
 | 
			
		||||
      if active?
 | 
			
		||||
        result = []
 | 
			
		||||
 | 
			
		||||
        if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
 | 
			
		||||
          result = disambiguate_pl(data, languages)
 | 
			
		||||
          result = disambiguate_pl(data)
 | 
			
		||||
        end
 | 
			
		||||
        if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
 | 
			
		||||
          result = disambiguate_ecl(data, languages)
 | 
			
		||||
          result = disambiguate_ecl(data)
 | 
			
		||||
        end
 | 
			
		||||
        if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
 | 
			
		||||
          result = disambiguate_pro(data, languages)
 | 
			
		||||
          result = disambiguate_pro(data)
 | 
			
		||||
        end
 | 
			
		||||
        if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
 | 
			
		||||
          result = disambiguate_cl(data, languages)
 | 
			
		||||
          result = disambiguate_cl(data)
 | 
			
		||||
        end
 | 
			
		||||
        if languages.all? { |l| ["Hack", "PHP"].include?(l) }
 | 
			
		||||
          result = disambiguate_hack(data, languages)
 | 
			
		||||
          result = disambiguate_hack(data)
 | 
			
		||||
        end
 | 
			
		||||
        if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
 | 
			
		||||
          result = disambiguate_sc(data, languages)
 | 
			
		||||
          result = disambiguate_sc(data)
 | 
			
		||||
        end
 | 
			
		||||
        if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
 | 
			
		||||
          result = disambiguate_asc(data, languages)
 | 
			
		||||
          result = disambiguate_asc(data)
 | 
			
		||||
        end
 | 
			
		||||
        return result
 | 
			
		||||
      end
 | 
			
		||||
@@ -42,28 +44,37 @@ module Linguist
 | 
			
		||||
    # We want to shortcut look for Objective-C _and_ now C++ too!
 | 
			
		||||
    #
 | 
			
		||||
    # Returns an array of Languages or []
 | 
			
		||||
    def self.disambiguate_c(data, languages)
 | 
			
		||||
    def self.disambiguate_c(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      matches << Language["Objective-C"] if data.include?("@interface")
 | 
			
		||||
      matches << Language["C++"] if data.include?("#include <cstdint>")
 | 
			
		||||
      if data.include?("@interface")
 | 
			
		||||
        matches << Language["Objective-C"]
 | 
			
		||||
      elsif data.include?("#include <cstdint>")
 | 
			
		||||
        matches << Language["C++"]
 | 
			
		||||
      end
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_pl(data, languages)
 | 
			
		||||
    def self.disambiguate_pl(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      matches << Language["Prolog"] if data.include?(":-")
 | 
			
		||||
      matches << Language["Perl"] if data.include?("use strict")
 | 
			
		||||
      if data.include?("use strict")
 | 
			
		||||
        matches << Language["Perl"]
 | 
			
		||||
      elsif data.include?(":-")
 | 
			
		||||
        matches << Language["Prolog"]
 | 
			
		||||
      end
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_ecl(data, languages)
 | 
			
		||||
    def self.disambiguate_ecl(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      matches << Language["Prolog"] if data.include?(":-")
 | 
			
		||||
      matches << Language["ECL"] if data.include?(":=")
 | 
			
		||||
      if data.include?(":-")
 | 
			
		||||
        matches << Language["Prolog"]
 | 
			
		||||
      elsif data.include?(":=")
 | 
			
		||||
        matches << Language["ECL"]
 | 
			
		||||
      end
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_pro(data, languages)
 | 
			
		||||
    def self.disambiguate_pro(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      if (data.include?(":-"))
 | 
			
		||||
        matches << Language["Prolog"]
 | 
			
		||||
@@ -73,7 +84,7 @@ module Linguist
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_ts(data, languages)
 | 
			
		||||
    def self.disambiguate_ts(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      if (data.include?("</translation>"))
 | 
			
		||||
        matches << Language["XML"]
 | 
			
		||||
@@ -83,21 +94,24 @@ module Linguist
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_cl(data, languages)
 | 
			
		||||
    def self.disambiguate_cl(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      matches << Language["Common Lisp"] if data.include?("(defun ")
 | 
			
		||||
      matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
 | 
			
		||||
      if data.include?("(defun ")
 | 
			
		||||
        matches << Language["Common Lisp"]
 | 
			
		||||
      elsif /\/\* |\/\/ |^\}/.match(data)
 | 
			
		||||
        matches << Language["OpenCL"]
 | 
			
		||||
      end
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_r(data, languages)
 | 
			
		||||
    def self.disambiguate_r(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      matches << Language["Rebol"] if /\bRebol\b/i.match(data)
 | 
			
		||||
      matches << Language["R"] if data.include?("<-")
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_hack(data, languages)
 | 
			
		||||
    def self.disambiguate_hack(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      if data.include?("<?hh")
 | 
			
		||||
        matches << Language["Hack"]
 | 
			
		||||
@@ -107,7 +121,7 @@ module Linguist
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_sc(data, languages)
 | 
			
		||||
    def self.disambiguate_sc(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
 | 
			
		||||
        matches << Language["SuperCollider"]
 | 
			
		||||
@@ -118,7 +132,7 @@ module Linguist
 | 
			
		||||
      matches
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.disambiguate_asc(data, languages)
 | 
			
		||||
    def self.disambiguate_asc(data)
 | 
			
		||||
      matches = []
 | 
			
		||||
      matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
 | 
			
		||||
      matches
 | 
			
		||||
 
 | 
			
		||||
@@ -100,12 +100,8 @@ module Linguist
 | 
			
		||||
    def self.detect(blob)
 | 
			
		||||
      name = blob.name.to_s
 | 
			
		||||
 | 
			
		||||
      # Check if the blob is possibly binary and bail early; this is a cheap
 | 
			
		||||
      # test that uses the extension name to guess a binary binary mime type.
 | 
			
		||||
      #
 | 
			
		||||
      # We'll perform a more comprehensive test later which actually involves
 | 
			
		||||
      # looking for binary characters in the blob
 | 
			
		||||
      return nil if blob.likely_binary? || blob.binary?
 | 
			
		||||
      # Bail early if the blob is binary or empty.
 | 
			
		||||
      return nil if blob.likely_binary? || blob.binary? || blob.empty?
 | 
			
		||||
 | 
			
		||||
      # A bit of an elegant hack. If the file is executable but extensionless,
 | 
			
		||||
      # append a "magic" extension so it can be classified with other
 | 
			
		||||
@@ -124,16 +120,18 @@ module Linguist
 | 
			
		||||
      if possible_languages.length > 1
 | 
			
		||||
        data = blob.data
 | 
			
		||||
        possible_language_names = possible_languages.map(&:name)
 | 
			
		||||
        heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
 | 
			
		||||
 | 
			
		||||
        if heuristic_languages.size > 1
 | 
			
		||||
          possible_language_names = heuristic_languages.map(&:name)
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        # Don't bother with binary contents or an empty file
 | 
			
		||||
        if data.nil? || data == ""
 | 
			
		||||
          nil
 | 
			
		||||
        # Check if there's a shebang line and use that as authoritative
 | 
			
		||||
        elsif (result = find_by_shebang(data)) && !result.empty?
 | 
			
		||||
        if (result = find_by_shebang(data)) && !result.empty?
 | 
			
		||||
          result.first
 | 
			
		||||
        # No shebang. Still more work to do. Try to find it with our heuristics.
 | 
			
		||||
        elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
 | 
			
		||||
          determined.first
 | 
			
		||||
        elsif heuristic_languages.size == 1
 | 
			
		||||
          heuristic_languages.first
 | 
			
		||||
        # Lastly, fall back to the probabilistic classifier.
 | 
			
		||||
        elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
 | 
			
		||||
          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +0,0 @@
 | 
			
		||||
(function() {
 | 
			
		||||
 | 
			
		||||
}).call(this);
 | 
			
		||||
@@ -226,7 +226,6 @@ class TestBlob < Test::Unit::TestCase
 | 
			
		||||
    assert !blob("PostScript/sierpinski.ps").generated?
 | 
			
		||||
 | 
			
		||||
    # These examples are too basic to tell
 | 
			
		||||
    assert !blob("JavaScript/empty.js").generated?
 | 
			
		||||
    assert !blob("JavaScript/hello.js").generated?
 | 
			
		||||
 | 
			
		||||
    assert blob("JavaScript/intro-old.js").generated?
 | 
			
		||||
@@ -469,4 +468,13 @@ class TestBlob < Test::Unit::TestCase
 | 
			
		||||
  def test_minified_files_not_safe_to_highlight
 | 
			
		||||
    assert !blob("JavaScript/jquery-1.6.1.min.js").safe_to_colorize?
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_empty
 | 
			
		||||
    blob = Struct.new(:data) { include Linguist::BlobHelper }
 | 
			
		||||
 | 
			
		||||
    assert blob.new("").empty?
 | 
			
		||||
    assert blob.new(nil).empty?
 | 
			
		||||
    refute blob.new(" ").empty?
 | 
			
		||||
    refute blob.new("nope").empty?
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
 
 | 
			
		||||
@@ -20,18 +20,18 @@ class TestHeuristcs < Test::Unit::TestCase
 | 
			
		||||
    Dir.glob("#{samples_path}/#{language_name}/#{file}")
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["C++", "Objective-C"]
 | 
			
		||||
  def test_obj_c_by_heuristics
 | 
			
		||||
    languages = ["C++", "Objective-C"]
 | 
			
		||||
    # Only calling out '.h' filenames as these are the ones causing issues
 | 
			
		||||
    all_fixtures("Objective-C", "*.h").each do |fixture|
 | 
			
		||||
      results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"), languages)
 | 
			
		||||
      results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"))
 | 
			
		||||
      assert_equal Language["Objective-C"], results.first
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["C++", "Objective-C"]
 | 
			
		||||
  def test_cpp_by_heuristics
 | 
			
		||||
    languages = ["C++", "Objective-C"]
 | 
			
		||||
    results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"))
 | 
			
		||||
    assert_equal Language["C++"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
@@ -41,57 +41,57 @@ class TestHeuristcs < Test::Unit::TestCase
 | 
			
		||||
    assert_equal Language["Objective-C"], match
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["Perl", "Prolog"]
 | 
			
		||||
  def test_pl_prolog_by_heuristics
 | 
			
		||||
    languages = ["Perl", "Prolog"]
 | 
			
		||||
    results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"))
 | 
			
		||||
    assert_equal Language["Prolog"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["Perl", "Prolog"]
 | 
			
		||||
  def test_pl_perl_by_heuristics
 | 
			
		||||
    languages = ["Perl", "Prolog"]
 | 
			
		||||
    results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"))
 | 
			
		||||
    assert_equal Language["Perl"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["ECL", "Prolog"]
 | 
			
		||||
  def test_ecl_prolog_by_heuristics
 | 
			
		||||
    languages = ["ECL", "Prolog"]
 | 
			
		||||
    results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"))
 | 
			
		||||
    assert_equal Language["Prolog"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["ECL", "Prolog"]
 | 
			
		||||
  def test_ecl_ecl_by_heuristics
 | 
			
		||||
    languages = ["ECL", "Prolog"]
 | 
			
		||||
    results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"))
 | 
			
		||||
    assert_equal Language["ECL"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["IDL", "Prolog"]
 | 
			
		||||
  def test_pro_prolog_by_heuristics
 | 
			
		||||
    languages = ["IDL", "Prolog"]
 | 
			
		||||
    results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"))
 | 
			
		||||
    assert_equal Language["Prolog"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["IDL", "Prolog"]
 | 
			
		||||
  def test_pro_idl_by_heuristics
 | 
			
		||||
    languages = ["IDL", "Prolog"]
 | 
			
		||||
    results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"))
 | 
			
		||||
    assert_equal Language["IDL"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["AGS Script", "AsciiDoc"]
 | 
			
		||||
  def test_asc_asciidoc_by_heuristics
 | 
			
		||||
    languages = ["AGS Script", "AsciiDoc"]
 | 
			
		||||
    results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"))
 | 
			
		||||
    assert_equal Language["AsciiDoc"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["TypeScript", "XML"]
 | 
			
		||||
  def test_ts_typescript_by_heuristics
 | 
			
		||||
    languages = ["TypeScript", "XML"]
 | 
			
		||||
    results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"))
 | 
			
		||||
    assert_equal Language["TypeScript"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["TypeScript", "XML"]
 | 
			
		||||
  def test_ts_xml_by_heuristics
 | 
			
		||||
    languages = ["TypeScript", "XML"]
 | 
			
		||||
    results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"))
 | 
			
		||||
    assert_equal Language["XML"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
@@ -99,27 +99,27 @@ class TestHeuristcs < Test::Unit::TestCase
 | 
			
		||||
    languages = ["Common Lisp", "OpenCL"]
 | 
			
		||||
    languages.each do |language|
 | 
			
		||||
      all_fixtures(language).each do |fixture|
 | 
			
		||||
        results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"), languages)
 | 
			
		||||
        results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"))
 | 
			
		||||
        assert_equal Language[language], results.first
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["Hack", "PHP"]
 | 
			
		||||
  def test_hack_by_heuristics
 | 
			
		||||
    languages = ["Hack", "PHP"]
 | 
			
		||||
    results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"))
 | 
			
		||||
    assert_equal Language["Hack"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["Scala", "SuperCollider"]
 | 
			
		||||
  def test_sc_supercollider_by_heuristics
 | 
			
		||||
    languages = ["Scala", "SuperCollider"]
 | 
			
		||||
    results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"))
 | 
			
		||||
    assert_equal Language["SuperCollider"], results.first
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # Candidate languages = ["Scala", "SuperCollider"]
 | 
			
		||||
  def test_sc_scala_by_heuristics
 | 
			
		||||
    languages = ["Scala", "SuperCollider"]
 | 
			
		||||
    results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"), languages)
 | 
			
		||||
    results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"))
 | 
			
		||||
    assert_equal Language["Scala"], results.first
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user