Start on basic heuristic approach

2025-10-29 17:50:22 +00:00 · 2013-12-14 23:01:04 -08:00
parent 6bd97c7fc7
commit 3bc17e822d
3 changed files with 58 additions and 1 deletions
--- a/lib/linguist.rb
+++ b/lib/linguist.rb
@@ -1,5 +1,6 @@
 require 'linguist/blob_helper'
 require 'linguist/generated'
+require 'linguist/heuristics'
 require 'linguist/language'
 require 'linguist/repository'
 require 'linguist/samples'
--- a/lib/linguist/heuristics.rb
+++ b/lib/linguist/heuristics.rb
@@ -0,0 +1,43 @@
+require 'linguist/tokenizer'
+
+module Linguist
+  # A collection of simple heuristics that can be used to better analysis languages.
+  class Heuristics
+    # Public: Given an array of String language names, a
+    # apply all heuristics against the given data and return an array
+    # of matching languages, or nil.
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+
+    # Returns an array of language name Strings, or []
+    def self.find_by_heuristics(data, languages)
+      if languages.all? { |l| ["pod", "perl"].include?(l) }
+        disambiguate_pod(data, languages)
+      elsif languages.all? { |l| ["objective-c", "c++"].include?(l) }
+        disambiguate_h(data, languages)
+      end
+    end
+
+    # Internal: Initialize a Heuristics class
+    def initialize
+    end
+
+    # .pod extensions are ambigious between perl and pod.
+    #
+    # Returns an array of still-possible languages, or nil
+    def self.disambiguate_pod(data, languages)
+      matches = []
+      matches << Language["Perl"] if data.includes?("my $")
+      matches
+    end
+
+    # .h extensions are ambigious between C, C++, and Objective-C.
+    # We want to look for Objective-C.
+    def self.disambiguate_h(data, languages)
+      matches = []
+      matches << Language["Objective-C"] if data.includes?("NSData *") && data.includes?("@interface")
+      matches
+    end
+  end
+end
+
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -7,6 +7,7 @@ rescue LoadError
 end

 require 'linguist/classifier'
+require 'linguist/heuristics'
 require 'linguist/samples'

 module Linguist
@@ -113,20 +114,32 @@ module Linguist
        name += ".script!"
      end

+      # First try to find languages that match based on filename.
      possible_languages = find_by_filename(name)

+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to continue
+      # our detection work
      if possible_languages.length > 1
        data = data.call() if data.respond_to?(:call)
+        possible_language_names = possible_languages.map(&:name)

+        # Don't bother with emptiness
        if data.nil? || data == ""
          nil
+        # Check if there's a shebang line and use that as authoritative
        elsif (result = find_by_shebang(data)) && !result.empty?
          result.first
-        elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
+        # No shebang. Still more work to do. Try to find it with our heuristics.
+        elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
+          determined.first
+        # Lastly, fall back to the probablistic classifier.
+        elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first
          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
          Language[classified[0]]
        end
      else
+        # Simplest and most common case, we can just return the one match based on extension
        possible_languages.first
      end
    end