Merge pull request #1538 from github/1233-local

Detection based on the shebang (updated)
2026-01-11 03:45:31 +00:00 · 2014-11-26 14:47:12 -06:00
parent 7ccd8caf71 208a3ff480
commit 412af86cb8
5 changed files with 1580 additions and 22 deletions
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -111,35 +111,47 @@ module Linguist
        name += ".script!"
      end

-      # First try to find languages that match based on filename.
+      # Find languages that match based on filename.
      possible_languages = find_by_filename(name)

-      # If there is more than one possible language with that extension (or no
-      # extension at all, in the case of extensionless scripts), we need to continue
-      # our detection work
-      if possible_languages.length > 1
-        data = blob.data
-        possible_language_names = possible_languages.map(&:name)
-        heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+      if possible_languages.length == 1
+        # Simplest and most common case, we can just return the one match based
+        # on extension
+        possible_languages.first

-        if heuristic_languages.size > 1
-          possible_language_names = heuristic_languages.map(&:name)
-        end
+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to
+      # continue our detection work
+      else
+        # Matches possible_languages.length == 0 || possible_languages.length > 0
+        data = blob.data

        # Check if there's a shebang line and use that as authoritative
        if (result = find_by_shebang(data)) && !result.empty?
-          result.first
-        # No shebang. Still more work to do. Try to find it with our heuristics.
-        elsif heuristic_languages.size == 1
-          heuristic_languages.first
-        # Lastly, fall back to the probabilistic classifier.
-        elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
-          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
-          Language[classified[0]]
+          return result.first
+
+        # More than one language with that extension. We need to make a choice.
+        elsif possible_languages.length > 1
+
+          # First try heuristics
+
+          possible_language_names = possible_languages.map(&:name)
+          heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+
+          # If there are multiple possible languages returned from heuristics
+          # then reduce language candidates for Bayesian classifier here.
+          if heuristic_languages.size > 1
+            possible_language_names = heuristic_languages.map(&:name)
+          end
+
+          if heuristic_languages.size == 1
+            return heuristic_languages.first
+          # Lastly, fall back to the probabilistic classifier.
+          elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
+            # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
+            return Language[classified[0]]
+          end
        end
-      else
-        # Simplest and most common case, we can just return the one match based on extension
-        possible_languages.first
      end
    end