Merge pull request #1538 from github/1233-local

Detection based on the shebang (updated)
2025-10-29 17:50:22 +00:00 · 2014-11-26 14:47:12 -06:00
parent 7ccd8caf71 208a3ff480
commit 412af86cb8
5 changed files with 1580 additions and 22 deletions
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -111,35 +111,47 @@ module Linguist
        name += ".script!"
      end

-      # First try to find languages that match based on filename.
+      # Find languages that match based on filename.
      possible_languages = find_by_filename(name)

-      # If there is more than one possible language with that extension (or no
-      # extension at all, in the case of extensionless scripts), we need to continue
-      # our detection work
-      if possible_languages.length > 1
-        data = blob.data
-        possible_language_names = possible_languages.map(&:name)
-        heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+      if possible_languages.length == 1
+        # Simplest and most common case, we can just return the one match based
+        # on extension
+        possible_languages.first

-        if heuristic_languages.size > 1
-          possible_language_names = heuristic_languages.map(&:name)
-        end
+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to
+      # continue our detection work
+      else
+        # Matches possible_languages.length == 0 || possible_languages.length > 0
+        data = blob.data

        # Check if there's a shebang line and use that as authoritative
        if (result = find_by_shebang(data)) && !result.empty?
-          result.first
-        # No shebang. Still more work to do. Try to find it with our heuristics.
-        elsif heuristic_languages.size == 1
-          heuristic_languages.first
-        # Lastly, fall back to the probabilistic classifier.
-        elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
-          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
-          Language[classified[0]]
+          return result.first
+
+        # More than one language with that extension. We need to make a choice.
+        elsif possible_languages.length > 1
+
+          # First try heuristics
+
+          possible_language_names = possible_languages.map(&:name)
+          heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+
+          # If there are multiple possible languages returned from heuristics
+          # then reduce language candidates for Bayesian classifier here.
+          if heuristic_languages.size > 1
+            possible_language_names = heuristic_languages.map(&:name)
+          end
+
+          if heuristic_languages.size == 1
+            return heuristic_languages.first
+          # Lastly, fall back to the probabilistic classifier.
+          elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
+            # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
+            return Language[classified[0]]
+          end
        end
-      else
-        # Simplest and most common case, we can just return the one match based on extension
-        possible_languages.first
      end
    end

--- a/samples/PHP/drupal.script!
+++ b/samples/PHP/drupal.script!
--- a/test/fixtures/Python/run_tests.module
+++ b/test/fixtures/Python/run_tests.module
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+import sys, os
+
+# Set the current working directory to the directory where this script is located
+os.chdir(os.path.abspath(os.path.dirname(sys.argv[0])))
+
+#### Set the name of the application here and moose directory relative to the application
+app_name = 'stork'
+
+MODULE_DIR = os.path.abspath('..')
+MOOSE_DIR = os.path.abspath(os.path.join(MODULE_DIR, '..'))
+#### See if MOOSE_DIR is already in the environment instead
+if os.environ.has_key("MOOSE_DIR"):
+  MOOSE_DIR = os.environ['MOOSE_DIR']
+
+sys.path.append(os.path.join(MOOSE_DIR, 'python'))
+import path_tool
+path_tool.activate_module('TestHarness')
+
+from TestHarness import TestHarness
+# Run the tests!
+TestHarness.buildAndRun(sys.argv, app_name, MOOSE_DIR)
--- a/test/fixtures/Shell/mintleaf.module
+++ b/test/fixtures/Shell/mintleaf.module
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -465,6 +465,25 @@ class TestBlob < Test::Unit::TestCase
      assert blob.language, "No language for #{sample[:path]}"
      assert_equal sample[:language], blob.language.name, blob.name
    end
+
+    # Test language detection for files which shouldn't be used as samples
+    root = File.expand_path('../fixtures', __FILE__)
+    Dir.entries(root).each do |language|
+      next unless File.file?(language)
+
+      # Each directory contains test files of a language
+      dirname = File.join(root, language)
+      Dir.entries(dirname).each do |filename|
+        next unless File.file?(filename)
+        
+        # By default blob search the file in the samples;
+        # thus, we need to give it the absolute path
+        filepath = File.join(dirname, filename)
+        blob = blob(filepath)
+        assert blob.language, "No language for #{filepath}"
+        assert_equal language, blob.language.name, blob.name
+      end
+    end
  end

  def test_minified_files_not_safe_to_highlight