Merge remote-tracking branch 'origin/master' into interpreters-in-samples

* origin/master:
  byebug requires ruby 2.0
  Remove test for removed extension
  Merge branch 'master' into 1233-local
  Removing pry runtime dependency
  Moving to fixtures
  Language detection test for non-sample files
  Refactoring of Language.detect
  Try shebang detection if the extension is unknown
  Change unknown extension of PHP sample file
This commit is contained in:
Brandon Keepers
2014-11-26 16:25:15 -05:00
7 changed files with 1581 additions and 24 deletions

View File

@@ -111,35 +111,47 @@ module Linguist
name += ".script!"
end
# First try to find languages that match based on filename.
# Find languages that match based on filename.
possible_languages = find_by_filename(name)
# If there is more than one possible language with that extension (or no
# extension at all, in the case of extensionless scripts), we need to continue
# our detection work
if possible_languages.length > 1
data = blob.data
possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
if possible_languages.length == 1
# Simplest and most common case, we can just return the one match based
# on extension
possible_languages.first
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
end
# If there is more than one possible language with that extension (or no
# extension at all, in the case of extensionless scripts), we need to
# continue our detection work
else
# Matches possible_languages.length == 0 || possible_languages.length > 0
data = blob.data
# Check if there's a shebang line and use that as authoritative
if (result = find_by_shebang(data)) && !result.empty?
result.first
# No shebang. Still more work to do. Try to find it with our heuristics.
elsif heuristic_languages.size == 1
heuristic_languages.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[classified[0]]
return result.first
# More than one language with that extension. We need to make a choice.
elsif possible_languages.length > 1
# First try heuristics
possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
# If there are multiple possible languages returned from heuristics
# then reduce language candidates for Bayesian classifier here.
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
end
if heuristic_languages.size == 1
return heuristic_languages.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
return Language[classified[0]]
end
end
else
# Simplest and most common case, we can just return the one match based on extension
possible_languages.first
end
end