mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Merge pull request #1538 from github/1233-local
Detection based on the shebang (updated)
This commit is contained in:
@@ -111,35 +111,47 @@ module Linguist
|
||||
name += ".script!"
|
||||
end
|
||||
|
||||
# First try to find languages that match based on filename.
|
||||
# Find languages that match based on filename.
|
||||
possible_languages = find_by_filename(name)
|
||||
|
||||
# If there is more than one possible language with that extension (or no
|
||||
# extension at all, in the case of extensionless scripts), we need to continue
|
||||
# our detection work
|
||||
if possible_languages.length > 1
|
||||
data = blob.data
|
||||
possible_language_names = possible_languages.map(&:name)
|
||||
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
||||
if possible_languages.length == 1
|
||||
# Simplest and most common case, we can just return the one match based
|
||||
# on extension
|
||||
possible_languages.first
|
||||
|
||||
if heuristic_languages.size > 1
|
||||
possible_language_names = heuristic_languages.map(&:name)
|
||||
end
|
||||
# If there is more than one possible language with that extension (or no
|
||||
# extension at all, in the case of extensionless scripts), we need to
|
||||
# continue our detection work
|
||||
else
|
||||
# Matches possible_languages.length == 0 || possible_languages.length > 0
|
||||
data = blob.data
|
||||
|
||||
# Check if there's a shebang line and use that as authoritative
|
||||
if (result = find_by_shebang(data)) && !result.empty?
|
||||
result.first
|
||||
# No shebang. Still more work to do. Try to find it with our heuristics.
|
||||
elsif heuristic_languages.size == 1
|
||||
heuristic_languages.first
|
||||
# Lastly, fall back to the probabilistic classifier.
|
||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||
Language[classified[0]]
|
||||
return result.first
|
||||
|
||||
# More than one language with that extension. We need to make a choice.
|
||||
elsif possible_languages.length > 1
|
||||
|
||||
# First try heuristics
|
||||
|
||||
possible_language_names = possible_languages.map(&:name)
|
||||
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
||||
|
||||
# If there are multiple possible languages returned from heuristics
|
||||
# then reduce language candidates for Bayesian classifier here.
|
||||
if heuristic_languages.size > 1
|
||||
possible_language_names = heuristic_languages.map(&:name)
|
||||
end
|
||||
|
||||
if heuristic_languages.size == 1
|
||||
return heuristic_languages.first
|
||||
# Lastly, fall back to the probabilistic classifier.
|
||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||
return Language[classified[0]]
|
||||
end
|
||||
end
|
||||
else
|
||||
# Simplest and most common case, we can just return the one match based on extension
|
||||
possible_languages.first
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
Reference in New Issue
Block a user