mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-29 13:21:01 +00:00
Extract strategies for detecting the language
This commit is contained in:
@@ -3,6 +3,10 @@ module Linguist
|
|||||||
class Heuristics
|
class Heuristics
|
||||||
ACTIVE = true
|
ACTIVE = true
|
||||||
|
|
||||||
|
def self.call(blob, languages)
|
||||||
|
find_by_heuristics(blob.data, langauges)
|
||||||
|
end
|
||||||
|
|
||||||
# Public: Given an array of String language names,
|
# Public: Given an array of String language names,
|
||||||
# apply heuristics against the given data and return an array
|
# apply heuristics against the given data and return an array
|
||||||
# of matching languages, or nil.
|
# of matching languages, or nil.
|
||||||
|
|||||||
@@ -93,6 +93,17 @@ module Linguist
|
|||||||
end
|
end
|
||||||
|
|
||||||
require 'linguist/strategy/filename'
|
require 'linguist/strategy/filename'
|
||||||
|
require 'linguist/strategy/shebang'
|
||||||
|
require 'linguist/strategy/classifier'
|
||||||
|
STRATEGIES = [
|
||||||
|
Linguist::Strategy::Filename,
|
||||||
|
# Don't bother with binary contents or an empty file
|
||||||
|
lambda {|blob, langauges| [] if blob.data.nil? || blob.data == "" },
|
||||||
|
# Check if there's a shebang line and use that as authoritative
|
||||||
|
Linguist::Strategy::Shebang,
|
||||||
|
Linguist::Heuristics,
|
||||||
|
Linguist::Strategy::Classifier
|
||||||
|
]
|
||||||
|
|
||||||
# Public: Detects the Language of the blob.
|
# Public: Detects the Language of the blob.
|
||||||
#
|
#
|
||||||
@@ -101,40 +112,20 @@ module Linguist
|
|||||||
#
|
#
|
||||||
# Returns Language or nil.
|
# Returns Language or nil.
|
||||||
def self.detect(blob)
|
def self.detect(blob)
|
||||||
# Check if the blob is possibly binary and bail early; this is a cheap
|
# Check if the blob is possibly binary and bail early.
|
||||||
# test that uses the extension name to guess a binary binary mime type.
|
|
||||||
#
|
|
||||||
# We'll perform a more comprehensive test later which actually involves
|
|
||||||
# looking for binary characters in the blob
|
|
||||||
return nil if blob.likely_binary? || blob.binary?
|
return nil if blob.likely_binary? || blob.binary?
|
||||||
|
|
||||||
possible_languages = Linguist::Strategy::Filename.new.call(blob)
|
STRATEGIES.reduce([]) do |languages, strategy|
|
||||||
|
if candidates = strategy.call(blob, languages)
|
||||||
# If there is more than one possible language with that extension (or no
|
if candidates.size > 1
|
||||||
# extension at all, in the case of extensionless scripts), we need to continue
|
candidates
|
||||||
# our detection work
|
else
|
||||||
if possible_languages.length > 1
|
break candidates
|
||||||
data = blob.data
|
end
|
||||||
possible_language_names = possible_languages.map(&:name)
|
else
|
||||||
|
languages
|
||||||
# Don't bother with binary contents or an empty file
|
|
||||||
if data.nil? || data == ""
|
|
||||||
nil
|
|
||||||
# Check if there's a shebang line and use that as authoritative
|
|
||||||
elsif (result = find_by_shebang(data)) && !result.empty?
|
|
||||||
result.first
|
|
||||||
# No shebang. Still more work to do. Try to find it with our heuristics.
|
|
||||||
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
|
||||||
determined.first
|
|
||||||
# Lastly, fall back to the probabilistic classifier.
|
|
||||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
|
||||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
|
||||||
Language[classified[0]]
|
|
||||||
end
|
end
|
||||||
else
|
end.first
|
||||||
# Simplest and most common case, we can just return the one match based on extension
|
|
||||||
possible_languages.first
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Public: Get all Languages
|
# Public: Get all Languages
|
||||||
|
|||||||
12
lib/linguist/strategy/classifier.rb
Normal file
12
lib/linguist/strategy/classifier.rb
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
module Linguist
|
||||||
|
module Strategy
|
||||||
|
class Classifier
|
||||||
|
def self.call(blob, languages)
|
||||||
|
Linguist::Classifier.classify(Samples.cache, blob.data, possible_language_names).map do |name|
|
||||||
|
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||||
|
Language[name]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
module Linguist
|
module Linguist
|
||||||
module Strategy
|
module Strategy
|
||||||
class Filename
|
class Filename
|
||||||
def call(blob)
|
def self.call(blob, _)
|
||||||
name = blob.name.to_s
|
name = blob.name.to_s
|
||||||
|
|
||||||
# A bit of an elegant hack. If the file is executable but extensionless,
|
# A bit of an elegant hack. If the file is executable but extensionless,
|
||||||
@@ -13,7 +13,7 @@ module Linguist
|
|||||||
end
|
end
|
||||||
|
|
||||||
# First try to find languages that match based on filename.
|
# First try to find languages that match based on filename.
|
||||||
possible_languages = Language.find_by_filename(name)
|
Language.find_by_filename(name)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
9
lib/linguist/strategy/shebang.rb
Normal file
9
lib/linguist/strategy/shebang.rb
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
module Linguist
|
||||||
|
module Strategy
|
||||||
|
class Shebang
|
||||||
|
def self.call(blob, _)
|
||||||
|
Language.find_by_shebang(blob.data)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
Reference in New Issue
Block a user