mirror of
https://github.com/KevinMidboe/linguist.git
synced 2026-01-28 12:16:14 +00:00
Merge pull request #1663 from github/strategies
Refactor detection into strategies
This commit is contained in:
@@ -3,6 +3,25 @@ require 'linguist/tokenizer'
|
|||||||
module Linguist
|
module Linguist
|
||||||
# Language bayesian classifier.
|
# Language bayesian classifier.
|
||||||
class Classifier
|
class Classifier
|
||||||
|
# Public: Use the classifier to detect language of the blob.
|
||||||
|
#
|
||||||
|
# blob - An object that quacks like a blob.
|
||||||
|
# possible_languages - Array of Language objects
|
||||||
|
#
|
||||||
|
# Examples
|
||||||
|
#
|
||||||
|
# Classifier.call(FileBlob.new("path/to/file"), [
|
||||||
|
# Language["Ruby"], Language["Python"]
|
||||||
|
# ])
|
||||||
|
#
|
||||||
|
# Returns an Array of Language objects, most probable first.
|
||||||
|
def self.call(blob, possible_languages)
|
||||||
|
language_names = possible_languages.map(&:name)
|
||||||
|
classify(Samples.cache, blob.data, language_names).map do |name, _|
|
||||||
|
Language[name] # Return the actual Language objects
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
# Public: Train classifier that data is a certain language.
|
# Public: Train classifier that data is a certain language.
|
||||||
#
|
#
|
||||||
# db - Hash classifier database object
|
# db - Hash classifier database object
|
||||||
|
|||||||
@@ -3,6 +3,23 @@ module Linguist
|
|||||||
class Heuristics
|
class Heuristics
|
||||||
ACTIVE = true
|
ACTIVE = true
|
||||||
|
|
||||||
|
# Public: Use heuristics to detect language of the blob.
|
||||||
|
#
|
||||||
|
# blob - An object that quacks like a blob.
|
||||||
|
# possible_languages - Array of Language objects
|
||||||
|
#
|
||||||
|
# Examples
|
||||||
|
#
|
||||||
|
# Heuristics.call(FileBlob.new("path/to/file"), [
|
||||||
|
# Language["Ruby"], Language["Python"]
|
||||||
|
# ])
|
||||||
|
#
|
||||||
|
# Returns an Array with one Language if a heuristic matched, or empty if
|
||||||
|
# none matched or were inconclusive.
|
||||||
|
def self.call(blob, languages)
|
||||||
|
find_by_heuristics(blob.data, languages.map(&:name))
|
||||||
|
end
|
||||||
|
|
||||||
# Public: Given an array of String language names,
|
# Public: Given an array of String language names,
|
||||||
# apply heuristics against the given data and return an array
|
# apply heuristics against the given data and return an array
|
||||||
# of matching languages, or nil.
|
# of matching languages, or nil.
|
||||||
|
|||||||
@@ -10,6 +10,8 @@ require 'linguist/heuristics'
|
|||||||
require 'linguist/samples'
|
require 'linguist/samples'
|
||||||
require 'linguist/file_blob'
|
require 'linguist/file_blob'
|
||||||
require 'linguist/blob_helper'
|
require 'linguist/blob_helper'
|
||||||
|
require 'linguist/strategy/filename'
|
||||||
|
require 'linguist/strategy/shebang'
|
||||||
|
|
||||||
module Linguist
|
module Linguist
|
||||||
# Language names that are recognizable by GitHub. Defined languages
|
# Language names that are recognizable by GitHub. Defined languages
|
||||||
@@ -91,6 +93,13 @@ module Linguist
|
|||||||
language
|
language
|
||||||
end
|
end
|
||||||
|
|
||||||
|
STRATEGIES = [
|
||||||
|
Linguist::Strategy::Filename,
|
||||||
|
Linguist::Strategy::Shebang,
|
||||||
|
Linguist::Heuristics,
|
||||||
|
Linguist::Classifier
|
||||||
|
]
|
||||||
|
|
||||||
# Public: Detects the Language of the blob.
|
# Public: Detects the Language of the blob.
|
||||||
#
|
#
|
||||||
# blob - an object that includes the Linguist `BlobHelper` interface;
|
# blob - an object that includes the Linguist `BlobHelper` interface;
|
||||||
@@ -98,61 +107,22 @@ module Linguist
|
|||||||
#
|
#
|
||||||
# Returns Language or nil.
|
# Returns Language or nil.
|
||||||
def self.detect(blob)
|
def self.detect(blob)
|
||||||
name = blob.name.to_s
|
|
||||||
|
|
||||||
# Bail early if the blob is binary or empty.
|
# Bail early if the blob is binary or empty.
|
||||||
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
||||||
|
|
||||||
# A bit of an elegant hack. If the file is executable but extensionless,
|
# Call each strategy until one candidate is returned.
|
||||||
# append a "magic" extension so it can be classified with other
|
STRATEGIES.reduce([]) do |languages, strategy|
|
||||||
# languages that have shebang scripts.
|
candidates = strategy.call(blob, languages)
|
||||||
extensions = FileBlob.new(name).extensions
|
if candidates.size == 1
|
||||||
if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
|
return candidates.first
|
||||||
name += ".script!"
|
elsif candidates.size > 1
|
||||||
end
|
# More than one candidate was found, pass them to the next strategy.
|
||||||
|
candidates
|
||||||
# Find languages that match based on filename.
|
else
|
||||||
possible_languages = find_by_filename(name)
|
# No candiates were found, pass on languages from the previous strategy.
|
||||||
|
languages
|
||||||
if possible_languages.length == 1
|
|
||||||
# Simplest and most common case, we can just return the one match based
|
|
||||||
# on extension
|
|
||||||
possible_languages.first
|
|
||||||
|
|
||||||
# If there is more than one possible language with that extension (or no
|
|
||||||
# extension at all, in the case of extensionless scripts), we need to
|
|
||||||
# continue our detection work
|
|
||||||
else
|
|
||||||
# Matches possible_languages.length == 0 || possible_languages.length > 0
|
|
||||||
data = blob.data
|
|
||||||
|
|
||||||
# Check if there's a shebang line and use that as authoritative
|
|
||||||
if (result = find_by_shebang(data)) && !result.empty?
|
|
||||||
return result.first
|
|
||||||
|
|
||||||
# More than one language with that extension. We need to make a choice.
|
|
||||||
elsif possible_languages.length > 1
|
|
||||||
|
|
||||||
# First try heuristics
|
|
||||||
|
|
||||||
possible_language_names = possible_languages.map(&:name)
|
|
||||||
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
|
|
||||||
|
|
||||||
# If there are multiple possible languages returned from heuristics
|
|
||||||
# then reduce language candidates for Bayesian classifier here.
|
|
||||||
if heuristic_languages.size > 1
|
|
||||||
possible_language_names = heuristic_languages.map(&:name)
|
|
||||||
end
|
|
||||||
|
|
||||||
if heuristic_languages.size == 1
|
|
||||||
return heuristic_languages.first
|
|
||||||
# Lastly, fall back to the probabilistic classifier.
|
|
||||||
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
|
|
||||||
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
|
||||||
return Language[classified[0]]
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end.first
|
||||||
end
|
end
|
||||||
|
|
||||||
# Public: Get all Languages
|
# Public: Get all Languages
|
||||||
|
|||||||
20
lib/linguist/strategy/filename.rb
Normal file
20
lib/linguist/strategy/filename.rb
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
module Linguist
|
||||||
|
module Strategy
|
||||||
|
# Detects language based on filename and/or extension
|
||||||
|
class Filename
|
||||||
|
def self.call(blob, _)
|
||||||
|
name = blob.name.to_s
|
||||||
|
|
||||||
|
# A bit of an elegant hack. If the file is executable but extensionless,
|
||||||
|
# append a "magic" extension so it can be classified with other
|
||||||
|
# languages that have shebang scripts.
|
||||||
|
extensions = FileBlob.new(name).extensions
|
||||||
|
if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
|
||||||
|
name += ".script!"
|
||||||
|
end
|
||||||
|
|
||||||
|
Language.find_by_filename(name)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
10
lib/linguist/strategy/shebang.rb
Normal file
10
lib/linguist/strategy/shebang.rb
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
module Linguist
|
||||||
|
module Strategy
|
||||||
|
# Check if there's a shebang line and use that as authoritative
|
||||||
|
class Shebang
|
||||||
|
def self.call(blob, _)
|
||||||
|
Language.find_by_shebang(blob.data)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
Reference in New Issue
Block a user