diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 5370bdd8..89a0df2f 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -3,6 +3,25 @@ require 'linguist/tokenizer' module Linguist # Language bayesian classifier. class Classifier + # Public: Use the classifier to detect language of the blob. + # + # blob - An object that quacks like a blob. + # possible_languages - Array of Language objects + # + # Examples + # + # Classifier.call(FileBlob.new("path/to/file"), [ + # Language["Ruby"], Language["Python"] + # ]) + # + # Returns an Array of Language objects, most probable first. + def self.call(blob, possible_languages) + language_names = possible_languages.map(&:name) + classify(Samples.cache, blob.data, language_names).map do |name, _| + Language[name] # Return the actual Language objects + end + end + # Public: Train classifier that data is a certain language. # # db - Hash classifier database object diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index d2aca7fe..0930305c 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -3,6 +3,23 @@ module Linguist class Heuristics ACTIVE = true + # Public: Use heuristics to detect language of the blob. + # + # blob - An object that quacks like a blob. + # possible_languages - Array of Language objects + # + # Examples + # + # Heuristics.call(FileBlob.new("path/to/file"), [ + # Language["Ruby"], Language["Python"] + # ]) + # + # Returns an Array with one Language if a heuristic matched, or empty if + # none matched or were inconclusive. + def self.call(blob, languages) + find_by_heuristics(blob.data, languages.map(&:name)) + end + # Public: Given an array of String language names, # apply heuristics against the given data and return an array # of matching languages, or nil. diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index daa1d003..2353edd1 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -10,6 +10,8 @@ require 'linguist/heuristics' require 'linguist/samples' require 'linguist/file_blob' require 'linguist/blob_helper' +require 'linguist/strategy/filename' +require 'linguist/strategy/shebang' module Linguist # Language names that are recognizable by GitHub. Defined languages @@ -91,6 +93,13 @@ module Linguist language end + STRATEGIES = [ + Linguist::Strategy::Filename, + Linguist::Strategy::Shebang, + Linguist::Heuristics, + Linguist::Classifier + ] + # Public: Detects the Language of the blob. # # blob - an object that includes the Linguist `BlobHelper` interface; @@ -98,61 +107,22 @@ module Linguist # # Returns Language or nil. def self.detect(blob) - name = blob.name.to_s - # Bail early if the blob is binary or empty. return nil if blob.likely_binary? || blob.binary? || blob.empty? - # A bit of an elegant hack. If the file is executable but extensionless, - # append a "magic" extension so it can be classified with other - # languages that have shebang scripts. - extensions = FileBlob.new(name).extensions - if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05 - name += ".script!" - end - - # Find languages that match based on filename. - possible_languages = find_by_filename(name) - - if possible_languages.length == 1 - # Simplest and most common case, we can just return the one match based - # on extension - possible_languages.first - - # If there is more than one possible language with that extension (or no - # extension at all, in the case of extensionless scripts), we need to - # continue our detection work - else - # Matches possible_languages.length == 0 || possible_languages.length > 0 - data = blob.data - - # Check if there's a shebang line and use that as authoritative - if (result = find_by_shebang(data)) && !result.empty? - return result.first - - # More than one language with that extension. We need to make a choice. - elsif possible_languages.length > 1 - - # First try heuristics - - possible_language_names = possible_languages.map(&:name) - heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names) - - # If there are multiple possible languages returned from heuristics - # then reduce language candidates for Bayesian classifier here. - if heuristic_languages.size > 1 - possible_language_names = heuristic_languages.map(&:name) - end - - if heuristic_languages.size == 1 - return heuristic_languages.first - # Lastly, fall back to the probabilistic classifier. - elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first - # Return the actual Language object based of the string language name (i.e., first element of `#classify`) - return Language[classified[0]] - end + # Call each strategy until one candidate is returned. + STRATEGIES.reduce([]) do |languages, strategy| + candidates = strategy.call(blob, languages) + if candidates.size == 1 + return candidates.first + elsif candidates.size > 1 + # More than one candidate was found, pass them to the next strategy. + candidates + else + # No candiates were found, pass on languages from the previous strategy. + languages end - end + end.first end # Public: Get all Languages diff --git a/lib/linguist/strategy/filename.rb b/lib/linguist/strategy/filename.rb new file mode 100644 index 00000000..e682863b --- /dev/null +++ b/lib/linguist/strategy/filename.rb @@ -0,0 +1,20 @@ +module Linguist + module Strategy + # Detects language based on filename and/or extension + class Filename + def self.call(blob, _) + name = blob.name.to_s + + # A bit of an elegant hack. If the file is executable but extensionless, + # append a "magic" extension so it can be classified with other + # languages that have shebang scripts. + extensions = FileBlob.new(name).extensions + if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05 + name += ".script!" + end + + Language.find_by_filename(name) + end + end + end +end diff --git a/lib/linguist/strategy/shebang.rb b/lib/linguist/strategy/shebang.rb new file mode 100644 index 00000000..dd5bc38b --- /dev/null +++ b/lib/linguist/strategy/shebang.rb @@ -0,0 +1,10 @@ +module Linguist + module Strategy + # Check if there's a shebang line and use that as authoritative + class Shebang + def self.call(blob, _) + Language.find_by_shebang(blob.data) + end + end + end +end