From 8d7b4f81b44d578fba4d162dc3a57635df7c5cb3 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Sun, 2 Nov 2014 22:15:52 -0500 Subject: [PATCH 1/9] Extract filename strategy --- lib/linguist/language.rb | 15 +++------------ lib/linguist/strategy/filename.rb | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 12 deletions(-) create mode 100644 lib/linguist/strategy/filename.rb diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 49c4a6be..3e8f39d3 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -92,6 +92,8 @@ module Linguist language end + require 'linguist/strategy/filename' + # Public: Detects the Language of the blob. # # blob - an object that includes the Linguist `BlobHelper` interface; @@ -99,8 +101,6 @@ module Linguist # # Returns Language or nil. def self.detect(blob) - name = blob.name.to_s - # Check if the blob is possibly binary and bail early; this is a cheap # test that uses the extension name to guess a binary binary mime type. # @@ -108,16 +108,7 @@ module Linguist # looking for binary characters in the blob return nil if blob.likely_binary? || blob.binary? - # A bit of an elegant hack. If the file is executable but extensionless, - # append a "magic" extension so it can be classified with other - # languages that have shebang scripts. - extension = FileBlob.new(name).extension - if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05 - name += ".script!" - end - - # First try to find languages that match based on filename. - possible_languages = find_by_filename(name) + possible_languages = Linguist::Strategy::Filename.new.call(blob) # If there is more than one possible language with that extension (or no # extension at all, in the case of extensionless scripts), we need to continue diff --git a/lib/linguist/strategy/filename.rb b/lib/linguist/strategy/filename.rb new file mode 100644 index 00000000..57c544ec --- /dev/null +++ b/lib/linguist/strategy/filename.rb @@ -0,0 +1,20 @@ +module Linguist + module Strategy + class Filename + def call(blob) + name = blob.name.to_s + + # A bit of an elegant hack. If the file is executable but extensionless, + # append a "magic" extension so it can be classified with other + # languages that have shebang scripts. + extension = FileBlob.new(name).extension + if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05 + name += ".script!" + end + + # First try to find languages that match based on filename. + possible_languages = Language.find_by_filename(name) + end + end + end +end From fd32938cd8620bc1055960133c514c866b4d5a03 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Mon, 3 Nov 2014 08:15:20 -0500 Subject: [PATCH 2/9] Extract strategies for detecting the language --- lib/linguist/heuristics.rb | 4 +++ lib/linguist/language.rb | 53 ++++++++++++----------------- lib/linguist/strategy/classifier.rb | 12 +++++++ lib/linguist/strategy/filename.rb | 4 +-- lib/linguist/strategy/shebang.rb | 9 +++++ 5 files changed, 49 insertions(+), 33 deletions(-) create mode 100644 lib/linguist/strategy/classifier.rb create mode 100644 lib/linguist/strategy/shebang.rb diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index fe7f69be..247d2bbf 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -3,6 +3,10 @@ module Linguist class Heuristics ACTIVE = true + def self.call(blob, languages) + find_by_heuristics(blob.data, langauges) + end + # Public: Given an array of String language names, # apply heuristics against the given data and return an array # of matching languages, or nil. diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 3e8f39d3..58881125 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -93,6 +93,17 @@ module Linguist end require 'linguist/strategy/filename' + require 'linguist/strategy/shebang' + require 'linguist/strategy/classifier' + STRATEGIES = [ + Linguist::Strategy::Filename, + # Don't bother with binary contents or an empty file + lambda {|blob, langauges| [] if blob.data.nil? || blob.data == "" }, + # Check if there's a shebang line and use that as authoritative + Linguist::Strategy::Shebang, + Linguist::Heuristics, + Linguist::Strategy::Classifier + ] # Public: Detects the Language of the blob. # @@ -101,40 +112,20 @@ module Linguist # # Returns Language or nil. def self.detect(blob) - # Check if the blob is possibly binary and bail early; this is a cheap - # test that uses the extension name to guess a binary binary mime type. - # - # We'll perform a more comprehensive test later which actually involves - # looking for binary characters in the blob + # Check if the blob is possibly binary and bail early. return nil if blob.likely_binary? || blob.binary? - possible_languages = Linguist::Strategy::Filename.new.call(blob) - - # If there is more than one possible language with that extension (or no - # extension at all, in the case of extensionless scripts), we need to continue - # our detection work - if possible_languages.length > 1 - data = blob.data - possible_language_names = possible_languages.map(&:name) - - # Don't bother with binary contents or an empty file - if data.nil? || data == "" - nil - # Check if there's a shebang line and use that as authoritative - elsif (result = find_by_shebang(data)) && !result.empty? - result.first - # No shebang. Still more work to do. Try to find it with our heuristics. - elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? - determined.first - # Lastly, fall back to the probabilistic classifier. - elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first - # Return the actual Language object based of the string language name (i.e., first element of `#classify`) - Language[classified[0]] + STRATEGIES.reduce([]) do |languages, strategy| + if candidates = strategy.call(blob, languages) + if candidates.size > 1 + candidates + else + break candidates + end + else + languages end - else - # Simplest and most common case, we can just return the one match based on extension - possible_languages.first - end + end.first end # Public: Get all Languages diff --git a/lib/linguist/strategy/classifier.rb b/lib/linguist/strategy/classifier.rb new file mode 100644 index 00000000..e362d9fb --- /dev/null +++ b/lib/linguist/strategy/classifier.rb @@ -0,0 +1,12 @@ +module Linguist + module Strategy + class Classifier + def self.call(blob, languages) + Linguist::Classifier.classify(Samples.cache, blob.data, possible_language_names).map do |name| + # Return the actual Language object based of the string language name (i.e., first element of `#classify`) + Language[name] + end + end + end + end +end diff --git a/lib/linguist/strategy/filename.rb b/lib/linguist/strategy/filename.rb index 57c544ec..3a3dcca9 100644 --- a/lib/linguist/strategy/filename.rb +++ b/lib/linguist/strategy/filename.rb @@ -1,7 +1,7 @@ module Linguist module Strategy class Filename - def call(blob) + def self.call(blob, _) name = blob.name.to_s # A bit of an elegant hack. If the file is executable but extensionless, @@ -13,7 +13,7 @@ module Linguist end # First try to find languages that match based on filename. - possible_languages = Language.find_by_filename(name) + Language.find_by_filename(name) end end end diff --git a/lib/linguist/strategy/shebang.rb b/lib/linguist/strategy/shebang.rb new file mode 100644 index 00000000..244e4c85 --- /dev/null +++ b/lib/linguist/strategy/shebang.rb @@ -0,0 +1,9 @@ +module Linguist + module Strategy + class Shebang + def self.call(blob, _) + Language.find_by_shebang(blob.data) + end + end + end +end From 815337299a9cfc988460ff02924b303bd3d89bd9 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Mon, 3 Nov 2014 08:21:46 -0500 Subject: [PATCH 3/9] Extract empty blob strategy --- lib/linguist/language.rb | 6 +++--- lib/linguist/strategy/empty_blob.rb | 10 ++++++++++ lib/linguist/strategy/shebang.rb | 1 + 3 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 lib/linguist/strategy/empty_blob.rb diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 58881125..17b7ddad 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -93,13 +93,13 @@ module Linguist end require 'linguist/strategy/filename' + require 'linguist/strategy/empty_blob' require 'linguist/strategy/shebang' require 'linguist/strategy/classifier' + STRATEGIES = [ Linguist::Strategy::Filename, - # Don't bother with binary contents or an empty file - lambda {|blob, langauges| [] if blob.data.nil? || blob.data == "" }, - # Check if there's a shebang line and use that as authoritative + Linguist::Strategy::EmptyBlob, Linguist::Strategy::Shebang, Linguist::Heuristics, Linguist::Strategy::Classifier diff --git a/lib/linguist/strategy/empty_blob.rb b/lib/linguist/strategy/empty_blob.rb new file mode 100644 index 00000000..cc30477d --- /dev/null +++ b/lib/linguist/strategy/empty_blob.rb @@ -0,0 +1,10 @@ +module Linguist + module Strategy + class EmptyBlob + def self.call(blob, langauges) + # Don't bother with binary contents or an empty file + [] if blob.data.nil? || blob.data == "" + end + end + end +end diff --git a/lib/linguist/strategy/shebang.rb b/lib/linguist/strategy/shebang.rb index 244e4c85..dd5bc38b 100644 --- a/lib/linguist/strategy/shebang.rb +++ b/lib/linguist/strategy/shebang.rb @@ -1,5 +1,6 @@ module Linguist module Strategy + # Check if there's a shebang line and use that as authoritative class Shebang def self.call(blob, _) Language.find_by_shebang(blob.data) From 74fa4b9b7555507bd4788145eb6a82ce04aed02b Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Mon, 3 Nov 2014 08:54:11 -0500 Subject: [PATCH 4/9] docs --- lib/linguist/language.rb | 3 +++ lib/linguist/strategy/classifier.rb | 3 ++- lib/linguist/strategy/empty_blob.rb | 3 ++- lib/linguist/strategy/filename.rb | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 17b7ddad..e53037a5 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -115,11 +115,14 @@ module Linguist # Check if the blob is possibly binary and bail early. return nil if blob.likely_binary? || blob.binary? + # Call each strategy until 0 or 1 candidates are returned STRATEGIES.reduce([]) do |languages, strategy| if candidates = strategy.call(blob, languages) if candidates.size > 1 + # More than one candidate was found, return them for the next strategy candidates else + # 1 or 0 candidates, stop trying strategies break candidates end else diff --git a/lib/linguist/strategy/classifier.rb b/lib/linguist/strategy/classifier.rb index e362d9fb..0bdbae69 100644 --- a/lib/linguist/strategy/classifier.rb +++ b/lib/linguist/strategy/classifier.rb @@ -1,8 +1,9 @@ module Linguist module Strategy + # Detect language using the bayesian classifier class Classifier def self.call(blob, languages) - Linguist::Classifier.classify(Samples.cache, blob.data, possible_language_names).map do |name| + Linguist::Classifier.classify(Samples.cache, blob.data, laguages.map(&:name)).map do |name| # Return the actual Language object based of the string language name (i.e., first element of `#classify`) Language[name] end diff --git a/lib/linguist/strategy/empty_blob.rb b/lib/linguist/strategy/empty_blob.rb index cc30477d..a43b6c9d 100644 --- a/lib/linguist/strategy/empty_blob.rb +++ b/lib/linguist/strategy/empty_blob.rb @@ -1,8 +1,9 @@ module Linguist module Strategy + # Stops detection if the blob contents are empty class EmptyBlob def self.call(blob, langauges) - # Don't bother with binary contents or an empty file + # Return empty array to stop detection [] if blob.data.nil? || blob.data == "" end end diff --git a/lib/linguist/strategy/filename.rb b/lib/linguist/strategy/filename.rb index 3a3dcca9..163d96e9 100644 --- a/lib/linguist/strategy/filename.rb +++ b/lib/linguist/strategy/filename.rb @@ -1,5 +1,6 @@ module Linguist module Strategy + # Detects language based on filename and/or extension class Filename def self.call(blob, _) name = blob.name.to_s @@ -12,7 +13,6 @@ module Linguist name += ".script!" end - # First try to find languages that match based on filename. Language.find_by_filename(name) end end From a4081498f89a4b570dec8586eeadd100fce470a5 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 10:55:03 -0500 Subject: [PATCH 5/9] Remove unneded empty blob check --- lib/linguist/language.rb | 2 -- lib/linguist/strategy/empty_blob.rb | 11 ----------- 2 files changed, 13 deletions(-) delete mode 100644 lib/linguist/strategy/empty_blob.rb diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 4b2651cc..c4194ea2 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -92,13 +92,11 @@ module Linguist end require 'linguist/strategy/filename' - require 'linguist/strategy/empty_blob' require 'linguist/strategy/shebang' require 'linguist/strategy/classifier' STRATEGIES = [ Linguist::Strategy::Filename, - Linguist::Strategy::EmptyBlob, Linguist::Strategy::Shebang, Linguist::Heuristics, Linguist::Strategy::Classifier diff --git a/lib/linguist/strategy/empty_blob.rb b/lib/linguist/strategy/empty_blob.rb deleted file mode 100644 index a43b6c9d..00000000 --- a/lib/linguist/strategy/empty_blob.rb +++ /dev/null @@ -1,11 +0,0 @@ -module Linguist - module Strategy - # Stops detection if the blob contents are empty - class EmptyBlob - def self.call(blob, langauges) - # Return empty array to stop detection - [] if blob.data.nil? || blob.data == "" - end - end - end -end From c1a97373139c2d46369e15d4f92c608c76a67614 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 11:12:47 -0500 Subject: [PATCH 6/9] Try strategies until one language is returned --- lib/linguist/heuristics.rb | 2 +- lib/linguist/language.rb | 24 +++++++++++------------- lib/linguist/strategy/classifier.rb | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index d6488c7f..574b280c 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -4,7 +4,7 @@ module Linguist ACTIVE = true def self.call(blob, languages) - find_by_heuristics(blob.data, langauges.map(&:name)) + find_by_heuristics(blob.data, languages.map(&:name)) end # Public: Given an array of String language names, diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index c4194ea2..40570356 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -10,6 +10,9 @@ require 'linguist/heuristics' require 'linguist/samples' require 'linguist/file_blob' require 'linguist/blob_helper' +require 'linguist/strategy/filename' +require 'linguist/strategy/shebang' +require 'linguist/strategy/classifier' module Linguist # Language names that are recognizable by GitHub. Defined languages @@ -91,10 +94,6 @@ module Linguist language end - require 'linguist/strategy/filename' - require 'linguist/strategy/shebang' - require 'linguist/strategy/classifier' - STRATEGIES = [ Linguist::Strategy::Filename, Linguist::Strategy::Shebang, @@ -112,17 +111,16 @@ module Linguist # Bail early if the blob is binary or empty. return nil if blob.likely_binary? || blob.binary? || blob.empty? - # Call each strategy until 0 or 1 candidates are returned + # Call each strategy until one candidate is returned STRATEGIES.reduce([]) do |languages, strategy| - if candidates = strategy.call(blob, languages) - if candidates.size > 1 - # More than one candidate was found, return them for the next strategy - candidates - else - # 1 or 0 candidates, stop trying strategies - break candidates - end + candidates = strategy.call(blob, languages) + if candidates.size == 1 + return candidates.first + elsif candidates.size > 1 + # More than one candidate was found, pass them to the next strategy + candidates else + # Strategy couldn't find any candidates, so pass on the original list languages end end.first diff --git a/lib/linguist/strategy/classifier.rb b/lib/linguist/strategy/classifier.rb index 0bdbae69..86a516b4 100644 --- a/lib/linguist/strategy/classifier.rb +++ b/lib/linguist/strategy/classifier.rb @@ -3,7 +3,7 @@ module Linguist # Detect language using the bayesian classifier class Classifier def self.call(blob, languages) - Linguist::Classifier.classify(Samples.cache, blob.data, laguages.map(&:name)).map do |name| + Linguist::Classifier.classify(Samples.cache, blob.data, languages.map(&:name)).map do |name, _| # Return the actual Language object based of the string language name (i.e., first element of `#classify`) Language[name] end From bf4baff3632943e96ac6475b5703e8a777ae75f6 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 11:29:38 -0500 Subject: [PATCH 7/9] Move call method into existing Classifier class --- lib/linguist/classifier.rb | 19 +++++++++++++++++++ lib/linguist/language.rb | 3 +-- lib/linguist/strategy/classifier.rb | 13 ------------- 3 files changed, 20 insertions(+), 15 deletions(-) delete mode 100644 lib/linguist/strategy/classifier.rb diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 5370bdd8..0515577b 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -3,6 +3,25 @@ require 'linguist/tokenizer' module Linguist # Language bayesian classifier. class Classifier + # Public: Use the classifier to detect language of the blob. + # + # blob - An object that quacks like a blob. + # possible_languages - Array of + # + # Examples + # + # Classifier.call(FileBlob.new("path/to/file"), [ + # Language["Ruby"], Language["Python"] + # ]) + # + # Returns an Array of possible lanuages, most probable first. + def self.call(blob, possible_languages) + language_names = possible_languages.map(&:name) + classify(Samples.cache, blob.data, language_names).map do |name, _| + Language[name] # Return the actual Language objects + end + end + # Public: Train classifier that data is a certain language. # # db - Hash classifier database object diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 40570356..07972019 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -12,7 +12,6 @@ require 'linguist/file_blob' require 'linguist/blob_helper' require 'linguist/strategy/filename' require 'linguist/strategy/shebang' -require 'linguist/strategy/classifier' module Linguist # Language names that are recognizable by GitHub. Defined languages @@ -98,7 +97,7 @@ module Linguist Linguist::Strategy::Filename, Linguist::Strategy::Shebang, Linguist::Heuristics, - Linguist::Strategy::Classifier + Linguist::Classifier ] # Public: Detects the Language of the blob. diff --git a/lib/linguist/strategy/classifier.rb b/lib/linguist/strategy/classifier.rb deleted file mode 100644 index 86a516b4..00000000 --- a/lib/linguist/strategy/classifier.rb +++ /dev/null @@ -1,13 +0,0 @@ -module Linguist - module Strategy - # Detect language using the bayesian classifier - class Classifier - def self.call(blob, languages) - Linguist::Classifier.classify(Samples.cache, blob.data, languages.map(&:name)).map do |name, _| - # Return the actual Language object based of the string language name (i.e., first element of `#classify`) - Language[name] - end - end - end - end -end From e42ccf0d82723843c3541cc9d8f7c8fc054a9b51 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 11:40:48 -0500 Subject: [PATCH 8/9] docs --- lib/linguist/classifier.rb | 4 ++-- lib/linguist/heuristics.rb | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 0515577b..89a0df2f 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -6,7 +6,7 @@ module Linguist # Public: Use the classifier to detect language of the blob. # # blob - An object that quacks like a blob. - # possible_languages - Array of + # possible_languages - Array of Language objects # # Examples # @@ -14,7 +14,7 @@ module Linguist # Language["Ruby"], Language["Python"] # ]) # - # Returns an Array of possible lanuages, most probable first. + # Returns an Array of Language objects, most probable first. def self.call(blob, possible_languages) language_names = possible_languages.map(&:name) classify(Samples.cache, blob.data, language_names).map do |name, _| diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 574b280c..0930305c 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -3,6 +3,19 @@ module Linguist class Heuristics ACTIVE = true + # Public: Use heuristics to detect language of the blob. + # + # blob - An object that quacks like a blob. + # possible_languages - Array of Language objects + # + # Examples + # + # Heuristics.call(FileBlob.new("path/to/file"), [ + # Language["Ruby"], Language["Python"] + # ]) + # + # Returns an Array with one Language if a heuristic matched, or empty if + # none matched or were inconclusive. def self.call(blob, languages) find_by_heuristics(blob.data, languages.map(&:name)) end From 577fb95384e154d698971884bcfd8a50d7983102 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 17:36:14 -0600 Subject: [PATCH 9/9] Tweak docs --- lib/linguist/language.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 07972019..2353edd1 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -110,16 +110,16 @@ module Linguist # Bail early if the blob is binary or empty. return nil if blob.likely_binary? || blob.binary? || blob.empty? - # Call each strategy until one candidate is returned + # Call each strategy until one candidate is returned. STRATEGIES.reduce([]) do |languages, strategy| candidates = strategy.call(blob, languages) if candidates.size == 1 return candidates.first elsif candidates.size > 1 - # More than one candidate was found, pass them to the next strategy + # More than one candidate was found, pass them to the next strategy. candidates else - # Strategy couldn't find any candidates, so pass on the original list + # No candiates were found, pass on languages from the previous strategy. languages end end.first