From 3bc17e822da6223a783ad222448f81c687110756 Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Sat, 14 Dec 2013 23:01:04 -0800 Subject: [PATCH] Start on basic heuristic approach --- lib/linguist.rb | 1 + lib/linguist/heuristics.rb | 43 ++++++++++++++++++++++++++++++++++++++ lib/linguist/language.rb | 15 ++++++++++++- 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 lib/linguist/heuristics.rb diff --git a/lib/linguist.rb b/lib/linguist.rb index e717fb67..ad8337c8 100644 --- a/lib/linguist.rb +++ b/lib/linguist.rb @@ -1,5 +1,6 @@ require 'linguist/blob_helper' require 'linguist/generated' +require 'linguist/heuristics' require 'linguist/language' require 'linguist/repository' require 'linguist/samples' diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb new file mode 100644 index 00000000..16090b51 --- /dev/null +++ b/lib/linguist/heuristics.rb @@ -0,0 +1,43 @@ +require 'linguist/tokenizer' + +module Linguist + # A collection of simple heuristics that can be used to better analysis languages. + class Heuristics + # Public: Given an array of String language names, a + # apply all heuristics against the given data and return an array + # of matching languages, or nil. + # data - Array of tokens or String data to analyze. + # languages - Array of language name Strings to restrict to. + + # Returns an array of language name Strings, or [] + def self.find_by_heuristics(data, languages) + if languages.all? { |l| ["pod", "perl"].include?(l) } + disambiguate_pod(data, languages) + elsif languages.all? { |l| ["objective-c", "c++"].include?(l) } + disambiguate_h(data, languages) + end + end + + # Internal: Initialize a Heuristics class + def initialize + end + + # .pod extensions are ambigious between perl and pod. + # + # Returns an array of still-possible languages, or nil + def self.disambiguate_pod(data, languages) + matches = [] + matches << Language["Perl"] if data.includes?("my $") + matches + end + + # .h extensions are ambigious between C, C++, and Objective-C. + # We want to look for Objective-C. + def self.disambiguate_h(data, languages) + matches = [] + matches << Language["Objective-C"] if data.includes?("NSData *") && data.includes?("@interface") + matches + end + end +end + diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 4e118807..0408f17f 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -7,6 +7,7 @@ rescue LoadError end require 'linguist/classifier' +require 'linguist/heuristics' require 'linguist/samples' module Linguist @@ -113,20 +114,32 @@ module Linguist name += ".script!" end + # First try to find languages that match based on filename. possible_languages = find_by_filename(name) + # If there is more than one possible language with that extension (or no + # extension at all, in the case of extensionless scripts), we need to continue + # our detection work if possible_languages.length > 1 data = data.call() if data.respond_to?(:call) + possible_language_names = possible_languages.map(&:name) + # Don't bother with emptiness if data.nil? || data == "" nil + # Check if there's a shebang line and use that as authoritative elsif (result = find_by_shebang(data)) && !result.empty? result.first - elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first + # No shebang. Still more work to do. Try to find it with our heuristics. + elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? + determined.first + # Lastly, fall back to the probablistic classifier. + elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first # Return the actual Language object based of the string language name (i.e., first element of `#classify`) Language[classified[0]] end else + # Simplest and most common case, we can just return the one match based on extension possible_languages.first end end