diff --git a/README.md b/README.md index 80ab56c0..1300d1ef 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,11 @@ Linguist defines the list of all languages known to GitHub in a [yaml file](http Most languages are detected by their file extension. This is the fastest and most common situation. -For disambiguating between files with common extensions, we use a [Bayesian classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb). For an example, this helps us tell the difference between `.h` files which could be either C, C++, or Obj-C. +For disambiguating between files with common extensions, we first apply +some common-sense heuristics to pick out obvious languages. After that, we use a +[Bayesian +classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb). +For an example, this process can help us tell the difference between `.h` files which could be either C, C++, or Obj-C. In the actual GitHub app we deal with `Grit::Blob` objects. For testing, there is a simple `FileBlob` API. @@ -31,7 +35,7 @@ We typically run on a pre-release version of Pygments, [pygments.rb](https://git ### Stats -The Language Graph you see on every repository is built by aggregating the languages of each file in that repository. +The Language Graph you see on every repository is built by aggregating the languages of each file in that repository. The top language in the graph determines the project's primary language. Collectively, these stats make up the [Top Languages](https://github.com/languages) page. The repository stats API, accessed through `#languages`, can be used on a directory: diff --git a/lib/linguist.rb b/lib/linguist.rb index e717fb67..ad8337c8 100644 --- a/lib/linguist.rb +++ b/lib/linguist.rb @@ -1,5 +1,6 @@ require 'linguist/blob_helper' require 'linguist/generated' +require 'linguist/heuristics' require 'linguist/language' require 'linguist/repository' require 'linguist/samples' diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb new file mode 100644 index 00000000..7a40503a --- /dev/null +++ b/lib/linguist/heuristics.rb @@ -0,0 +1,28 @@ +module Linguist + # A collection of simple heuristics that can be used to better analysis languages. + class Heuristics + # Public: Given an array of String language names, + # apply heuristics against the given data and return an array + # of matching languages, or nil. + # + # data - Array of tokens or String data to analyze. + # languages - Array of language name Strings to restrict to. + # + # Returns an array of Languages or [] + def self.find_by_heuristics(data, languages) + if languages.all? { |l| ["Objective-C", "C++"].include?(l) } + disambiguate_h(data, languages) + end + end + + # .h extensions are ambigious between C, C++, and Objective-C. + # We want to shortcut look for Objective-C. + # + # Returns an array of Languages or [] + def self.disambiguate_h(data, languages) + matches = [] + matches << Language["Objective-C"] if data.include?("@interface") + matches + end + end +end diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 8dc89096..0408f17f 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -7,6 +7,7 @@ rescue LoadError end require 'linguist/classifier' +require 'linguist/heuristics' require 'linguist/samples' module Linguist @@ -113,19 +114,32 @@ module Linguist name += ".script!" end + # First try to find languages that match based on filename. possible_languages = find_by_filename(name) + # If there is more than one possible language with that extension (or no + # extension at all, in the case of extensionless scripts), we need to continue + # our detection work if possible_languages.length > 1 data = data.call() if data.respond_to?(:call) + possible_language_names = possible_languages.map(&:name) + # Don't bother with emptiness if data.nil? || data == "" nil + # Check if there's a shebang line and use that as authoritative elsif (result = find_by_shebang(data)) && !result.empty? result.first - elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first + # No shebang. Still more work to do. Try to find it with our heuristics. + elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? + determined.first + # Lastly, fall back to the probablistic classifier. + elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first + # Return the actual Language object based of the string language name (i.e., first element of `#classify`) Language[classified[0]] end else + # Simplest and most common case, we can just return the one match based on extension possible_languages.first end end diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb new file mode 100644 index 00000000..871e9878 --- /dev/null +++ b/test/test_heuristics.rb @@ -0,0 +1,28 @@ +require 'linguist/heuristics' +require 'linguist/language' +require 'linguist/samples' + +require 'test/unit' + +class TestHeuristcs < Test::Unit::TestCase + include Linguist + + def samples_path + File.expand_path("../../samples", __FILE__) + end + + def fixture(name) + File.read(File.join(samples_path, name)) + end + + def test_find_by_heuristics + languages = ["C++", "Objective-C"] + results = Heuristics.find_by_heuristics(fixture("Objective-C/StyleViewController.h"), languages) + assert_equal Language["Objective-C"], results.first + end + + def test_detect_still_works_if_nothing_matches + match = Language.detect("Hello.m", fixture("Objective-C/hello.m")) + assert_equal Language["Objective-C"], match + end +end