Merge pull request #839 from github/heuristics

Introduce heuristics
This commit is contained in:
Ted Nyman
2013-12-15 20:20:50 -08:00
5 changed files with 78 additions and 3 deletions

View File

@@ -10,7 +10,11 @@ Linguist defines the list of all languages known to GitHub in a [yaml file](http
Most languages are detected by their file extension. This is the fastest and most common situation.
For disambiguating between files with common extensions, we use a [Bayesian classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb). For an example, this helps us tell the difference between `.h` files which could be either C, C++, or Obj-C.
For disambiguating between files with common extensions, we first apply
some common-sense heuristics to pick out obvious languages. After that, we use a
[Bayesian
classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb).
For an example, this process can help us tell the difference between `.h` files which could be either C, C++, or Obj-C.
In the actual GitHub app we deal with `Grit::Blob` objects. For testing, there is a simple `FileBlob` API.

View File

@@ -1,5 +1,6 @@
require 'linguist/blob_helper'
require 'linguist/generated'
require 'linguist/heuristics'
require 'linguist/language'
require 'linguist/repository'
require 'linguist/samples'

View File

@@ -0,0 +1,28 @@
module Linguist
# A collection of simple heuristics that can be used to better analysis languages.
class Heuristics
# Public: Given an array of String language names,
# apply heuristics against the given data and return an array
# of matching languages, or nil.
#
# data - Array of tokens or String data to analyze.
# languages - Array of language name Strings to restrict to.
#
# Returns an array of Languages or []
def self.find_by_heuristics(data, languages)
if languages.all? { |l| ["Objective-C", "C++"].include?(l) }
disambiguate_h(data, languages)
end
end
# .h extensions are ambigious between C, C++, and Objective-C.
# We want to shortcut look for Objective-C.
#
# Returns an array of Languages or []
def self.disambiguate_h(data, languages)
matches = []
matches << Language["Objective-C"] if data.include?("@interface")
matches
end
end
end

View File

@@ -7,6 +7,7 @@ rescue LoadError
end
require 'linguist/classifier'
require 'linguist/heuristics'
require 'linguist/samples'
module Linguist
@@ -113,19 +114,32 @@ module Linguist
name += ".script!"
end
# First try to find languages that match based on filename.
possible_languages = find_by_filename(name)
# If there is more than one possible language with that extension (or no
# extension at all, in the case of extensionless scripts), we need to continue
# our detection work
if possible_languages.length > 1
data = data.call() if data.respond_to?(:call)
possible_language_names = possible_languages.map(&:name)
# Don't bother with emptiness
if data.nil? || data == ""
nil
# Check if there's a shebang line and use that as authoritative
elsif (result = find_by_shebang(data)) && !result.empty?
result.first
elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
# No shebang. Still more work to do. Try to find it with our heuristics.
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
determined.first
# Lastly, fall back to the probablistic classifier.
elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[classified[0]]
end
else
# Simplest and most common case, we can just return the one match based on extension
possible_languages.first
end
end

28
test/test_heuristics.rb Normal file
View File

@@ -0,0 +1,28 @@
require 'linguist/heuristics'
require 'linguist/language'
require 'linguist/samples'
require 'test/unit'
class TestHeuristcs < Test::Unit::TestCase
include Linguist
def samples_path
File.expand_path("../../samples", __FILE__)
end
def fixture(name)
File.read(File.join(samples_path, name))
end
def test_find_by_heuristics
languages = ["C++", "Objective-C"]
results = Heuristics.find_by_heuristics(fixture("Objective-C/StyleViewController.h"), languages)
assert_equal Language["Objective-C"], results.first
end
def test_detect_still_works_if_nothing_matches
match = Language.detect("Hello.m", fixture("Objective-C/hello.m"))
assert_equal Language["Objective-C"], match
end
end