mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
@@ -10,7 +10,11 @@ Linguist defines the list of all languages known to GitHub in a [yaml file](http
|
|||||||
|
|
||||||
Most languages are detected by their file extension. This is the fastest and most common situation.
|
Most languages are detected by their file extension. This is the fastest and most common situation.
|
||||||
|
|
||||||
For disambiguating between files with common extensions, we use a [Bayesian classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb). For an example, this helps us tell the difference between `.h` files which could be either C, C++, or Obj-C.
|
For disambiguating between files with common extensions, we first apply
|
||||||
|
some common-sense heuristics to pick out obvious languages. After that, we use a
|
||||||
|
[Bayesian
|
||||||
|
classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb).
|
||||||
|
For an example, this process can help us tell the difference between `.h` files which could be either C, C++, or Obj-C.
|
||||||
|
|
||||||
In the actual GitHub app we deal with `Grit::Blob` objects. For testing, there is a simple `FileBlob` API.
|
In the actual GitHub app we deal with `Grit::Blob` objects. For testing, there is a simple `FileBlob` API.
|
||||||
|
|
||||||
@@ -31,7 +35,7 @@ We typically run on a pre-release version of Pygments, [pygments.rb](https://git
|
|||||||
|
|
||||||
### Stats
|
### Stats
|
||||||
|
|
||||||
The Language Graph you see on every repository is built by aggregating the languages of each file in that repository.
|
The Language Graph you see on every repository is built by aggregating the languages of each file in that repository.
|
||||||
The top language in the graph determines the project's primary language. Collectively, these stats make up the [Top Languages](https://github.com/languages) page.
|
The top language in the graph determines the project's primary language. Collectively, these stats make up the [Top Languages](https://github.com/languages) page.
|
||||||
|
|
||||||
The repository stats API, accessed through `#languages`, can be used on a directory:
|
The repository stats API, accessed through `#languages`, can be used on a directory:
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
require 'linguist/blob_helper'
|
require 'linguist/blob_helper'
|
||||||
require 'linguist/generated'
|
require 'linguist/generated'
|
||||||
|
require 'linguist/heuristics'
|
||||||
require 'linguist/language'
|
require 'linguist/language'
|
||||||
require 'linguist/repository'
|
require 'linguist/repository'
|
||||||
require 'linguist/samples'
|
require 'linguist/samples'
|
||||||
|
|||||||
28
lib/linguist/heuristics.rb
Normal file
28
lib/linguist/heuristics.rb
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
module Linguist
|
||||||
|
# A collection of simple heuristics that can be used to better analysis languages.
|
||||||
|
class Heuristics
|
||||||
|
# Public: Given an array of String language names,
|
||||||
|
# apply heuristics against the given data and return an array
|
||||||
|
# of matching languages, or nil.
|
||||||
|
#
|
||||||
|
# data - Array of tokens or String data to analyze.
|
||||||
|
# languages - Array of language name Strings to restrict to.
|
||||||
|
#
|
||||||
|
# Returns an array of Languages or []
|
||||||
|
def self.find_by_heuristics(data, languages)
|
||||||
|
if languages.all? { |l| ["Objective-C", "C++"].include?(l) }
|
||||||
|
disambiguate_h(data, languages)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# .h extensions are ambigious between C, C++, and Objective-C.
|
||||||
|
# We want to shortcut look for Objective-C.
|
||||||
|
#
|
||||||
|
# Returns an array of Languages or []
|
||||||
|
def self.disambiguate_h(data, languages)
|
||||||
|
matches = []
|
||||||
|
matches << Language["Objective-C"] if data.include?("@interface")
|
||||||
|
matches
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -7,6 +7,7 @@ rescue LoadError
|
|||||||
end
|
end
|
||||||
|
|
||||||
require 'linguist/classifier'
|
require 'linguist/classifier'
|
||||||
|
require 'linguist/heuristics'
|
||||||
require 'linguist/samples'
|
require 'linguist/samples'
|
||||||
|
|
||||||
module Linguist
|
module Linguist
|
||||||
@@ -113,19 +114,32 @@ module Linguist
|
|||||||
name += ".script!"
|
name += ".script!"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# First try to find languages that match based on filename.
|
||||||
possible_languages = find_by_filename(name)
|
possible_languages = find_by_filename(name)
|
||||||
|
|
||||||
|
# If there is more than one possible language with that extension (or no
|
||||||
|
# extension at all, in the case of extensionless scripts), we need to continue
|
||||||
|
# our detection work
|
||||||
if possible_languages.length > 1
|
if possible_languages.length > 1
|
||||||
data = data.call() if data.respond_to?(:call)
|
data = data.call() if data.respond_to?(:call)
|
||||||
|
possible_language_names = possible_languages.map(&:name)
|
||||||
|
|
||||||
|
# Don't bother with emptiness
|
||||||
if data.nil? || data == ""
|
if data.nil? || data == ""
|
||||||
nil
|
nil
|
||||||
|
# Check if there's a shebang line and use that as authoritative
|
||||||
elsif (result = find_by_shebang(data)) && !result.empty?
|
elsif (result = find_by_shebang(data)) && !result.empty?
|
||||||
result.first
|
result.first
|
||||||
elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
|
# No shebang. Still more work to do. Try to find it with our heuristics.
|
||||||
|
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
||||||
|
determined.first
|
||||||
|
# Lastly, fall back to the probablistic classifier.
|
||||||
|
elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first
|
||||||
|
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
||||||
Language[classified[0]]
|
Language[classified[0]]
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
|
# Simplest and most common case, we can just return the one match based on extension
|
||||||
possible_languages.first
|
possible_languages.first
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
28
test/test_heuristics.rb
Normal file
28
test/test_heuristics.rb
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
require 'linguist/heuristics'
|
||||||
|
require 'linguist/language'
|
||||||
|
require 'linguist/samples'
|
||||||
|
|
||||||
|
require 'test/unit'
|
||||||
|
|
||||||
|
class TestHeuristcs < Test::Unit::TestCase
|
||||||
|
include Linguist
|
||||||
|
|
||||||
|
def samples_path
|
||||||
|
File.expand_path("../../samples", __FILE__)
|
||||||
|
end
|
||||||
|
|
||||||
|
def fixture(name)
|
||||||
|
File.read(File.join(samples_path, name))
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_find_by_heuristics
|
||||||
|
languages = ["C++", "Objective-C"]
|
||||||
|
results = Heuristics.find_by_heuristics(fixture("Objective-C/StyleViewController.h"), languages)
|
||||||
|
assert_equal Language["Objective-C"], results.first
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_detect_still_works_if_nothing_matches
|
||||||
|
match = Language.detect("Hello.m", fixture("Objective-C/hello.m"))
|
||||||
|
assert_equal Language["Objective-C"], match
|
||||||
|
end
|
||||||
|
end
|
||||||
Reference in New Issue
Block a user