From 6bd97c7fc73671d818e8f3a081ad768f2c30ba7b Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Sat, 14 Dec 2013 18:51:34 -0800 Subject: [PATCH 1/6] Bit of docs --- lib/linguist/language.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 8dc89096..4e118807 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -123,6 +123,7 @@ module Linguist elsif (result = find_by_shebang(data)) && !result.empty? result.first elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first + # Return the actual Language object based of the string language name (i.e., first element of `#classify`) Language[classified[0]] end else From 3bc17e822da6223a783ad222448f81c687110756 Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Sat, 14 Dec 2013 23:01:04 -0800 Subject: [PATCH 2/6] Start on basic heuristic approach --- lib/linguist.rb | 1 + lib/linguist/heuristics.rb | 43 ++++++++++++++++++++++++++++++++++++++ lib/linguist/language.rb | 15 ++++++++++++- 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 lib/linguist/heuristics.rb diff --git a/lib/linguist.rb b/lib/linguist.rb index e717fb67..ad8337c8 100644 --- a/lib/linguist.rb +++ b/lib/linguist.rb @@ -1,5 +1,6 @@ require 'linguist/blob_helper' require 'linguist/generated' +require 'linguist/heuristics' require 'linguist/language' require 'linguist/repository' require 'linguist/samples' diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb new file mode 100644 index 00000000..16090b51 --- /dev/null +++ b/lib/linguist/heuristics.rb @@ -0,0 +1,43 @@ +require 'linguist/tokenizer' + +module Linguist + # A collection of simple heuristics that can be used to better analysis languages. + class Heuristics + # Public: Given an array of String language names, a + # apply all heuristics against the given data and return an array + # of matching languages, or nil. + # data - Array of tokens or String data to analyze. + # languages - Array of language name Strings to restrict to. + + # Returns an array of language name Strings, or [] + def self.find_by_heuristics(data, languages) + if languages.all? { |l| ["pod", "perl"].include?(l) } + disambiguate_pod(data, languages) + elsif languages.all? { |l| ["objective-c", "c++"].include?(l) } + disambiguate_h(data, languages) + end + end + + # Internal: Initialize a Heuristics class + def initialize + end + + # .pod extensions are ambigious between perl and pod. + # + # Returns an array of still-possible languages, or nil + def self.disambiguate_pod(data, languages) + matches = [] + matches << Language["Perl"] if data.includes?("my $") + matches + end + + # .h extensions are ambigious between C, C++, and Objective-C. + # We want to look for Objective-C. + def self.disambiguate_h(data, languages) + matches = [] + matches << Language["Objective-C"] if data.includes?("NSData *") && data.includes?("@interface") + matches + end + end +end + diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 4e118807..0408f17f 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -7,6 +7,7 @@ rescue LoadError end require 'linguist/classifier' +require 'linguist/heuristics' require 'linguist/samples' module Linguist @@ -113,20 +114,32 @@ module Linguist name += ".script!" end + # First try to find languages that match based on filename. possible_languages = find_by_filename(name) + # If there is more than one possible language with that extension (or no + # extension at all, in the case of extensionless scripts), we need to continue + # our detection work if possible_languages.length > 1 data = data.call() if data.respond_to?(:call) + possible_language_names = possible_languages.map(&:name) + # Don't bother with emptiness if data.nil? || data == "" nil + # Check if there's a shebang line and use that as authoritative elsif (result = find_by_shebang(data)) && !result.empty? result.first - elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first + # No shebang. Still more work to do. Try to find it with our heuristics. + elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? + determined.first + # Lastly, fall back to the probablistic classifier. + elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first # Return the actual Language object based of the string language name (i.e., first element of `#classify`) Language[classified[0]] end else + # Simplest and most common case, we can just return the one match based on extension possible_languages.first end end From 0c668ee1795e3112531740e79394155c948c23df Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Sun, 15 Dec 2013 12:25:47 -0800 Subject: [PATCH 3/6] Some README updates --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 80ab56c0..91f898e5 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,11 @@ Linguist defines the list of all languages known to GitHub in a [yaml file](http Most languages are detected by their file extension. This is the fastest and most common situation. -For disambiguating between files with common extensions, we use a [Bayesian classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb). For an example, this helps us tell the difference between `.h` files which could be either C, C++, or Obj-C. +For disambiguating between files with common extensions, we first apply +some common-sense heuristics to pick out obvious languages. After that, we use a +[Bayesian +classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb). +For an example, this process us tell the difference between `.h` files which could be either C, C++, or Obj-C. In the actual GitHub app we deal with `Grit::Blob` objects. For testing, there is a simple `FileBlob` API. @@ -31,7 +35,7 @@ We typically run on a pre-release version of Pygments, [pygments.rb](https://git ### Stats -The Language Graph you see on every repository is built by aggregating the languages of each file in that repository. +The Language Graph you see on every repository is built by aggregating the languages of each file in that repository. The top language in the graph determines the project's primary language. Collectively, these stats make up the [Top Languages](https://github.com/languages) page. The repository stats API, accessed through `#languages`, can be used on a directory: From b3c6c85387e8416b89d4504287b4a10a9711b986 Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Sun, 15 Dec 2013 20:06:59 -0800 Subject: [PATCH 4/6] Start on basic heuristics tests --- lib/linguist/heuristics.rb | 8 ++++---- test/test_heuristics.rb | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 test/test_heuristics.rb diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 16090b51..8871f118 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -11,9 +11,9 @@ module Linguist # Returns an array of language name Strings, or [] def self.find_by_heuristics(data, languages) - if languages.all? { |l| ["pod", "perl"].include?(l) } + if languages.all? { |l| ["pod", "perl"].include?(l.downcase) } disambiguate_pod(data, languages) - elsif languages.all? { |l| ["objective-c", "c++"].include?(l) } + elsif languages.all? { |l| ["objective-c", "c++"].include?(l.downcase) } disambiguate_h(data, languages) end end @@ -27,7 +27,7 @@ module Linguist # Returns an array of still-possible languages, or nil def self.disambiguate_pod(data, languages) matches = [] - matches << Language["Perl"] if data.includes?("my $") + matches << Language["Perl"] if data.include?("my $") matches end @@ -35,7 +35,7 @@ module Linguist # We want to look for Objective-C. def self.disambiguate_h(data, languages) matches = [] - matches << Language["Objective-C"] if data.includes?("NSData *") && data.includes?("@interface") + matches << Language["Objective-C"] if data.include?("@interface") matches end end diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb new file mode 100644 index 00000000..eb5f251c --- /dev/null +++ b/test/test_heuristics.rb @@ -0,0 +1,23 @@ +require 'linguist/heuristics' +require 'linguist/language' +require 'linguist/samples' + +require 'test/unit' + +class TestHeuristcs < Test::Unit::TestCase + include Linguist + + def samples_path + File.expand_path("../../samples", __FILE__) + end + + def fixture(name) + File.read(File.join(samples_path, name)) + end + + def test_find_by_heuristics + languages = ["C++", "Objective-C"] + results = Heuristics.find_by_heuristics(fixture("Objective-C/StyleViewController.h"), languages) + assert_equal Language["Objective-C"], results.first + end +end From 0626def6998f077cf6df5d910396e32a9483b22b Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Sun, 15 Dec 2013 20:15:19 -0800 Subject: [PATCH 5/6] Start with Objective-C --- lib/linguist/heuristics.rb | 22 ++++------------------ test/test_heuristics.rb | 5 +++++ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 8871f118..a89f7c70 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -6,33 +6,19 @@ module Linguist # Public: Given an array of String language names, a # apply all heuristics against the given data and return an array # of matching languages, or nil. + # # data - Array of tokens or String data to analyze. # languages - Array of language name Strings to restrict to. - + # # Returns an array of language name Strings, or [] def self.find_by_heuristics(data, languages) - if languages.all? { |l| ["pod", "perl"].include?(l.downcase) } - disambiguate_pod(data, languages) - elsif languages.all? { |l| ["objective-c", "c++"].include?(l.downcase) } + if languages.all? { |l| ["Objective-C", "C++"].include?(l) } disambiguate_h(data, languages) end end - # Internal: Initialize a Heuristics class - def initialize - end - - # .pod extensions are ambigious between perl and pod. - # - # Returns an array of still-possible languages, or nil - def self.disambiguate_pod(data, languages) - matches = [] - matches << Language["Perl"] if data.include?("my $") - matches - end - # .h extensions are ambigious between C, C++, and Objective-C. - # We want to look for Objective-C. + # We want to shortcut look for Objective-C. def self.disambiguate_h(data, languages) matches = [] matches << Language["Objective-C"] if data.include?("@interface") diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index eb5f251c..871e9878 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -20,4 +20,9 @@ class TestHeuristcs < Test::Unit::TestCase results = Heuristics.find_by_heuristics(fixture("Objective-C/StyleViewController.h"), languages) assert_equal Language["Objective-C"], results.first end + + def test_detect_still_works_if_nothing_matches + match = Language.detect("Hello.m", fixture("Objective-C/hello.m")) + assert_equal Language["Objective-C"], match + end end From 17d0b1e02f49a52381576b107004de98805e015e Mon Sep 17 00:00:00 2001 From: Ted Nyman Date: Sun, 15 Dec 2013 20:17:30 -0800 Subject: [PATCH 6/6] More documentation --- README.md | 2 +- lib/linguist/heuristics.rb | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 91f898e5..1300d1ef 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ For disambiguating between files with common extensions, we first apply some common-sense heuristics to pick out obvious languages. After that, we use a [Bayesian classifier](https://github.com/github/linguist/blob/master/lib/linguist/classifier.rb). -For an example, this process us tell the difference between `.h` files which could be either C, C++, or Obj-C. +For an example, this process can help us tell the difference between `.h` files which could be either C, C++, or Obj-C. In the actual GitHub app we deal with `Grit::Blob` objects. For testing, there is a simple `FileBlob` API. diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index a89f7c70..7a40503a 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -1,16 +1,14 @@ -require 'linguist/tokenizer' - module Linguist # A collection of simple heuristics that can be used to better analysis languages. class Heuristics - # Public: Given an array of String language names, a - # apply all heuristics against the given data and return an array + # Public: Given an array of String language names, + # apply heuristics against the given data and return an array # of matching languages, or nil. # # data - Array of tokens or String data to analyze. # languages - Array of language name Strings to restrict to. # - # Returns an array of language name Strings, or [] + # Returns an array of Languages or [] def self.find_by_heuristics(data, languages) if languages.all? { |l| ["Objective-C", "C++"].include?(l) } disambiguate_h(data, languages) @@ -19,6 +17,8 @@ module Linguist # .h extensions are ambigious between C, C++, and Objective-C. # We want to shortcut look for Objective-C. + # + # Returns an array of Languages or [] def self.disambiguate_h(data, languages) matches = [] matches << Language["Objective-C"] if data.include?("@interface") @@ -26,4 +26,3 @@ module Linguist end end end -