From fbc0947420d3fd4d8aff0eb79ab479cc41936fd5 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 08:14:30 -0600 Subject: [PATCH 1/8] Not going back --- lib/linguist/heuristics.rb | 59 ++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 0930305c..283b7649 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -1,7 +1,6 @@ module Linguist # A collection of simple heuristics that can be used to better analyze languages. class Heuristics - ACTIVE = true # Public: Use heuristics to detect language of the blob. # @@ -29,38 +28,36 @@ module Linguist # # Returns an array of Languages or [] def self.find_by_heuristics(data, languages) - if active? - result = [] + result = [] - if languages.all? { |l| ["Perl", "Prolog"].include?(l) } - result = disambiguate_pl(data) - end - if languages.all? { |l| ["ECL", "Prolog"].include?(l) } - result = disambiguate_ecl(data) - end - if languages.all? { |l| ["IDL", "Prolog"].include?(l) } - result = disambiguate_pro(data) - end - if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) } - result = disambiguate_cl(data) - end - if languages.all? { |l| ["Hack", "PHP"].include?(l) } - result = disambiguate_hack(data) - end - if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) } - result = disambiguate_sc(data) - end - if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) } - result = disambiguate_asc(data) - end - if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) } - result = disambiguate_f(data) - end - if languages.all? { |l| ["F#", "Forth", "GLSL"].include?(l) } - result = disambiguate_fs(data) - end - return result + if languages.all? { |l| ["Perl", "Prolog"].include?(l) } + result = disambiguate_pl(data) end + if languages.all? { |l| ["ECL", "Prolog"].include?(l) } + result = disambiguate_ecl(data) + end + if languages.all? { |l| ["IDL", "Prolog"].include?(l) } + result = disambiguate_pro(data) + end + if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) } + result = disambiguate_cl(data) + end + if languages.all? { |l| ["Hack", "PHP"].include?(l) } + result = disambiguate_hack(data) + end + if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) } + result = disambiguate_sc(data) + end + if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) } + result = disambiguate_asc(data) + end + if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) } + result = disambiguate_f(data) + end + if languages.all? { |l| ["F#", "Forth", "GLSL"].include?(l) } + result = disambiguate_fs(data) + end + return result end # .h extensions are ambiguous between C, C++, and Objective-C. From 034cb250999a8cfaa402ecfbc63504c67c724cbf Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 09:43:59 -0600 Subject: [PATCH 2/8] Refactor heuristics --- lib/linguist/heuristics.rb | 212 +++++++++++++++---------------------- test/test_heuristics.rb | 112 ++++++++++---------- 2 files changed, 145 insertions(+), 179 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 283b7649..8033d722 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -1,7 +1,6 @@ module Linguist # A collection of simple heuristics that can be used to better analyze languages. class Heuristics - # Public: Use heuristics to detect language of the blob. # # blob - An object that quacks like a blob. @@ -16,48 +15,105 @@ module Linguist # Returns an Array with one Language if a heuristic matched, or empty if # none matched or were inconclusive. def self.call(blob, languages) - find_by_heuristics(blob.data, languages.map(&:name)) + data = blob.data + + @heuristics.each do |heuristic| + if heuristic.matches?(languages) + language = heuristic.call(data) + return [language] if language + end + end + + [] # No heuristics matched end - # Public: Given an array of String language names, - # apply heuristics against the given data and return an array - # of matching languages, or nil. - # - # data - Array of tokens or String data to analyze. - # languages - Array of language name Strings to restrict to. - # - # Returns an array of Languages or [] - def self.find_by_heuristics(data, languages) - result = [] + @heuristics = [] - if languages.all? { |l| ["Perl", "Prolog"].include?(l) } - result = disambiguate_pl(data) + def self.create(*languages, &heuristic) + @heuristics << new(languages, &heuristic) + end + + def initialize(languages, &heuristic) + @languages = languages + @heuristic = heuristic + end + + def matches?(candidates) + candidates.all? { |l| @languages.include?(l.name) } + end + + def call(data) + @heuristic.call(data) + end + + create "Perl", "Prolog" do |data| + if data.include?("use strict") + Language["Perl"] + elsif data.include?(":-") + Language["Prolog"] end - if languages.all? { |l| ["ECL", "Prolog"].include?(l) } - result = disambiguate_ecl(data) + end + + create "ECL", "Prolog" do |data| + if data.include?(":-") + Language["Prolog"] + elsif data.include?(":=") + Language["ECL"] end - if languages.all? { |l| ["IDL", "Prolog"].include?(l) } - result = disambiguate_pro(data) + end + + create "IDL", "Prolog" do |data| + if data.include?(":-") + Language["Prolog"] + else + Language["IDL"] end - if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) } - result = disambiguate_cl(data) + end + + create "Common Lisp", "OpenCL" do |data| + if data.include?("(defun ") + Language["Common Lisp"] + elsif /\/\* |\/\/ |^\}/.match(data) + Language["OpenCL"] end - if languages.all? { |l| ["Hack", "PHP"].include?(l) } - result = disambiguate_hack(data) + end + + create "Hack", "PHP" do |data| + if data.include?("")) @@ -114,16 +140,6 @@ module Linguist matches end - def self.disambiguate_cl(data) - matches = [] - if data.include?("(defun ") - matches << Language["Common Lisp"] - elsif /\/\* |\/\/ |^\}/.match(data) - matches << Language["OpenCL"] - end - matches - end - def self.disambiguate_r(data) matches = [] matches << Language["Rebol"] if /\bRebol\b/i.match(data) @@ -131,57 +147,5 @@ module Linguist matches end - def self.disambiguate_hack(data) - matches = [] - if data.include?(" "Prolog/turing.pl", + "Perl" => "Perl/perl-test.t", + }) end # Candidate languages = ["ECL", "Prolog"] def test_ecl_prolog_by_heuristics - results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl")) - assert_equal Language["Prolog"], results.first + results = Heuristics.call(file_blob("Prolog/or-constraint.ecl"), [Language["ECL"], Language["Prolog"]]) + assert_equal [Language["Prolog"]], results end # Candidate languages = ["ECL", "Prolog"] - def test_ecl_ecl_by_heuristics - results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl")) - assert_equal Language["ECL"], results.first + def test_ecl_prolog_by_heuristics + assert_heuristics({ + "ECL" => "ECL/sample.ecl", + "Prolog" => "Prolog/or-constraint.ecl" + }) end # Candidate languages = ["IDL", "Prolog"] - def test_pro_prolog_by_heuristics - results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro")) - assert_equal Language["Prolog"], results.first - end - - # Candidate languages = ["IDL", "Prolog"] - def test_pro_idl_by_heuristics - results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro")) - assert_equal Language["IDL"], results.first + def test_pro_prolog_idl_by_heuristics + assert_heuristics({ + "Prolog" => "Prolog/logic-problem.pro", + "IDL" => "IDL/mg_acosh.pro" + }) end # Candidate languages = ["AGS Script", "AsciiDoc"] def test_asc_asciidoc_by_heuristics - results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc")) - assert_equal Language["AsciiDoc"], results.first + assert_heuristics({ + "AsciiDoc" => "AsciiDoc/list.asc", + "AGS Script" => nil + }) end # Candidate languages = ["TypeScript", "XML"] @@ -91,49 +92,50 @@ class TestHeuristcs < Test::Unit::TestCase end def test_cl_by_heuristics - languages = ["Common Lisp", "OpenCL"] - languages.each do |language| - all_fixtures(language).each do |fixture| - results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}")) - assert_equal Language[language], results.first - end - end + assert_heuristics({ + "Common Lisp" => all_fixtures("Common Lisp"), + "OpenCL" => all_fixtures("OpenCL") + }) end def test_f_by_heuristics - languages = ["FORTRAN", "Forth"] - languages.each do |language| - all_fixtures(language).each do |fixture| - results = Heuristics.disambiguate_f(fixture("#{language}/#{File.basename(fixture)}")) - assert_equal Language[language], results.first - end - end + assert_heuristics({ + "FORTRAN" => all_fixtures("FORTRAN"), + "Forth" => all_fixtures("Forth") + }) end # Candidate languages = ["Hack", "PHP"] def test_hack_by_heuristics - results = Heuristics.disambiguate_hack(fixture("Hack/funs.php")) - assert_equal Language["Hack"], results.first + assert_heuristics({ + "Hack" => "Hack/funs.php", + "PHP" => "PHP/Model.php" + }) end # Candidate languages = ["Scala", "SuperCollider"] - def test_sc_supercollider_by_heuristics - results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc")) - assert_equal Language["SuperCollider"], results.first - end - - # Candidate languages = ["Scala", "SuperCollider"] - def test_sc_scala_by_heuristics - results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc")) - assert_equal Language["Scala"], results.first + def test_sc_supercollider_scala_by_heuristics + assert_heuristics({ + "SuperCollider" => "SuperCollider/WarpPreset.sc", + "Scala" => "Scala/node11.sc" + }) end def test_fs_by_heuristics - languages = ["F#", "Forth", "GLSL"] - languages.each do |language| - all_fixtures(language).each do |fixture| - results = Heuristics.disambiguate_fs(fixture("#{language}/#{File.basename(fixture)}")) - assert_equal Language[language], results.first + assert_heuristics({ + "F#" => all_fixtures("F#"), + "Forth" => all_fixtures("Forth"), + "GLSL" => all_fixtures("GLSL") + }) + end + + def assert_heuristics(hash) + candidates = hash.keys.map { |l| Language[l] } + + hash.each do |language, blobs| + Array(blobs).each do |blob| + result = Heuristics.call(file_blob(blob), candidates) + assert_equal [Language[language]], result end end end From bc66f558b91a280edabecd7e9825042a728086d2 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 12:17:52 -0600 Subject: [PATCH 3/8] Remove inactive heuristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can add these back when we’re ready to enable them. --- lib/linguist/heuristics.rb | 32 -------------------------------- test/test_heuristics.rb | 27 --------------------------- 2 files changed, 59 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 8033d722..d23ea7f0 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -115,37 +115,5 @@ module Linguist Language["GLSL"] end end - - # .h extensions are ambiguous between C, C++, and Objective-C. - # We want to shortcut look for Objective-C _and_ now C++ too! - # - # Returns an array of Languages or [] - def self.disambiguate_c(data) - matches = [] - if data.include?("@interface") - matches << Language["Objective-C"] - elsif data.include?("#include ") - matches << Language["C++"] - end - matches - end - - def self.disambiguate_ts(data) - matches = [] - if (data.include?("")) - matches << Language["XML"] - else - matches << Language["TypeScript"] - end - matches - end - - def self.disambiguate_r(data) - matches = [] - matches << Language["Rebol"] if /\bRebol\b/i.match(data) - matches << Language["R"] if data.include?("<-") - matches - end - end end diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index ce50daa4..36c8ece1 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -20,21 +20,6 @@ class TestHeuristcs < Test::Unit::TestCase Dir.glob("#{samples_path}/#{language_name}/#{file}") end - # Candidate languages = ["C++", "Objective-C"] - def test_obj_c_by_heuristics - # Only calling out '.h' filenames as these are the ones causing issues - all_fixtures("Objective-C", "*.h").each do |fixture| - results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}")) - assert_equal Language["Objective-C"], results.first - end - end - - # Candidate languages = ["C++", "Objective-C"] - def test_cpp_by_heuristics - results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp")) - assert_equal Language["C++"], results.first - end - def test_detect_still_works_if_nothing_matches blob = Linguist::FileBlob.new(File.join(samples_path, "Objective-C/hello.m")) match = Language.detect(blob) @@ -79,18 +64,6 @@ class TestHeuristcs < Test::Unit::TestCase }) end - # Candidate languages = ["TypeScript", "XML"] - def test_ts_typescript_by_heuristics - results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts")) - assert_equal Language["TypeScript"], results.first - end - - # Candidate languages = ["TypeScript", "XML"] - def test_ts_xml_by_heuristics - results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml")) - assert_equal Language["XML"], results.first - end - def test_cl_by_heuristics assert_heuristics({ "Common Lisp" => all_fixtures("Common Lisp"), From 26d789612be7b42a01c20b7f496e2fbbff5f1e8c Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 14:40:02 -0600 Subject: [PATCH 4/8] docs --- lib/linguist/heuristics.rb | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index d23ea7f0..4abd654b 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -27,21 +27,40 @@ module Linguist [] # No heuristics matched end - @heuristics = [] - + # Internal: Define a new heuristic. + # + # languages - String names of languages to disambiguate. + # heuristic - Block which takes data as an argument and returns a Language or nil. + # + # Examples + # + # create "Perl", "Prolog" do |data| + # if data.include?("use strict") + # Language["Perl"] + # elsif data.include?(":-") + # Language["Prolog"] + # end + # end + # def self.create(*languages, &heuristic) @heuristics << new(languages, &heuristic) end + # Internal: Array of defined heuristics + @heuristics = [] + + # Internal def initialize(languages, &heuristic) @languages = languages @heuristic = heuristic end + # Internal: Check if this heuristic matches the candidate languages. def matches?(candidates) candidates.all? { |l| @languages.include?(l.name) } end + # Internal: Perform the heuristic def call(data) @heuristic.call(data) end From b8685103d0226a2f01bc0b56a93740a46a5838bd Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 14:41:52 -0600 Subject: [PATCH 5/8] Rename .create to .disambiguate --- lib/linguist/heuristics.rb | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 4abd654b..7db25742 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -34,7 +34,7 @@ module Linguist # # Examples # - # create "Perl", "Prolog" do |data| + # disambiguate "Perl", "Prolog" do |data| # if data.include?("use strict") # Language["Perl"] # elsif data.include?(":-") @@ -42,7 +42,7 @@ module Linguist # end # end # - def self.create(*languages, &heuristic) + def self.disambiguate(*languages, &heuristic) @heuristics << new(languages, &heuristic) end @@ -65,7 +65,7 @@ module Linguist @heuristic.call(data) end - create "Perl", "Prolog" do |data| + disambiguate "Perl", "Prolog" do |data| if data.include?("use strict") Language["Perl"] elsif data.include?(":-") @@ -73,7 +73,7 @@ module Linguist end end - create "ECL", "Prolog" do |data| + disambiguate "ECL", "Prolog" do |data| if data.include?(":-") Language["Prolog"] elsif data.include?(":=") @@ -81,7 +81,7 @@ module Linguist end end - create "IDL", "Prolog" do |data| + disambiguate "IDL", "Prolog" do |data| if data.include?(":-") Language["Prolog"] else @@ -89,7 +89,7 @@ module Linguist end end - create "Common Lisp", "OpenCL" do |data| + disambiguate "Common Lisp", "OpenCL" do |data| if data.include?("(defun ") Language["Common Lisp"] elsif /\/\* |\/\/ |^\}/.match(data) @@ -97,7 +97,7 @@ module Linguist end end - create "Hack", "PHP" do |data| + disambiguate "Hack", "PHP" do |data| if data.include?(" Date: Fri, 28 Nov 2014 16:55:00 -0600 Subject: [PATCH 6/8] Allow disambiguate to return an Array --- lib/linguist/heuristics.rb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 7db25742..26c14bf6 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -18,9 +18,8 @@ module Linguist data = blob.data @heuristics.each do |heuristic| - if heuristic.matches?(languages) - language = heuristic.call(data) - return [language] if language + if heuristic.matches?(languages) && result = heuristic.call(data) + return Array(result) end end From c038b51941256ade29e4f35ac627d5d6fd3f2d00 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 17:03:01 -0600 Subject: [PATCH 7/8] Clean up heuristic logic --- lib/linguist/heuristics.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 26c14bf6..954a7d16 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -18,9 +18,7 @@ module Linguist data = blob.data @heuristics.each do |heuristic| - if heuristic.matches?(languages) && result = heuristic.call(data) - return Array(result) - end + return Array(heuristic.call(data)) if heuristic.matches?(languages) end [] # No heuristics matched From 770a1d4553a57965388a9db3671b86782ef2c212 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 17:07:15 -0600 Subject: [PATCH 8/8] update docs --- lib/linguist/heuristics.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 954a7d16..0b09bd1b 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -12,8 +12,7 @@ module Linguist # Language["Ruby"], Language["Python"] # ]) # - # Returns an Array with one Language if a heuristic matched, or empty if - # none matched or were inconclusive. + # Returns an Array of languages, or empty if none matched or were inconclusive. def self.call(blob, languages) data = blob.data