From dd8eaf2893030d808fdf10553f79ac75ae5f596c Mon Sep 17 00:00:00 2001 From: Lars Brinkhoff Date: Sun, 30 Aug 2015 12:32:33 +0200 Subject: [PATCH] Alphabetise heuristics. --- lib/linguist/heuristics.rb | 372 ++++++++++++++++++------------------- test/test_heuristics.rb | 208 ++++++++++----------- 2 files changed, 290 insertions(+), 290 deletions(-) diff --git a/lib/linguist/heuristics.rb b/lib/linguist/heuristics.rb index 4d468bf0..dd1f8ee6 100644 --- a/lib/linguist/heuristics.rb +++ b/lib/linguist/heuristics.rb @@ -67,6 +67,16 @@ module Linguist # Common heuristics ObjectiveCRegex = /^[ \t]*@(interface|class|protocol|property|end|synchronized|selector|implementation)\b/ + disambiguate ".asc" do |data| + if /^(----[- ]BEGIN|ssh-(rsa|dss)) /.match(data) + Language["Public Key"] + elsif /^[=-]+(\s|\n)|{{[A-Za-z]/.match(data) + Language["AsciiDoc"] + elsif /^(\/\/.+|((import|export)\s+)?(function|int|float|char)\s+((room|repeatedly|on|game)_)?([A-Za-z]+[A-Za-z_0-9]+)\s*[;\(])/.match(data) + Language["AGS Script"] + end + end + disambiguate ".bb" do |data| if /^\s*; /.match(data) || data.include?("End Function") Language["BlitzBasic"] @@ -75,67 +85,9 @@ module Linguist end end - disambiguate ".cs" do |data| - if /![\w\s]+methodsFor: /.match(data) - Language["Smalltalk"] - elsif /^\s*namespace\s*[\w\.]+\s*{/.match(data) || /^\s*\/\//.match(data) - Language["C#"] - end - end - - disambiguate ".h" do |data| - if ObjectiveCRegex.match(data) - Language["Objective-C"] - elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) || - /^\s*template\s* ")) - Language["GAP"] - # Heads up - we don't usually write heuristics like this (with no regex match) - else - Language["Scilab"] + disambiguate ".ch" do |data| + if /^\s*#\s*(if|ifdef|ifndef|define|command|xcommand|translate|xtranslate|include|pragma|undef)\b/i.match(data) + Language["xBase"] end end @@ -149,29 +101,29 @@ module Linguist end end - disambiguate ".php" do |data| - if data.include?("/.match(data) || + /^\s*template\s*)/.match(data) + Language["Lex"] + elsif /^\.[a-z][a-z](\s|$)/i.match(data) + Language["Groff"] + elsif /^\((de|class|rel|code|data|must)\s/.match(data) + Language["PicoLisp"] + end + end + + disambiguate ".ls" do |data| + if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data) + Language["LoomScript"] + else + Language["LiveScript"] + end + end + + disambiguate ".lsp", ".lisp" do |data| + if /^\s*\((defun|in-package|defpackage) /i.match(data) + Language["Common Lisp"] + elsif /^\s*\(define /.match(data) + Language["NewLisp"] + end + end + disambiguate ".m" do |data| if ObjectiveCRegex.match(data) Language["Objective-C"] @@ -213,41 +216,113 @@ module Linguist end end - disambiguate ".gs" do |data| - Language["Gosu"] if /^uses java\./.match(data) - end - - disambiguate ".ls" do |data| - if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data) - Language["LoomScript"] - else - Language["LiveScript"] + disambiguate ".ml" do |data| + if /(^\s*module)|let rec |match\s+(\S+\s)+with/.match(data) + Language["OCaml"] + elsif /=> |case\s+(\S+\s)+of/.match(data) + Language["Standard ML"] end end - disambiguate ".lsp", ".lisp" do |data| - if /^\s*\((defun|in-package|defpackage) /i.match(data) - Language["Common Lisp"] - elsif /^\s*\(define /.match(data) + disambiguate ".mod" do |data| + if data.include?(' |case\s+(\S+\s)+of/.match(data) - Language["Standard ML"] - end - end - - disambiguate ".mod" do |data| - if data.include?(' ")) + Language["GAP"] + # Heads up - we don't usually write heuristics like this (with no regex match) else - Language["NewLisp"] - end - end - - disambiguate ".rs" do |data| - if /^(use |fn |mod |pub |macro_rules|impl|#!?\[)/.match(data) - Language["Rust"] - elsif /#include|#pragma\s+(rs|version)|__attribute__/.match(data) - Language["RenderScript"] - end - end - - disambiguate ".l" do |data| - if /\(def(un|macro)\s/.match(data) - Language["Common Lisp"] - elsif /^(%[%{}]xs|<.*>)/.match(data) - Language["Lex"] - elsif /^\.[a-z][a-z](\s|$)/i.match(data) - Language["Groff"] - elsif /^\((de|class|rel|code|data|must)\s/.match(data) - Language["PicoLisp"] - end - end - - disambiguate ".n" do |data| - if /^[.']/.match(data) - Language["Groff"] - elsif /^(module|namespace|using)\s/.match(data) - Language["Nemerle"] - end - end - - disambiguate ".ms" do |data| - if /^[.'][a-z][a-z](\s|$)/i.match(data) - Language["Groff"] - elsif /((^|\s)move?[. ])|\.(include|globa?l)\s/.match(data) - Language["GAS"] - end - end - - disambiguate ".ch" do |data| - if /^\s*#\s*(if|ifdef|ifndef|define|command|xcommand|translate|xtranslate|include|pragma|undef)\b/i.match(data) - Language["xBase"] - end - end - - disambiguate ".r" do |data| - if /\bRebol\b/i.match(data) - Language["Rebol"] - elsif data.include?("<-") - Language["R"] + Language["Scilab"] end end end diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index cca48675..524a522a 100644 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -33,6 +33,101 @@ class TestHeuristcs < Minitest::Test end end + def test_detect_still_works_if_nothing_matches + blob = Linguist::FileBlob.new(File.join(samples_path, "Objective-C/hello.m")) + match = Language.detect(blob) + assert_equal Language["Objective-C"], match + end + + # Candidate languages = ["AGS Script", "AsciiDoc", "Public Key"] + def test_asc_by_heuristics + assert_heuristics({ + "AsciiDoc" => all_fixtures("AsciiDoc", "*.asc"), + "AGS Script" => all_fixtures("AGS Script", "*.asc"), + "Public Key" => all_fixtures("Public Key", "*.asc") + }) + end + + def test_bb_by_heuristics + assert_heuristics({ + "BitBake" => all_fixtures("BitBake", "*.bb"), + "BlitzBasic" => all_fixtures("BlitzBasic", "*.bb") + }) + end + + def test_ch_by_heuristics + assert_heuristics({ + "xBase" => all_fixtures("xBase", ".ch") + }) + end + + def test_cl_by_heuristics + assert_heuristics({ + "Common Lisp" => all_fixtures("Common Lisp", "*.cl"), + "OpenCL" => all_fixtures("OpenCL", "*.cl") + }) + end + + def test_cs_by_heuristics + assert_heuristics({ + "C#" => all_fixtures("C#", "*.cs"), + "Smalltalk" => all_fixtures("Smalltalk", "*.cs") + }) + end + + # Candidate languages = ["ECL", "ECLiPSe"] + def test_ecl_by_heuristics + assert_heuristics({ + "ECL" => all_fixtures("ECL", "*.ecl"), + "ECLiPSe" => all_fixtures("ECLiPSe", "*.ecl") + }) + end + + def test_f_by_heuristics + assert_heuristics({ + "FORTRAN" => all_fixtures("FORTRAN", "*.f") + all_fixtures("FORTRAN", "*.for"), + "Forth" => all_fixtures("Forth", "*.f") + all_fixtures("Forth", "*.for") + }) + end + + def test_fr_by_heuristics + assert_heuristics({ + "Frege" => all_fixtures("Frege", "*.fr"), + "Forth" => all_fixtures("Forth", "*.fr"), + "Text" => all_fixtures("Text", "*.fr") + }) + end + + def test_fs_by_heuristics + assert_heuristics({ + "F#" => all_fixtures("F#", "*.fs"), + "Forth" => all_fixtures("Forth", "*.fs"), + "GLSL" => all_fixtures("GLSL", "*.fs") + }) + end + + # Candidate languages = ["Hack", "PHP"] + def test_hack_by_heuristics + assert_heuristics({ + "Hack" => all_fixtures("Hack", "*.php"), + "PHP" => all_fixtures("PHP", "*.php") + }) + end + + def test_ls_by_heuristics + assert_heuristics({ + "LiveScript" => all_fixtures("LiveScript", "*.ls"), + "LoomScript" => all_fixtures("LoomScript", "*.ls") + }) + end + + def test_lsp_by_heuristics + assert_heuristics({ + "Common Lisp" => all_fixtures("Common Lisp", "*.lsp") + all_fixtures("Common Lisp", "*.lisp"), + "NewLisp" => all_fixtures("NewLisp", "*.lsp") + all_fixtures("NewLisp", "*.lisp") + }) + end + # Candidate languages = ["C++", "Objective-C"] def test_obj_c_by_heuristics # Only calling out '.h' filenames as these are the ones causing issues @@ -43,12 +138,6 @@ class TestHeuristcs < Minitest::Test }) end - def test_detect_still_works_if_nothing_matches - blob = Linguist::FileBlob.new(File.join(samples_path, "Objective-C/hello.m")) - match = Language.detect(blob) - assert_equal Language["Objective-C"], match - end - # Candidate languages = ["Perl", "Perl6", "Prolog"] def test_pl_prolog_perl_by_heuristics assert_heuristics({ @@ -66,24 +155,6 @@ class TestHeuristcs < Minitest::Test }) end - # Candidate languages = ["Perl", "Perl6"] - def test_t_perl_by_heuristics - assert_heuristics({ - "Perl" => all_fixtures("Perl", "*.t"), - "Perl6" => ["Perl6/01-dash-uppercase-i.t", "Perl6/01-parse.t", "Perl6/advent2009-day16.t", - "Perl6/basic-open.t", "Perl6/calendar.t", "Perl6/for.t", "Perl6/hash.t", - "Perl6/listquote-whitespace.t"] - }) - end - - # Candidate languages = ["ECL", "ECLiPSe"] - def test_ecl_by_heuristics - assert_heuristics({ - "ECL" => all_fixtures("ECL", "*.ecl"), - "ECLiPSe" => all_fixtures("ECLiPSe", "*.ecl") - }) - end - # Candidate languages = ["IDL", "Prolog", "QMake", "INI"] def test_pro_by_heuristics assert_heuristics({ @@ -94,34 +165,10 @@ class TestHeuristcs < Minitest::Test }) end - # Candidate languages = ["AGS Script", "AsciiDoc", "Public Key"] - def test_asc_by_heuristics + def test_r_by_heuristics assert_heuristics({ - "AsciiDoc" => all_fixtures("AsciiDoc", "*.asc"), - "AGS Script" => all_fixtures("AGS Script", "*.asc"), - "Public Key" => all_fixtures("Public Key", "*.asc") - }) - end - - def test_cl_by_heuristics - assert_heuristics({ - "Common Lisp" => all_fixtures("Common Lisp", "*.cl"), - "OpenCL" => all_fixtures("OpenCL", "*.cl") - }) - end - - def test_f_by_heuristics - assert_heuristics({ - "FORTRAN" => all_fixtures("FORTRAN", "*.f") + all_fixtures("FORTRAN", "*.for"), - "Forth" => all_fixtures("Forth", "*.f") + all_fixtures("Forth", "*.for") - }) - end - - # Candidate languages = ["Hack", "PHP"] - def test_hack_by_heuristics - assert_heuristics({ - "Hack" => all_fixtures("Hack", "*.php"), - "PHP" => all_fixtures("PHP", "*.php") + "R" => all_fixtures("R", "*.r") + all_fixtures("R", "*.R"), + "Rebol" => all_fixtures("Rebol", "*.r") }) end @@ -133,47 +180,13 @@ class TestHeuristcs < Minitest::Test }) end - def test_fs_by_heuristics + # Candidate languages = ["Perl", "Perl6"] + def test_t_perl_by_heuristics assert_heuristics({ - "F#" => all_fixtures("F#", "*.fs"), - "Forth" => all_fixtures("Forth", "*.fs"), - "GLSL" => all_fixtures("GLSL", "*.fs") - }) - end - - def test_fr_by_heuristics - assert_heuristics({ - "Frege" => all_fixtures("Frege", "*.fr"), - "Forth" => all_fixtures("Forth", "*.fr"), - "Text" => all_fixtures("Text", "*.fr") - }) - end - - def test_bb_by_heuristics - assert_heuristics({ - "BitBake" => all_fixtures("BitBake", "*.bb"), - "BlitzBasic" => all_fixtures("BlitzBasic", "*.bb") - }) - end - - def test_lsp_by_heuristics - assert_heuristics({ - "Common Lisp" => all_fixtures("Common Lisp", "*.lsp") + all_fixtures("Common Lisp", "*.lisp"), - "NewLisp" => all_fixtures("NewLisp", "*.lsp") + all_fixtures("NewLisp", "*.lisp") - }) - end - - def test_cs_by_heuristics - assert_heuristics({ - "C#" => all_fixtures("C#", "*.cs"), - "Smalltalk" => all_fixtures("Smalltalk", "*.cs") - }) - end - - def test_ls_by_heuristics - assert_heuristics({ - "LiveScript" => all_fixtures("LiveScript", "*.ls"), - "LoomScript" => all_fixtures("LoomScript", "*.ls") + "Perl" => all_fixtures("Perl", "*.t"), + "Perl6" => ["Perl6/01-dash-uppercase-i.t", "Perl6/01-parse.t", "Perl6/advent2009-day16.t", + "Perl6/basic-open.t", "Perl6/calendar.t", "Perl6/for.t", "Perl6/hash.t", + "Perl6/listquote-whitespace.t"] }) end @@ -183,17 +196,4 @@ class TestHeuristcs < Minitest::Test "XML" => all_fixtures("XML", "*.ts") }) end - - def test_ch_by_heuristics - assert_heuristics({ - "xBase" => all_fixtures("xBase", ".ch") - }) - end - - def test_r_by_heuristics - assert_heuristics({ - "R" => all_fixtures("R", "*.r") + all_fixtures("R", "*.R"), - "Rebol" => all_fixtures("Rebol", "*.r") - }) - end end