Merge branch 'refactor-heuristics' into 1036-local

* refactor-heuristics: (43 commits)
  update docs
  Clean up heuristic logic
  Allow disambiguate to return an Array
  Rename .create to .disambiguate
  docs
  Remove inactive heuristics
  Refactor heuristics
  Not going back
  docs
  Move call method into existing Classifier class
  Try strategies until one language is returned
  Remove unneded empty blob check
  Add F# and GLSL samples.  Add Forth and GLSL extension .fs. Add heuristic to disambiguate between F#, Forth, and GLSL.
  byebug requires ruby 2.0
  Remove test for removed extension
  Fix typo in test
  add rake interpreter
  add python3 interpreter
  Remove old wrong_shebang.rb sample
  Add byebug
  ...

Conflicts:
	lib/linguist/heuristics.rb
	test/test_heuristics.rb
This commit is contained in:
Brandon Keepers
2014-11-28 17:58:00 -06:00
32 changed files with 2199 additions and 294 deletions

View File

@@ -1,162 +1,143 @@
module Linguist
# A collection of simple heuristics that can be used to better analyze languages.
class Heuristics
ACTIVE = true
# Public: Given an array of String language names,
# apply heuristics against the given data and return an array
# of matching languages, or nil.
# Public: Use heuristics to detect language of the blob.
#
# data - Array of tokens or String data to analyze.
# languages - Array of language name Strings to restrict to.
# blob - An object that quacks like a blob.
# possible_languages - Array of Language objects
#
# Returns an array of Languages or []
def self.find_by_heuristics(data, languages)
if active?
result = []
# Examples
#
# Heuristics.call(FileBlob.new("path/to/file"), [
# Language["Ruby"], Language["Python"]
# ])
#
# Returns an Array of languages, or empty if none matched or were inconclusive.
def self.call(blob, languages)
data = blob.data
if languages.all? { |l| ["Objective-C", "C++", "C"].include?(l) }
result = disambiguate_c(data)
end
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
result = disambiguate_pl(data)
end
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
result = disambiguate_ecl(data)
end
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
result = disambiguate_pro(data)
end
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
result = disambiguate_cl(data)
end
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
result = disambiguate_hack(data)
end
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
result = disambiguate_sc(data)
end
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
result = disambiguate_asc(data)
end
if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
result = disambiguate_f(data)
end
return result
@heuristics.each do |heuristic|
return Array(heuristic.call(data)) if heuristic.matches?(languages)
end
[] # No heuristics matched
end
# .h extensions are ambiguous between C, C++, and Objective-C.
# We want to shortcut look for Objective-C _and_ now C++ too!
# Internal: Define a new heuristic.
#
# Returns an array of Languages or []
def self.disambiguate_c(data)
matches = []
# languages - String names of languages to disambiguate.
# heuristic - Block which takes data as an argument and returns a Language or nil.
#
# Examples
#
# disambiguate "Perl", "Prolog" do |data|
# if data.include?("use strict")
# Language["Perl"]
# elsif data.include?(":-")
# Language["Prolog"]
# end
# end
#
def self.disambiguate(*languages, &heuristic)
@heuristics << new(languages, &heuristic)
end
# Internal: Array of defined heuristics
@heuristics = []
# Internal
def initialize(languages, &heuristic)
@languages = languages
@heuristic = heuristic
end
# Internal: Check if this heuristic matches the candidate languages.
def matches?(candidates)
candidates.all? { |l| @languages.include?(l.name) }
end
# Internal: Perform the heuristic
def call(data)
@heuristic.call(data)
end
disambiguate "Objective-C", "C++", "C" do |data|
if (/@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
matches << Language["Objective-C"]
Language["Objective-C"]
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
/^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
matches << Language["C++"]
/^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
Language["C++"]
end
matches
end
def self.disambiguate_pl(data)
matches = []
disambiguate "Perl", "Prolog" do |data|
if data.include?("use strict")
matches << Language["Perl"]
Language["Perl"]
elsif data.include?(":-")
matches << Language["Prolog"]
Language["Prolog"]
end
matches
end
def self.disambiguate_ecl(data)
matches = []
disambiguate "ECL", "Prolog" do |data|
if data.include?(":-")
matches << Language["Prolog"]
Language["Prolog"]
elsif data.include?(":=")
matches << Language["ECL"]
Language["ECL"]
end
matches
end
def self.disambiguate_pro(data)
matches = []
if (data.include?(":-"))
matches << Language["Prolog"]
disambiguate "IDL", "Prolog" do |data|
if data.include?(":-")
Language["Prolog"]
else
matches << Language["IDL"]
Language["IDL"]
end
matches
end
def self.disambiguate_ts(data)
matches = []
if (data.include?("</translation>"))
matches << Language["XML"]
else
matches << Language["TypeScript"]
end
matches
end
def self.disambiguate_cl(data)
matches = []
disambiguate "Common Lisp", "OpenCL" do |data|
if data.include?("(defun ")
matches << Language["Common Lisp"]
Language["Common Lisp"]
elsif /\/\* |\/\/ |^\}/.match(data)
matches << Language["OpenCL"]
Language["OpenCL"]
end
matches
end
def self.disambiguate_r(data)
matches = []
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
matches << Language["R"] if data.include?("<-")
matches
end
def self.disambiguate_hack(data)
matches = []
disambiguate "Hack", "PHP" do |data|
if data.include?("<?hh")
matches << Language["Hack"]
Language["Hack"]
elsif /<?[^h]/.match(data)
matches << Language["PHP"]
Language["PHP"]
end
matches
end
def self.disambiguate_sc(data)
matches = []
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
matches << Language["SuperCollider"]
disambiguate "Scala", "SuperCollider" do |data|
if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
Language["SuperCollider"]
elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
Language["Scala"]
end
if (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
matches << Language["Scala"]
end
matches
end
def self.disambiguate_asc(data)
matches = []
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
matches
disambiguate "AsciiDoc", "AGS Script" do |data|
Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
end
def self.disambiguate_f(data)
matches = []
disambiguate "FORTRAN", "Forth" do |data|
if /^: /.match(data)
matches << Language["Forth"]
Language["Forth"]
elsif /^([c*][^a-z]| subroutine\s)/i.match(data)
matches << Language["FORTRAN"]
Language["FORTRAN"]
end
matches
end
def self.active?
!!ACTIVE
disambiguate "F#", "Forth", "GLSL" do |data|
if /^(: |new-device)/.match(data)
Language["Forth"]
elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
Language["F#"]
elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
Language["GLSL"]
end
end
end
end