Merge branch 'refactor-heuristics' into 1036-local

* refactor-heuristics: (43 commits)
  update docs
  Clean up heuristic logic
  Allow disambiguate to return an Array
  Rename .create to .disambiguate
  docs
  Remove inactive heuristics
  Refactor heuristics
  Not going back
  docs
  Move call method into existing Classifier class
  Try strategies until one language is returned
  Remove unneded empty blob check
  Add F# and GLSL samples.  Add Forth and GLSL extension .fs. Add heuristic to disambiguate between F#, Forth, and GLSL.
  byebug requires ruby 2.0
  Remove test for removed extension
  Fix typo in test
  add rake interpreter
  add python3 interpreter
  Remove old wrong_shebang.rb sample
  Add byebug
  ...

Conflicts:
	lib/linguist/heuristics.rb
	test/test_heuristics.rb
This commit is contained in:
Brandon Keepers
2014-11-28 17:58:00 -06:00
32 changed files with 2199 additions and 294 deletions

View File

@@ -12,7 +12,7 @@ This can usually be solved either by adding a new filename or file name extensio
Assuming your code is being detected as the right language (see above), in most cases this is due to a bug in the language grammar rather than a bug in Linguist. [`grammars.yml`][grammars] lists all the grammars we use for syntax highlighting on github.com. Find the one corresponding to your code's programming language and submit a bug report upstream.
You can also try to fix the bug yourself and submit a Pull Request. [This piece from TextMate's documentation](http://manual.macromates.com/en/language_grammars) offers a good introduction on how to work with TextMate-compatible grammars.
You can also try to fix the bug yourself and submit a Pull Request. [This piece from TextMate's documentation](http://manual.macromates.com/en/language_grammars) offers a good introduction on how to work with TextMate-compatible grammars. You can test grammars using [Lightshow](https://lightshow.githubapp.com).
Once the bug has been fixed upstream, please let us know and we'll pick it up for GitHub.

View File

@@ -2,3 +2,4 @@ source 'https://rubygems.org'
gemspec :name => "github-linguist"
gemspec :name => "github-linguist-grammars"
gem 'test-unit', require: false if RUBY_VERSION >= '2.2'
gem 'byebug' if RUBY_VERSION >= '2.0'

View File

@@ -5,8 +5,6 @@ http://svn.textmate.org/trunk/Review/Bundles/BlitzMax.tmbundle:
- source.blitzmax
http://svn.textmate.org/trunk/Review/Bundles/Cython.tmbundle:
- source.cython
http://svn.textmate.org/trunk/Review/Bundles/F%20Sharp.tmbundle:
- source.fsharp
http://svn.textmate.org/trunk/Review/Bundles/Forth.tmbundle:
- source.forth
http://svn.textmate.org/trunk/Review/Bundles/Parrot.tmbundle:
@@ -135,6 +133,8 @@ https://github.com/euler0/sublime-glsl/raw/master/GLSL.tmLanguage:
- source.glsl
https://github.com/fancy-lang/fancy-tmbundle:
- source.fancy
https://github.com/fsharp/fsharpbinding:
- source.fsharp
https://github.com/gingerbeardman/monkey.tmbundle:
- source.monkey
https://github.com/guillermooo/dart-sublime-bundle/raw/master/Dart.tmLanguage:

View File

@@ -3,6 +3,25 @@ require 'linguist/tokenizer'
module Linguist
# Language bayesian classifier.
class Classifier
# Public: Use the classifier to detect language of the blob.
#
# blob - An object that quacks like a blob.
# possible_languages - Array of Language objects
#
# Examples
#
# Classifier.call(FileBlob.new("path/to/file"), [
# Language["Ruby"], Language["Python"]
# ])
#
# Returns an Array of Language objects, most probable first.
def self.call(blob, possible_languages)
language_names = possible_languages.map(&:name)
classify(Samples.cache, blob.data, language_names).map do |name, _|
Language[name] # Return the actual Language objects
end
end
# Public: Train classifier that data is a certain language.
#
# db - Hash classifier database object

View File

@@ -57,14 +57,20 @@ module Linguist
#
# Returns a String.
def extension
# File.extname returns nil if the filename is an extension.
extension = File.extname(name)
basename = File.basename(name)
# Checks if the filename is an extension.
if extension.empty? && basename[0] == "."
basename
else
extension
extensions.last || ""
end
# Public: Return an array of the file extensions
#
# >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
# => [".html.erb", ".erb"]
#
# Returns an Array
def extensions
basename, *segments = File.basename(name).split(".")
segments.map.with_index do |segment, index|
"." + segments[index..-1].join(".")
end
end
end

View File

@@ -1,162 +1,143 @@
module Linguist
# A collection of simple heuristics that can be used to better analyze languages.
class Heuristics
ACTIVE = true
# Public: Given an array of String language names,
# apply heuristics against the given data and return an array
# of matching languages, or nil.
# Public: Use heuristics to detect language of the blob.
#
# data - Array of tokens or String data to analyze.
# languages - Array of language name Strings to restrict to.
# blob - An object that quacks like a blob.
# possible_languages - Array of Language objects
#
# Returns an array of Languages or []
def self.find_by_heuristics(data, languages)
if active?
result = []
# Examples
#
# Heuristics.call(FileBlob.new("path/to/file"), [
# Language["Ruby"], Language["Python"]
# ])
#
# Returns an Array of languages, or empty if none matched or were inconclusive.
def self.call(blob, languages)
data = blob.data
if languages.all? { |l| ["Objective-C", "C++", "C"].include?(l) }
result = disambiguate_c(data)
end
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
result = disambiguate_pl(data)
end
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
result = disambiguate_ecl(data)
end
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
result = disambiguate_pro(data)
end
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
result = disambiguate_cl(data)
end
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
result = disambiguate_hack(data)
end
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
result = disambiguate_sc(data)
end
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
result = disambiguate_asc(data)
end
if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
result = disambiguate_f(data)
end
return result
@heuristics.each do |heuristic|
return Array(heuristic.call(data)) if heuristic.matches?(languages)
end
[] # No heuristics matched
end
# .h extensions are ambiguous between C, C++, and Objective-C.
# We want to shortcut look for Objective-C _and_ now C++ too!
# Internal: Define a new heuristic.
#
# Returns an array of Languages or []
def self.disambiguate_c(data)
matches = []
# languages - String names of languages to disambiguate.
# heuristic - Block which takes data as an argument and returns a Language or nil.
#
# Examples
#
# disambiguate "Perl", "Prolog" do |data|
# if data.include?("use strict")
# Language["Perl"]
# elsif data.include?(":-")
# Language["Prolog"]
# end
# end
#
def self.disambiguate(*languages, &heuristic)
@heuristics << new(languages, &heuristic)
end
# Internal: Array of defined heuristics
@heuristics = []
# Internal
def initialize(languages, &heuristic)
@languages = languages
@heuristic = heuristic
end
# Internal: Check if this heuristic matches the candidate languages.
def matches?(candidates)
candidates.all? { |l| @languages.include?(l.name) }
end
# Internal: Perform the heuristic
def call(data)
@heuristic.call(data)
end
disambiguate "Objective-C", "C++", "C" do |data|
if (/@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
matches << Language["Objective-C"]
Language["Objective-C"]
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
/^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
matches << Language["C++"]
/^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
Language["C++"]
end
matches
end
def self.disambiguate_pl(data)
matches = []
disambiguate "Perl", "Prolog" do |data|
if data.include?("use strict")
matches << Language["Perl"]
Language["Perl"]
elsif data.include?(":-")
matches << Language["Prolog"]
Language["Prolog"]
end
matches
end
def self.disambiguate_ecl(data)
matches = []
disambiguate "ECL", "Prolog" do |data|
if data.include?(":-")
matches << Language["Prolog"]
Language["Prolog"]
elsif data.include?(":=")
matches << Language["ECL"]
Language["ECL"]
end
matches
end
def self.disambiguate_pro(data)
matches = []
if (data.include?(":-"))
matches << Language["Prolog"]
disambiguate "IDL", "Prolog" do |data|
if data.include?(":-")
Language["Prolog"]
else
matches << Language["IDL"]
Language["IDL"]
end
matches
end
def self.disambiguate_ts(data)
matches = []
if (data.include?("</translation>"))
matches << Language["XML"]
else
matches << Language["TypeScript"]
end
matches
end
def self.disambiguate_cl(data)
matches = []
disambiguate "Common Lisp", "OpenCL" do |data|
if data.include?("(defun ")
matches << Language["Common Lisp"]
Language["Common Lisp"]
elsif /\/\* |\/\/ |^\}/.match(data)
matches << Language["OpenCL"]
Language["OpenCL"]
end
matches
end
def self.disambiguate_r(data)
matches = []
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
matches << Language["R"] if data.include?("<-")
matches
end
def self.disambiguate_hack(data)
matches = []
disambiguate "Hack", "PHP" do |data|
if data.include?("<?hh")
matches << Language["Hack"]
Language["Hack"]
elsif /<?[^h]/.match(data)
matches << Language["PHP"]
Language["PHP"]
end
matches
end
def self.disambiguate_sc(data)
matches = []
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
matches << Language["SuperCollider"]
disambiguate "Scala", "SuperCollider" do |data|
if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
Language["SuperCollider"]
elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
Language["Scala"]
end
if (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
matches << Language["Scala"]
end
matches
end
def self.disambiguate_asc(data)
matches = []
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
matches
disambiguate "AsciiDoc", "AGS Script" do |data|
Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
end
def self.disambiguate_f(data)
matches = []
disambiguate "FORTRAN", "Forth" do |data|
if /^: /.match(data)
matches << Language["Forth"]
Language["Forth"]
elsif /^([c*][^a-z]| subroutine\s)/i.match(data)
matches << Language["FORTRAN"]
Language["FORTRAN"]
end
matches
end
def self.active?
!!ACTIVE
disambiguate "F#", "Forth", "GLSL" do |data|
if /^(: |new-device)/.match(data)
Language["Forth"]
elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
Language["F#"]
elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
Language["GLSL"]
end
end
end
end

View File

@@ -10,6 +10,8 @@ require 'linguist/heuristics'
require 'linguist/samples'
require 'linguist/file_blob'
require 'linguist/blob_helper'
require 'linguist/strategy/filename'
require 'linguist/strategy/shebang'
module Linguist
# Language names that are recognizable by GitHub. Defined languages
@@ -91,6 +93,13 @@ module Linguist
language
end
STRATEGIES = [
Linguist::Strategy::Filename,
Linguist::Strategy::Shebang,
Linguist::Heuristics,
Linguist::Classifier
]
# Public: Detects the Language of the blob.
#
# blob - an object that includes the Linguist `BlobHelper` interface;
@@ -98,49 +107,22 @@ module Linguist
#
# Returns Language or nil.
def self.detect(blob)
name = blob.name.to_s
# Bail early if the blob is binary or empty.
return nil if blob.likely_binary? || blob.binary? || blob.empty?
# A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other
# languages that have shebang scripts.
extension = FileBlob.new(name).extension
if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
name += ".script!"
end
# First try to find languages that match based on filename.
possible_languages = find_by_filename(name)
# If there is more than one possible language with that extension (or no
# extension at all, in the case of extensionless scripts), we need to continue
# our detection work
if possible_languages.length > 1
data = blob.data
possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
# Call each strategy until one candidate is returned
STRATEGIES.reduce([]) do |languages, strategy|
candidates = strategy.call(blob, languages)
if candidates.size == 1
return candidates.first
elsif candidates.size > 1
# More than one candidate was found, pass them to the next strategy
candidates
else
# Strategy couldn't find any candidates, so pass on the original list
languages
end
# Check if there's a shebang line and use that as authoritative
if (result = find_by_shebang(data)) && !result.empty?
result.first
# No shebang. Still more work to do. Try to find it with our heuristics.
elsif heuristic_languages.size == 1
heuristic_languages.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[classified[0]]
end
else
# Simplest and most common case, we can just return the one match based on extension
possible_languages.first
end
end.first
end
# Public: Get all Languages
@@ -190,8 +172,13 @@ module Linguist
# Returns all matching Languages or [] if none were found.
def self.find_by_filename(filename)
basename = File.basename(filename)
extname = FileBlob.new(filename).extension
(@filename_index[basename] + find_by_extension(extname)).compact.uniq
# find the first extension with language definitions
extname = FileBlob.new(filename).extensions.detect do |e|
!@extension_index[e].empty?
end
(@filename_index[basename] + @extension_index[extname]).compact.uniq
end
# Public: Look up Languages by file extension.

View File

@@ -558,6 +558,8 @@ Crystal:
- .cr
ace_mode: ruby
tm_scope: source.ruby
interpreters:
- crystal
Cucumber:
extensions:
@@ -735,6 +737,8 @@ Erlang:
- .es
- .escript
- .hrl
interpreters:
- escript
F#:
type: programming
@@ -814,6 +818,7 @@ Forth:
- .for
- .forth
- .frt
- .fs
Frege:
type: programming
@@ -867,6 +872,7 @@ GLSL:
- .fp
- .frag
- .frg
- .fs
- .fshader
- .geo
- .geom
@@ -930,6 +936,8 @@ Gnuplot:
- .gnuplot
- .plot
- .plt
interpreters:
- gnuplot
Go:
type: programming
@@ -1195,6 +1203,8 @@ Ioke:
color: "#078193"
extensions:
- .ik
interpreters:
- ioke
Isabelle:
type: programming
@@ -1702,6 +1712,8 @@ Nu:
filenames:
- Nukefile
tm_scope: source.scheme
interpreters:
- nush
NumPy:
group: Python
@@ -1888,6 +1900,8 @@ Parrot Assembly:
- pasm
extensions:
- .pasm
interpreters:
- parrot
tm_scope: none
Parrot Internal Representation:
@@ -1898,6 +1912,8 @@ Parrot Internal Representation:
- pir
extensions:
- .pir
interpreters:
- parrot
Pascal:
type: programming
@@ -1940,6 +1956,8 @@ Perl6:
- .p6m
- .pl6
- .pm6
interpreters:
- perl6
tm_scope: none
PigLatin:
@@ -2004,6 +2022,8 @@ Prolog:
- .ecl
- .pro
- .prolog
interpreters:
- swipl
Propeller Spin:
type: programming
@@ -2067,6 +2087,8 @@ Python:
- wscript
interpreters:
- python
- python2
- python3
Python traceback:
type: data
@@ -2087,6 +2109,8 @@ QMake:
extensions:
- .pro
- .pri
interpreters:
- qmake
R:
type: programming
@@ -2241,6 +2265,8 @@ Ruby:
- .watchr
interpreters:
- ruby
- macruby
- rake
filenames:
- .pryrc
- Appraisals
@@ -2327,6 +2353,8 @@ Scala:
- .scala
- .sbt
- .sc
interpreters:
- scala
Scaml:
group: HTML

View File

@@ -52,14 +52,16 @@ module Linguist
})
end
else
path = File.join(dirname, filename)
if File.extname(filename) == ""
raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
raise "#{path} is missing an extension, maybe it belongs in filenames/ subdir"
end
yield({
:path => File.join(dirname, filename),
:path => path,
:language => category,
:interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
:interpreter => Linguist.interpreter_from_shebang(File.read(path)),
:extname => File.extname(filename)
})
end
@@ -131,18 +133,19 @@ module Linguist
script = script == 'env' ? tokens[1] : script
# "python2.6" -> "python"
if script =~ /((?:\d+\.?)+)/
script.sub! $1, ''
end
# If script has an invalid shebang, we might get here
return unless script
# "python2.6" -> "python2"
script.sub! $1, '' if script =~ /(\.\d+)$/
# Check for multiline shebang hacks that call `exec`
if script == 'sh' &&
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
script = $1
end
script
File.basename(script)
else
nil
end

View File

@@ -0,0 +1,20 @@
module Linguist
module Strategy
# Detects language based on filename and/or extension
class Filename
def self.call(blob, _)
name = blob.name.to_s
# A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other
# languages that have shebang scripts.
extensions = FileBlob.new(name).extensions
if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
name += ".script!"
end
Language.find_by_filename(name)
end
end
end
end

View File

@@ -0,0 +1,10 @@
module Linguist
module Strategy
# Check if there's a shebang line and use that as authoritative
class Shebang
def self.call(blob, _)
Language.find_by_shebang(blob.data)
end
end
end
end

15
samples/F#/sample.fs Normal file
View File

@@ -0,0 +1,15 @@
module Sample
open System
type Foo =
{
Bar : string
}
type Baz = interface end
let Sample1(xs : int list) : string =
xs
|> List.map (fun x -> string x)
|> String.concat ","

252
samples/Forth/core.fs Normal file
View File

@@ -0,0 +1,252 @@
: immediate lastxt @ dup c@ negate swap c! ;
: \ source nip >in ! ; immediate \ Copyright 2004, 2012 Lars Brinkhoff
: char \ ( "word" -- char )
bl-word here 1+ c@ ;
: ahead here 0 , ;
: resolve here swap ! ;
: ' bl-word here find 0branch [ ahead ] exit [ resolve ] 0 ;
: postpone-nonimmediate [ ' literal , ' compile, ] literal , ;
: create dovariable_code header, reveal ;
create postponers
' postpone-nonimmediate ,
' abort ,
' , ,
: word \ ( char "<chars>string<char>" -- caddr )
drop bl-word here ;
: postpone \ ( C: "word" -- )
bl word find 1+ cells postponers + @ execute ; immediate
: unresolved \ ( C: "word" -- orig )
postpone postpone postpone ahead ; immediate
: chars \ ( n1 -- n2 )
;
: else \ ( -- ) ( C: orig1 -- orig2 )
unresolved branch swap resolve ; immediate
: if \ ( flag -- ) ( C: -- orig )
unresolved 0branch ; immediate
: then \ ( -- ) ( C: orig -- )
resolve ; immediate
: [char] \ ( "word" -- )
char postpone literal ; immediate
: (does>) lastxt @ dodoes_code over >code ! r> swap >does ! ;
: does> postpone (does>) ; immediate
: begin \ ( -- ) ( C: -- dest )
here ; immediate
: while \ ( x -- ) ( C: dest -- orig dest )
unresolved 0branch swap ; immediate
: repeat \ ( -- ) ( C: orig dest -- )
postpone branch , resolve ; immediate
: until \ ( x -- ) ( C: dest -- )
postpone 0branch , ; immediate
: recurse lastxt @ compile, ; immediate
: pad \ ( -- addr )
here 1024 + ;
: parse \ ( char "string<char>" -- addr n )
pad >r begin
source? if <source 2dup <> else 0 0 then
while
r@ c! r> 1+ >r
repeat 2drop pad r> over - ;
: ( \ ( "string<paren>" -- )
[ char ) ] literal parse 2drop ; immediate
\ TODO: If necessary, refill and keep parsing.
: string, ( addr n -- )
here over allot align swap cmove ;
: (s") ( -- addr n ) ( R: ret1 -- ret2 )
r> dup @ swap cell+ 2dup + aligned >r swap ;
create squote 128 allot
: s" ( "string<quote>" -- addr n )
state @ if
postpone (s") [char] " parse dup , string,
else
[char] " parse >r squote r@ cmove squote r>
then ; immediate
: (abort") ( ... addr n -- ) ( R: ... -- )
cr type cr abort ;
: abort" ( ... x "string<quote>" -- ) ( R: ... -- )
postpone if postpone s" postpone (abort") postpone then ; immediate
\ ----------------------------------------------------------------------
( Core words. )
\ TODO: #
\ TODO: #>
\ TODO: #s
: and ( x y -- x&y ) nand invert ;
: * 1 2>r 0 swap begin r@ while
r> r> swap 2dup dup + 2>r and if swap over + swap then dup +
repeat r> r> 2drop drop ;
\ TODO: */mod
: +loop ( -- ) ( C: nest-sys -- )
postpone (+loop) postpone 0branch , postpone unloop ; immediate
: space bl emit ;
: ?.- dup 0 < if [char] - emit negate then ;
: digit [char] 0 + emit ;
: (.) base @ /mod ?dup if recurse then digit ;
: ." ( "string<quote>" -- ) postpone s" postpone type ; immediate
: . ( x -- ) ?.- (.) space ;
: postpone-number ( caddr -- )
0 0 rot count >number dup 0= if
2drop nip
postpone (literal) postpone (literal) postpone ,
postpone literal postpone ,
else
." Undefined: " type cr abort
then ;
' postpone-number postponers cell+ !
: / ( x y -- x/y ) /mod nip ;
: 0< ( n -- flag ) 0 < ;
: 1- ( n -- n-1 ) -1 + ;
: 2! ( x1 x2 addr -- ) swap over ! cell+ ! ;
: 2* ( n -- 2n ) dup + ;
\ Kernel: 2/
: 2@ ( addr -- x1 x2 ) dup cell+ @ swap @ ;
\ Kernel: 2drop
\ Kernel: 2dup
\ TODO: 2over ( x1 x2 x3 x4 -- x1 x2 x3 x4 x1 x2 )
\ 3 pick 3 pick ;
\ TODO: 2swap
\ TODO: <#
: abs ( n -- |n| )
dup 0< if negate then ;
\ TODO: accept
: c, ( n -- )
here c! 1 chars allot ;
: char+ ( n1 -- n2 )
1+ ;
: constant create , does> @ ;
: decimal ( -- )
10 base ! ;
: depth ( -- n )
data_stack 100 cells + 'SP @ - /cell / 2 - ;
: do ( n1 n2 -- ) ( R: -- loop-sys ) ( C: -- do-sys )
postpone 2>r here ; immediate
\ TODO: environment?
\ TODO: evaluate
\ TODO: fill
\ TODO: fm/mod )
\ TODO: hold
: j ( -- x1 ) ( R: x1 x2 x3 -- x1 x2 x3 )
'RP @ 3 cells + @ ;
\ TODO: leave
: loop ( -- ) ( C: nest-sys -- )
postpone 1 postpone (+loop)
postpone 0branch ,
postpone unloop ; immediate
: lshift begin ?dup while 1- swap dup + swap repeat ;
: rshift 1 begin over while dup + swap 1- swap repeat nip
2>r 0 1 begin r@ while
r> r> 2dup swap dup + 2>r and if swap over + swap then dup +
repeat r> r> 2drop drop ;
: max ( x y -- max[x,y] )
2dup > if drop else nip then ;
\ Kernel: min
\ TODO: mod
\ TODO: move
: (quit) ( R: ... -- )
return_stack 100 cells + 'RP !
0 'source-id ! tib ''source ! #tib ''#source !
postpone [
begin
refill
while
interpret state @ 0= if ." ok" cr then
repeat
bye ;
' (quit) ' quit >body cell+ !
\ TODO: s>d
\ TODO: sign
\ TODO: sm/rem
: spaces ( n -- )
0 do space loop ;
\ TODO: u.
: signbit ( -- n ) -1 1 rshift invert ;
: xor ( x y -- x^y ) 2dup nand >r r@ nand swap r> nand nand ;
: u< ( x y -- flag ) signbit xor swap signbit xor > ;
\ TODO: um/mod
: variable ( "word" -- )
create /cell allot ;
: ['] \ ( C: "word" -- )
' postpone literal ; immediate

48
samples/GLSL/recurse1.fs Normal file
View File

@@ -0,0 +1,48 @@
#version 330 core
// cross-unit recursion
void main() {}
// two-level recursion
float cbar(int);
void cfoo(float)
{
cbar(2);
}
// four-level, out of order
void CB();
void CD();
void CA() { CB(); }
void CC() { CD(); }
// high degree
void CBT();
void CDT();
void CAT() { CBT(); CBT(); CBT(); }
void CCT() { CDT(); CDT(); CBT(); }
// not recursive
void norA() {}
void norB() { norA(); }
void norC() { norA(); }
void norD() { norA(); }
void norE() { norB(); }
void norF() { norB(); }
void norG() { norE(); }
void norH() { norE(); }
void norI() { norE(); }
// not recursive, but with a call leading into a cycle if ignoring direction
void norcA() { }
void norcB() { norcA(); }
void norcC() { norcB(); }
void norcD() { norcC(); norcB(); } // head of cycle
void norcE() { norcD(); } // lead into cycle

View File

@@ -1,2 +0,0 @@
#!/usr/bin/env python
puts "Not Python"

22
test/fixtures/Python/run_tests.module vendored Normal file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env python
import sys, os
# Set the current working directory to the directory where this script is located
os.chdir(os.path.abspath(os.path.dirname(sys.argv[0])))
#### Set the name of the application here and moose directory relative to the application
app_name = 'stork'
MODULE_DIR = os.path.abspath('..')
MOOSE_DIR = os.path.abspath(os.path.join(MODULE_DIR, '..'))
#### See if MOOSE_DIR is already in the environment instead
if os.environ.has_key("MOOSE_DIR"):
MOOSE_DIR = os.environ['MOOSE_DIR']
sys.path.append(os.path.join(MOOSE_DIR, 'python'))
import path_tool
path_tool.activate_module('TestHarness')
from TestHarness import TestHarness
# Run the tests!
TestHarness.buildAndRun(sys.argv, app_name, MOOSE_DIR)

1505
test/fixtures/Shell/mintleaf.module vendored Normal file

File diff suppressed because it is too large Load Diff

4
test/helper.rb Normal file
View File

@@ -0,0 +1,4 @@
require "bundler/setup"
require "test/unit"
require "mocha/setup"
require "linguist"

View File

@@ -1,9 +1,4 @@
require 'linguist/file_blob'
require 'linguist/samples'
require 'test/unit'
require 'mocha/setup'
require 'mime/types'
require_relative "./helper"
class TestBlob < Test::Unit::TestCase
include Linguist
@@ -470,6 +465,25 @@ class TestBlob < Test::Unit::TestCase
assert blob.language, "No language for #{sample[:path]}"
assert_equal sample[:language], blob.language.name, blob.name
end
# Test language detection for files which shouldn't be used as samples
root = File.expand_path('../fixtures', __FILE__)
Dir.entries(root).each do |language|
next unless File.file?(language)
# Each directory contains test files of a language
dirname = File.join(root, language)
Dir.entries(dirname).each do |filename|
next unless File.file?(filename)
# By default blob search the file in the samples;
# thus, we need to give it the absolute path
filepath = File.join(dirname, filename)
blob = blob(filepath)
assert blob.language, "No language for #{filepath}"
assert_equal language, blob.language.name, blob.name
end
end
end
def test_minified_files_not_safe_to_highlight

View File

@@ -1,9 +1,4 @@
require 'linguist/classifier'
require 'linguist/language'
require 'linguist/samples'
require 'linguist/tokenizer'
require 'test/unit'
require_relative "./helper"
class TestClassifier < Test::Unit::TestCase
include Linguist

10
test/test_file_blob.rb Normal file
View File

@@ -0,0 +1,10 @@
require 'linguist/file_blob'
require 'test/unit'
class TestFileBlob < Test::Unit::TestCase
def test_extensions
assert_equal [".gitignore"], Linguist::FileBlob.new(".gitignore").extensions
assert_equal [".xml"], Linguist::FileBlob.new("build.xml").extensions
assert_equal [".html.erb", ".erb"], Linguist::FileBlob.new("dotted.dir/index.html.erb").extensions
end
end

View File

@@ -1,9 +1,4 @@
require 'linguist/heuristics'
require 'linguist/language'
require 'linguist/samples'
require 'linguist/file_blob'
require 'test/unit'
require_relative "./helper"
class TestHeuristcs < Test::Unit::TestCase
include Linguist
@@ -16,6 +11,11 @@ class TestHeuristcs < Test::Unit::TestCase
File.read(File.join(samples_path, name))
end
def file_blob(name)
path = File.exist?(name) ? name : File.join(samples_path, name)
FileBlob.new(path)
end
def all_fixtures(language_name, file="*")
Dir.glob("#{samples_path}/#{language_name}/#{file}")
end
@@ -23,24 +23,17 @@ class TestHeuristcs < Test::Unit::TestCase
# Candidate languages = ["C++", "Objective-C"]
def test_obj_c_by_heuristics
# Only calling out '.h' filenames as these are the ones causing issues
all_fixtures("Objective-C", "*.h").each do |fixture|
results = Heuristics.disambiguate_c(fixture("Objective-C/#{File.basename(fixture)}"))
assert_equal Language["Objective-C"], results.first, "Failed for #{File.basename(fixture)}"
end
end
# Candidate languages = ["C++", "Objective-C"]
def test_cpp_by_heuristics
results = Heuristics.disambiguate_c(fixture("C++/render_adapter.cpp"))
assert_equal Language["C++"], results.first
results = Heuristics.disambiguate_c(fixture("C++/ThreadedQueue.h"))
assert_equal Language["C++"], results.first
assert_heuristics({
"Objective-C" => all_fixtures("Objective-C", "*.h"),
"C++" => ["C++/render_adapter.cpp", "C++/ThreadedQueue.h"],
"C" => nil
})
end
def test_c_by_heuristics
languages = ["C++", "Objective-C", "C"]
results = Heuristics.disambiguate_c(fixture("C/ArrowLeft.h"))
assert_equal nil, results.first
languages = [Language["C++"], Language["Objective-C"], Language["C"]]
results = Heuristics.call(file_blob("C/ArrowLeft.h"), languages)
assert_equal [], results
end
def test_detect_still_works_if_nothing_matches
@@ -50,94 +43,89 @@ class TestHeuristcs < Test::Unit::TestCase
end
# Candidate languages = ["Perl", "Prolog"]
def test_pl_prolog_by_heuristics
results = Heuristics.disambiguate_pl(fixture("Prolog/turing.pl"))
assert_equal Language["Prolog"], results.first
end
# Candidate languages = ["Perl", "Prolog"]
def test_pl_perl_by_heuristics
results = Heuristics.disambiguate_pl(fixture("Perl/perl-test.t"))
assert_equal Language["Perl"], results.first
def test_pl_prolog_perl_by_heuristics
assert_heuristics({
"Prolog" => "Prolog/turing.pl",
"Perl" => "Perl/perl-test.t",
})
end
# Candidate languages = ["ECL", "Prolog"]
def test_ecl_prolog_by_heuristics
results = Heuristics.disambiguate_ecl(fixture("Prolog/or-constraint.ecl"))
assert_equal Language["Prolog"], results.first
results = Heuristics.call(file_blob("Prolog/or-constraint.ecl"), [Language["ECL"], Language["Prolog"]])
assert_equal [Language["Prolog"]], results
end
# Candidate languages = ["ECL", "Prolog"]
def test_ecl_ecl_by_heuristics
results = Heuristics.disambiguate_ecl(fixture("ECL/sample.ecl"))
assert_equal Language["ECL"], results.first
def test_ecl_prolog_by_heuristics
assert_heuristics({
"ECL" => "ECL/sample.ecl",
"Prolog" => "Prolog/or-constraint.ecl"
})
end
# Candidate languages = ["IDL", "Prolog"]
def test_pro_prolog_by_heuristics
results = Heuristics.disambiguate_pro(fixture("Prolog/logic-problem.pro"))
assert_equal Language["Prolog"], results.first
end
# Candidate languages = ["IDL", "Prolog"]
def test_pro_idl_by_heuristics
results = Heuristics.disambiguate_pro(fixture("IDL/mg_acosh.pro"))
assert_equal Language["IDL"], results.first
def test_pro_prolog_idl_by_heuristics
assert_heuristics({
"Prolog" => "Prolog/logic-problem.pro",
"IDL" => "IDL/mg_acosh.pro"
})
end
# Candidate languages = ["AGS Script", "AsciiDoc"]
def test_asc_asciidoc_by_heuristics
results = Heuristics.disambiguate_asc(fixture("AsciiDoc/list.asc"))
assert_equal Language["AsciiDoc"], results.first
end
# Candidate languages = ["TypeScript", "XML"]
def test_ts_typescript_by_heuristics
results = Heuristics.disambiguate_ts(fixture("TypeScript/classes.ts"))
assert_equal Language["TypeScript"], results.first
end
# Candidate languages = ["TypeScript", "XML"]
def test_ts_xml_by_heuristics
results = Heuristics.disambiguate_ts(fixture("XML/pt_BR.xml"))
assert_equal Language["XML"], results.first
assert_heuristics({
"AsciiDoc" => "AsciiDoc/list.asc",
"AGS Script" => nil
})
end
def test_cl_by_heuristics
languages = ["Common Lisp", "OpenCL"]
languages.each do |language|
all_fixtures(language).each do |fixture|
results = Heuristics.disambiguate_cl(fixture("#{language}/#{File.basename(fixture)}"))
assert_equal Language[language], results.first
end
end
assert_heuristics({
"Common Lisp" => all_fixtures("Common Lisp"),
"OpenCL" => all_fixtures("OpenCL")
})
end
def test_f_by_heuristics
languages = ["FORTRAN", "Forth"]
languages.each do |language|
all_fixtures(language).each do |fixture|
results = Heuristics.disambiguate_f(fixture("#{language}/#{File.basename(fixture)}"))
assert_equal Language[language], results.first
end
end
assert_heuristics({
"FORTRAN" => all_fixtures("FORTRAN"),
"Forth" => all_fixtures("Forth")
})
end
# Candidate languages = ["Hack", "PHP"]
def test_hack_by_heuristics
results = Heuristics.disambiguate_hack(fixture("Hack/funs.php"))
assert_equal Language["Hack"], results.first
assert_heuristics({
"Hack" => "Hack/funs.php",
"PHP" => "PHP/Model.php"
})
end
# Candidate languages = ["Scala", "SuperCollider"]
def test_sc_supercollider_by_heuristics
results = Heuristics.disambiguate_sc(fixture("SuperCollider/WarpPreset.sc"))
assert_equal Language["SuperCollider"], results.first
def test_sc_supercollider_scala_by_heuristics
assert_heuristics({
"SuperCollider" => "SuperCollider/WarpPreset.sc",
"Scala" => "Scala/node11.sc"
})
end
# Candidate languages = ["Scala", "SuperCollider"]
def test_sc_scala_by_heuristics
results = Heuristics.disambiguate_sc(fixture("Scala/node11.sc"))
assert_equal Language["Scala"], results.first
def test_fs_by_heuristics
assert_heuristics({
"F#" => all_fixtures("F#"),
"Forth" => all_fixtures("Forth"),
"GLSL" => all_fixtures("GLSL")
})
end
def assert_heuristics(hash)
candidates = hash.keys.map { |l| Language[l] }
hash.each do |language, blobs|
Array(blobs).each do |blob|
result = Heuristics.call(file_blob(blob), candidates)
assert_equal [Language[language]], result
end
end
end
end

View File

@@ -1,6 +1,4 @@
require 'linguist/language'
require 'test/unit'
require 'yaml'
require_relative "./helper"
class TestLanguage < Test::Unit::TestCase
include Linguist

View File

@@ -1,6 +1,4 @@
require 'linguist/md5'
require 'test/unit'
require_relative "./helper"
class TestMD5 < Test::Unit::TestCase
include Linguist

View File

@@ -1,5 +1,4 @@
require 'test/unit'
require 'yaml'
require_relative "./helper"
class TestPedantic < Test::Unit::TestCase
filename = File.expand_path("../../lib/linguist/languages.yml", __FILE__)

View File

@@ -1,6 +1,4 @@
require 'linguist/repository'
require 'linguist/lazy_blob'
require 'test/unit'
require_relative "./helper"
class TestRepository < Test::Unit::TestCase
def rugged_repository

View File

@@ -1,8 +1,5 @@
require 'linguist/samples'
require 'linguist/language'
require 'tempfile'
require 'yajl'
require 'test/unit'
require_relative "./helper"
require "tempfile"
class TestSamples < Test::Unit::TestCase
include Linguist
@@ -34,23 +31,29 @@ class TestSamples < Test::Unit::TestCase
assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
assert_equal data['tokens_total'], data['tokens'].inject(0) { |n, (_, ts)| n += ts.inject(0) { |m, (_, c)| m += c } }
assert !data["interpreters"].empty?
end
# Check that there aren't samples with extensions that aren't explicitly defined in languages.yml
def test_parity
extensions = Samples.cache['extnames']
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
languages = YAML.load_file(languages_yml)
languages.each do |name, options|
# Check that there aren't samples with extensions or interpreters that
# aren't explicitly defined in languages.yml
languages_yml = File.expand_path("../../lib/linguist/languages.yml", __FILE__)
YAML.load_file(languages_yml).each do |name, options|
define_method "test_samples_have_parity_with_languages_yml_for_#{name}" do
options['extensions'] ||= []
if extnames = extensions[name]
if extnames = Samples.cache['extnames'][name]
extnames.each do |extname|
next if extname == '.script!'
assert options['extensions'].include?(extname), "#{name} has a sample with extension (#{extname}) that isn't explicitly defined in languages.yml"
end
end
options['interpreters'] ||= []
if interpreters = Samples.cache['interpreters'][name]
interpreters.each do |interpreter|
# next if extname == '.script!'
assert options['interpreters'].include?(interpreter), "#{name} has a sample with an interpreter (#{interpreter}) that isn't explicitly defined in languages.yml"
end
end
end
end
@@ -79,4 +82,9 @@ class TestSamples < Test::Unit::TestCase
end
end
end
def test_shebang
assert_equal "crystal", Linguist.interpreter_from_shebang("#!/usr/bin/env bin/crystal")
assert_equal "python2", Linguist.interpreter_from_shebang("#!/usr/bin/python2.4")
end
end

View File

@@ -1,6 +1,4 @@
require 'linguist/tokenizer'
require 'test/unit'
require_relative "./helper"
class TestTokenizer < Test::Unit::TestCase
include Linguist

BIN
vendor/cache/byebug-3.5.1.gem vendored Normal file

Binary file not shown.

BIN
vendor/cache/columnize-0.8.9.gem vendored Normal file

Binary file not shown.

Binary file not shown.