Merge master

This commit is contained in:
Garen Torikian
2014-11-28 11:04:53 -08:00
34 changed files with 2071 additions and 97 deletions

View File

@@ -57,14 +57,20 @@ module Linguist
#
# Returns a String.
def extension
# File.extname returns nil if the filename is an extension.
extension = File.extname(name)
basename = File.basename(name)
# Checks if the filename is an extension.
if extension.empty? && basename[0] == "."
basename
else
extension
extensions.last || ""
end
# Public: Return an array of the file extensions
#
# >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
# => [".html.erb", ".erb"]
#
# Returns an Array
def extensions
basename, *segments = File.basename(name).split(".")
segments.map.with_index do |segment, index|
"." + segments[index..-1].join(".")
end
end
end

View File

@@ -39,6 +39,9 @@ module Linguist
if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
result = disambiguate_f(data)
end
if languages.all? { |l| ["F#", "Forth", "GLSL"].include?(l) }
result = disambiguate_fs(data)
end
return result
end
end
@@ -151,6 +154,18 @@ module Linguist
matches
end
def self.disambiguate_fs(data)
matches = []
if /^(: |new-device)/.match(data)
matches << Language["Forth"]
elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
matches << Language["F#"]
elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
matches << Language["GLSL"]
end
matches
end
def self.active?
!!ACTIVE
end

View File

@@ -106,40 +106,52 @@ module Linguist
# A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other
# languages that have shebang scripts.
extension = FileBlob.new(name).extension
if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
extensions = FileBlob.new(name).extensions
if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
name += ".script!"
end
# First try to find languages that match based on filename.
# Find languages that match based on filename.
possible_languages = find_by_filename(name)
# If there is more than one possible language with that extension (or no
# extension at all, in the case of extensionless scripts), we need to continue
# our detection work
if possible_languages.length > 1
data = blob.data
possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
if possible_languages.length == 1
# Simplest and most common case, we can just return the one match based
# on extension
possible_languages.first
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
end
# If there is more than one possible language with that extension (or no
# extension at all, in the case of extensionless scripts), we need to
# continue our detection work
else
# Matches possible_languages.length == 0 || possible_languages.length > 0
data = blob.data
# Check if there's a shebang line and use that as authoritative
if (result = find_by_shebang(data)) && !result.empty?
result.first
# No shebang. Still more work to do. Try to find it with our heuristics.
elsif heuristic_languages.size == 1
heuristic_languages.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
Language[classified[0]]
return result.first
# More than one language with that extension. We need to make a choice.
elsif possible_languages.length > 1
# First try heuristics
possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
# If there are multiple possible languages returned from heuristics
# then reduce language candidates for Bayesian classifier here.
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
end
if heuristic_languages.size == 1
return heuristic_languages.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
return Language[classified[0]]
end
end
else
# Simplest and most common case, we can just return the one match based on extension
possible_languages.first
end
end
@@ -190,8 +202,13 @@ module Linguist
# Returns all matching Languages or [] if none were found.
def self.find_by_filename(filename)
basename = File.basename(filename)
extname = FileBlob.new(filename).extension
(@filename_index[basename] + find_by_extension(extname)).compact.uniq
# find the first extension with language definitions
extname = FileBlob.new(filename).extensions.detect do |e|
!@extension_index[e].empty?
end
(@filename_index[basename] + @extension_index[extname]).compact.uniq
end
# Public: Look up Languages by file extension.

View File

@@ -506,6 +506,7 @@ CoffeeScript:
extensions:
- .coffee
- ._coffee
- .cjsx
- .cson
- .iced
filenames:
@@ -607,6 +608,8 @@ Crystal:
- .cr
ace_mode: ruby
tm_scope: source.ruby
interpreters:
- crystal
Cucumber:
extensions:
@@ -806,7 +809,6 @@ Erlang:
- .es
- .escript
- .hrl
ace_mode: erlang
F#:
type: programming
@@ -892,7 +894,6 @@ Forth:
- .for
- .forth
- .frt
ace_mode: forth
Frege:
type: programming
@@ -952,6 +953,7 @@ GLSL:
- .fp
- .frag
- .frg
- .fs
- .fshader
- .geo
- .geom
@@ -1022,6 +1024,8 @@ Gnuplot:
- .gnuplot
- .plot
- .plt
interpreters:
- gnuplot
ace_mode: none
Go:
@@ -1313,6 +1317,8 @@ Ioke:
color: "#078193"
extensions:
- .ik
interpreters:
- ioke
ace_mode: none
Isabelle:
@@ -1868,6 +1874,8 @@ Nu:
- Nukefile
tm_scope: source.scheme
ace_mode: scheme
interpreters:
- nush
NumPy:
group: Python
@@ -2072,6 +2080,8 @@ Parrot Assembly:
- pasm
extensions:
- .pasm
interpreters:
- parrot
tm_scope: none
ace_mode: none
@@ -2083,6 +2093,8 @@ Parrot Internal Representation:
- pir
extensions:
- .pir
interpreters:
- parrot
ace_mode: none
Pascal:
@@ -2127,6 +2139,8 @@ Perl6:
- .p6m
- .pl6
- .pm6
interpreters:
- perl6
tm_scope: none
ace_mode: perl
@@ -2197,6 +2211,8 @@ Prolog:
- .ecl
- .pro
- .prolog
interpreters:
- swipl
ace_mode: prolog
Propeller Spin:
@@ -2266,6 +2282,8 @@ Python:
- wscript
interpreters:
- python
- python2
- python3
Python traceback:
type: data
@@ -2288,6 +2306,8 @@ QMake:
extensions:
- .pro
- .pri
interpreters:
- qmake
ace_mode: none
R:
@@ -2453,6 +2473,8 @@ Ruby:
- .watchr
interpreters:
- ruby
- macruby
- rake
filenames:
- .pryrc
- Appraisals
@@ -2545,6 +2567,8 @@ Scala:
- .scala
- .sbt
- .sc
interpreters:
- scala
Scaml:
group: HTML

View File

@@ -52,14 +52,16 @@ module Linguist
})
end
else
path = File.join(dirname, filename)
if File.extname(filename) == ""
raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
raise "#{path} is missing an extension, maybe it belongs in filenames/ subdir"
end
yield({
:path => File.join(dirname, filename),
:path => path,
:language => category,
:interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
:interpreter => Linguist.interpreter_from_shebang(File.read(path)),
:extname => File.extname(filename)
})
end
@@ -131,18 +133,19 @@ module Linguist
script = script == 'env' ? tokens[1] : script
# "python2.6" -> "python"
if script =~ /((?:\d+\.?)+)/
script.sub! $1, ''
end
# If script has an invalid shebang, we might get here
return unless script
# "python2.6" -> "python2"
script.sub! $1, '' if script =~ /(\.\d+)$/
# Check for multiline shebang hacks that call `exec`
if script == 'sh' &&
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
script = $1
end
script
File.basename(script)
else
nil
end