mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 04:18:48 +00:00
Merge branch 'master' into more-687
This commit is contained in:
@@ -15,8 +15,8 @@ module Linguist
|
||||
#
|
||||
# Returns nothing.
|
||||
#
|
||||
# Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
|
||||
# per-language. See also dump_all_tokens, below.
|
||||
# Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
|
||||
# per-language. See also #dump_all_tokens, below.
|
||||
def self.train!(db, language, data)
|
||||
tokens = Tokenizer.tokenize(data)
|
||||
|
||||
@@ -151,10 +151,10 @@ module Linguist
|
||||
printf "%#{maxlen}s", ""
|
||||
puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join
|
||||
|
||||
tokmap = Hash.new(0)
|
||||
tokens.each { |tok| tokmap[tok] += 1 }
|
||||
token_map = Hash.new(0)
|
||||
tokens.each { |tok| token_map[tok] += 1 }
|
||||
|
||||
tokmap.sort.each { |tok, count|
|
||||
token_map.sort.each { |tok, count|
|
||||
arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
|
||||
min = arr.map { |a,b| b }.min
|
||||
minlog = Math.log(min)
|
||||
|
||||
@@ -75,14 +75,16 @@ module Linguist
|
||||
|
||||
# Internal: Is the blob minified files?
|
||||
#
|
||||
# Consider a file minified if it contains more than 5% spaces.
|
||||
# Consider a file minified if the average line length is
|
||||
# greater then 110c.
|
||||
#
|
||||
# Currently, only JS and CSS files are detected by this method.
|
||||
#
|
||||
# Returns true or false.
|
||||
def minified_files?
|
||||
return unless ['.js', '.css'].include? extname
|
||||
if data && data.length > 200
|
||||
(data.each_char.count{ |c| c <= ' ' } / data.length.to_f) < 0.05
|
||||
if lines.any?
|
||||
(lines.inject(0) { |n, l| n += l.length } / lines.length) > 110
|
||||
else
|
||||
false
|
||||
end
|
||||
@@ -191,8 +193,8 @@ module Linguist
|
||||
return false unless extname == '.h'
|
||||
return false unless lines.count > 2
|
||||
|
||||
return lines[0].include?("/* DO NOT EDIT THIS FILE - it is machine generated */")
|
||||
return lines[1].include?("#include <jni.h>")
|
||||
return lines[0].include?("/* DO NOT EDIT THIS FILE - it is machine generated */") &&
|
||||
lines[1].include?("#include <jni.h>")
|
||||
end
|
||||
|
||||
# node_modules/ can contain large amounts of files, in general not meant
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
require 'escape_utils'
|
||||
require 'pygments'
|
||||
require 'yaml'
|
||||
begin
|
||||
require 'json'
|
||||
rescue LoadError
|
||||
end
|
||||
|
||||
require 'linguist/classifier'
|
||||
require 'linguist/samples'
|
||||
@@ -487,7 +491,16 @@ module Linguist
|
||||
filenames = Samples::DATA['filenames']
|
||||
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
|
||||
|
||||
YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
|
||||
languages_yml = File.expand_path("../languages.yml", __FILE__)
|
||||
languages_json = File.expand_path("../languages.json", __FILE__)
|
||||
|
||||
if File.exist?(languages_json) && defined?(JSON)
|
||||
languages = JSON.load(File.read(languages_json))
|
||||
else
|
||||
languages = YAML.load_file(languages_yml)
|
||||
end
|
||||
|
||||
languages.each do |name, options|
|
||||
options['extensions'] ||= []
|
||||
options['interpreters'] ||= []
|
||||
options['filenames'] ||= []
|
||||
|
||||
@@ -29,13 +29,13 @@ ABAP:
|
||||
type: programming
|
||||
lexer: ABAP
|
||||
primary_extension: .abap
|
||||
|
||||
|
||||
ANTLR:
|
||||
type: programming
|
||||
color: "#9DC3FF"
|
||||
lexer: ANTLR
|
||||
primary_extension: .g4
|
||||
|
||||
|
||||
ASP:
|
||||
type: programming
|
||||
color: "#6a40fd"
|
||||
@@ -71,6 +71,7 @@ Ada:
|
||||
|
||||
Agda:
|
||||
type: programming
|
||||
color: "#467C91"
|
||||
primary_extension: .agda
|
||||
|
||||
ApacheConf:
|
||||
@@ -123,6 +124,15 @@ AutoHotkey:
|
||||
- ahk
|
||||
primary_extension: .ahk
|
||||
|
||||
AutoIt:
|
||||
type: programming
|
||||
color: "#36699B"
|
||||
aliases:
|
||||
- au3
|
||||
- AutoIt3
|
||||
- AutoItScript
|
||||
primary_extension: .au3
|
||||
|
||||
Awk:
|
||||
type: programming
|
||||
lexer: Awk
|
||||
@@ -178,6 +188,11 @@ Brainfuck:
|
||||
extensions:
|
||||
- .bf
|
||||
|
||||
Brightscript:
|
||||
type: programming
|
||||
lexer: Text only
|
||||
primary_extension: .brs
|
||||
|
||||
Bro:
|
||||
type: programming
|
||||
primary_extension: .bro
|
||||
@@ -378,11 +393,11 @@ D-ObjDump:
|
||||
type: data
|
||||
lexer: d-objdump
|
||||
primary_extension: .d-objdump
|
||||
|
||||
|
||||
DM:
|
||||
type: programming
|
||||
color: "#075ff1"
|
||||
lexer: Text only
|
||||
lexer: C++
|
||||
primary_extension: .dm
|
||||
aliases:
|
||||
- byond
|
||||
@@ -404,6 +419,7 @@ Darcs Patch:
|
||||
|
||||
Dart:
|
||||
type: programming
|
||||
color: "#98BAD6"
|
||||
primary_extension: .dart
|
||||
|
||||
DCPU-16 ASM:
|
||||
@@ -414,7 +430,7 @@ DCPU-16 ASM:
|
||||
- .dasm
|
||||
aliases:
|
||||
- dasm16
|
||||
|
||||
|
||||
Diff:
|
||||
primary_extension: .diff
|
||||
|
||||
@@ -463,7 +479,7 @@ Emacs Lisp:
|
||||
- elisp
|
||||
- emacs
|
||||
primary_extension: .el
|
||||
filenames:
|
||||
filenames:
|
||||
- .emacs
|
||||
extensions:
|
||||
- .emacs
|
||||
@@ -749,6 +765,8 @@ JSON:
|
||||
- .sublime_session
|
||||
- .sublime-settings
|
||||
- .sublime-workspace
|
||||
filenames:
|
||||
- composer.lock
|
||||
|
||||
Jade:
|
||||
group: HTML
|
||||
@@ -946,15 +964,17 @@ Matlab:
|
||||
Max:
|
||||
type: programming
|
||||
color: "#ce279c"
|
||||
lexer: Text only
|
||||
lexer: JSON
|
||||
aliases:
|
||||
- max/msp
|
||||
- maxmsp
|
||||
search_term: max/msp
|
||||
primary_extension: .mxt
|
||||
primary_extension: .maxpat
|
||||
extensions:
|
||||
- .maxhelp
|
||||
- .maxpat
|
||||
- .maxproj
|
||||
- .mxt
|
||||
- .pat
|
||||
|
||||
MiniD: # Legacy
|
||||
searchable: false
|
||||
@@ -995,6 +1015,12 @@ Nemerle:
|
||||
color: "#0d3c6e"
|
||||
primary_extension: .n
|
||||
|
||||
NetLogo:
|
||||
type: programming
|
||||
lexer: Common Lisp
|
||||
color: "#ff2b2b"
|
||||
primary_extension: .nlogo
|
||||
|
||||
Nginx:
|
||||
type: markup
|
||||
lexer: Nginx configuration file
|
||||
@@ -1488,6 +1514,7 @@ Tcsh:
|
||||
|
||||
TeX:
|
||||
type: markup
|
||||
color: "#3D6117"
|
||||
ace_mode: latex
|
||||
aliases:
|
||||
- latex
|
||||
@@ -1544,11 +1571,25 @@ Unified Parallel C:
|
||||
color: "#755223"
|
||||
primary_extension: .upc
|
||||
|
||||
UnrealScript:
|
||||
type: programming
|
||||
color: "#a54c4d"
|
||||
lexer: Java
|
||||
primary_extension: .uc
|
||||
|
||||
VHDL:
|
||||
type: programming
|
||||
lexer: vhdl
|
||||
color: "#543978"
|
||||
primary_extension: .vhdl
|
||||
extensions:
|
||||
- .vhd
|
||||
- .vhf
|
||||
- .vhi
|
||||
- .vho
|
||||
- .vhs
|
||||
- .vht
|
||||
- .vhw
|
||||
|
||||
Vala:
|
||||
type: programming
|
||||
@@ -1620,6 +1661,7 @@ XML:
|
||||
- .grxml
|
||||
- .jelly
|
||||
- .kml
|
||||
- .launch
|
||||
- .mxml
|
||||
- .plist
|
||||
- .pluginspec
|
||||
@@ -1629,6 +1671,7 @@ XML:
|
||||
- .rdf
|
||||
- .rss
|
||||
- .scxml
|
||||
- .srdf
|
||||
- .svg
|
||||
- .tmCommand
|
||||
- .tmLanguage
|
||||
@@ -1637,12 +1680,14 @@ XML:
|
||||
- .tmTheme
|
||||
- .tml
|
||||
- .ui
|
||||
- .urdf
|
||||
- .vxml
|
||||
- .wsdl
|
||||
- .wxi
|
||||
- .wxl
|
||||
- .wxs
|
||||
- .x3d
|
||||
- .xacro
|
||||
- .xaml
|
||||
- .xlf
|
||||
- .xliff
|
||||
@@ -1653,6 +1698,7 @@ XML:
|
||||
filenames:
|
||||
- .classpath
|
||||
- .project
|
||||
- phpunit.xml.dist
|
||||
|
||||
XProc:
|
||||
type: programming
|
||||
@@ -1694,6 +1740,7 @@ YAML:
|
||||
primary_extension: .yml
|
||||
extensions:
|
||||
- .reek
|
||||
- .rviz
|
||||
- .yaml
|
||||
|
||||
eC:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,8 @@
|
||||
require 'yaml'
|
||||
begin
|
||||
require 'json'
|
||||
rescue LoadError
|
||||
require 'yaml'
|
||||
end
|
||||
|
||||
require 'linguist/md5'
|
||||
require 'linguist/classifier'
|
||||
@@ -14,7 +18,8 @@ module Linguist
|
||||
|
||||
# Hash of serialized samples object
|
||||
if File.exist?(PATH)
|
||||
DATA = YAML.load_file(PATH)
|
||||
serializer = defined?(JSON) ? JSON : YAML
|
||||
DATA = serializer.load(File.read(PATH))
|
||||
end
|
||||
|
||||
# Public: Iterate over each sample.
|
||||
|
||||
@@ -139,7 +139,7 @@
|
||||
|
||||
# LICENSE, README, git config files
|
||||
- ^COPYING$
|
||||
- ^LICENSE$
|
||||
- LICENSE$
|
||||
- gitattributes$
|
||||
- gitignore$
|
||||
- gitmodules$
|
||||
|
||||
Reference in New Issue
Block a user