Store extnames in samples.yml

This commit is contained in:
Joshua Peek
2012-07-23 15:00:42 -05:00
parent 5cda67530d
commit afedf2557d
4 changed files with 214 additions and 62 deletions

View File

@@ -441,15 +441,14 @@ module Linguist
end
end
extensions = Samples.extensions
filenames = Samples.filenames
extensions = Samples::DATA['extnames'] rescue {} # TODO: BAH!
filenames = Samples::DATA['filenames'] rescue {} # TODO: BAH!
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
aliases = [name.downcase.gsub(/\s/, '-') ] + (options[:aliases] || [])
options['extensions'] ||= []
options['filenames'] ||= []
aliases.each do |name|
if extnames = extensions[name]
extnames.each do |extname|
if !options['extensions'].include?(extname)
@@ -469,7 +468,6 @@ module Linguist
end
end
end
end
lang = Language.create(
:name => name,

View File

@@ -184,14 +184,9 @@ C++:
primary_extension: .cpp
extensions:
- .c++
- .cc
- .cpp
- .cu
- .cxx
- .h
- .h++
- .hh
- .hpp
- .hxx
- .tcc
@@ -401,7 +396,6 @@ Emacs Lisp:
- emacs
primary_extension: .el
extensions:
- .el
- .emacs
Erlang:
@@ -544,8 +538,6 @@ Groovy Server Pages:
aliases:
- gsp
primary_extension: .gsp
extensions:
- .gsp
HTML:
type: markup
@@ -909,8 +901,6 @@ Parrot Internal Representation:
aliases:
- pir
primary_extension: .pir
extensions:
- .pir
Parrot Assembly:
group: Parrot
@@ -919,8 +909,6 @@ Parrot Assembly:
aliases:
- pasm
primary_extension: .pasm
extensions:
- .pasm
Perl:
type: programming

View File

@@ -56,36 +56,6 @@ module Linguist
nil
end
# Get all extensions listed in samples/
#
# Returns Hash of sample language keys with a Set of extension
# Strings.
def self.extensions
extensions = {}
each do |sample|
# TODO: For now skip empty extnames
next if sample[:extname].nil? || sample[:extname] == ""
extensions[sample[:language]] ||= Set.new
extensions[sample[:language]] << sample[:extname]
end
extensions
end
# Get all filenames listed in samples/
#
# Returns Hash of sample language keys with a Set of filename
# Strings.
def self.filenames
filenames = {}
each do |sample|
# TODO: For now skip empty extnames
next if sample[:filename].nil?
filenames[sample[:language]] ||= Set.new
filenames[sample[:language]] << sample[:filename]
end
filenames
end
# Public: Build Classifier from all samples.
#
# Returns trained Classifier.
@@ -94,12 +64,32 @@ module Linguist
require 'linguist/language'
db = {}
db['extnames'] = {}
db['filenames'] = {}
each do |sample|
language = Language.find_by_alias(sample[:language])
# TODO: For now skip empty extnames
if sample[:extname] && sample[:extname] != ""
db['extnames'][language.name] ||= []
if !db['extnames'][language.name].include?(sample[:extname])
db['extnames'][language.name] << sample[:extname]
end
end
# TODO: For now skip empty extnames
if fn = sample[:filename]
db['filenames'][language.name] ||= []
db['filenames'][language.name] << fn
end
data = File.read(sample[:path])
Classifier.train!(db, language.name, data)
end
db['md5'] = MD5.hexdigest(db)
db
end
@@ -114,6 +104,22 @@ module Linguist
out << "md5: #{db['md5']}\n"
out << "extnames:\n"
db['extnames'].sort.each do |language, extnames|
out << " #{escape.call(language)}:\n"
extnames.sort.each do |extname|
out << " - #{escape.call(extname)}\n"
end
end
out << "filenames:\n"
db['filenames'].sort.each do |language, filenames|
out << " #{escape.call(language)}:\n"
filenames.sort.each do |filename|
out << " - #{escape.call(filename)}\n"
end
end
out << "languages_total: #{db['languages_total']}\n"
out << "tokens_total: #{db['tokens_total']}\n"

View File

@@ -1,4 +1,164 @@
md5: b445a8a3e3414d6b628939c347e7a4f3
md5: 58f0a3290964ee537940c3686559ba37
extnames:
"Apex":
- ".cls"
"AppleScript":
- ".applescript"
- ".scpt"
"Arduino":
- ".ino"
"AutoHotkey":
- ".ahk"
"C":
- ".c"
- ".h"
"C++":
- ".cc"
- ".cpp"
- ".cu"
- ".h"
- ".hpp"
"Ceylon":
- ".ceylon"
"CoffeeScript":
- ".coffee"
"Coq":
- ".v"
"Dart":
- ".dart"
"Delphi":
- ".dpr"
"Diff":
- ".patch"
"Emacs Lisp":
- ".el"
"GAS":
- ".s"
"Gosu":
- ".gs"
- ".gsp"
- ".gst"
- ".gsx"
- ".vark"
"Groovy":
- ".gradle"
- ".groovy"
"Groovy Server Pages":
- ".gsp"
"Haml":
- ".haml"
"Ioke":
- ".ik"
"Java":
- ".java"
"JavaScript":
- ".js"
"Julia":
- ".jl"
"Kotlin":
- ".kt"
"Logtalk":
- ".lgt"
"Markdown":
- ".md"
"Matlab":
- ".m"
"Nemerle":
- ".n"
"Nimrod":
- ".nim"
"Nu":
- ".nu"
"OCaml":
- ".ml"
"Objective-C":
- ".h"
- ".m"
"Opa":
- ".opa"
"OpenCL":
- ".cl"
"OpenEdge ABL":
- ".cls"
- ".p"
"PHP":
- ".module"
- ".php"
"Parrot Assembly":
- ".pasm"
"Parrot Internal Representation":
- ".pir"
"Perl":
- ".pl"
- ".pm"
- ".t"
"PowerShell":
- ".ps1"
- ".psm1"
"Prolog":
- ".pl"
"Python":
- ".py"
"R":
- ".R"
"Racket":
- ".rkt"
- ".scrbl"
"Rebol":
- ".r"
"Ruby":
- ".rake"
- ".rb"
"Rust":
- ".rs"
"SCSS":
- ".scss"
"Sass":
- ".sass"
"Scala":
- ".sbt"
- ".scala"
"Scheme":
- ".sps"
"Scilab":
- ".sce"
- ".sci"
- ".tst"
"Shell":
- ".bash"
- ".sh"
- ".zsh"
"Standard ML":
- ".sig"
- ".sml"
"SuperCollider":
- ".sc"
"TeX":
- ".cls"
"Tea":
- ".tea"
"Turing":
- ".t"
"VHDL":
- ".vhd"
"Verilog":
- ".v"
"Visual Basic":
- ".cls"
"XML":
- ".ant"
- ".ivy"
- ".xml"
"XQuery":
- ".xqm"
"XSLT":
- ".xslt"
filenames:
"Ruby":
- "Capfile"
- "Rakefile"
"Shell":
- "PKGBUILD"
languages_total: 243
tokens_total: 164127
languages: