Compare commits

..

60 Commits

Author SHA1 Message Date
Joshua Peek
3d7364877d Linguist 2.0.0 2012-06-21 11:28:24 -05:00
Joshua Peek
2a324c6289 Merge pull request #172 from github/bayesian
Bayesian Classifier
2012-06-21 09:26:54 -07:00
Joshua Peek
77a6a41fc3 Merge branch 'master' into bayesian 2012-06-21 11:25:34 -05:00
Joshua Peek
076bf7d0c8 Use classifier as primary method for disambiguation 2012-06-21 10:55:26 -05:00
Joshua Peek
540f2a0941 More matlab samples 2012-06-21 10:44:31 -05:00
Joshua Peek
497da86262 Strip tex and matlab leading inline comments 2012-06-21 10:38:28 -05:00
Joshua Peek
4b9b8a5058 Remove matlab file with bogus keywords 2012-06-21 10:25:30 -05:00
Joshua Peek
5568489123 Rebuild classifier db 2012-06-20 17:17:09 -05:00
Joshua Peek
5cdd5e206c Improve operator tokenizing 2012-06-20 17:16:53 -05:00
Joshua Peek
c353d3a050 Fix indent 2012-06-20 16:58:32 -05:00
Joshua Peek
6252f12175 Rebuild classifier db 2012-06-20 16:54:40 -05:00
Joshua Peek
0067f28246 YAML sucks 2012-06-20 16:54:29 -05:00
Joshua Peek
ac23d64d26 Merge branch 'master' into bayesian 2012-06-20 16:24:39 -05:00
Joshua Peek
9c9607e42c Log regexp and classifier guess mismatches 2012-06-20 16:20:59 -05:00
Joshua Peek
516a220d9f Verify classifer counts 2012-06-20 15:48:46 -05:00
Joshua Peek
7bcf90c527 Skip gc step for now 2012-06-20 15:13:06 -05:00
Joshua Peek
8c83cbe244 Merge branch 'master' into bayesian
Conflicts:
	linguist.gemspec
2012-06-20 14:56:15 -05:00
Joshua Peek
26f95507ef Test against real gist data 2012-06-20 14:55:13 -05:00
Joshua Peek
4324971cea Remove debug line 2012-06-20 14:11:23 -05:00
Joshua Peek
2672089154 Ensure language is loaded 2012-06-20 14:10:34 -05:00
Joshua Peek
48ecae0c95 Rebuild classifier db 2012-06-20 12:50:42 -05:00
Joshua Peek
5daaee88b4 Sort classifier yaml output 2012-06-20 12:50:05 -05:00
Joshua Peek
db9475f240 Rebuild classifier data 2012-06-20 11:27:18 -05:00
Joshua Peek
f68e94f181 Skip number literals 2012-06-20 11:26:14 -05:00
Joshua Peek
cb70572163 Rebuild classifier db 2012-06-20 11:19:36 -05:00
Joshua Peek
e9eae4e008 Skip pending tests 2012-06-20 11:19:02 -05:00
Joshua Peek
e33d8f3685 Merge branch 'master' into bayesian 2012-06-20 11:18:47 -05:00
Joshua Peek
645a87d02b Remove dead fixture test 2012-06-19 16:34:13 -05:00
Joshua Peek
4484011f08 Switch to log probabilities to avoid float underflows 2012-06-19 16:33:29 -05:00
Joshua Peek
c114d710f8 Test classifier on ambiguous languages 2012-06-19 16:32:56 -05:00
Joshua Peek
9810c693c3 Rebuild classifier db 2012-06-19 16:30:46 -05:00
Joshua Peek
c804d04072 Merge branch 'master' into bayesian 2012-06-19 16:29:01 -05:00
Joshua Peek
fdd81ce0be Merge branch 'master' into bayesian 2012-06-19 16:26:43 -05:00
Joshua Peek
176f6483d0 Ensure token probability is less than 1.0 2012-06-19 15:26:56 -05:00
Joshua Peek
ee6650f83f Fix doc typo 2012-06-19 15:23:23 -05:00
Joshua Peek
3fee3ac549 Rebuild classifier db 2012-06-19 15:23:06 -05:00
Joshua Peek
9d555862c3 Merge branch 'master' into bayesian 2012-06-19 15:02:02 -05:00
Joshua Peek
ddf3ec4a5b Warn if classifier instance is out of date 2012-06-19 14:32:04 -05:00
Joshua Peek
e2b0f6bb50 Depend classifier db on fixtures 2012-06-19 14:23:12 -05:00
Joshua Peek
4d5c9b951b Rebuild classifier 2012-06-19 14:22:42 -05:00
Joshua Peek
d566b35020 Allow classifer languages to be scoped 2012-06-19 14:21:42 -05:00
Joshua Peek
8f85a447de Allow tokens to be passed directly to classify 2012-06-19 14:17:27 -05:00
Joshua Peek
d0691988a9 More classifier docs 2012-06-19 14:15:10 -05:00
Joshua Peek
d9ecbf0c24 Doc sample class 2012-06-19 13:30:28 -05:00
Joshua Peek
d5fa8cbcb7 Refactor tokenizer test helper 2012-06-19 13:12:17 -05:00
Joshua Peek
555573071e More tokenizer docs 2012-06-19 13:09:23 -05:00
Joshua Peek
ecb2397e59 Merge branch 'master' into bayesian 2012-06-19 11:43:48 -05:00
Joshua Peek
12cfab6d50 Rebuild classifier data 2012-06-08 16:04:52 -05:00
Joshua Peek
8a75d4d208 GC classifier db 2012-06-08 16:04:43 -05:00
Joshua Peek
fd8b70ffa4 Rebuild classifier data 2012-06-08 15:49:35 -05:00
Joshua Peek
62498cf0e9 Merge branch 'master' into bayesian 2012-06-08 15:46:48 -05:00
Joshua Peek
543922c68a Rebuild classifier data 2012-06-08 14:48:04 -05:00
Joshua Peek
6f6dd8bc38 Improve tokenizing sgml tags 2012-06-08 14:46:16 -05:00
Joshua Peek
8351d55c56 Don't crash if classifier data is missing 2012-06-08 14:46:06 -05:00
Joshua Peek
9ecab364d1 Dump classifier results 2012-06-08 14:13:26 -05:00
Joshua Peek
0172623061 Add sample gathering class 2012-06-08 13:51:49 -05:00
Joshua Peek
e5ae9c328b Use language name as hash key 2012-06-08 13:43:57 -05:00
Joshua Peek
e0c777d995 Fix test name 2012-06-08 13:43:37 -05:00
Joshua Peek
f747b49347 Add simple classifier 2012-06-07 17:10:28 -05:00
Joshua Peek
e0cbe815a3 Add basic Tokenizer 2012-06-07 14:55:11 -05:00
14 changed files with 19725 additions and 57 deletions

View File

@@ -1,3 +1,4 @@
require 'rake/clean'
require 'rake/testtask'
task :default => :test
@@ -5,3 +6,63 @@ task :default => :test
Rake::TestTask.new do |t|
t.warning = true
end
file 'lib/linguist/classifier.yml' => Dir['test/fixtures/**/*'] do |f|
require 'linguist/sample'
classifier = Linguist::Sample.classifier
File.open(f.name, 'w') { |io| YAML.dump(classifier, io) }
end
CLOBBER.include 'lib/linguist/classifier.yml'
task :classifier => ['lib/linguist/classifier.yml']
namespace :classifier do
LIMIT = 1_000
desc "Run classifier against #{LIMIT} public gists"
task :test do
require 'linguist/classifier'
total, correct, incorrect = 0, 0, 0
$stdout.sync = true
each_public_gist do |gist_url, file_url, file_language|
next if file_language.nil? || file_language == 'Text'
begin
data = open(file_url).read
guessed_language, score = Linguist::Classifier.instance.classify(data).first
total += 1
guessed_language.name == file_language ? correct += 1 : incorrect += 1
print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
$stdout.flush
rescue URI::InvalidURIError
else
break if total >= LIMIT
end
end
puts ""
end
def each_public_gist
require 'open-uri'
require 'json'
url = "https://api.github.com/gists/public"
loop do
resp = open(url)
url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
gists = JSON.parse(resp.read)
for gist in gists
for filename, attrs in gist['files']
yield gist['url'], attrs['raw_url'], attrs['language']
end
end
end
end
end

View File

@@ -1,6 +1,6 @@
Gem::Specification.new do |s|
s.name = 'github-linguist'
s.version = '1.0.0'
s.version = '2.0.0'
s.summary = "GitHub Language detection"
s.authors = "GitHub"
@@ -12,5 +12,6 @@ Gem::Specification.new do |s|
s.add_dependency 'escape_utils', '~> 0.2.3'
s.add_dependency 'mime-types', '~> 1.18'
s.add_dependency 'pygments.rb', '~> 0.2.13'
s.add_development_dependency 'json'
s.add_development_dependency 'rake'
end

View File

@@ -1,3 +1,4 @@
require 'linguist/classifier'
require 'linguist/language'
require 'linguist/mime'
require 'linguist/pathname'
@@ -453,8 +454,15 @@ module Linguist
# Returns a Language or nil.
def disambiguate_extension_language
if Language.ambiguous?(extname)
name = "guess_#{extname.sub(/^\./, '')}_language"
send(name) if respond_to?(name)
# name = "guess_#{extname.sub(/^\./, '')}_language"
# send(name) if respond_to?(name)
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
if possible_languages.any?
if result = Classifier.instance.classify(data, possible_languages).first
result[0]
end
end
end
end

183
lib/linguist/classifier.rb Normal file
View File

@@ -0,0 +1,183 @@
require 'linguist/language'
require 'linguist/tokenizer'
module Linguist
# Language bayesian classifier.
class Classifier
# Internal: Path to persisted classifier db.
PATH = File.expand_path('../classifier.yml', __FILE__)
# Public: Check if persisted db exists on disk.
#
# Returns Boolean.
def self.exist?
File.exist?(PATH)
end
# Public: Get persisted Classifier instance.
#
# Returns Classifier.
def self.instance
@instance ||= YAML.load_file(PATH)
end
# Public: Initialize a Classifier.
def initialize
@tokens_total = 0
@languages_total = 0
@tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
@language_tokens = Hash.new(0)
@languages = Hash.new(0)
end
# Public: Compare Classifier objects.
#
# other - Classifier object to compare to.
#
# Returns Boolean.
def eql?(other)
# Lazy fast check counts only
other.is_a?(self.class) &&
@tokens_total == other.instance_variable_get(:@tokens_total) &&
@languages_total == other.instance_variable_get(:@languages_total)
end
alias_method :==, :eql?
# Public: Train classifier that data is a certain language.
#
# language - Language of data
# data - String contents of file
#
# Examples
#
# train(Language['Ruby'], "def hello; end")
#
# Returns nothing.
def train(language, data)
language = language.name
tokens = Tokenizer.new(data).tokens
tokens.each do |token|
@tokens[language][token] += 1
@language_tokens[language] += 1
@tokens_total += 1
end
@languages[language] += 1
@languages_total += 1
nil
end
# Public: Verify internal counts are consistent.
#
# Returns Boolean.
def verify
@languages.inject(0) { |n, (l, c)| n += c } == @languages_total &&
@language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total &&
@tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total
end
# Public: Prune infrequent tokens.
#
# Returns receiver Classifier instance.
def gc
self
end
# Public: Guess language of data.
#
# data - Array of tokens or String data to analyze.
# languages - Array of Languages to restrict to.
#
# Examples
#
# classify("def hello; end")
# # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
#
# Returns sorted Array of result pairs. Each pair contains the
# Language and a Float score.
def classify(tokens, languages = @languages.keys)
tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
scores = {}
languages.each do |language|
language_name = language.is_a?(Language) ? language.name : language
scores[language_name] = tokens_probability(tokens, language_name) +
language_probability(language_name)
end
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
end
# Internal: Probably of set of tokens in a language occuring - P(D | C)
#
# tokens - Array of String tokens.
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def tokens_probability(tokens, language)
tokens.inject(0.0) do |sum, token|
sum += Math.log(token_probability(token, language))
end
end
# Internal: Probably of token in language occuring - P(F | C)
#
# token - String token.
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def token_probability(token, language)
if @tokens[language][token].to_f == 0.0
1 / @tokens_total.to_f
else
@tokens[language][token].to_f / @language_tokens[language].to_f
end
end
# Internal: Probably of a language occuring - P(C)
#
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def language_probability(language)
Math.log(@languages[language].to_f / @languages_total.to_f)
end
# Public: Serialize classifier to YAML.
#
# opts - Hash of YAML options.
#
# Returns nothing.
def to_yaml(io)
data = "--- !ruby/object:Linguist::Classifier\n"
data << "languages_total: #{@languages_total}\n"
data << "tokens_total: #{@tokens_total}\n"
data << "languages:\n"
@languages.sort.each do |language, count|
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
end
data << "language_tokens:\n"
@language_tokens.sort.each do |language, count|
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
end
data << "tokens:\n"
@tokens.sort.each do |language, tokens|
data << " #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
tokens.sort.each do |token, count|
data << " #{{token => count}.to_yaml.lines.to_a[1]}"
end
end
io.write data
nil
end
end
# Eager load instance
Classifier.instance if Classifier.exist?
end

19013
lib/linguist/classifier.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -26,7 +26,7 @@ module Linguist
@overrides.include?(extension)
end
# Include?: Return overridden extensions.
# Internal: Return overridden extensions.
#
# Returns extensions Array.
def self.overridden_extensions

74
lib/linguist/sample.rb Normal file
View File

@@ -0,0 +1,74 @@
require 'linguist/classifier'
require 'linguist/language'
module Linguist
# Model for accessing classifier training data.
class Sample
# Samples live in test/ for now, we'll eventually move them out
PATH = File.expand_path("../../../test/fixtures", __FILE__)
# Public: Iterate over each Sample.
#
# &block - Yields Sample to block
#
# Returns nothing.
def self.each(&block)
Dir.entries(PATH).each do |category|
next if category == '.' || category == '..'
# Skip text and binary for now
# Possibly reconsider this later
next if category == 'text' || category == 'binary'
# Map directory name to a Language alias
language = Linguist::Language.find_by_alias(category)
raise "No language for #{category.inspect}" unless language
dirname = File.join(PATH, category)
Dir.entries(dirname).each do |filename|
next if filename == '.' || filename == '..'
yield new(File.join(dirname, filename), language)
end
end
nil
end
# Public: Build Classifier from all samples.
#
# Returns trained Classifier.
def self.classifier
classifier = Classifier.new
each { |sample| classifier.train(sample.language, sample.data) }
classifier.gc
end
# Internal: Initialize Sample.
#
# Samples should be initialized by Sample.each.
#
# path - String full path to file.
# language - Language of sample.
def initialize(path, language)
@path = path
@language = language
end
# Public: Get full path to file.
#
# Returns String.
attr_reader :path
# Public: Get sample language.
#
# Returns Language.
attr_reader :language
# Public: Read file contents.
#
# Returns String.
def data
File.read(path)
end
end
end

157
lib/linguist/tokenizer.rb Normal file
View File

@@ -0,0 +1,157 @@
module Linguist
# Generic programming language tokenizer.
#
# Tokens are designed for use in the language bayes classifier.
# It strips any data strings or comments and preserves significant
# language symbols.
class Tokenizer
# Public: Initialize a Tokenizer.
#
# data - String data to scan.
def initialize(data)
@data = data
end
# Public: Get source data.
#
# Returns String.
attr_reader :data
# Public: Extract tokens from data.
#
# Returns Array of token Strings.
def tokens
extract_tokens(data)
end
# Internal: Extract generic tokens from data.
#
# data - String to scan.
#
# Examples
#
# extract_tokens("printf('Hello')")
# # => ['printf', '(', ')']
#
# Returns Array of token Strings.
def extract_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
# Ruby single line comment
if token = s.scan(/# /)
tokens << "#"
s.skip_until(/\n|\Z/)
# C style single line comment
elsif token = s.scan(/\/\/ /)
tokens << "//"
s.skip_until(/\n|\Z/)
# Leading Tex or Matlab comments
elsif token = s.scan(/\n%/)
tokens << "%"
s.skip_until(/\n|\Z/)
# C multiline comments
elsif token = s.scan(/\/\*/)
tokens << "/*"
s.skip_until(/\*\//)
tokens << "*/"
# Haskell multiline comments
elsif token = s.scan(/\{-/)
tokens << "{-"
s.skip_until(/-\}/)
tokens << "-}"
# XML multiline comments
elsif token = s.scan(/<!--/)
tokens << "<!--"
s.skip_until(/-->/)
tokens << "-->"
# Skip single or double quoted strings
elsif s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
s.skip_until(/[^\\]'/)
# Skip number literals
elsif s.scan(/(0x)?\d+/)
# SGML style brackets
elsif token = s.scan(/<[^\s<>][^<>]*>/)
extract_sgml_tokens(token).each { |t| tokens << t }
# Common programming punctuation
elsif token = s.scan(/;|\{|\}|\(|\)/)
tokens << token
# Regular token
elsif token = s.scan(/[\w\.@#\/\*]+/)
tokens << token
# Common operators
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
tokens << token
else
s.getch
end
end
tokens
end
# Internal: Extract tokens from inside SGML tag.
#
# data - SGML tag String.
#
# Examples
#
# extract_sgml_tokens("<a href='' class=foo>")
# # => ["<a>", "href="]
#
# Returns Array of token Strings.
def extract_sgml_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
# Emit start token
if token = s.scan(/<\/?[^\s>]+/)
tokens << "#{token}>"
# Emit attributes with trailing =
elsif token = s.scan(/\w+=/)
tokens << token
# Then skip over attribute value
if s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
s.skip_until(/[^\\]'/)
else
s.skip_until(/\w+/)
end
# Emit lone attributes
elsif token = s.scan(/\w+/)
tokens << token
# Stop at the end of the tag
elsif s.scan(/>/)
s.terminate
else
s.getch
end
end
tokens
end
end
end

9
test/fixtures/matlab/average.m vendored Normal file
View File

@@ -0,0 +1,9 @@
function y = average(x)
% AVERAGE Mean of vector elements.
% AVERAGE(X), where X is a vector, is the mean of vector
% elements. Nonvector input results in an error.
[m,n] = size(x);
if (~((m == 1) | (n == 1)) | (m == 1 & n == 1))
error('Input must be a vector')
end
y = sum(x)/length(x);

38
test/fixtures/matlab/make_filter.m vendored Normal file
View File

@@ -0,0 +1,38 @@
function [filtfcn, statefcn] = makeFilter(b, a)
% FILTFCN = MAKEFILTER(B, A) creates an IIR filtering
% function and returns it in the form of a function handle,
% FILTFCN. Each time you call FILTFCN with a new filter
% input value, it computes the corresponding new filter
% output value, updating its internal state vector at the
% same time.
%
% [FILTFCN, STATEFCN] = MAKEFILTER(B, A) also returns a
% function (in the form of a function handle, STATEFCN)
% that can return the filter's internal state. The internal
% state vector is in the form of a transposed direct form
% II delay line.
% Initialize state vector. To keep this example a bit
% simpler, assume that a and b have the same length.
% Also assume that a(1) is 1.
v = zeros(size(a));
filtfcn = @iirFilter;
statefcn = @getState;
function yn = iirFilter(xn)
% Update the state vector
v(1) = v(2) + b(1) * xn;
v(2:end-1) = v(3:end) + b(2:end-1) * xn - ...
a(2:end-1) * v(1);
v(end) = b(end) * xn - a(end) * v(1);
% Output is the first element of the state vector.
yn = v(1);
end
function vOut = getState
vOut = v;
end
end

View File

@@ -1,33 +0,0 @@
function ret = matlab_function2(A,B)
% Simple function that combines two values using function handles and displays
% the return value
% create function handles
fun1=@interface;
fun2=@implementation;
fun3=@property;
fun4=@synthesize;
% use function handles
ret = fun1(A)+fun2(A)+fun3(B)+fun4(B);
% Display the return value
disp('Return value in function');
disp(ret);
function A=interface(A)
% simple sub-function with same name Objective-C @keyword
A=2*A;
function A=implementation(A)
% simple sub-function with same name Objective-C @keyword
A=A^2;
function B=property(B)
% simple sub-function with same name Objective-C @keyword
B=2*B;
function B=synthesize(B)
% simple sub-function with same name Objective-C @keyword
B=B^2;

View File

@@ -1,4 +1,5 @@
require 'linguist/file_blob'
require 'linguist/sample'
require 'test/unit'
require 'mime/types'
@@ -24,23 +25,6 @@ class TestBlob < Test::Unit::TestCase
blob
end
def each_language_fixture
Dir["#{fixtures_path}/*"].each do |path|
name = File.basename(path)
if name == 'text' || name == 'binary'
next
else
assert language = Language.find_by_alias(name), "No language alias for #{name.inspect}"
end
Dir.entries(path).each do |filename|
next if filename == '.' || filename == '..'
yield language, blob(File.join(path, filename))
end
end
end
def test_name
assert_equal "foo.rb", blob("foo.rb").name
end
@@ -291,9 +275,9 @@ class TestBlob < Test::Unit::TestCase
end
def test_language
# Drop any files under test/fixtures/LANGUAGE
each_language_fixture do |language, blob|
assert_equal language, blob.language, blob.name
Sample.each do |sample|
blob = blob(sample.path)
assert_equal sample.language, blob.language, blob.name
end
end

82
test/test_classifier.rb Normal file
View File

@@ -0,0 +1,82 @@
require 'linguist/classifier'
require 'linguist/language'
require 'linguist/sample'
require 'linguist/tokenizer'
require 'test/unit'
class TestClassifier < Test::Unit::TestCase
include Linguist
def fixtures_path
File.expand_path("../fixtures", __FILE__)
end
def fixture(name)
File.read(File.join(fixtures_path, name))
end
def test_instance_freshness
# Just warn, it shouldn't scare people off by breaking the build.
unless Classifier.instance.eql?(Linguist::Sample.classifier)
warn "Classifier database is out of date. Run `bundle exec rake classifier`."
end
end
def test_classify
classifier = Classifier.new
classifier.train Language["Ruby"], fixture("ruby/foo.rb")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
results = classifier.classify(fixture("objective-c/hello.m"))
assert_equal Language["Objective-C"], results.first[0]
tokens = Tokenizer.new(fixture("objective-c/hello.m")).tokens
results = classifier.classify(tokens)
assert_equal Language["Objective-C"], results.first[0]
end
def test_restricted_classify
classifier = Classifier.new
classifier.train Language["Ruby"], fixture("ruby/foo.rb")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]])
assert_equal Language["Objective-C"], results.first[0]
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]])
assert_equal Language["Ruby"], results.first[0]
end
def test_instance_classify_empty
results = Classifier.instance.classify("")
assert results.first[1] < 0.5, results.first.inspect
end
def test_verify
assert Classifier.instance.verify
end
def test_gc
Classifier.instance.gc
end
def test_classify_ambiguous_languages
Sample.each do |sample|
# TODO: These tests are pending
next if sample.path =~ /hello.h/
next if sample.path =~ /MainMenuViewController.h/
next unless sample.language.overrides.any?
extname = File.extname(sample.path)
languages = Language.all.select { |l| l.extensions.include?(extname) }
next unless languages.length > 1
results = Classifier.instance.classify(sample.data, languages)
assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}"
end
end
end

91
test/test_tokenizer.rb Normal file
View File

@@ -0,0 +1,91 @@
require 'linguist/tokenizer'
require 'test/unit'
class TestTokenizer < Test::Unit::TestCase
include Linguist
def fixtures_path
File.expand_path("../fixtures", __FILE__)
end
def tokenize(data)
data = File.read(File.join(fixtures_path, data.to_s)) if data.is_a?(Symbol)
Tokenizer.new(data).tokens
end
def test_skip_string_literals
assert_equal %w(print), tokenize('print ""')
assert_equal %w(print), tokenize('print "Josh"')
assert_equal %w(print), tokenize("print 'Josh'")
assert_equal %w(print), tokenize('print "Hello \"Josh\""')
assert_equal %w(print), tokenize("print 'Hello \\'Josh\\''")
end
def test_skip_number_literals
assert_equal %w(+), tokenize('1 + 1')
assert_equal %w(add \( \)), tokenize('add(123, 456)')
assert_equal %w(|), tokenize('0x01 | 0x10')
end
def test_skip_comments
assert_equal %w(foo #), tokenize("foo # Comment")
assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
assert_equal %w(foo //), tokenize("foo // Comment")
assert_equal %w(foo /* */), tokenize("foo /* Comment */")
assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")
assert_equal %w(foo {- -}), tokenize("foo {- Comment -}")
assert_equal %w(% %), tokenize("2 % 10\n% Comment")
end
def test_sgml_tags
assert_equal %w(<html> </html>), tokenize("<html></html>")
assert_equal %w(<div> id </div>), tokenize("<div id></div>")
assert_equal %w(<div> id= </div>), tokenize("<div id=foo></div>")
assert_equal %w(<div> id class </div>), tokenize("<div id class></div>")
assert_equal %w(<div> id= </div>), tokenize("<div id=\"foo bar\"></div>")
assert_equal %w(<div> id= </div>), tokenize("<div id='foo bar'></div>")
assert_equal %w(<?xml> version=), tokenize("<?xml version=\"1.0\"?>")
end
def test_operators
assert_equal %w(+), tokenize("1 + 1")
assert_equal %w(-), tokenize("1 - 1")
assert_equal %w(*), tokenize("1 * 1")
assert_equal %w(/), tokenize("1 / 1")
assert_equal %w(%), tokenize("2 % 5")
assert_equal %w(&), tokenize("1 & 1")
assert_equal %w(&&), tokenize("1 && 1")
assert_equal %w(|), tokenize("1 | 1")
assert_equal %w(||), tokenize("1 || 1")
assert_equal %w(<), tokenize("1 < 0x01")
assert_equal %w(<<), tokenize("1 << 0x01")
end
def test_c_tokens
assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize(:"c/hello.h")
assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return ; }), tokenize(:"c/hello.c")
end
def test_cpp_tokens
assert_equal %w(class Bar { protected char *name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h")
assert_equal %w(#include <iostream> using namespace std ; int main \( \) { cout << << endl ; }), tokenize(:"cpp/hello.cpp")
end
def test_objective_c_tokens
assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize(:"objective-c/Foo.h")
assert_equal %w(#import @implementation Foo @end), tokenize(:"objective-c/Foo.m")
assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m")
end
def test_javascript_tokens
assert_equal %w( \( function \( \) { console.log \( \) ; } \) .call \( this \) ;), tokenize(:"javascript/hello.js")
end
def test_ruby_tokens
assert_equal %w(module Foo end), tokenize(:"ruby/foo.rb")
assert_equal %w(# /usr/bin/env ruby puts), tokenize(:"ruby/script.rb")
assert_equal %w(task default do puts end), tokenize(:"ruby/Rakefile")
end
end