Merge pull request #172 from github/bayesian

Bayesian Classifier
This commit is contained in:
Joshua Peek
2012-06-21 09:26:54 -07:00
14 changed files with 19724 additions and 56 deletions

View File

@@ -1,3 +1,4 @@
require 'rake/clean'
require 'rake/testtask'
task :default => :test
@@ -5,3 +6,63 @@ task :default => :test
Rake::TestTask.new do |t|
t.warning = true
end
file 'lib/linguist/classifier.yml' => Dir['test/fixtures/**/*'] do |f|
require 'linguist/sample'
classifier = Linguist::Sample.classifier
File.open(f.name, 'w') { |io| YAML.dump(classifier, io) }
end
CLOBBER.include 'lib/linguist/classifier.yml'
task :classifier => ['lib/linguist/classifier.yml']
namespace :classifier do
LIMIT = 1_000
desc "Run classifier against #{LIMIT} public gists"
task :test do
require 'linguist/classifier'
total, correct, incorrect = 0, 0, 0
$stdout.sync = true
each_public_gist do |gist_url, file_url, file_language|
next if file_language.nil? || file_language == 'Text'
begin
data = open(file_url).read
guessed_language, score = Linguist::Classifier.instance.classify(data).first
total += 1
guessed_language.name == file_language ? correct += 1 : incorrect += 1
print "\r\e[0K%d:%d %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
$stdout.flush
rescue URI::InvalidURIError
else
break if total >= LIMIT
end
end
puts ""
end
def each_public_gist
require 'open-uri'
require 'json'
url = "https://api.github.com/gists/public"
loop do
resp = open(url)
url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
gists = JSON.parse(resp.read)
for gist in gists
for filename, attrs in gist['files']
yield gist['url'], attrs['raw_url'], attrs['language']
end
end
end
end
end

View File

@@ -12,5 +12,6 @@ Gem::Specification.new do |s|
s.add_dependency 'escape_utils', '~> 0.2.3'
s.add_dependency 'mime-types', '~> 1.18'
s.add_dependency 'pygments.rb', '~> 0.2.13'
s.add_development_dependency 'json'
s.add_development_dependency 'rake'
end

View File

@@ -1,3 +1,4 @@
require 'linguist/classifier'
require 'linguist/language'
require 'linguist/mime'
require 'linguist/pathname'
@@ -453,8 +454,15 @@ module Linguist
# Returns a Language or nil.
def disambiguate_extension_language
if Language.ambiguous?(extname)
name = "guess_#{extname.sub(/^\./, '')}_language"
send(name) if respond_to?(name)
# name = "guess_#{extname.sub(/^\./, '')}_language"
# send(name) if respond_to?(name)
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
if possible_languages.any?
if result = Classifier.instance.classify(data, possible_languages).first
result[0]
end
end
end
end

183
lib/linguist/classifier.rb Normal file
View File

@@ -0,0 +1,183 @@
require 'linguist/language'
require 'linguist/tokenizer'
module Linguist
# Language bayesian classifier.
class Classifier
# Internal: Path to persisted classifier db.
PATH = File.expand_path('../classifier.yml', __FILE__)
# Public: Check if persisted db exists on disk.
#
# Returns Boolean.
def self.exist?
File.exist?(PATH)
end
# Public: Get persisted Classifier instance.
#
# Returns Classifier.
def self.instance
@instance ||= YAML.load_file(PATH)
end
# Public: Initialize a Classifier.
def initialize
@tokens_total = 0
@languages_total = 0
@tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
@language_tokens = Hash.new(0)
@languages = Hash.new(0)
end
# Public: Compare Classifier objects.
#
# other - Classifier object to compare to.
#
# Returns Boolean.
def eql?(other)
# Lazy fast check counts only
other.is_a?(self.class) &&
@tokens_total == other.instance_variable_get(:@tokens_total) &&
@languages_total == other.instance_variable_get(:@languages_total)
end
alias_method :==, :eql?
# Public: Train classifier that data is a certain language.
#
# language - Language of data
# data - String contents of file
#
# Examples
#
# train(Language['Ruby'], "def hello; end")
#
# Returns nothing.
def train(language, data)
language = language.name
tokens = Tokenizer.new(data).tokens
tokens.each do |token|
@tokens[language][token] += 1
@language_tokens[language] += 1
@tokens_total += 1
end
@languages[language] += 1
@languages_total += 1
nil
end
# Public: Verify internal counts are consistent.
#
# Returns Boolean.
def verify
@languages.inject(0) { |n, (l, c)| n += c } == @languages_total &&
@language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total &&
@tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total
end
# Public: Prune infrequent tokens.
#
# Returns receiver Classifier instance.
def gc
self
end
# Public: Guess language of data.
#
# data - Array of tokens or String data to analyze.
# languages - Array of Languages to restrict to.
#
# Examples
#
# classify("def hello; end")
# # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
#
# Returns sorted Array of result pairs. Each pair contains the
# Language and a Float score.
def classify(tokens, languages = @languages.keys)
tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
scores = {}
languages.each do |language|
language_name = language.is_a?(Language) ? language.name : language
scores[language_name] = tokens_probability(tokens, language_name) +
language_probability(language_name)
end
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
end
# Internal: Probably of set of tokens in a language occuring - P(D | C)
#
# tokens - Array of String tokens.
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def tokens_probability(tokens, language)
tokens.inject(0.0) do |sum, token|
sum += Math.log(token_probability(token, language))
end
end
# Internal: Probably of token in language occuring - P(F | C)
#
# token - String token.
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def token_probability(token, language)
if @tokens[language][token].to_f == 0.0
1 / @tokens_total.to_f
else
@tokens[language][token].to_f / @language_tokens[language].to_f
end
end
# Internal: Probably of a language occuring - P(C)
#
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def language_probability(language)
Math.log(@languages[language].to_f / @languages_total.to_f)
end
# Public: Serialize classifier to YAML.
#
# opts - Hash of YAML options.
#
# Returns nothing.
def to_yaml(io)
data = "--- !ruby/object:Linguist::Classifier\n"
data << "languages_total: #{@languages_total}\n"
data << "tokens_total: #{@tokens_total}\n"
data << "languages:\n"
@languages.sort.each do |language, count|
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
end
data << "language_tokens:\n"
@language_tokens.sort.each do |language, count|
data << " #{{language => count}.to_yaml.lines.to_a[1]}"
end
data << "tokens:\n"
@tokens.sort.each do |language, tokens|
data << " #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
tokens.sort.each do |token, count|
data << " #{{token => count}.to_yaml.lines.to_a[1]}"
end
end
io.write data
nil
end
end
# Eager load instance
Classifier.instance if Classifier.exist?
end

19013
lib/linguist/classifier.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -26,7 +26,7 @@ module Linguist
@overrides.include?(extension)
end
# Include?: Return overridden extensions.
# Internal: Return overridden extensions.
#
# Returns extensions Array.
def self.overridden_extensions

74
lib/linguist/sample.rb Normal file
View File

@@ -0,0 +1,74 @@
require 'linguist/classifier'
require 'linguist/language'
module Linguist
# Model for accessing classifier training data.
class Sample
# Samples live in test/ for now, we'll eventually move them out
PATH = File.expand_path("../../../test/fixtures", __FILE__)
# Public: Iterate over each Sample.
#
# &block - Yields Sample to block
#
# Returns nothing.
def self.each(&block)
Dir.entries(PATH).each do |category|
next if category == '.' || category == '..'
# Skip text and binary for now
# Possibly reconsider this later
next if category == 'text' || category == 'binary'
# Map directory name to a Language alias
language = Linguist::Language.find_by_alias(category)
raise "No language for #{category.inspect}" unless language
dirname = File.join(PATH, category)
Dir.entries(dirname).each do |filename|
next if filename == '.' || filename == '..'
yield new(File.join(dirname, filename), language)
end
end
nil
end
# Public: Build Classifier from all samples.
#
# Returns trained Classifier.
def self.classifier
classifier = Classifier.new
each { |sample| classifier.train(sample.language, sample.data) }
classifier.gc
end
# Internal: Initialize Sample.
#
# Samples should be initialized by Sample.each.
#
# path - String full path to file.
# language - Language of sample.
def initialize(path, language)
@path = path
@language = language
end
# Public: Get full path to file.
#
# Returns String.
attr_reader :path
# Public: Get sample language.
#
# Returns Language.
attr_reader :language
# Public: Read file contents.
#
# Returns String.
def data
File.read(path)
end
end
end

157
lib/linguist/tokenizer.rb Normal file
View File

@@ -0,0 +1,157 @@
module Linguist
# Generic programming language tokenizer.
#
# Tokens are designed for use in the language bayes classifier.
# It strips any data strings or comments and preserves significant
# language symbols.
class Tokenizer
# Public: Initialize a Tokenizer.
#
# data - String data to scan.
def initialize(data)
@data = data
end
# Public: Get source data.
#
# Returns String.
attr_reader :data
# Public: Extract tokens from data.
#
# Returns Array of token Strings.
def tokens
extract_tokens(data)
end
# Internal: Extract generic tokens from data.
#
# data - String to scan.
#
# Examples
#
# extract_tokens("printf('Hello')")
# # => ['printf', '(', ')']
#
# Returns Array of token Strings.
def extract_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
# Ruby single line comment
if token = s.scan(/# /)
tokens << "#"
s.skip_until(/\n|\Z/)
# C style single line comment
elsif token = s.scan(/\/\/ /)
tokens << "//"
s.skip_until(/\n|\Z/)
# Leading Tex or Matlab comments
elsif token = s.scan(/\n%/)
tokens << "%"
s.skip_until(/\n|\Z/)
# C multiline comments
elsif token = s.scan(/\/\*/)
tokens << "/*"
s.skip_until(/\*\//)
tokens << "*/"
# Haskell multiline comments
elsif token = s.scan(/\{-/)
tokens << "{-"
s.skip_until(/-\}/)
tokens << "-}"
# XML multiline comments
elsif token = s.scan(/<!--/)
tokens << "<!--"
s.skip_until(/-->/)
tokens << "-->"
# Skip single or double quoted strings
elsif s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
s.skip_until(/[^\\]'/)
# Skip number literals
elsif s.scan(/(0x)?\d+/)
# SGML style brackets
elsif token = s.scan(/<[^\s<>][^<>]*>/)
extract_sgml_tokens(token).each { |t| tokens << t }
# Common programming punctuation
elsif token = s.scan(/;|\{|\}|\(|\)/)
tokens << token
# Regular token
elsif token = s.scan(/[\w\.@#\/\*]+/)
tokens << token
# Common operators
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
tokens << token
else
s.getch
end
end
tokens
end
# Internal: Extract tokens from inside SGML tag.
#
# data - SGML tag String.
#
# Examples
#
# extract_sgml_tokens("<a href='' class=foo>")
# # => ["<a>", "href="]
#
# Returns Array of token Strings.
def extract_sgml_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
# Emit start token
if token = s.scan(/<\/?[^\s>]+/)
tokens << "#{token}>"
# Emit attributes with trailing =
elsif token = s.scan(/\w+=/)
tokens << token
# Then skip over attribute value
if s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
s.skip_until(/[^\\]'/)
else
s.skip_until(/\w+/)
end
# Emit lone attributes
elsif token = s.scan(/\w+/)
tokens << token
# Stop at the end of the tag
elsif s.scan(/>/)
s.terminate
else
s.getch
end
end
tokens
end
end
end

9
test/fixtures/matlab/average.m vendored Normal file
View File

@@ -0,0 +1,9 @@
function y = average(x)
% AVERAGE Mean of vector elements.
% AVERAGE(X), where X is a vector, is the mean of vector
% elements. Nonvector input results in an error.
[m,n] = size(x);
if (~((m == 1) | (n == 1)) | (m == 1 & n == 1))
error('Input must be a vector')
end
y = sum(x)/length(x);

38
test/fixtures/matlab/make_filter.m vendored Normal file
View File

@@ -0,0 +1,38 @@
function [filtfcn, statefcn] = makeFilter(b, a)
% FILTFCN = MAKEFILTER(B, A) creates an IIR filtering
% function and returns it in the form of a function handle,
% FILTFCN. Each time you call FILTFCN with a new filter
% input value, it computes the corresponding new filter
% output value, updating its internal state vector at the
% same time.
%
% [FILTFCN, STATEFCN] = MAKEFILTER(B, A) also returns a
% function (in the form of a function handle, STATEFCN)
% that can return the filter's internal state. The internal
% state vector is in the form of a transposed direct form
% II delay line.
% Initialize state vector. To keep this example a bit
% simpler, assume that a and b have the same length.
% Also assume that a(1) is 1.
v = zeros(size(a));
filtfcn = @iirFilter;
statefcn = @getState;
function yn = iirFilter(xn)
% Update the state vector
v(1) = v(2) + b(1) * xn;
v(2:end-1) = v(3:end) + b(2:end-1) * xn - ...
a(2:end-1) * v(1);
v(end) = b(end) * xn - a(end) * v(1);
% Output is the first element of the state vector.
yn = v(1);
end
function vOut = getState
vOut = v;
end
end

View File

@@ -1,33 +0,0 @@
function ret = matlab_function2(A,B)
% Simple function that combines two values using function handles and displays
% the return value
% create function handles
fun1=@interface;
fun2=@implementation;
fun3=@property;
fun4=@synthesize;
% use function handles
ret = fun1(A)+fun2(A)+fun3(B)+fun4(B);
% Display the return value
disp('Return value in function');
disp(ret);
function A=interface(A)
% simple sub-function with same name Objective-C @keyword
A=2*A;
function A=implementation(A)
% simple sub-function with same name Objective-C @keyword
A=A^2;
function B=property(B)
% simple sub-function with same name Objective-C @keyword
B=2*B;
function B=synthesize(B)
% simple sub-function with same name Objective-C @keyword
B=B^2;

View File

@@ -1,4 +1,5 @@
require 'linguist/file_blob'
require 'linguist/sample'
require 'test/unit'
require 'mime/types'
@@ -24,23 +25,6 @@ class TestBlob < Test::Unit::TestCase
blob
end
def each_language_fixture
Dir["#{fixtures_path}/*"].each do |path|
name = File.basename(path)
if name == 'text' || name == 'binary'
next
else
assert language = Language.find_by_alias(name), "No language alias for #{name.inspect}"
end
Dir.entries(path).each do |filename|
next if filename == '.' || filename == '..'
yield language, blob(File.join(path, filename))
end
end
end
def test_name
assert_equal "foo.rb", blob("foo.rb").name
end
@@ -291,9 +275,9 @@ class TestBlob < Test::Unit::TestCase
end
def test_language
# Drop any files under test/fixtures/LANGUAGE
each_language_fixture do |language, blob|
assert_equal language, blob.language, blob.name
Sample.each do |sample|
blob = blob(sample.path)
assert_equal sample.language, blob.language, blob.name
end
end

82
test/test_classifier.rb Normal file
View File

@@ -0,0 +1,82 @@
require 'linguist/classifier'
require 'linguist/language'
require 'linguist/sample'
require 'linguist/tokenizer'
require 'test/unit'
class TestClassifier < Test::Unit::TestCase
include Linguist
def fixtures_path
File.expand_path("../fixtures", __FILE__)
end
def fixture(name)
File.read(File.join(fixtures_path, name))
end
def test_instance_freshness
# Just warn, it shouldn't scare people off by breaking the build.
unless Classifier.instance.eql?(Linguist::Sample.classifier)
warn "Classifier database is out of date. Run `bundle exec rake classifier`."
end
end
def test_classify
classifier = Classifier.new
classifier.train Language["Ruby"], fixture("ruby/foo.rb")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
results = classifier.classify(fixture("objective-c/hello.m"))
assert_equal Language["Objective-C"], results.first[0]
tokens = Tokenizer.new(fixture("objective-c/hello.m")).tokens
results = classifier.classify(tokens)
assert_equal Language["Objective-C"], results.first[0]
end
def test_restricted_classify
classifier = Classifier.new
classifier.train Language["Ruby"], fixture("ruby/foo.rb")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]])
assert_equal Language["Objective-C"], results.first[0]
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]])
assert_equal Language["Ruby"], results.first[0]
end
def test_instance_classify_empty
results = Classifier.instance.classify("")
assert results.first[1] < 0.5, results.first.inspect
end
def test_verify
assert Classifier.instance.verify
end
def test_gc
Classifier.instance.gc
end
def test_classify_ambiguous_languages
Sample.each do |sample|
# TODO: These tests are pending
next if sample.path =~ /hello.h/
next if sample.path =~ /MainMenuViewController.h/
next unless sample.language.overrides.any?
extname = File.extname(sample.path)
languages = Language.all.select { |l| l.extensions.include?(extname) }
next unless languages.length > 1
results = Classifier.instance.classify(sample.data, languages)
assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}"
end
end
end

91
test/test_tokenizer.rb Normal file
View File

@@ -0,0 +1,91 @@
require 'linguist/tokenizer'
require 'test/unit'
class TestTokenizer < Test::Unit::TestCase
include Linguist
def fixtures_path
File.expand_path("../fixtures", __FILE__)
end
def tokenize(data)
data = File.read(File.join(fixtures_path, data.to_s)) if data.is_a?(Symbol)
Tokenizer.new(data).tokens
end
def test_skip_string_literals
assert_equal %w(print), tokenize('print ""')
assert_equal %w(print), tokenize('print "Josh"')
assert_equal %w(print), tokenize("print 'Josh'")
assert_equal %w(print), tokenize('print "Hello \"Josh\""')
assert_equal %w(print), tokenize("print 'Hello \\'Josh\\''")
end
def test_skip_number_literals
assert_equal %w(+), tokenize('1 + 1')
assert_equal %w(add \( \)), tokenize('add(123, 456)')
assert_equal %w(|), tokenize('0x01 | 0x10')
end
def test_skip_comments
assert_equal %w(foo #), tokenize("foo # Comment")
assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
assert_equal %w(foo //), tokenize("foo // Comment")
assert_equal %w(foo /* */), tokenize("foo /* Comment */")
assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")
assert_equal %w(foo {- -}), tokenize("foo {- Comment -}")
assert_equal %w(% %), tokenize("2 % 10\n% Comment")
end
def test_sgml_tags
assert_equal %w(<html> </html>), tokenize("<html></html>")
assert_equal %w(<div> id </div>), tokenize("<div id></div>")
assert_equal %w(<div> id= </div>), tokenize("<div id=foo></div>")
assert_equal %w(<div> id class </div>), tokenize("<div id class></div>")
assert_equal %w(<div> id= </div>), tokenize("<div id=\"foo bar\"></div>")
assert_equal %w(<div> id= </div>), tokenize("<div id='foo bar'></div>")
assert_equal %w(<?xml> version=), tokenize("<?xml version=\"1.0\"?>")
end
def test_operators
assert_equal %w(+), tokenize("1 + 1")
assert_equal %w(-), tokenize("1 - 1")
assert_equal %w(*), tokenize("1 * 1")
assert_equal %w(/), tokenize("1 / 1")
assert_equal %w(%), tokenize("2 % 5")
assert_equal %w(&), tokenize("1 & 1")
assert_equal %w(&&), tokenize("1 && 1")
assert_equal %w(|), tokenize("1 | 1")
assert_equal %w(||), tokenize("1 || 1")
assert_equal %w(<), tokenize("1 < 0x01")
assert_equal %w(<<), tokenize("1 << 0x01")
end
def test_c_tokens
assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize(:"c/hello.h")
assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return ; }), tokenize(:"c/hello.c")
end
def test_cpp_tokens
assert_equal %w(class Bar { protected char *name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h")
assert_equal %w(#include <iostream> using namespace std ; int main \( \) { cout << << endl ; }), tokenize(:"cpp/hello.cpp")
end
def test_objective_c_tokens
assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize(:"objective-c/Foo.h")
assert_equal %w(#import @implementation Foo @end), tokenize(:"objective-c/Foo.m")
assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m")
end
def test_javascript_tokens
assert_equal %w( \( function \( \) { console.log \( \) ; } \) .call \( this \) ;), tokenize(:"javascript/hello.js")
end
def test_ruby_tokens
assert_equal %w(module Foo end), tokenize(:"ruby/foo.rb")
assert_equal %w(# /usr/bin/env ruby puts), tokenize(:"ruby/script.rb")
assert_equal %w(task default do puts end), tokenize(:"ruby/Rakefile")
end
end