diff --git a/Rakefile b/Rakefile index 753cadf5..d99f8216 100644 --- a/Rakefile +++ b/Rakefile @@ -10,7 +10,8 @@ end file 'lib/linguist/samples.yml' => Dir['samples/**/*'] do |f| require 'linguist/samples' - File.open(f.name, 'w') { |io| Linguist::Samples.serialize_to_yaml(Linguist::Samples::DATA, io) } + yaml = Linguist::Samples.serialize_to_yaml(Linguist::Samples.data) + File.open(f.name, 'w') { |io| io.write yaml } end CLOBBER.include 'lib/linguist/samples.yml' @@ -31,7 +32,7 @@ namespace :classifier do next if file_language.nil? || file_language == 'Text' begin data = open(file_url).read - guessed_language, score = Linguist::Classifier.new(Samples::DATA).classify(data).first + guessed_language, score = Linguist::Classifier.classify(Samples::DATA, data).first total += 1 guessed_language == file_language ? correct += 1 : incorrect += 1 diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 39d62c0d..3e101e43 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -442,7 +442,7 @@ module Linguist if Language.ambiguous?(extname) possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) if possible_languages.any? - if result = Classifier.new(Samples::DATA).classify(data, possible_languages).first + if result = Classifier.classify(Samples::DATA, data, possible_languages).first Language[result[0]] end end diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index d50f75d7..ce92af30 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -3,56 +3,76 @@ require 'linguist/tokenizer' module Linguist # Language bayesian classifier. class Classifier - # Public: Initialize a Classifier. - def initialize(attrs = {}) - @tokens_total = attrs['tokens_total'] || 0 - @languages_total = attrs['languages_total'] || 0 - @tokens = attrs['tokens'] || {} - @language_tokens = attrs['language_tokens'] || {} - @languages = attrs['languages'] || {} - end - # Public: Train classifier that data is a certain language. # + # db - Hash classifier database object # language - String language of data # data - String contents of file # # Examples # - # train('Ruby', "def hello; end") + # Classifier.train(db, 'Ruby', "def hello; end") # # Returns nothing. - def train(language, data) + def self.train!(db, language, data) tokens = Tokenizer.tokenize(data) + db['tokens_total'] ||= 0 + db['languages_total'] ||= 0 + db['tokens'] ||= {} + db['language_tokens'] ||= {} + db['languages'] ||= {} + tokens.each do |token| - @tokens[language] ||= {} - @tokens[language][token] ||= 0 - @tokens[language][token] += 1 - @language_tokens[language] ||= 0 - @language_tokens[language] += 1 - @tokens_total += 1 + db['tokens'][language] ||= {} + db['tokens'][language][token] ||= 0 + db['tokens'][language][token] += 1 + db['language_tokens'][language] ||= 0 + db['language_tokens'][language] += 1 + db['tokens_total'] += 1 end - @languages[language] ||= 0 - @languages[language] += 1 - @languages_total += 1 + db['languages'][language] ||= 0 + db['languages'][language] += 1 + db['languages_total'] += 1 nil end # Public: Guess language of data. # + # db - Hash of classifer tokens database. # data - Array of tokens or String data to analyze. # languages - Array of language name Strings to restrict to. # # Examples # - # classify("def hello; end") + # Classifier.classify(db, "def hello; end") # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ] # # Returns sorted Array of result pairs. Each pair contains the # String language name and a Float score. - def classify(tokens, languages = @languages.keys) + def self.classify(db, tokens, languages = nil) + languages ||= db['languages'].keys + new(db).classify(tokens, languages) + end + + # Internal: Initialize a Classifier. + def initialize(db = {}) + @tokens_total = db['tokens_total'] + @languages_total = db['languages_total'] + @tokens = db['tokens'] + @language_tokens = db['language_tokens'] + @languages = db['languages'] + end + + # Internal: Guess language of data + # + # data - Array of tokens or String data to analyze. + # languages - Array of language name Strings to restrict to. + # + # Returns sorted Array of result pairs. Each pair contains the + # String language name and a Float score. + def classify(tokens, languages) return [] if tokens.nil? tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String) @@ -99,18 +119,5 @@ module Linguist def language_probability(language) Math.log(@languages[language].to_f / @languages_total.to_f) end - - # Public: Returns serializable hash representation. - # - # Returns Hash. - def to_hash - { - 'tokens_total' => @tokens_total, - 'languages_total' => @languages_total, - 'tokens' => @tokens, - 'language_tokens' => @language_tokens, - 'languages' => @languages - } - end end end diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index 85d3193a..b8902c5f 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -22,7 +22,7 @@ module Linguist # # Returns Boolean. def self.outdated? - MD5.hexdigest(DATA) != MD5.hexdigest(classifier.to_hash) + MD5.hexdigest(DATA) != MD5.hexdigest(data) end # Public: Iterate over each sample. @@ -98,52 +98,50 @@ module Linguist # Public: Build Classifier from all samples. # # Returns trained Classifier. - def self.classifier + def self.data require 'linguist/classifier' require 'linguist/language' - classifier = Classifier.new - each { |sample| + db = {} + each do |sample| language = Language.find_by_alias(sample[:language]) data = File.read(sample[:path]) - classifier.train(language.name, data) - } - classifier + Classifier.train!(db, language.name, data) + end + db end # Public: Serialize samples data to YAML. # - # data - Hash - # io - IO object to write to + # db - Hash # - # Returns nothing. - def self.serialize_to_yaml(data, io) - data = "" + # Returns String. + def self.serialize_to_yaml(db) + out = "" escape = lambda { |s| s.inspect.gsub(/\\#/, "\#") } - data << "languages_total: #{data['languages_total']}\n" - data << "tokens_total: #{data['tokens_total']}\n" + out << "languages_total: #{db['languages_total']}\n" + out << "tokens_total: #{db['tokens_total']}\n" - data << "languages:\n" - data['languages'].sort.each do |language, count| - data << " #{escape.call(language)}: #{count}\n" + out << "languages:\n" + db['languages'].sort.each do |language, count| + out << " #{escape.call(language)}: #{count}\n" end - data << "language_tokens:\n" - data['language_tokens'].sort.each do |language, count| - data << " #{escape.call(language)}: #{count}\n" + out << "language_tokens:\n" + db['language_tokens'].sort.each do |language, count| + out << " #{escape.call(language)}: #{count}\n" end - data << "tokens:\n" - data['tokens'].sort.each do |language, tokens| - data << " #{escape.call(language)}:\n" + out << "tokens:\n" + db['tokens'].sort.each do |language, tokens| + out << " #{escape.call(language)}:\n" tokens.sort.each do |token, count| - data << " #{escape.call(token)}: #{count}\n" + out << " #{escape.call(token)}: #{count}\n" end end - io.write data - nil + out end end end diff --git a/test/test_classifier.rb b/test/test_classifier.rb index 890696fb..33f3bae9 100644 --- a/test/test_classifier.rb +++ b/test/test_classifier.rb @@ -24,39 +24,39 @@ class TestClassifier < Test::Unit::TestCase end def test_classify - classifier = Classifier.new - classifier.train "Ruby", fixture("ruby/foo.rb") - classifier.train "Objective-C", fixture("objective-c/Foo.h") - classifier.train "Objective-C", fixture("objective-c/Foo.m") + db = {} + Classifier.train! db, "Ruby", fixture("ruby/foo.rb") + Classifier.train! db, "Objective-C", fixture("objective-c/Foo.h") + Classifier.train! db, "Objective-C", fixture("objective-c/Foo.m") - results = classifier.classify(fixture("objective-c/hello.m")) + results = Classifier.classify(db, fixture("objective-c/hello.m")) assert_equal "Objective-C", results.first[0] tokens = Tokenizer.tokenize(fixture("objective-c/hello.m")) - results = classifier.classify(tokens) + results = Classifier.classify(db, tokens) assert_equal "Objective-C", results.first[0] end def test_restricted_classify - classifier = Classifier.new - classifier.train "Ruby", fixture("ruby/foo.rb") - classifier.train "Objective-C", fixture("objective-c/Foo.h") - classifier.train "Objective-C", fixture("objective-c/Foo.m") + db = {} + Classifier.train! db, "Ruby", fixture("ruby/foo.rb") + Classifier.train! db, "Objective-C", fixture("objective-c/Foo.h") + Classifier.train! db, "Objective-C", fixture("objective-c/Foo.m") - results = classifier.classify(fixture("objective-c/hello.m"), ["Objective-C"]) + results = Classifier.classify(db, fixture("objective-c/hello.m"), ["Objective-C"]) assert_equal "Objective-C", results.first[0] - results = classifier.classify(fixture("objective-c/hello.m"), ["Ruby"]) + results = Classifier.classify(db, fixture("objective-c/hello.m"), ["Ruby"]) assert_equal "Ruby", results.first[0] end def test_instance_classify_empty - results = Classifier.new(Samples::DATA).classify("") + results = Classifier.classify(Samples::DATA, "") assert results.first[1] < 0.5, results.first.inspect end def test_instance_classify_nil - assert_equal [], Classifier.new(Samples::DATA).classify(nil) + assert_equal [], Classifier.classify(Samples::DATA, nil) end def test_verify @@ -76,7 +76,7 @@ class TestClassifier < Test::Unit::TestCase languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) next unless languages.length > 1 - results = Classifier.new(Samples::DATA).classify(File.read(sample[:path]), languages) + results = Classifier.classify(Samples::DATA, File.read(sample[:path]), languages) assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}" end end