diff --git a/Rakefile b/Rakefile index d093868d..cade0dc1 100644 --- a/Rakefile +++ b/Rakefile @@ -10,8 +10,7 @@ end file 'lib/linguist/samples.yml' => Dir['samples/**/*'] do |f| require 'linguist/sample' - classifier = Linguist::Sample.classifier - File.open(f.name, 'w') { |io| classifier.to_yaml(io) } + File.open(f.name, 'w') { |io| Linguist::Sample.serialize_to_yaml(Linguist::Sample::DATA, io) } end CLOBBER.include 'lib/linguist/samples.yml' @@ -32,7 +31,7 @@ namespace :classifier do next if file_language.nil? || file_language == 'Text' begin data = open(file_url).read - guessed_language, score = Linguist::Classifier.instance.classify(data).first + guessed_language, score = Linguist::Classifier.new(Sample::DATA).classify(data).first total += 1 guessed_language == file_language ? correct += 1 : incorrect += 1 diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 1ea5b44d..b9326120 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -2,6 +2,7 @@ require 'linguist/classifier' require 'linguist/language' require 'linguist/mime' require 'linguist/pathname' +require 'linguist/sample' require 'charlock_holmes' require 'escape_utils' @@ -441,7 +442,7 @@ module Linguist if Language.ambiguous?(extname) possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) if possible_languages.any? - if result = Classifier.instance.classify(data, possible_languages).first + if result = Classifier.new(Sample::DATA).classify(data, possible_languages).first Language[result[0]] end end diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 2cbd1bd5..d50f75d7 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -3,23 +3,6 @@ require 'linguist/tokenizer' module Linguist # Language bayesian classifier. class Classifier - # Internal: Path to persisted classifier db. - PATH = File.expand_path('../samples.yml', __FILE__) - - # Public: Check if persisted db exists on disk. - # - # Returns Boolean. - def self.exist? - File.exist?(PATH) - end - - # Public: Get persisted Classifier instance. - # - # Returns Classifier. - def self.instance - @instance ||= new(YAML.load_file(PATH)) - end - # Public: Initialize a Classifier. def initialize(attrs = {}) @tokens_total = attrs['tokens_total'] || 0 @@ -129,42 +112,5 @@ module Linguist 'languages' => @languages } end - - # Public: Serialize classifier to YAML. - # - # opts - Hash of YAML options. - # - # Returns nothing. - def to_yaml(io) - data = "" - escape = lambda { |s| s.inspect.gsub(/\\#/, "\#") } - - data << "languages_total: #{@languages_total}\n" - data << "tokens_total: #{@tokens_total}\n" - - data << "languages:\n" - @languages.sort.each do |language, count| - data << " #{escape.call(language)}: #{count}\n" - end - - data << "language_tokens:\n" - @language_tokens.sort.each do |language, count| - data << " #{escape.call(language)}: #{count}\n" - end - - data << "tokens:\n" - @tokens.sort.each do |language, tokens| - data << " #{escape.call(language)}:\n" - tokens.sort.each do |token, count| - data << " #{escape.call(token)}: #{count}\n" - end - end - - io.write data - nil - end end - - # Eager load instance - Classifier.instance if Classifier.exist? end diff --git a/lib/linguist/sample.rb b/lib/linguist/sample.rb index 77b1102e..4c3f5e9a 100644 --- a/lib/linguist/sample.rb +++ b/lib/linguist/sample.rb @@ -1,4 +1,5 @@ require 'set' +require 'yaml' module Linguist # Model for accessing classifier training data. @@ -6,6 +7,13 @@ module Linguist # Samples live in test/ for now, we'll eventually move them out PATH = File.expand_path("../../../samples", __FILE__) + YML = File.expand_path('../samples.yml', __FILE__) + if File.exist?(YML) + DATA = YAML.load_file(YML) + else + DATA = nil + end + # Public: Iterate over each sample. # # &block - Yields Sample to block @@ -91,5 +99,40 @@ module Linguist } classifier end + + # Public: Serialize samples data to YAML. + # + # data - Hash + # io - IO object to write to + # + # Returns nothing. + def self.serialize_to_yaml(data, io) + data = "" + escape = lambda { |s| s.inspect.gsub(/\\#/, "\#") } + + data << "languages_total: #{data['languages_total']}\n" + data << "tokens_total: #{data['tokens_total']}\n" + + data << "languages:\n" + data['languages'].sort.each do |language, count| + data << " #{escape.call(language)}: #{count}\n" + end + + data << "language_tokens:\n" + data['language_tokens'].sort.each do |language, count| + data << " #{escape.call(language)}: #{count}\n" + end + + data << "tokens:\n" + data['tokens'].sort.each do |language, tokens| + data << " #{escape.call(language)}:\n" + tokens.sort.each do |token, count| + data << " #{escape.call(token)}: #{count}\n" + end + end + + io.write data + nil + end end end diff --git a/test/test_classifier.rb b/test/test_classifier.rb index e648dce7..2d55aa07 100644 --- a/test/test_classifier.rb +++ b/test/test_classifier.rb @@ -18,7 +18,7 @@ class TestClassifier < Test::Unit::TestCase end def test_instance_freshness - serialized = Linguist::MD5.hexdigest(Classifier.instance.to_hash) + serialized = Linguist::MD5.hexdigest(Sample::DATA) latest = Linguist::MD5.hexdigest(Linguist::Sample.classifier.to_hash) # Just warn, it shouldn't scare people off by breaking the build. @@ -55,16 +55,16 @@ class TestClassifier < Test::Unit::TestCase end def test_instance_classify_empty - results = Classifier.instance.classify("") + results = Classifier.new(Sample::DATA).classify("") assert results.first[1] < 0.5, results.first.inspect end def test_instance_classify_nil - assert_equal [], Classifier.instance.classify(nil) + assert_equal [], Classifier.new(Sample::DATA).classify(nil) end def test_verify - data = Classifier.instance.to_hash + data = Sample::DATA assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c } assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c } @@ -80,7 +80,7 @@ class TestClassifier < Test::Unit::TestCase languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) next unless languages.length > 1 - results = Classifier.instance.classify(File.read(sample[:path]), languages) + results = Classifier.new(Sample::DATA).classify(File.read(sample[:path]), languages) assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}" end end