Load classifer db into sample data hash

This commit is contained in:
Joshua Peek
2012-07-23 13:13:52 -05:00
parent 97ae7c1a11
commit 0c9a947f39
5 changed files with 52 additions and 63 deletions

View File

@@ -2,6 +2,7 @@ require 'linguist/classifier'
require 'linguist/language'
require 'linguist/mime'
require 'linguist/pathname'
require 'linguist/sample'
require 'charlock_holmes'
require 'escape_utils'
@@ -441,7 +442,7 @@ module Linguist
if Language.ambiguous?(extname)
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
if possible_languages.any?
if result = Classifier.instance.classify(data, possible_languages).first
if result = Classifier.new(Sample::DATA).classify(data, possible_languages).first
Language[result[0]]
end
end

View File

@@ -3,23 +3,6 @@ require 'linguist/tokenizer'
module Linguist
# Language bayesian classifier.
class Classifier
# Internal: Path to persisted classifier db.
PATH = File.expand_path('../samples.yml', __FILE__)
# Public: Check if persisted db exists on disk.
#
# Returns Boolean.
def self.exist?
File.exist?(PATH)
end
# Public: Get persisted Classifier instance.
#
# Returns Classifier.
def self.instance
@instance ||= new(YAML.load_file(PATH))
end
# Public: Initialize a Classifier.
def initialize(attrs = {})
@tokens_total = attrs['tokens_total'] || 0
@@ -129,42 +112,5 @@ module Linguist
'languages' => @languages
}
end
# Public: Serialize classifier to YAML.
#
# opts - Hash of YAML options.
#
# Returns nothing.
def to_yaml(io)
data = ""
escape = lambda { |s| s.inspect.gsub(/\\#/, "\#") }
data << "languages_total: #{@languages_total}\n"
data << "tokens_total: #{@tokens_total}\n"
data << "languages:\n"
@languages.sort.each do |language, count|
data << " #{escape.call(language)}: #{count}\n"
end
data << "language_tokens:\n"
@language_tokens.sort.each do |language, count|
data << " #{escape.call(language)}: #{count}\n"
end
data << "tokens:\n"
@tokens.sort.each do |language, tokens|
data << " #{escape.call(language)}:\n"
tokens.sort.each do |token, count|
data << " #{escape.call(token)}: #{count}\n"
end
end
io.write data
nil
end
end
# Eager load instance
Classifier.instance if Classifier.exist?
end

View File

@@ -1,4 +1,5 @@
require 'set'
require 'yaml'
module Linguist
# Model for accessing classifier training data.
@@ -6,6 +7,13 @@ module Linguist
# Samples live in test/ for now, we'll eventually move them out
PATH = File.expand_path("../../../samples", __FILE__)
YML = File.expand_path('../samples.yml', __FILE__)
if File.exist?(YML)
DATA = YAML.load_file(YML)
else
DATA = nil
end
# Public: Iterate over each sample.
#
# &block - Yields Sample to block
@@ -91,5 +99,40 @@ module Linguist
}
classifier
end
# Public: Serialize samples data to YAML.
#
# data - Hash
# io - IO object to write to
#
# Returns nothing.
def self.serialize_to_yaml(data, io)
data = ""
escape = lambda { |s| s.inspect.gsub(/\\#/, "\#") }
data << "languages_total: #{data['languages_total']}\n"
data << "tokens_total: #{data['tokens_total']}\n"
data << "languages:\n"
data['languages'].sort.each do |language, count|
data << " #{escape.call(language)}: #{count}\n"
end
data << "language_tokens:\n"
data['language_tokens'].sort.each do |language, count|
data << " #{escape.call(language)}: #{count}\n"
end
data << "tokens:\n"
data['tokens'].sort.each do |language, tokens|
data << " #{escape.call(language)}:\n"
tokens.sort.each do |token, count|
data << " #{escape.call(token)}: #{count}\n"
end
end
io.write data
nil
end
end
end