mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Load classifer db into sample data hash
This commit is contained in:
5
Rakefile
5
Rakefile
@@ -10,8 +10,7 @@ end
|
||||
|
||||
file 'lib/linguist/samples.yml' => Dir['samples/**/*'] do |f|
|
||||
require 'linguist/sample'
|
||||
classifier = Linguist::Sample.classifier
|
||||
File.open(f.name, 'w') { |io| classifier.to_yaml(io) }
|
||||
File.open(f.name, 'w') { |io| Linguist::Sample.serialize_to_yaml(Linguist::Sample::DATA, io) }
|
||||
end
|
||||
|
||||
CLOBBER.include 'lib/linguist/samples.yml'
|
||||
@@ -32,7 +31,7 @@ namespace :classifier do
|
||||
next if file_language.nil? || file_language == 'Text'
|
||||
begin
|
||||
data = open(file_url).read
|
||||
guessed_language, score = Linguist::Classifier.instance.classify(data).first
|
||||
guessed_language, score = Linguist::Classifier.new(Sample::DATA).classify(data).first
|
||||
|
||||
total += 1
|
||||
guessed_language == file_language ? correct += 1 : incorrect += 1
|
||||
|
||||
@@ -2,6 +2,7 @@ require 'linguist/classifier'
|
||||
require 'linguist/language'
|
||||
require 'linguist/mime'
|
||||
require 'linguist/pathname'
|
||||
require 'linguist/sample'
|
||||
|
||||
require 'charlock_holmes'
|
||||
require 'escape_utils'
|
||||
@@ -441,7 +442,7 @@ module Linguist
|
||||
if Language.ambiguous?(extname)
|
||||
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
||||
if possible_languages.any?
|
||||
if result = Classifier.instance.classify(data, possible_languages).first
|
||||
if result = Classifier.new(Sample::DATA).classify(data, possible_languages).first
|
||||
Language[result[0]]
|
||||
end
|
||||
end
|
||||
|
||||
@@ -3,23 +3,6 @@ require 'linguist/tokenizer'
|
||||
module Linguist
|
||||
# Language bayesian classifier.
|
||||
class Classifier
|
||||
# Internal: Path to persisted classifier db.
|
||||
PATH = File.expand_path('../samples.yml', __FILE__)
|
||||
|
||||
# Public: Check if persisted db exists on disk.
|
||||
#
|
||||
# Returns Boolean.
|
||||
def self.exist?
|
||||
File.exist?(PATH)
|
||||
end
|
||||
|
||||
# Public: Get persisted Classifier instance.
|
||||
#
|
||||
# Returns Classifier.
|
||||
def self.instance
|
||||
@instance ||= new(YAML.load_file(PATH))
|
||||
end
|
||||
|
||||
# Public: Initialize a Classifier.
|
||||
def initialize(attrs = {})
|
||||
@tokens_total = attrs['tokens_total'] || 0
|
||||
@@ -129,42 +112,5 @@ module Linguist
|
||||
'languages' => @languages
|
||||
}
|
||||
end
|
||||
|
||||
# Public: Serialize classifier to YAML.
|
||||
#
|
||||
# opts - Hash of YAML options.
|
||||
#
|
||||
# Returns nothing.
|
||||
def to_yaml(io)
|
||||
data = ""
|
||||
escape = lambda { |s| s.inspect.gsub(/\\#/, "\#") }
|
||||
|
||||
data << "languages_total: #{@languages_total}\n"
|
||||
data << "tokens_total: #{@tokens_total}\n"
|
||||
|
||||
data << "languages:\n"
|
||||
@languages.sort.each do |language, count|
|
||||
data << " #{escape.call(language)}: #{count}\n"
|
||||
end
|
||||
|
||||
data << "language_tokens:\n"
|
||||
@language_tokens.sort.each do |language, count|
|
||||
data << " #{escape.call(language)}: #{count}\n"
|
||||
end
|
||||
|
||||
data << "tokens:\n"
|
||||
@tokens.sort.each do |language, tokens|
|
||||
data << " #{escape.call(language)}:\n"
|
||||
tokens.sort.each do |token, count|
|
||||
data << " #{escape.call(token)}: #{count}\n"
|
||||
end
|
||||
end
|
||||
|
||||
io.write data
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
# Eager load instance
|
||||
Classifier.instance if Classifier.exist?
|
||||
end
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
require 'set'
|
||||
require 'yaml'
|
||||
|
||||
module Linguist
|
||||
# Model for accessing classifier training data.
|
||||
@@ -6,6 +7,13 @@ module Linguist
|
||||
# Samples live in test/ for now, we'll eventually move them out
|
||||
PATH = File.expand_path("../../../samples", __FILE__)
|
||||
|
||||
YML = File.expand_path('../samples.yml', __FILE__)
|
||||
if File.exist?(YML)
|
||||
DATA = YAML.load_file(YML)
|
||||
else
|
||||
DATA = nil
|
||||
end
|
||||
|
||||
# Public: Iterate over each sample.
|
||||
#
|
||||
# &block - Yields Sample to block
|
||||
@@ -91,5 +99,40 @@ module Linguist
|
||||
}
|
||||
classifier
|
||||
end
|
||||
|
||||
# Public: Serialize samples data to YAML.
|
||||
#
|
||||
# data - Hash
|
||||
# io - IO object to write to
|
||||
#
|
||||
# Returns nothing.
|
||||
def self.serialize_to_yaml(data, io)
|
||||
data = ""
|
||||
escape = lambda { |s| s.inspect.gsub(/\\#/, "\#") }
|
||||
|
||||
data << "languages_total: #{data['languages_total']}\n"
|
||||
data << "tokens_total: #{data['tokens_total']}\n"
|
||||
|
||||
data << "languages:\n"
|
||||
data['languages'].sort.each do |language, count|
|
||||
data << " #{escape.call(language)}: #{count}\n"
|
||||
end
|
||||
|
||||
data << "language_tokens:\n"
|
||||
data['language_tokens'].sort.each do |language, count|
|
||||
data << " #{escape.call(language)}: #{count}\n"
|
||||
end
|
||||
|
||||
data << "tokens:\n"
|
||||
data['tokens'].sort.each do |language, tokens|
|
||||
data << " #{escape.call(language)}:\n"
|
||||
tokens.sort.each do |token, count|
|
||||
data << " #{escape.call(token)}: #{count}\n"
|
||||
end
|
||||
end
|
||||
|
||||
io.write data
|
||||
nil
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -18,7 +18,7 @@ class TestClassifier < Test::Unit::TestCase
|
||||
end
|
||||
|
||||
def test_instance_freshness
|
||||
serialized = Linguist::MD5.hexdigest(Classifier.instance.to_hash)
|
||||
serialized = Linguist::MD5.hexdigest(Sample::DATA)
|
||||
latest = Linguist::MD5.hexdigest(Linguist::Sample.classifier.to_hash)
|
||||
|
||||
# Just warn, it shouldn't scare people off by breaking the build.
|
||||
@@ -55,16 +55,16 @@ class TestClassifier < Test::Unit::TestCase
|
||||
end
|
||||
|
||||
def test_instance_classify_empty
|
||||
results = Classifier.instance.classify("")
|
||||
results = Classifier.new(Sample::DATA).classify("")
|
||||
assert results.first[1] < 0.5, results.first.inspect
|
||||
end
|
||||
|
||||
def test_instance_classify_nil
|
||||
assert_equal [], Classifier.instance.classify(nil)
|
||||
assert_equal [], Classifier.new(Sample::DATA).classify(nil)
|
||||
end
|
||||
|
||||
def test_verify
|
||||
data = Classifier.instance.to_hash
|
||||
data = Sample::DATA
|
||||
|
||||
assert_equal data['languages_total'], data['languages'].inject(0) { |n, (_, c)| n += c }
|
||||
assert_equal data['tokens_total'], data['language_tokens'].inject(0) { |n, (_, c)| n += c }
|
||||
@@ -80,7 +80,7 @@ class TestClassifier < Test::Unit::TestCase
|
||||
languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
||||
next unless languages.length > 1
|
||||
|
||||
results = Classifier.instance.classify(File.read(sample[:path]), languages)
|
||||
results = Classifier.new(Sample::DATA).classify(File.read(sample[:path]), languages)
|
||||
assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user