Sample returns simple hash objects

This commit is contained in:
Joshua Peek
2012-07-20 16:17:37 -05:00
parent ee0ffa0516
commit eb2c07e511
3 changed files with 19 additions and 45 deletions

View File

@@ -1,13 +1,10 @@
require 'linguist/classifier'
require 'linguist/language'
module Linguist module Linguist
# Model for accessing classifier training data. # Model for accessing classifier training data.
class Sample module Sample
# Samples live in test/ for now, we'll eventually move them out # Samples live in test/ for now, we'll eventually move them out
PATH = File.expand_path("../../../samples", __FILE__) PATH = File.expand_path("../../../samples", __FILE__)
# Public: Iterate over each Sample. # Public: Iterate over each sample.
# #
# &block - Yields Sample to block # &block - Yields Sample to block
# #
@@ -20,14 +17,10 @@ module Linguist
# Possibly reconsider this later # Possibly reconsider this later
next if category == 'text' || category == 'binary' next if category == 'text' || category == 'binary'
# Map directory name to a Language alias
language = Linguist::Language.find_by_alias(category)
raise "No language for #{category.inspect}" unless language
dirname = File.join(PATH, category) dirname = File.join(PATH, category)
Dir.entries(dirname).each do |filename| Dir.entries(dirname).each do |filename|
next if filename == '.' || filename == '..' next if filename == '.' || filename == '..'
yield new(File.join(dirname, filename), language) yield({ :path => File.join(dirname, filename), :language => category })
end end
end end
@@ -38,37 +31,16 @@ module Linguist
# #
# Returns trained Classifier. # Returns trained Classifier.
def self.classifier def self.classifier
require 'linguist/classifier'
require 'linguist/language'
classifier = Classifier.new classifier = Classifier.new
each { |sample| classifier.train(sample.language.name, sample.data) } each { |sample|
language = Language.find_by_alias(sample[:language])
data = File.read(sample[:path])
classifier.train(language.name, data)
}
classifier.gc classifier.gc
end end
# Internal: Initialize Sample.
#
# Samples should be initialized by Sample.each.
#
# path - String full path to file.
# language - Language of sample.
def initialize(path, language)
@path = path
@language = language
end
# Public: Get full path to file.
#
# Returns String.
attr_reader :path
# Public: Get sample language.
#
# Returns Language.
attr_reader :language
# Public: Read file contents.
#
# Returns String.
def data
File.read(path)
end
end end
end end

View File

@@ -276,8 +276,9 @@ class TestBlob < Test::Unit::TestCase
def test_language def test_language
Sample.each do |sample| Sample.each do |sample|
blob = blob(sample.path) blob = blob(sample[:path])
assert_equal sample.language, blob.language, blob.name language = Linguist::Language.find_by_alias(sample[:language])
assert_equal language, blob.language, blob.name
end end
end end

View File

@@ -69,14 +69,15 @@ class TestClassifier < Test::Unit::TestCase
def test_classify_ambiguous_languages def test_classify_ambiguous_languages
Sample.each do |sample| Sample.each do |sample|
next unless sample.language.overrides.any? language = Linguist::Language.find_by_alias(sample[:language])
next unless language.overrides.any?
extname = File.extname(sample.path) extname = File.extname(sample[:path])
languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
next unless languages.length > 1 next unless languages.length > 1
results = Classifier.instance.classify(sample.data, languages) results = Classifier.instance.classify(File.read(sample[:path]), languages)
assert_equal sample.language.name, results.first[0], "#{sample.path}\n#{results.inspect}" assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
end end
end end
end end