diff --git a/lib/linguist/sample.rb b/lib/linguist/sample.rb index 20d76732..4b838626 100644 --- a/lib/linguist/sample.rb +++ b/lib/linguist/sample.rb @@ -1,13 +1,10 @@ -require 'linguist/classifier' -require 'linguist/language' - module Linguist # Model for accessing classifier training data. - class Sample + module Sample # Samples live in test/ for now, we'll eventually move them out PATH = File.expand_path("../../../samples", __FILE__) - # Public: Iterate over each Sample. + # Public: Iterate over each sample. # # &block - Yields Sample to block # @@ -20,14 +17,10 @@ module Linguist # Possibly reconsider this later next if category == 'text' || category == 'binary' - # Map directory name to a Language alias - language = Linguist::Language.find_by_alias(category) - raise "No language for #{category.inspect}" unless language - dirname = File.join(PATH, category) Dir.entries(dirname).each do |filename| next if filename == '.' || filename == '..' - yield new(File.join(dirname, filename), language) + yield({ :path => File.join(dirname, filename), :language => category }) end end @@ -38,37 +31,16 @@ module Linguist # # Returns trained Classifier. def self.classifier + require 'linguist/classifier' + require 'linguist/language' + classifier = Classifier.new - each { |sample| classifier.train(sample.language.name, sample.data) } + each { |sample| + language = Language.find_by_alias(sample[:language]) + data = File.read(sample[:path]) + classifier.train(language.name, data) + } classifier.gc end - - # Internal: Initialize Sample. - # - # Samples should be initialized by Sample.each. - # - # path - String full path to file. - # language - Language of sample. - def initialize(path, language) - @path = path - @language = language - end - - # Public: Get full path to file. - # - # Returns String. - attr_reader :path - - # Public: Get sample language. - # - # Returns Language. - attr_reader :language - - # Public: Read file contents. - # - # Returns String. - def data - File.read(path) - end end end diff --git a/test/test_blob.rb b/test/test_blob.rb index df13fc4b..fc62349d 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -276,8 +276,9 @@ class TestBlob < Test::Unit::TestCase def test_language Sample.each do |sample| - blob = blob(sample.path) - assert_equal sample.language, blob.language, blob.name + blob = blob(sample[:path]) + language = Linguist::Language.find_by_alias(sample[:language]) + assert_equal language, blob.language, blob.name end end diff --git a/test/test_classifier.rb b/test/test_classifier.rb index 33de385a..80df7e77 100644 --- a/test/test_classifier.rb +++ b/test/test_classifier.rb @@ -69,14 +69,15 @@ class TestClassifier < Test::Unit::TestCase def test_classify_ambiguous_languages Sample.each do |sample| - next unless sample.language.overrides.any? + language = Linguist::Language.find_by_alias(sample[:language]) + next unless language.overrides.any? - extname = File.extname(sample.path) + extname = File.extname(sample[:path]) languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) next unless languages.length > 1 - results = Classifier.instance.classify(sample.data, languages) - assert_equal sample.language.name, results.first[0], "#{sample.path}\n#{results.inspect}" + results = Classifier.instance.classify(File.read(sample[:path]), languages) + assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}" end end end