mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Sample returns simple hash objects
This commit is contained in:
@@ -1,13 +1,10 @@
|
|||||||
require 'linguist/classifier'
|
|
||||||
require 'linguist/language'
|
|
||||||
|
|
||||||
module Linguist
|
module Linguist
|
||||||
# Model for accessing classifier training data.
|
# Model for accessing classifier training data.
|
||||||
class Sample
|
module Sample
|
||||||
# Samples live in test/ for now, we'll eventually move them out
|
# Samples live in test/ for now, we'll eventually move them out
|
||||||
PATH = File.expand_path("../../../samples", __FILE__)
|
PATH = File.expand_path("../../../samples", __FILE__)
|
||||||
|
|
||||||
# Public: Iterate over each Sample.
|
# Public: Iterate over each sample.
|
||||||
#
|
#
|
||||||
# &block - Yields Sample to block
|
# &block - Yields Sample to block
|
||||||
#
|
#
|
||||||
@@ -20,14 +17,10 @@ module Linguist
|
|||||||
# Possibly reconsider this later
|
# Possibly reconsider this later
|
||||||
next if category == 'text' || category == 'binary'
|
next if category == 'text' || category == 'binary'
|
||||||
|
|
||||||
# Map directory name to a Language alias
|
|
||||||
language = Linguist::Language.find_by_alias(category)
|
|
||||||
raise "No language for #{category.inspect}" unless language
|
|
||||||
|
|
||||||
dirname = File.join(PATH, category)
|
dirname = File.join(PATH, category)
|
||||||
Dir.entries(dirname).each do |filename|
|
Dir.entries(dirname).each do |filename|
|
||||||
next if filename == '.' || filename == '..'
|
next if filename == '.' || filename == '..'
|
||||||
yield new(File.join(dirname, filename), language)
|
yield({ :path => File.join(dirname, filename), :language => category })
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -38,37 +31,16 @@ module Linguist
|
|||||||
#
|
#
|
||||||
# Returns trained Classifier.
|
# Returns trained Classifier.
|
||||||
def self.classifier
|
def self.classifier
|
||||||
|
require 'linguist/classifier'
|
||||||
|
require 'linguist/language'
|
||||||
|
|
||||||
classifier = Classifier.new
|
classifier = Classifier.new
|
||||||
each { |sample| classifier.train(sample.language.name, sample.data) }
|
each { |sample|
|
||||||
|
language = Language.find_by_alias(sample[:language])
|
||||||
|
data = File.read(sample[:path])
|
||||||
|
classifier.train(language.name, data)
|
||||||
|
}
|
||||||
classifier.gc
|
classifier.gc
|
||||||
end
|
end
|
||||||
|
|
||||||
# Internal: Initialize Sample.
|
|
||||||
#
|
|
||||||
# Samples should be initialized by Sample.each.
|
|
||||||
#
|
|
||||||
# path - String full path to file.
|
|
||||||
# language - Language of sample.
|
|
||||||
def initialize(path, language)
|
|
||||||
@path = path
|
|
||||||
@language = language
|
|
||||||
end
|
|
||||||
|
|
||||||
# Public: Get full path to file.
|
|
||||||
#
|
|
||||||
# Returns String.
|
|
||||||
attr_reader :path
|
|
||||||
|
|
||||||
# Public: Get sample language.
|
|
||||||
#
|
|
||||||
# Returns Language.
|
|
||||||
attr_reader :language
|
|
||||||
|
|
||||||
# Public: Read file contents.
|
|
||||||
#
|
|
||||||
# Returns String.
|
|
||||||
def data
|
|
||||||
File.read(path)
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -276,8 +276,9 @@ class TestBlob < Test::Unit::TestCase
|
|||||||
|
|
||||||
def test_language
|
def test_language
|
||||||
Sample.each do |sample|
|
Sample.each do |sample|
|
||||||
blob = blob(sample.path)
|
blob = blob(sample[:path])
|
||||||
assert_equal sample.language, blob.language, blob.name
|
language = Linguist::Language.find_by_alias(sample[:language])
|
||||||
|
assert_equal language, blob.language, blob.name
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -69,14 +69,15 @@ class TestClassifier < Test::Unit::TestCase
|
|||||||
|
|
||||||
def test_classify_ambiguous_languages
|
def test_classify_ambiguous_languages
|
||||||
Sample.each do |sample|
|
Sample.each do |sample|
|
||||||
next unless sample.language.overrides.any?
|
language = Linguist::Language.find_by_alias(sample[:language])
|
||||||
|
next unless language.overrides.any?
|
||||||
|
|
||||||
extname = File.extname(sample.path)
|
extname = File.extname(sample[:path])
|
||||||
languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
||||||
next unless languages.length > 1
|
next unless languages.length > 1
|
||||||
|
|
||||||
results = Classifier.instance.classify(sample.data, languages)
|
results = Classifier.instance.classify(File.read(sample[:path]), languages)
|
||||||
assert_equal sample.language.name, results.first[0], "#{sample.path}\n#{results.inspect}"
|
assert_equal language.name, results.first[0], "#{sample[:path]}\n#{results.inspect}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user