diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 021cc5ec..686ac8c9 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -34,6 +34,21 @@ module Linguist @languages_total += 1 end + def gc + @tokens.each do |language, tokens| + if @language_tokens[language] > 20 + tokens.each do |name, count| + if count == 1 + @tokens[language].delete(name) + @language_tokens[language] -= 1 + @tokens_total -= 1 + end + end + end + end + self + end + def classify(data) tokens = Tokenizer.new(data).tokens diff --git a/lib/linguist/sample.rb b/lib/linguist/sample.rb index 53800b97..a2248daa 100644 --- a/lib/linguist/sample.rb +++ b/lib/linguist/sample.rb @@ -29,7 +29,7 @@ module Linguist def self.classifier classifier = Classifier.new each { |sample| classifier.train(sample.language, sample.data) } - classifier + classifier.gc end def initialize(path, language) diff --git a/test/test_classifier.rb b/test/test_classifier.rb index 24254661..38c4df5c 100644 --- a/test/test_classifier.rb +++ b/test/test_classifier.rb @@ -15,7 +15,7 @@ class TestClassifier < Test::Unit::TestCase File.read(File.join(fixtures_path, name)) end - def test_train_and_classify + def test_classify classifier = Classifier.new classifier.train Language["Ruby"], fixture("ruby/foo.rb") classifier.train Language["Objective-C"], fixture("objective-c/Foo.h") @@ -30,6 +30,10 @@ class TestClassifier < Test::Unit::TestCase assert results.first[1] < 0.5, results.first.inspect end + def test_gc + Classifier.instance.gc + end + # def test_instance_classify # Sample.each do |sample| # results = Classifier.instance.classify(sample.data)