From 2637d8dc55a228d7aec793016632aed2dbf052d3 Mon Sep 17 00:00:00 2001 From: Joshua Peek Date: Fri, 20 Jul 2012 15:14:58 -0500 Subject: [PATCH] Add tokenize helper to Tokenize class --- lib/linguist/classifier.rb | 4 ++-- lib/linguist/tokenizer.rb | 18 ++++-------------- test/test_classifier.rb | 2 +- test/test_tokenizer.rb | 2 +- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 3aef234d..1e4084dc 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -55,7 +55,7 @@ module Linguist # Returns nothing. def train(language, data) language = language.name - tokens = Tokenizer.new(data).tokens + tokens = Tokenizer.tokenize(data) tokens.each do |token| @tokens[language][token] += 1 @@ -98,7 +98,7 @@ module Linguist # Language and a Float score. def classify(tokens, languages = @languages.keys) return [] if tokens.nil? - tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String) + tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String) scores = {} languages.each do |language| diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index 83706064..520ed5c9 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -5,23 +5,13 @@ module Linguist # It strips any data strings or comments and preserves significant # language symbols. class Tokenizer - # Public: Initialize a Tokenizer. + # Public: Extract tokens from data # - # data - String data to scan. - def initialize(data) - @data = data - end - - # Public: Get source data. - # - # Returns String. - attr_reader :data - - # Public: Extract tokens from data. + # data - String to tokenize # # Returns Array of token Strings. - def tokens - extract_tokens(data) + def self.tokenize(data) + new.extract_tokens(data) end SINGLE_LINE_COMMENTS = [ diff --git a/test/test_classifier.rb b/test/test_classifier.rb index 81725bc0..ac7fbb0d 100644 --- a/test/test_classifier.rb +++ b/test/test_classifier.rb @@ -32,7 +32,7 @@ class TestClassifier < Test::Unit::TestCase results = classifier.classify(fixture("objective-c/hello.m")) assert_equal Language["Objective-C"], results.first[0] - tokens = Tokenizer.new(fixture("objective-c/hello.m")).tokens + tokens = Tokenizer.tokenize(fixture("objective-c/hello.m")) results = classifier.classify(tokens) assert_equal Language["Objective-C"], results.first[0] end diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb index bbd43872..00142b5c 100644 --- a/test/test_tokenizer.rb +++ b/test/test_tokenizer.rb @@ -11,7 +11,7 @@ class TestTokenizer < Test::Unit::TestCase def tokenize(data) data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol) - Tokenizer.new(data).tokens + Tokenizer.tokenize(data) end def test_skip_string_literals