Add tokenize helper to Tokenize class

This commit is contained in:
Joshua Peek
2012-07-20 15:14:58 -05:00
parent 79be8b8c67
commit 2637d8dc55
4 changed files with 8 additions and 18 deletions

View File

@@ -55,7 +55,7 @@ module Linguist
# Returns nothing. # Returns nothing.
def train(language, data) def train(language, data)
language = language.name language = language.name
tokens = Tokenizer.new(data).tokens tokens = Tokenizer.tokenize(data)
tokens.each do |token| tokens.each do |token|
@tokens[language][token] += 1 @tokens[language][token] += 1
@@ -98,7 +98,7 @@ module Linguist
# Language and a Float score. # Language and a Float score.
def classify(tokens, languages = @languages.keys) def classify(tokens, languages = @languages.keys)
return [] if tokens.nil? return [] if tokens.nil?
tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String) tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
scores = {} scores = {}
languages.each do |language| languages.each do |language|

View File

@@ -5,23 +5,13 @@ module Linguist
# It strips any data strings or comments and preserves significant # It strips any data strings or comments and preserves significant
# language symbols. # language symbols.
class Tokenizer class Tokenizer
# Public: Initialize a Tokenizer. # Public: Extract tokens from data
# #
# data - String data to scan. # data - String to tokenize
def initialize(data)
@data = data
end
# Public: Get source data.
#
# Returns String.
attr_reader :data
# Public: Extract tokens from data.
# #
# Returns Array of token Strings. # Returns Array of token Strings.
def tokens def self.tokenize(data)
extract_tokens(data) new.extract_tokens(data)
end end
SINGLE_LINE_COMMENTS = [ SINGLE_LINE_COMMENTS = [

View File

@@ -32,7 +32,7 @@ class TestClassifier < Test::Unit::TestCase
results = classifier.classify(fixture("objective-c/hello.m")) results = classifier.classify(fixture("objective-c/hello.m"))
assert_equal Language["Objective-C"], results.first[0] assert_equal Language["Objective-C"], results.first[0]
tokens = Tokenizer.new(fixture("objective-c/hello.m")).tokens tokens = Tokenizer.tokenize(fixture("objective-c/hello.m"))
results = classifier.classify(tokens) results = classifier.classify(tokens)
assert_equal Language["Objective-C"], results.first[0] assert_equal Language["Objective-C"], results.first[0]
end end

View File

@@ -11,7 +11,7 @@ class TestTokenizer < Test::Unit::TestCase
def tokenize(data) def tokenize(data)
data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol) data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol)
Tokenizer.new(data).tokens Tokenizer.tokenize(data)
end end
def test_skip_string_literals def test_skip_string_literals