Add tokenize helper to Tokenize class

This commit is contained in:
Joshua Peek
2012-07-20 15:14:58 -05:00
parent 79be8b8c67
commit 2637d8dc55
4 changed files with 8 additions and 18 deletions

View File

@@ -55,7 +55,7 @@ module Linguist
# Returns nothing.
def train(language, data)
language = language.name
tokens = Tokenizer.new(data).tokens
tokens = Tokenizer.tokenize(data)
tokens.each do |token|
@tokens[language][token] += 1
@@ -98,7 +98,7 @@ module Linguist
# Language and a Float score.
def classify(tokens, languages = @languages.keys)
return [] if tokens.nil?
tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
scores = {}
languages.each do |language|

View File

@@ -5,23 +5,13 @@ module Linguist
# It strips any data strings or comments and preserves significant
# language symbols.
class Tokenizer
# Public: Initialize a Tokenizer.
# Public: Extract tokens from data
#
# data - String data to scan.
def initialize(data)
@data = data
end
# Public: Get source data.
#
# Returns String.
attr_reader :data
# Public: Extract tokens from data.
# data - String to tokenize
#
# Returns Array of token Strings.
def tokens
extract_tokens(data)
def self.tokenize(data)
new.extract_tokens(data)
end
SINGLE_LINE_COMMENTS = [

View File

@@ -32,7 +32,7 @@ class TestClassifier < Test::Unit::TestCase
results = classifier.classify(fixture("objective-c/hello.m"))
assert_equal Language["Objective-C"], results.first[0]
tokens = Tokenizer.new(fixture("objective-c/hello.m")).tokens
tokens = Tokenizer.tokenize(fixture("objective-c/hello.m"))
results = classifier.classify(tokens)
assert_equal Language["Objective-C"], results.first[0]
end

View File

@@ -11,7 +11,7 @@ class TestTokenizer < Test::Unit::TestCase
def tokenize(data)
data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol)
Tokenizer.new(data).tokens
Tokenizer.tokenize(data)
end
def test_skip_string_literals