mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 20:38:47 +00:00
Add tokenize helper to Tokenize class
This commit is contained in:
@@ -55,7 +55,7 @@ module Linguist
|
||||
# Returns nothing.
|
||||
def train(language, data)
|
||||
language = language.name
|
||||
tokens = Tokenizer.new(data).tokens
|
||||
tokens = Tokenizer.tokenize(data)
|
||||
|
||||
tokens.each do |token|
|
||||
@tokens[language][token] += 1
|
||||
@@ -98,7 +98,7 @@ module Linguist
|
||||
# Language and a Float score.
|
||||
def classify(tokens, languages = @languages.keys)
|
||||
return [] if tokens.nil?
|
||||
tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
|
||||
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
||||
|
||||
scores = {}
|
||||
languages.each do |language|
|
||||
|
||||
@@ -5,23 +5,13 @@ module Linguist
|
||||
# It strips any data strings or comments and preserves significant
|
||||
# language symbols.
|
||||
class Tokenizer
|
||||
# Public: Initialize a Tokenizer.
|
||||
# Public: Extract tokens from data
|
||||
#
|
||||
# data - String data to scan.
|
||||
def initialize(data)
|
||||
@data = data
|
||||
end
|
||||
|
||||
# Public: Get source data.
|
||||
#
|
||||
# Returns String.
|
||||
attr_reader :data
|
||||
|
||||
# Public: Extract tokens from data.
|
||||
# data - String to tokenize
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def tokens
|
||||
extract_tokens(data)
|
||||
def self.tokenize(data)
|
||||
new.extract_tokens(data)
|
||||
end
|
||||
|
||||
SINGLE_LINE_COMMENTS = [
|
||||
|
||||
@@ -32,7 +32,7 @@ class TestClassifier < Test::Unit::TestCase
|
||||
results = classifier.classify(fixture("objective-c/hello.m"))
|
||||
assert_equal Language["Objective-C"], results.first[0]
|
||||
|
||||
tokens = Tokenizer.new(fixture("objective-c/hello.m")).tokens
|
||||
tokens = Tokenizer.tokenize(fixture("objective-c/hello.m"))
|
||||
results = classifier.classify(tokens)
|
||||
assert_equal Language["Objective-C"], results.first[0]
|
||||
end
|
||||
|
||||
@@ -11,7 +11,7 @@ class TestTokenizer < Test::Unit::TestCase
|
||||
|
||||
def tokenize(data)
|
||||
data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol)
|
||||
Tokenizer.new(data).tokens
|
||||
Tokenizer.tokenize(data)
|
||||
end
|
||||
|
||||
def test_skip_string_literals
|
||||
|
||||
Reference in New Issue
Block a user