mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 20:38:47 +00:00
Add tokenize helper to Tokenize class
This commit is contained in:
@@ -55,7 +55,7 @@ module Linguist
|
|||||||
# Returns nothing.
|
# Returns nothing.
|
||||||
def train(language, data)
|
def train(language, data)
|
||||||
language = language.name
|
language = language.name
|
||||||
tokens = Tokenizer.new(data).tokens
|
tokens = Tokenizer.tokenize(data)
|
||||||
|
|
||||||
tokens.each do |token|
|
tokens.each do |token|
|
||||||
@tokens[language][token] += 1
|
@tokens[language][token] += 1
|
||||||
@@ -98,7 +98,7 @@ module Linguist
|
|||||||
# Language and a Float score.
|
# Language and a Float score.
|
||||||
def classify(tokens, languages = @languages.keys)
|
def classify(tokens, languages = @languages.keys)
|
||||||
return [] if tokens.nil?
|
return [] if tokens.nil?
|
||||||
tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
||||||
|
|
||||||
scores = {}
|
scores = {}
|
||||||
languages.each do |language|
|
languages.each do |language|
|
||||||
|
|||||||
@@ -5,23 +5,13 @@ module Linguist
|
|||||||
# It strips any data strings or comments and preserves significant
|
# It strips any data strings or comments and preserves significant
|
||||||
# language symbols.
|
# language symbols.
|
||||||
class Tokenizer
|
class Tokenizer
|
||||||
# Public: Initialize a Tokenizer.
|
# Public: Extract tokens from data
|
||||||
#
|
#
|
||||||
# data - String data to scan.
|
# data - String to tokenize
|
||||||
def initialize(data)
|
|
||||||
@data = data
|
|
||||||
end
|
|
||||||
|
|
||||||
# Public: Get source data.
|
|
||||||
#
|
|
||||||
# Returns String.
|
|
||||||
attr_reader :data
|
|
||||||
|
|
||||||
# Public: Extract tokens from data.
|
|
||||||
#
|
#
|
||||||
# Returns Array of token Strings.
|
# Returns Array of token Strings.
|
||||||
def tokens
|
def self.tokenize(data)
|
||||||
extract_tokens(data)
|
new.extract_tokens(data)
|
||||||
end
|
end
|
||||||
|
|
||||||
SINGLE_LINE_COMMENTS = [
|
SINGLE_LINE_COMMENTS = [
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class TestClassifier < Test::Unit::TestCase
|
|||||||
results = classifier.classify(fixture("objective-c/hello.m"))
|
results = classifier.classify(fixture("objective-c/hello.m"))
|
||||||
assert_equal Language["Objective-C"], results.first[0]
|
assert_equal Language["Objective-C"], results.first[0]
|
||||||
|
|
||||||
tokens = Tokenizer.new(fixture("objective-c/hello.m")).tokens
|
tokens = Tokenizer.tokenize(fixture("objective-c/hello.m"))
|
||||||
results = classifier.classify(tokens)
|
results = classifier.classify(tokens)
|
||||||
assert_equal Language["Objective-C"], results.first[0]
|
assert_equal Language["Objective-C"], results.first[0]
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ class TestTokenizer < Test::Unit::TestCase
|
|||||||
|
|
||||||
def tokenize(data)
|
def tokenize(data)
|
||||||
data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol)
|
data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol)
|
||||||
Tokenizer.new(data).tokens
|
Tokenizer.tokenize(data)
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_skip_string_literals
|
def test_skip_string_literals
|
||||||
|
|||||||
Reference in New Issue
Block a user