mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 20:38:47 +00:00
Add tokenize helper to Tokenize class
This commit is contained in:
@@ -55,7 +55,7 @@ module Linguist
|
||||
# Returns nothing.
|
||||
def train(language, data)
|
||||
language = language.name
|
||||
tokens = Tokenizer.new(data).tokens
|
||||
tokens = Tokenizer.tokenize(data)
|
||||
|
||||
tokens.each do |token|
|
||||
@tokens[language][token] += 1
|
||||
@@ -98,7 +98,7 @@ module Linguist
|
||||
# Language and a Float score.
|
||||
def classify(tokens, languages = @languages.keys)
|
||||
return [] if tokens.nil?
|
||||
tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
|
||||
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
||||
|
||||
scores = {}
|
||||
languages.each do |language|
|
||||
|
||||
@@ -5,23 +5,13 @@ module Linguist
|
||||
# It strips any data strings or comments and preserves significant
|
||||
# language symbols.
|
||||
class Tokenizer
|
||||
# Public: Initialize a Tokenizer.
|
||||
# Public: Extract tokens from data
|
||||
#
|
||||
# data - String data to scan.
|
||||
def initialize(data)
|
||||
@data = data
|
||||
end
|
||||
|
||||
# Public: Get source data.
|
||||
#
|
||||
# Returns String.
|
||||
attr_reader :data
|
||||
|
||||
# Public: Extract tokens from data.
|
||||
# data - String to tokenize
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def tokens
|
||||
extract_tokens(data)
|
||||
def self.tokenize(data)
|
||||
new.extract_tokens(data)
|
||||
end
|
||||
|
||||
SINGLE_LINE_COMMENTS = [
|
||||
|
||||
Reference in New Issue
Block a user