mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Add tokenize helper to Tokenize class
This commit is contained in:
		@@ -55,7 +55,7 @@ module Linguist
 | 
			
		||||
    # Returns nothing.
 | 
			
		||||
    def train(language, data)
 | 
			
		||||
      language = language.name
 | 
			
		||||
      tokens   = Tokenizer.new(data).tokens
 | 
			
		||||
      tokens   = Tokenizer.tokenize(data)
 | 
			
		||||
 | 
			
		||||
      tokens.each do |token|
 | 
			
		||||
        @tokens[language][token] += 1
 | 
			
		||||
@@ -98,7 +98,7 @@ module Linguist
 | 
			
		||||
    # Language and a Float score.
 | 
			
		||||
    def classify(tokens, languages = @languages.keys)
 | 
			
		||||
      return [] if tokens.nil?
 | 
			
		||||
      tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
 | 
			
		||||
      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
 | 
			
		||||
 | 
			
		||||
      scores = {}
 | 
			
		||||
      languages.each do |language|
 | 
			
		||||
 
 | 
			
		||||
@@ -5,23 +5,13 @@ module Linguist
 | 
			
		||||
  # It strips any data strings or comments and preserves significant
 | 
			
		||||
  # language symbols.
 | 
			
		||||
  class Tokenizer
 | 
			
		||||
    # Public: Initialize a Tokenizer.
 | 
			
		||||
    # Public: Extract tokens from data
 | 
			
		||||
    #
 | 
			
		||||
    # data - String data to scan.
 | 
			
		||||
    def initialize(data)
 | 
			
		||||
      @data = data
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Public: Get source data.
 | 
			
		||||
    #
 | 
			
		||||
    # Returns String.
 | 
			
		||||
    attr_reader :data
 | 
			
		||||
 | 
			
		||||
    # Public: Extract tokens from data.
 | 
			
		||||
    # data - String to tokenize
 | 
			
		||||
    #
 | 
			
		||||
    # Returns Array of token Strings.
 | 
			
		||||
    def tokens
 | 
			
		||||
      extract_tokens(data)
 | 
			
		||||
    def self.tokenize(data)
 | 
			
		||||
      new.extract_tokens(data)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    SINGLE_LINE_COMMENTS = [
 | 
			
		||||
 
 | 
			
		||||
@@ -32,7 +32,7 @@ class TestClassifier < Test::Unit::TestCase
 | 
			
		||||
    results = classifier.classify(fixture("objective-c/hello.m"))
 | 
			
		||||
    assert_equal Language["Objective-C"], results.first[0]
 | 
			
		||||
 | 
			
		||||
    tokens  = Tokenizer.new(fixture("objective-c/hello.m")).tokens
 | 
			
		||||
    tokens  = Tokenizer.tokenize(fixture("objective-c/hello.m"))
 | 
			
		||||
    results = classifier.classify(tokens)
 | 
			
		||||
    assert_equal Language["Objective-C"], results.first[0]
 | 
			
		||||
  end
 | 
			
		||||
 
 | 
			
		||||
@@ -11,7 +11,7 @@ class TestTokenizer < Test::Unit::TestCase
 | 
			
		||||
 | 
			
		||||
  def tokenize(data)
 | 
			
		||||
    data = File.read(File.join(samples_path, data.to_s)) if data.is_a?(Symbol)
 | 
			
		||||
    Tokenizer.new(data).tokens
 | 
			
		||||
    Tokenizer.tokenize(data)
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_skip_string_literals
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user