mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Add basic Tokenizer
This commit is contained in:
		
							
								
								
									
										74
									
								
								lib/linguist/tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								lib/linguist/tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,74 @@
 | 
			
		||||
module Linguist
 | 
			
		||||
  # Generic programming language tokenizer.
 | 
			
		||||
  #
 | 
			
		||||
  # Tokens are designed for use in the language bayes classifier.
 | 
			
		||||
  # It strips any data strings or comments and preserves significant
 | 
			
		||||
  # language symbols.
 | 
			
		||||
  class Tokenizer
 | 
			
		||||
    def initialize(data)
 | 
			
		||||
      @data = data
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Get source data.
 | 
			
		||||
    #
 | 
			
		||||
    # Returns String.
 | 
			
		||||
    attr_reader :data
 | 
			
		||||
 | 
			
		||||
    # Extract tokens from data.
 | 
			
		||||
    #
 | 
			
		||||
    # Returns Array of Strings.
 | 
			
		||||
    def tokens
 | 
			
		||||
      s = StringScanner.new(data)
 | 
			
		||||
 | 
			
		||||
      tokens = []
 | 
			
		||||
      until s.eos?
 | 
			
		||||
        # Ruby single line comment
 | 
			
		||||
        if token = s.scan(/# /)
 | 
			
		||||
          tokens << "#"
 | 
			
		||||
          s.skip_until(/\n|\Z/)
 | 
			
		||||
 | 
			
		||||
        # C style single line comment
 | 
			
		||||
        elsif token = s.scan(/\/\/ /)
 | 
			
		||||
          tokens << "//"
 | 
			
		||||
          s.skip_until(/\n|\Z/)
 | 
			
		||||
 | 
			
		||||
        # C multiline comments
 | 
			
		||||
        elsif token = s.scan(/\/\*/)
 | 
			
		||||
          tokens << "/*"
 | 
			
		||||
          s.skip_until(/\*\//)
 | 
			
		||||
          tokens << "*/"
 | 
			
		||||
 | 
			
		||||
        # Haskell multiline comments
 | 
			
		||||
        elsif token = s.scan(/\{-/)
 | 
			
		||||
          tokens << "{-"
 | 
			
		||||
          s.skip_until(/-\}/)
 | 
			
		||||
          tokens << "-}"
 | 
			
		||||
 | 
			
		||||
        # XML multiline comments
 | 
			
		||||
        elsif token = s.scan(/<!--/)
 | 
			
		||||
          tokens << "<!--"
 | 
			
		||||
          s.skip_until(/-->/)
 | 
			
		||||
          tokens << "-->"
 | 
			
		||||
 | 
			
		||||
        elsif s.scan(/"/)
 | 
			
		||||
          s.skip_until(/[^\\]"/)
 | 
			
		||||
        elsif s.scan(/'/)
 | 
			
		||||
          s.skip_until(/[^\\]'/)
 | 
			
		||||
 | 
			
		||||
        # Common programming punctuation
 | 
			
		||||
        elsif token = s.scan(/;|\{|\}|\(|\)/)
 | 
			
		||||
          tokens << token
 | 
			
		||||
 | 
			
		||||
        # Regular token
 | 
			
		||||
        elsif token = s.scan(/[\w\.@#\/<>]+/)
 | 
			
		||||
            tokens << token
 | 
			
		||||
 | 
			
		||||
        else
 | 
			
		||||
          s.getch
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      tokens
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
							
								
								
									
										60
									
								
								test/test_tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								test/test_tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,60 @@
 | 
			
		||||
require 'linguist/tokenizer'
 | 
			
		||||
 | 
			
		||||
require 'test/unit'
 | 
			
		||||
 | 
			
		||||
class TestTokenizer < Test::Unit::TestCase
 | 
			
		||||
  include Linguist
 | 
			
		||||
 | 
			
		||||
  def fixtures_path
 | 
			
		||||
    File.expand_path("../fixtures", __FILE__)
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def tokenize(name)
 | 
			
		||||
    data = File.read(File.join(fixtures_path, name))
 | 
			
		||||
    Tokenizer.new(data).tokens
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_skip_strings
 | 
			
		||||
    assert_equal %w(print), Tokenizer.new('print ""').tokens
 | 
			
		||||
    assert_equal %w(print), Tokenizer.new('print "Josh"').tokens
 | 
			
		||||
    assert_equal %w(print), Tokenizer.new("print 'Josh'").tokens
 | 
			
		||||
    assert_equal %w(print), Tokenizer.new('print "Hello \"Josh\""').tokens
 | 
			
		||||
    assert_equal %w(print), Tokenizer.new("print 'Hello \\'Josh\\''").tokens
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_skip_comments
 | 
			
		||||
    assert_equal %w(foo #), Tokenizer.new("foo # Comment").tokens
 | 
			
		||||
    assert_equal %w(foo # bar), Tokenizer.new("foo # Comment\nbar").tokens
 | 
			
		||||
    assert_equal %w(foo //), Tokenizer.new("foo // Comment").tokens
 | 
			
		||||
    assert_equal %w(foo /* */), Tokenizer.new("foo /* Comment */").tokens
 | 
			
		||||
    assert_equal %w(foo /* */), Tokenizer.new("foo /* \nComment\n */").tokens
 | 
			
		||||
    assert_equal %w(foo <!-- -->), Tokenizer.new("foo <!-- Comment -->").tokens
 | 
			
		||||
    assert_equal %w(foo {- -}), Tokenizer.new("foo {- Comment -}").tokens
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_c_tokens
 | 
			
		||||
    assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return 0 ; }), tokenize("c/hello.c")
 | 
			
		||||
    assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize("c/hello.h")
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_cpp_tokens
 | 
			
		||||
    assert_equal %w(class Bar { protected char name ; public void hello \( \) ; }), tokenize("cpp/bar.h")
 | 
			
		||||
    assert_equal %w(#include <iostream> using namespace std ; int main \( \) { cout << << endl ; }), tokenize("cpp/hello.cpp")
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_objective_c_tokens
 | 
			
		||||
    assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize("objective-c/Foo.h")
 | 
			
		||||
    assert_equal %w(#import @implementation Foo @end), tokenize("objective-c/Foo.m")
 | 
			
		||||
    assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char argv \) { NSLog \( @ \) ; return 0 ; }), tokenize("objective-c/hello.m")
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_javascript_tokens
 | 
			
		||||
    assert_equal %w( \( function \( \) { console.log \( \) ; } \) .call \( this \) ;), tokenize("javascript/hello.js")
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def test_ruby_tokens
 | 
			
		||||
    assert_equal %w(module Foo end), tokenize("ruby/foo.rb")
 | 
			
		||||
    assert_equal %w(# /usr/bin/env ruby puts), tokenize("ruby/script.rb")
 | 
			
		||||
    assert_equal %w(task default do puts end), tokenize("ruby/Rakefile")
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
		Reference in New Issue
	
	Block a user