Improve operator tokenizing

This commit is contained in:
Joshua Peek
2012-06-20 17:16:53 -05:00
parent c353d3a050
commit 5cdd5e206c
2 changed files with 24 additions and 6 deletions

View File

@@ -74,18 +74,22 @@ module Linguist
s.skip_until(/[^\\]'/)
# Skip number literals
elsif s.scan(/\d+/)
elsif s.scan(/(0x)?\d+/)
# SGML style brackets
elsif token = s.scan(/<[^>]+>/)
elsif token = s.scan(/<[^\s<>][^<>]*>/)
extract_sgml_tokens(token).each { |t| tokens << t }
# Common programming punctuation
elsif token = s.scan(/;|\{|\}|\(|\)|<<?|\+/)
elsif token = s.scan(/;|\{|\}|\(|\)/)
tokens << token
# Regular token
elsif token = s.scan(/[\w\.@#\/]+/)
elsif token = s.scan(/[\w\.@#\/\*]+/)
tokens << token
# Common operators
elsif token = s.scan(/<<?|\+|\-|\*|\/|&&?|\|\|?/)
tokens << token
else

View File

@@ -25,6 +25,7 @@ class TestTokenizer < Test::Unit::TestCase
def test_skip_number_literals
assert_equal %w(+), tokenize('1 + 1')
assert_equal %w(add \( \)), tokenize('add(123, 456)')
assert_equal %w(|), tokenize('0x01 | 0x10')
end
def test_skip_comments
@@ -47,20 +48,33 @@ class TestTokenizer < Test::Unit::TestCase
assert_equal %w(<?xml> version=), tokenize("<?xml version=\"1.0\"?>")
end
def test_operators
assert_equal %w(+), tokenize("1 + 1")
assert_equal %w(-), tokenize("1 - 1")
assert_equal %w(*), tokenize("1 * 1")
assert_equal %w(/), tokenize("1 / 1")
assert_equal %w(&), tokenize("1 & 1")
assert_equal %w(&&), tokenize("1 && 1")
assert_equal %w(|), tokenize("1 | 1")
assert_equal %w(||), tokenize("1 || 1")
assert_equal %w(<), tokenize("1 < 0x01")
assert_equal %w(<<), tokenize("1 << 0x01")
end
def test_c_tokens
assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize(:"c/hello.h")
assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return ; }), tokenize(:"c/hello.c")
end
def test_cpp_tokens
assert_equal %w(class Bar { protected char name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h")
assert_equal %w(class Bar { protected char *name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h")
assert_equal %w(#include <iostream> using namespace std ; int main \( \) { cout << << endl ; }), tokenize(:"cpp/hello.cpp")
end
def test_objective_c_tokens
assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize(:"objective-c/Foo.h")
assert_equal %w(#import @implementation Foo @end), tokenize(:"objective-c/Foo.m")
assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m")
assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m")
end
def test_javascript_tokens