diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index f5f12981..0df2c56f 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -74,18 +74,22 @@ module Linguist s.skip_until(/[^\\]'/) # Skip number literals - elsif s.scan(/\d+/) + elsif s.scan(/(0x)?\d+/) # SGML style brackets - elsif token = s.scan(/<[^>]+>/) + elsif token = s.scan(/<[^\s<>][^<>]*>/) extract_sgml_tokens(token).each { |t| tokens << t } # Common programming punctuation - elsif token = s.scan(/;|\{|\}|\(|\)|< version=), tokenize("") end + def test_operators + assert_equal %w(+), tokenize("1 + 1") + assert_equal %w(-), tokenize("1 - 1") + assert_equal %w(*), tokenize("1 * 1") + assert_equal %w(/), tokenize("1 / 1") + assert_equal %w(&), tokenize("1 & 1") + assert_equal %w(&&), tokenize("1 && 1") + assert_equal %w(|), tokenize("1 | 1") + assert_equal %w(||), tokenize("1 || 1") + assert_equal %w(<), tokenize("1 < 0x01") + assert_equal %w(<<), tokenize("1 << 0x01") + end + def test_c_tokens assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize(:"c/hello.h") assert_equal %w(#include int main \( \) { printf \( \) ; return ; }), tokenize(:"c/hello.c") end def test_cpp_tokens - assert_equal %w(class Bar { protected char name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h") + assert_equal %w(class Bar { protected char *name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h") assert_equal %w(#include using namespace std ; int main \( \) { cout << << endl ; }), tokenize(:"cpp/hello.cpp") end def test_objective_c_tokens assert_equal %w(#import @interface Foo NSObject { } @end), tokenize(:"objective-c/Foo.h") assert_equal %w(#import @implementation Foo @end), tokenize(:"objective-c/Foo.m") - assert_equal %w(#import int main \( int argc char argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m") + assert_equal %w(#import int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m") end def test_javascript_tokens