diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index b6b6612f..829c64f5 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -18,6 +18,10 @@ module Linguist # # Returns Array of Strings. def tokens + extract_tokens(data) + end + + def extract_tokens(data) s = StringScanner.new(data) tokens = [] @@ -55,12 +59,16 @@ module Linguist elsif s.scan(/'/) s.skip_until(/[^\\]'/) + # SGML style brackets + elsif token = s.scan(/<[^>]+>/) + extract_sgml_tokens(token).each { |t| tokens << t } + # Common programming punctuation - elsif token = s.scan(/;|\{|\}|\(|\)/) + elsif token = s.scan(/;|\{|\}|\(|\)|</) tokens << token # Regular token - elsif token = s.scan(/[\w\.@#\/<>]+/) + elsif token = s.scan(/[\w\.@#\/]+/) tokens << token else @@ -70,5 +78,39 @@ module Linguist tokens end + + def extract_sgml_tokens(data) + s = StringScanner.new(data) + + tokens = [] + + until s.eos? + if token = s.scan(/<\/?[^\s>]+/) + tokens << "#{token}>" + + elsif token = s.scan(/\w+=/) + tokens << token + + if s.scan(/"/) + s.skip_until(/[^\\]"/) + elsif s.scan(/'/) + s.skip_until(/[^\\]'/) + else + s.skip_until(/\w+/) + end + + elsif token = s.scan(/\w+/) + tokens << token + + elsif s.scan(/>/) + s.terminate + + else + s.getch + end + end + + tokens + end end end diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb index 9cfbcec3..9b0ec3e8 100644 --- a/test/test_tokenizer.rb +++ b/test/test_tokenizer.rb @@ -32,6 +32,16 @@ class TestTokenizer < Test::Unit::TestCase assert_equal %w(foo {- -}), Tokenizer.new("foo {- Comment -}").tokens end + def test_sgml_tags + assert_equal %w( ), Tokenizer.new("").tokens + assert_equal %w(