Improve tokenizing sgml tags

This commit is contained in:
Joshua Peek
2012-06-08 14:46:16 -05:00
parent 8351d55c56
commit 6f6dd8bc38
2 changed files with 54 additions and 2 deletions

View File

@@ -18,6 +18,10 @@ module Linguist
# #
# Returns Array of Strings. # Returns Array of Strings.
def tokens def tokens
extract_tokens(data)
end
def extract_tokens(data)
s = StringScanner.new(data) s = StringScanner.new(data)
tokens = [] tokens = []
@@ -55,12 +59,16 @@ module Linguist
elsif s.scan(/'/) elsif s.scan(/'/)
s.skip_until(/[^\\]'/) s.skip_until(/[^\\]'/)
# SGML style brackets
elsif token = s.scan(/<[^>]+>/)
extract_sgml_tokens(token).each { |t| tokens << t }
# Common programming punctuation # Common programming punctuation
elsif token = s.scan(/;|\{|\}|\(|\)/) elsif token = s.scan(/;|\{|\}|\(|\)|<<?/)
tokens << token tokens << token
# Regular token # Regular token
elsif token = s.scan(/[\w\.@#\/<>]+/) elsif token = s.scan(/[\w\.@#\/]+/)
tokens << token tokens << token
else else
@@ -70,5 +78,39 @@ module Linguist
tokens tokens
end end
def extract_sgml_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
if token = s.scan(/<\/?[^\s>]+/)
tokens << "#{token}>"
elsif token = s.scan(/\w+=/)
tokens << token
if s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
s.skip_until(/[^\\]'/)
else
s.skip_until(/\w+/)
end
elsif token = s.scan(/\w+/)
tokens << token
elsif s.scan(/>/)
s.terminate
else
s.getch
end
end
tokens
end
end end
end end

View File

@@ -32,6 +32,16 @@ class TestTokenizer < Test::Unit::TestCase
assert_equal %w(foo {- -}), Tokenizer.new("foo {- Comment -}").tokens assert_equal %w(foo {- -}), Tokenizer.new("foo {- Comment -}").tokens
end end
def test_sgml_tags
assert_equal %w(<html> </html>), Tokenizer.new("<html></html>").tokens
assert_equal %w(<div> id </div>), Tokenizer.new("<div id></div>").tokens
assert_equal %w(<div> id= </div>), Tokenizer.new("<div id=foo></div>").tokens
assert_equal %w(<div> id class </div>), Tokenizer.new("<div id class></div>").tokens
assert_equal %w(<div> id= </div>), Tokenizer.new("<div id=\"foo bar\"></div>").tokens
assert_equal %w(<div> id= </div>), Tokenizer.new("<div id='foo bar'></div>").tokens
assert_equal %w(<?xml> version=), Tokenizer.new("<?xml version=\"1.0\"?>").tokens
end
def test_c_tokens def test_c_tokens
assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return 0 ; }), tokenize("c/hello.c") assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return 0 ; }), tokenize("c/hello.c")
assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize("c/hello.h") assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize("c/hello.h")