diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb
index b6b6612f..829c64f5 100644
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -18,6 +18,10 @@ module Linguist
     #
     # Returns Array of Strings.
     def tokens
+      extract_tokens(data)
+    end
+
+    def extract_tokens(data)
       s = StringScanner.new(data)
 
       tokens = []
@@ -55,12 +59,16 @@ module Linguist
         elsif s.scan(/'/)
           s.skip_until(/[^\\]'/)
 
+        # SGML style brackets
+        elsif token = s.scan(/<[^>]+>/)
+          extract_sgml_tokens(token).each { |t| tokens << t }
+
         # Common programming punctuation
-        elsif token = s.scan(/;|\{|\}|\(|\)/)
+        elsif token = s.scan(/;|\{|\}|\(|\)|<<?/)
           tokens << token
 
         # Regular token
-        elsif token = s.scan(/[\w\.@#\/<>]+/)
+        elsif token = s.scan(/[\w\.@#\/]+/)
             tokens << token
 
         else
@@ -70,5 +78,39 @@ module Linguist
 
       tokens
     end
+
+    def extract_sgml_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+
+      until s.eos?
+        if token = s.scan(/<\/?[^\s>]+/)
+          tokens << "#{token}>"
+
+        elsif token = s.scan(/\w+=/)
+          tokens << token
+
+          if s.scan(/"/)
+            s.skip_until(/[^\\]"/)
+          elsif s.scan(/'/)
+            s.skip_until(/[^\\]'/)
+          else
+            s.skip_until(/\w+/)
+          end
+
+        elsif token = s.scan(/\w+/)
+          tokens << token
+
+        elsif s.scan(/>/)
+          s.terminate
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
   end
 end
diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb
index 9cfbcec3..9b0ec3e8 100644
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -32,6 +32,16 @@ class TestTokenizer < Test::Unit::TestCase
     assert_equal %w(foo {- -}), Tokenizer.new("foo {- Comment -}").tokens
   end
 
+  def test_sgml_tags
+    assert_equal %w(<html> </html>), Tokenizer.new("<html></html>").tokens
+    assert_equal %w(<div> id </div>), Tokenizer.new("<div id></div>").tokens
+    assert_equal %w(<div> id= </div>), Tokenizer.new("<div id=foo></div>").tokens
+    assert_equal %w(<div> id class </div>), Tokenizer.new("<div id class></div>").tokens
+    assert_equal %w(<div> id= </div>), Tokenizer.new("<div id=\"foo bar\"></div>").tokens
+    assert_equal %w(<div> id= </div>), Tokenizer.new("<div id='foo bar'></div>").tokens
+    assert_equal %w(<?xml> version=), Tokenizer.new("<?xml version=\"1.0\"?>").tokens
+  end
+
   def test_c_tokens
     assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return 0 ; }), tokenize("c/hello.c")
     assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize("c/hello.h")