diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb
index 829c64f5..4e641667 100644
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -5,22 +5,35 @@ module Linguist
# It strips any data strings or comments and preserves significant
# language symbols.
class Tokenizer
+ # Public: Initialize a Tokenizer.
+ #
+ # data - String data to scan.
def initialize(data)
@data = data
end
- # Get source data.
+ # Public: Get source data.
#
# Returns String.
attr_reader :data
- # Extract tokens from data.
+ # Public: Extract tokens from data.
#
- # Returns Array of Strings.
+ # Returns Array of token Strings.
def tokens
extract_tokens(data)
end
+ # Internal: Extract generic tokens from data.
+ #
+ # data - String to scan.
+ #
+ # Examples
+ #
+ # extract_tokens("printf('Hello')")
+ # # => ['printf', '(', ')']
+ #
+ # Returns Array of token Strings.
def extract_tokens(data)
s = StringScanner.new(data)
@@ -54,6 +67,7 @@ module Linguist
s.skip_until(/-->/)
tokens << "-->"
+ # Skip single or double quoted strings
elsif s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
@@ -79,18 +93,31 @@ module Linguist
tokens
end
+ # Internal: Extract tokens from inside SGML tag.
+ #
+ # data - SGML tag String.
+ #
+ # Examples
+ #
+ # extract_sgml_tokens("")
+ # # => ["", "href="]
+ #
+ # Returns Array of token Strings.
def extract_sgml_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
+ # Emit start token
if token = s.scan(/<\/?[^\s>]+/)
tokens << "#{token}>"
+ # Emit attributes with trailing =
elsif token = s.scan(/\w+=/)
tokens << token
+ # Then skip over attribute value
if s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
@@ -99,9 +126,11 @@ module Linguist
s.skip_until(/\w+/)
end
+ # Emit lone attributes
elsif token = s.scan(/\w+/)
tokens << token
+ # Stop at the end of the tag
elsif s.scan(/>/)
s.terminate