More tokenizer docs

This commit is contained in:
Joshua Peek
2012-06-19 13:09:23 -05:00
parent ecb2397e59
commit 555573071e

View File

@@ -5,22 +5,35 @@ module Linguist
# It strips any data strings or comments and preserves significant # It strips any data strings or comments and preserves significant
# language symbols. # language symbols.
class Tokenizer class Tokenizer
# Public: Initialize a Tokenizer.
#
# data - String data to scan.
def initialize(data) def initialize(data)
@data = data @data = data
end end
# Get source data. # Public: Get source data.
# #
# Returns String. # Returns String.
attr_reader :data attr_reader :data
# Extract tokens from data. # Public: Extract tokens from data.
# #
# Returns Array of Strings. # Returns Array of token Strings.
def tokens def tokens
extract_tokens(data) extract_tokens(data)
end end
# Internal: Extract generic tokens from data.
#
# data - String to scan.
#
# Examples
#
# extract_tokens("printf('Hello')")
# # => ['printf', '(', ')']
#
# Returns Array of token Strings.
def extract_tokens(data) def extract_tokens(data)
s = StringScanner.new(data) s = StringScanner.new(data)
@@ -54,6 +67,7 @@ module Linguist
s.skip_until(/-->/) s.skip_until(/-->/)
tokens << "-->" tokens << "-->"
# Skip single or double quoted strings
elsif s.scan(/"/) elsif s.scan(/"/)
s.skip_until(/[^\\]"/) s.skip_until(/[^\\]"/)
elsif s.scan(/'/) elsif s.scan(/'/)
@@ -79,18 +93,31 @@ module Linguist
tokens tokens
end end
# Internal: Extract tokens from inside SGML tag.
#
# data - SGML tag String.
#
# Examples
#
# extract_sgml_tokens("<a href='' class=foo>")
# # => ["<a>", "href="]
#
# Returns Array of token Strings.
def extract_sgml_tokens(data) def extract_sgml_tokens(data)
s = StringScanner.new(data) s = StringScanner.new(data)
tokens = [] tokens = []
until s.eos? until s.eos?
# Emit start token
if token = s.scan(/<\/?[^\s>]+/) if token = s.scan(/<\/?[^\s>]+/)
tokens << "#{token}>" tokens << "#{token}>"
# Emit attributes with trailing =
elsif token = s.scan(/\w+=/) elsif token = s.scan(/\w+=/)
tokens << token tokens << token
# Then skip over attribute value
if s.scan(/"/) if s.scan(/"/)
s.skip_until(/[^\\]"/) s.skip_until(/[^\\]"/)
elsif s.scan(/'/) elsif s.scan(/'/)
@@ -99,9 +126,11 @@ module Linguist
s.skip_until(/\w+/) s.skip_until(/\w+/)
end end
# Emit lone attributes
elsif token = s.scan(/\w+/) elsif token = s.scan(/\w+/)
tokens << token tokens << token
# Stop at the end of the tag
elsif s.scan(/>/) elsif s.scan(/>/)
s.terminate s.terminate