mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
More tokenizer docs
This commit is contained in:
@@ -5,22 +5,35 @@ module Linguist
|
|||||||
# It strips any data strings or comments and preserves significant
|
# It strips any data strings or comments and preserves significant
|
||||||
# language symbols.
|
# language symbols.
|
||||||
class Tokenizer
|
class Tokenizer
|
||||||
|
# Public: Initialize a Tokenizer.
|
||||||
|
#
|
||||||
|
# data - String data to scan.
|
||||||
def initialize(data)
|
def initialize(data)
|
||||||
@data = data
|
@data = data
|
||||||
end
|
end
|
||||||
|
|
||||||
# Get source data.
|
# Public: Get source data.
|
||||||
#
|
#
|
||||||
# Returns String.
|
# Returns String.
|
||||||
attr_reader :data
|
attr_reader :data
|
||||||
|
|
||||||
# Extract tokens from data.
|
# Public: Extract tokens from data.
|
||||||
#
|
#
|
||||||
# Returns Array of Strings.
|
# Returns Array of token Strings.
|
||||||
def tokens
|
def tokens
|
||||||
extract_tokens(data)
|
extract_tokens(data)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Internal: Extract generic tokens from data.
|
||||||
|
#
|
||||||
|
# data - String to scan.
|
||||||
|
#
|
||||||
|
# Examples
|
||||||
|
#
|
||||||
|
# extract_tokens("printf('Hello')")
|
||||||
|
# # => ['printf', '(', ')']
|
||||||
|
#
|
||||||
|
# Returns Array of token Strings.
|
||||||
def extract_tokens(data)
|
def extract_tokens(data)
|
||||||
s = StringScanner.new(data)
|
s = StringScanner.new(data)
|
||||||
|
|
||||||
@@ -54,6 +67,7 @@ module Linguist
|
|||||||
s.skip_until(/-->/)
|
s.skip_until(/-->/)
|
||||||
tokens << "-->"
|
tokens << "-->"
|
||||||
|
|
||||||
|
# Skip single or double quoted strings
|
||||||
elsif s.scan(/"/)
|
elsif s.scan(/"/)
|
||||||
s.skip_until(/[^\\]"/)
|
s.skip_until(/[^\\]"/)
|
||||||
elsif s.scan(/'/)
|
elsif s.scan(/'/)
|
||||||
@@ -79,18 +93,31 @@ module Linguist
|
|||||||
tokens
|
tokens
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Internal: Extract tokens from inside SGML tag.
|
||||||
|
#
|
||||||
|
# data - SGML tag String.
|
||||||
|
#
|
||||||
|
# Examples
|
||||||
|
#
|
||||||
|
# extract_sgml_tokens("<a href='' class=foo>")
|
||||||
|
# # => ["<a>", "href="]
|
||||||
|
#
|
||||||
|
# Returns Array of token Strings.
|
||||||
def extract_sgml_tokens(data)
|
def extract_sgml_tokens(data)
|
||||||
s = StringScanner.new(data)
|
s = StringScanner.new(data)
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
|
|
||||||
until s.eos?
|
until s.eos?
|
||||||
|
# Emit start token
|
||||||
if token = s.scan(/<\/?[^\s>]+/)
|
if token = s.scan(/<\/?[^\s>]+/)
|
||||||
tokens << "#{token}>"
|
tokens << "#{token}>"
|
||||||
|
|
||||||
|
# Emit attributes with trailing =
|
||||||
elsif token = s.scan(/\w+=/)
|
elsif token = s.scan(/\w+=/)
|
||||||
tokens << token
|
tokens << token
|
||||||
|
|
||||||
|
# Then skip over attribute value
|
||||||
if s.scan(/"/)
|
if s.scan(/"/)
|
||||||
s.skip_until(/[^\\]"/)
|
s.skip_until(/[^\\]"/)
|
||||||
elsif s.scan(/'/)
|
elsif s.scan(/'/)
|
||||||
@@ -99,9 +126,11 @@ module Linguist
|
|||||||
s.skip_until(/\w+/)
|
s.skip_until(/\w+/)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Emit lone attributes
|
||||||
elsif token = s.scan(/\w+/)
|
elsif token = s.scan(/\w+/)
|
||||||
tokens << token
|
tokens << token
|
||||||
|
|
||||||
|
# Stop at the end of the tag
|
||||||
elsif s.scan(/>/)
|
elsif s.scan(/>/)
|
||||||
s.terminate
|
s.terminate
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user