mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
More tokenizer docs
This commit is contained in:
@@ -5,22 +5,35 @@ module Linguist
|
||||
# It strips any data strings or comments and preserves significant
|
||||
# language symbols.
|
||||
class Tokenizer
|
||||
# Public: Initialize a Tokenizer.
|
||||
#
|
||||
# data - String data to scan.
|
||||
def initialize(data)
|
||||
@data = data
|
||||
end
|
||||
|
||||
# Get source data.
|
||||
# Public: Get source data.
|
||||
#
|
||||
# Returns String.
|
||||
attr_reader :data
|
||||
|
||||
# Extract tokens from data.
|
||||
# Public: Extract tokens from data.
|
||||
#
|
||||
# Returns Array of Strings.
|
||||
# Returns Array of token Strings.
|
||||
def tokens
|
||||
extract_tokens(data)
|
||||
end
|
||||
|
||||
# Internal: Extract generic tokens from data.
|
||||
#
|
||||
# data - String to scan.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_tokens("printf('Hello')")
|
||||
# # => ['printf', '(', ')']
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def extract_tokens(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
@@ -54,6 +67,7 @@ module Linguist
|
||||
s.skip_until(/-->/)
|
||||
tokens << "-->"
|
||||
|
||||
# Skip single or double quoted strings
|
||||
elsif s.scan(/"/)
|
||||
s.skip_until(/[^\\]"/)
|
||||
elsif s.scan(/'/)
|
||||
@@ -79,18 +93,31 @@ module Linguist
|
||||
tokens
|
||||
end
|
||||
|
||||
# Internal: Extract tokens from inside SGML tag.
|
||||
#
|
||||
# data - SGML tag String.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_sgml_tokens("<a href='' class=foo>")
|
||||
# # => ["<a>", "href="]
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def extract_sgml_tokens(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
tokens = []
|
||||
|
||||
until s.eos?
|
||||
# Emit start token
|
||||
if token = s.scan(/<\/?[^\s>]+/)
|
||||
tokens << "#{token}>"
|
||||
|
||||
# Emit attributes with trailing =
|
||||
elsif token = s.scan(/\w+=/)
|
||||
tokens << token
|
||||
|
||||
# Then skip over attribute value
|
||||
if s.scan(/"/)
|
||||
s.skip_until(/[^\\]"/)
|
||||
elsif s.scan(/'/)
|
||||
@@ -99,9 +126,11 @@ module Linguist
|
||||
s.skip_until(/\w+/)
|
||||
end
|
||||
|
||||
# Emit lone attributes
|
||||
elsif token = s.scan(/\w+/)
|
||||
tokens << token
|
||||
|
||||
# Stop at the end of the tag
|
||||
elsif s.scan(/>/)
|
||||
s.terminate
|
||||
|
||||
|
||||
Reference in New Issue
Block a user