From 555573071e9d4ac0748ec9ffc8add8b3edba5817 Mon Sep 17 00:00:00 2001 From: Joshua Peek Date: Tue, 19 Jun 2012 13:09:23 -0500 Subject: [PATCH] More tokenizer docs --- lib/linguist/tokenizer.rb | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index 829c64f5..4e641667 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -5,22 +5,35 @@ module Linguist # It strips any data strings or comments and preserves significant # language symbols. class Tokenizer + # Public: Initialize a Tokenizer. + # + # data - String data to scan. def initialize(data) @data = data end - # Get source data. + # Public: Get source data. # # Returns String. attr_reader :data - # Extract tokens from data. + # Public: Extract tokens from data. # - # Returns Array of Strings. + # Returns Array of token Strings. def tokens extract_tokens(data) end + # Internal: Extract generic tokens from data. + # + # data - String to scan. + # + # Examples + # + # extract_tokens("printf('Hello')") + # # => ['printf', '(', ')'] + # + # Returns Array of token Strings. def extract_tokens(data) s = StringScanner.new(data) @@ -54,6 +67,7 @@ module Linguist s.skip_until(/-->/) tokens << "-->" + # Skip single or double quoted strings elsif s.scan(/"/) s.skip_until(/[^\\]"/) elsif s.scan(/'/) @@ -79,18 +93,31 @@ module Linguist tokens end + # Internal: Extract tokens from inside SGML tag. + # + # data - SGML tag String. + # + # Examples + # + # extract_sgml_tokens("") + # # => ["", "href="] + # + # Returns Array of token Strings. def extract_sgml_tokens(data) s = StringScanner.new(data) tokens = [] until s.eos? + # Emit start token if token = s.scan(/<\/?[^\s>]+/) tokens << "#{token}>" + # Emit attributes with trailing = elsif token = s.scan(/\w+=/) tokens << token + # Then skip over attribute value if s.scan(/"/) s.skip_until(/[^\\]"/) elsif s.scan(/'/) @@ -99,9 +126,11 @@ module Linguist s.skip_until(/\w+/) end + # Emit lone attributes elsif token = s.scan(/\w+/) tokens << token + # Stop at the end of the tag elsif s.scan(/>/) s.terminate