More tokenizer docs

2026-02-14 20:29:32 +00:00 · 2012-06-19 13:09:23 -05:00
parent ecb2397e59
commit 555573071e
1 changed files with 32 additions and 3 deletions
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -5,22 +5,35 @@ module Linguist
  # It strips any data strings or comments and preserves significant
  # language symbols.
  class Tokenizer
+    # Public: Initialize a Tokenizer.
+    #
+    # data - String data to scan.
    def initialize(data)
      @data = data
    end

-    # Get source data.
+    # Public: Get source data.
    #
    # Returns String.
    attr_reader :data

-    # Extract tokens from data.
+    # Public: Extract tokens from data.
    #
-    # Returns Array of Strings.
+    # Returns Array of token Strings.
    def tokens
      extract_tokens(data)
    end

+    # Internal: Extract generic tokens from data.
+    #
+    # data - String to scan.
+    #
+    # Examples
+    #
+    #   extract_tokens("printf('Hello')")
+    #   # => ['printf', '(', ')']
+    #
+    # Returns Array of token Strings.
    def extract_tokens(data)
      s = StringScanner.new(data)

@@ -54,6 +67,7 @@ module Linguist
          s.skip_until(/-->/)
          tokens << "-->"

+        # Skip single or double quoted strings
        elsif s.scan(/"/)
          s.skip_until(/[^\\]"/)
        elsif s.scan(/'/)
@@ -79,18 +93,31 @@ module Linguist
      tokens
    end

+    # Internal: Extract tokens from inside SGML tag.
+    #
+    # data - SGML tag String.
+    #
+    # Examples
+    #
+    #   extract_sgml_tokens("<a href='' class=foo>")
+    #   # => ["<a>", "href="]
+    #
+    # Returns Array of token Strings.
    def extract_sgml_tokens(data)
      s = StringScanner.new(data)

      tokens = []

      until s.eos?
+        # Emit start token
        if token = s.scan(/<\/?[^\s>]+/)
          tokens << "#{token}>"

+        # Emit attributes with trailing =
        elsif token = s.scan(/\w+=/)
          tokens << token

+          # Then skip over attribute value
          if s.scan(/"/)
            s.skip_until(/[^\\]"/)
          elsif s.scan(/'/)
@@ -99,9 +126,11 @@ module Linguist
            s.skip_until(/\w+/)
          end

+        # Emit lone attributes
        elsif token = s.scan(/\w+/)
          tokens << token

+        # Stop at the end of the tag
        elsif s.scan(/>/)
          s.terminate