Skip emiting comment tokens

2025-12-08 20:38:47 +00:00 · 2012-08-20 10:34:07 -05:00
parent 657adaabec
commit 220108857c
2 changed files with 17 additions and 14 deletions
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -16,12 +16,15 @@ module Linguist
      new.extract_tokens(data)
    end
    # Start state on token, ignore anything till the next newline
    SINGLE_LINE_COMMENTS = [
      '//', # C
      '#',  # Ruby
      '%',  # Tex
    ]
    # Start state on opening token, ignore anything until the closing
    # token is reached.
    MULTI_LINE_COMMENTS = [
      ['/*', '*/'],    # C
      ['<!--', '-->'], # XML
@@ -30,7 +33,7 @@ module Linguist
    ]
    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
-      "^\s*#{Regexp.escape(c)} "
+      "\s*#{Regexp.escape(c)} "
    }.join("|"))
    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
@@ -58,16 +61,16 @@ module Linguist
          end
        # Single line comment
-        elsif token = s.scan(START_SINGLE_LINE_COMMENT)
+        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
-          tokens << token.strip
+          # tokens << token.strip
          s.skip_until(/\n|\Z/)
        # Multiline comments
        elsif token = s.scan(START_MULTI_LINE_COMMENT)
-          tokens << token
+          # tokens << token
          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
-          tokens << close_token
+          # tokens << close_token
        # Skip single or double quoted strings
        elsif s.scan(/"/)
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -34,15 +34,15 @@ class TestTokenizer < Test::Unit::TestCase
  end
  def test_skip_comments
-    assert_equal %w(foo #), tokenize("foo\n# Comment")
+    assert_equal %w(foo), tokenize("foo\n# Comment")
-    assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar")
+    assert_equal %w(foo bar), tokenize("foo\n# Comment\nbar")
-    assert_equal %w(foo //), tokenize("foo\n// Comment")
+    assert_equal %w(foo), tokenize("foo\n// Comment")
-    assert_equal %w(foo /* */), tokenize("foo /* Comment */")
+    assert_equal %w(foo), tokenize("foo /* Comment */")
-    assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
+    assert_equal %w(foo), tokenize("foo /* \nComment\n */")
-    assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")
+    assert_equal %w(foo), tokenize("foo <!-- Comment -->")
-    assert_equal %w(foo {- -}), tokenize("foo {- Comment -}")
+    assert_equal %w(foo), tokenize("foo {- Comment -}")
-    assert_equal %w(foo \(* *\)), tokenize("foo (* Comment *)")
+    assert_equal %w(foo), tokenize("foo (* Comment *)")
-    assert_equal %w(% %), tokenize("2 % 10\n% Comment")
+    assert_equal %w(%), tokenize("2 % 10\n% Comment")
  end
  def test_sgml_tags