Changed tokenizer number literals to be more encompassing

Number literals now skips hexadecimal, and C style literals.
2026-02-13 19:59:34 +00:00 · 2015-02-20 14:08:39 +11:00
parent d28f5e87c0
commit 885b5aab41
2 changed files with 5 additions and 1 deletions
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -94,7 +94,7 @@ module Linguist
          end

        # Skip number literals
-        elsif s.scan(/(0x)?\d(\d|\.)*/)
+        elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)

        # SGML style brackets
        elsif token = s.scan(/<[^\s<>][^<>]*>/)
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -25,6 +25,10 @@ class TestTokenizer < Minitest::Test
    assert_equal %w(add \( \)), tokenize('add(123, 456)')
    assert_equal %w(|), tokenize('0x01 | 0x10')
    assert_equal %w(*), tokenize('500.42 * 1.0')
+    assert_equal %w(), tokenize('1.23e-04')
+    assert_equal %w(), tokenize('1.0f')
+    assert_equal %w(), tokenize('1234ULL')
+    assert_equal %w(G1 X55 Y5 F2000), tokenize('G1 X55 Y5 F2000')
  end

  def test_skip_comments