Merge pull request #1211 from alindeman/multibyte_line_count

Counts the number of lines correctly for files with certain multibyte encodings
2026-06-21 19:59:29 +00:00 · 2014-05-22 11:27:35 -04:00
parent 93d7aa3d07 09a33f8daa
commit 6a192dae63
4 changed files with 41 additions and 1 deletions
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -241,7 +241,25 @@ module Linguist
    def lines
      @lines ||=
        if viewable? && data
-          data.split(/\r\n|\r|\n/, -1)
+          # `data` is usually encoded as ASCII-8BIT even when the content has
          # been detected as a different encoding. However, we are not allowed
          # to change the encoding of `data` because we've made the implicit
          # guarantee that each entry in `lines` is encoded the same way as
          # `data`.
          #
          # Instead, we re-encode each possible newline sequence as the
          # detected encoding, then force them back to the encoding of `data`
          # (usually a binary encoding like ASCII-8BIT). This means that the
          # byte sequence will match how newlines are likely encoded in the
          # file, but we don't have to change the encoding of `data` as far as
          # Ruby is concerned. This allows us to correctly parse out each line
          # without changing the encoding of `data`, and
          # also--importantly--without having to duplicate many (potentially
          # large) strings.
          encoded_newlines = ["\r\n", "\r", "\n"].
            map { |nl| nl.encode(encoding).force_encoding(data.encoding) }
          data.split(Regexp.union(encoded_newlines), -1)
        else
          []
        end
--- a/samples/Text/utf16le-windows.txt
+++ b/samples/Text/utf16le-windows.txt
--- a/samples/Text/utf16le.txt
+++ b/samples/Text/utf16le.txt
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -11,6 +11,17 @@ class TestBlob < Test::Unit::TestCase
  Lexer = Pygments::Lexer
  def setup
    # git blobs are normally loaded as ASCII-8BIT since they may contain data
    # with arbitrary encoding not known ahead of time
    @original_external = Encoding.default_external
    Encoding.default_external = Encoding.find("ASCII-8BIT")
  end
  def teardown
    Encoding.default_external = @original_external
  end
  def samples_path
    File.expand_path("../../samples", __FILE__)
  end
@@ -67,6 +78,14 @@ class TestBlob < Test::Unit::TestCase
    assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length
  end
  def test_lines_maintains_original_encoding
    # Even if the file's encoding is detected as something like UTF-16LE,
    # earlier versions of the gem made implicit guarantees that the encoding of
    # each `line` is in the same encoding as the file was originally read (in
    # practice, UTF-8 or ASCII-8BIT)
    assert_equal Encoding.default_external, blob("Text/utf16le.txt").lines.first.encoding
  end
  def test_size
    assert_equal 15, blob("Ruby/foo.rb").size
  end
@@ -77,12 +96,15 @@ class TestBlob < Test::Unit::TestCase
  def test_sloc
    assert_equal 2, blob("Ruby/foo.rb").sloc
    assert_equal 3, blob("Text/utf16le-windows.txt").sloc
  end
  def test_encoding
    assert_equal "ISO-8859-2", blob("Text/README").encoding
    assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding
    assert_equal "UTF-8", blob("Text/foo.txt").encoding
    assert_equal "UTF-16LE", blob("Text/utf16le.txt").encoding
    assert_equal "UTF-16LE", blob("Text/utf16le-windows.txt").encoding
    assert_nil blob("Binary/dog.o").encoding
  end