diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 9ab8d2c5..76eab7ea 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -241,8 +241,25 @@ module Linguist def lines @lines ||= if viewable? && data - newlines = Regexp.new("\r\n|\r|\n".encode(encoding)) - data.force_encoding(encoding).split(newlines, -1) + # `data` is usually encoded as ASCII-8BIT even when the content has + # been detected as a different encoding. However, we are not allowed + # to change the encoding of `data` because we've made the implicit + # guarantee that each entry in `lines` is encoded the same way as + # `data`. + # + # Instead, we re-encode each possible newline sequence as the + # detected encoding, then force them back to the encoding of `data` + # (usually a binary encoding like ASCII-8BIT). This means that the + # byte sequence will match how newlines are likely encoded in the + # file, but we don't have to change the encoding of `data` as far as + # Ruby is concerned. This allows us to correctly parse out each line + # without changing the encoding of `data`, and + # also--importantly--without having to duplicate many (potentially + # large) strings. + encoded_newlines = ["\r\n", "\r", "\n"]. + map { |nl| nl.encode(encoding).force_encoding(data.encoding) } + + data.split(Regexp.union(encoded_newlines), -1) else [] end @@ -263,7 +280,7 @@ module Linguist # # Returns Integer def sloc - lines.grep(Regexp.new('\S'.encode(encoding || 'ASCII-8BIT'))).size + lines.grep(/\S/).size end # Public: Is the blob a generated file? diff --git a/test/test_blob.rb b/test/test_blob.rb index 890242b6..da13b96e 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -11,6 +11,17 @@ class TestBlob < Test::Unit::TestCase Lexer = Pygments::Lexer + def setup + # git blobs are normally loaded as ASCII-8BIT since they may contain data + # with arbitrary encoding not known ahead of time + @original_external = Encoding.default_external + Encoding.default_external = Encoding.find("ASCII-8BIT") + end + + def teardown + Encoding.default_external = @original_external + end + def samples_path File.expand_path("../../samples", __FILE__) end @@ -67,6 +78,14 @@ class TestBlob < Test::Unit::TestCase assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length end + def test_lines_maintains_original_encoding + # Even if the file's encoding is detected as something like UTF-16LE, + # earlier versions of the gem made implicit guarantees that the encoding of + # each `line` is in the same encoding as the file was originally read (in + # practice, UTF-8 or ASCII-8BIT) + assert_equal Encoding.default_external, blob("Text/utf16le.txt").lines.first.encoding + end + def test_size assert_equal 15, blob("Ruby/foo.rb").size end