diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 37793a36..76eab7ea 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -241,7 +241,25 @@ module Linguist def lines @lines ||= if viewable? && data - data.split(/\r\n|\r|\n/, -1) + # `data` is usually encoded as ASCII-8BIT even when the content has + # been detected as a different encoding. However, we are not allowed + # to change the encoding of `data` because we've made the implicit + # guarantee that each entry in `lines` is encoded the same way as + # `data`. + # + # Instead, we re-encode each possible newline sequence as the + # detected encoding, then force them back to the encoding of `data` + # (usually a binary encoding like ASCII-8BIT). This means that the + # byte sequence will match how newlines are likely encoded in the + # file, but we don't have to change the encoding of `data` as far as + # Ruby is concerned. This allows us to correctly parse out each line + # without changing the encoding of `data`, and + # also--importantly--without having to duplicate many (potentially + # large) strings. + encoded_newlines = ["\r\n", "\r", "\n"]. + map { |nl| nl.encode(encoding).force_encoding(data.encoding) } + + data.split(Regexp.union(encoded_newlines), -1) else [] end diff --git a/samples/Text/utf16le-windows.txt b/samples/Text/utf16le-windows.txt new file mode 100644 index 00000000..590ae2a6 Binary files /dev/null and b/samples/Text/utf16le-windows.txt differ diff --git a/samples/Text/utf16le.txt b/samples/Text/utf16le.txt new file mode 100644 index 00000000..1829ef70 Binary files /dev/null and b/samples/Text/utf16le.txt differ diff --git a/test/test_blob.rb b/test/test_blob.rb index ec1443ad..da13b96e 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -11,6 +11,17 @@ class TestBlob < Test::Unit::TestCase Lexer = Pygments::Lexer + def setup + # git blobs are normally loaded as ASCII-8BIT since they may contain data + # with arbitrary encoding not known ahead of time + @original_external = Encoding.default_external + Encoding.default_external = Encoding.find("ASCII-8BIT") + end + + def teardown + Encoding.default_external = @original_external + end + def samples_path File.expand_path("../../samples", __FILE__) end @@ -67,6 +78,14 @@ class TestBlob < Test::Unit::TestCase assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length end + def test_lines_maintains_original_encoding + # Even if the file's encoding is detected as something like UTF-16LE, + # earlier versions of the gem made implicit guarantees that the encoding of + # each `line` is in the same encoding as the file was originally read (in + # practice, UTF-8 or ASCII-8BIT) + assert_equal Encoding.default_external, blob("Text/utf16le.txt").lines.first.encoding + end + def test_size assert_equal 15, blob("Ruby/foo.rb").size end @@ -77,12 +96,15 @@ class TestBlob < Test::Unit::TestCase def test_sloc assert_equal 2, blob("Ruby/foo.rb").sloc + assert_equal 3, blob("Text/utf16le-windows.txt").sloc end def test_encoding assert_equal "ISO-8859-2", blob("Text/README").encoding assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding assert_equal "UTF-8", blob("Text/foo.txt").encoding + assert_equal "UTF-16LE", blob("Text/utf16le.txt").encoding + assert_equal "UTF-16LE", blob("Text/utf16le-windows.txt").encoding assert_nil blob("Binary/dog.o").encoding end