From 85efbde3f769d3fb8da98b294afec81e07d5814d Mon Sep 17 00:00:00 2001 From: Andy Lindeman Date: Wed, 21 May 2014 11:44:18 -0400 Subject: [PATCH 1/3] Counts the number of lines correctly for files with certain multibyte encodings --- lib/linguist/blob_helper.rb | 5 +++-- samples/Text/utf16le-windows.txt | Bin 0 -> 40 bytes samples/Text/utf16le.txt | Bin 0 -> 36 bytes test/test_blob.rb | 3 +++ 4 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 samples/Text/utf16le-windows.txt create mode 100644 samples/Text/utf16le.txt diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 37793a36..8aa96ed4 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -241,7 +241,8 @@ module Linguist def lines @lines ||= if viewable? && data - data.split(/\r\n|\r|\n/, -1) + newlines = Regexp.new("\r\n|\r|\n".encode(encoding)) + data.force_encoding(encoding).split(newlines, -1) else [] end @@ -262,7 +263,7 @@ module Linguist # # Returns Integer def sloc - lines.grep(/\S/).size + lines.grep(Regexp.new('\S'.encode(encoding))).size end # Public: Is the blob a generated file? diff --git a/samples/Text/utf16le-windows.txt b/samples/Text/utf16le-windows.txt new file mode 100644 index 0000000000000000000000000000000000000000..590ae2a63f89c45e1118a554a8f6c0b27c5a6873 GIT binary patch literal 40 ecmezWFNYzMA&()I!H|KMfr|mgGeY4RGXMa@90$4p literal 0 HcmV?d00001 diff --git a/samples/Text/utf16le.txt b/samples/Text/utf16le.txt new file mode 100644 index 0000000000000000000000000000000000000000..1829ef7099c936f6ebfe92fa033a517a133ae6e4 GIT binary patch literal 36 ccmezWFNYzMA&()I!H|KA0me2$VjD960It>tpa1{> literal 0 HcmV?d00001 diff --git a/test/test_blob.rb b/test/test_blob.rb index ec1443ad..890242b6 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -77,12 +77,15 @@ class TestBlob < Test::Unit::TestCase def test_sloc assert_equal 2, blob("Ruby/foo.rb").sloc + assert_equal 3, blob("Text/utf16le-windows.txt").sloc end def test_encoding assert_equal "ISO-8859-2", blob("Text/README").encoding assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding assert_equal "UTF-8", blob("Text/foo.txt").encoding + assert_equal "UTF-16LE", blob("Text/utf16le.txt").encoding + assert_equal "UTF-16LE", blob("Text/utf16le-windows.txt").encoding assert_nil blob("Binary/dog.o").encoding end From 185db0e8d5553067e394a230fc6453728f55e795 Mon Sep 17 00:00:00 2001 From: Andy Lindeman Date: Wed, 21 May 2014 11:59:48 -0400 Subject: [PATCH 2/3] Makes sure we do not fail if encoding == nil It looks like it's valid to call this method even if `binary?` is true. Encoding as 'ASCII-8BIT' should always succeed. --- lib/linguist/blob_helper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 8aa96ed4..9ab8d2c5 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -263,7 +263,7 @@ module Linguist # # Returns Integer def sloc - lines.grep(Regexp.new('\S'.encode(encoding))).size + lines.grep(Regexp.new('\S'.encode(encoding || 'ASCII-8BIT'))).size end # Public: Is the blob a generated file? From 09a33f8daad0b3d0584466324999b00414feb732 Mon Sep 17 00:00:00 2001 From: Andy Lindeman Date: Wed, 21 May 2014 15:11:06 -0400 Subject: [PATCH 3/3] Takes a different approach --- lib/linguist/blob_helper.rb | 23 ++++++++++++++++++++--- test/test_blob.rb | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 9ab8d2c5..76eab7ea 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -241,8 +241,25 @@ module Linguist def lines @lines ||= if viewable? && data - newlines = Regexp.new("\r\n|\r|\n".encode(encoding)) - data.force_encoding(encoding).split(newlines, -1) + # `data` is usually encoded as ASCII-8BIT even when the content has + # been detected as a different encoding. However, we are not allowed + # to change the encoding of `data` because we've made the implicit + # guarantee that each entry in `lines` is encoded the same way as + # `data`. + # + # Instead, we re-encode each possible newline sequence as the + # detected encoding, then force them back to the encoding of `data` + # (usually a binary encoding like ASCII-8BIT). This means that the + # byte sequence will match how newlines are likely encoded in the + # file, but we don't have to change the encoding of `data` as far as + # Ruby is concerned. This allows us to correctly parse out each line + # without changing the encoding of `data`, and + # also--importantly--without having to duplicate many (potentially + # large) strings. + encoded_newlines = ["\r\n", "\r", "\n"]. + map { |nl| nl.encode(encoding).force_encoding(data.encoding) } + + data.split(Regexp.union(encoded_newlines), -1) else [] end @@ -263,7 +280,7 @@ module Linguist # # Returns Integer def sloc - lines.grep(Regexp.new('\S'.encode(encoding || 'ASCII-8BIT'))).size + lines.grep(/\S/).size end # Public: Is the blob a generated file? diff --git a/test/test_blob.rb b/test/test_blob.rb index 890242b6..da13b96e 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -11,6 +11,17 @@ class TestBlob < Test::Unit::TestCase Lexer = Pygments::Lexer + def setup + # git blobs are normally loaded as ASCII-8BIT since they may contain data + # with arbitrary encoding not known ahead of time + @original_external = Encoding.default_external + Encoding.default_external = Encoding.find("ASCII-8BIT") + end + + def teardown + Encoding.default_external = @original_external + end + def samples_path File.expand_path("../../samples", __FILE__) end @@ -67,6 +78,14 @@ class TestBlob < Test::Unit::TestCase assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length end + def test_lines_maintains_original_encoding + # Even if the file's encoding is detected as something like UTF-16LE, + # earlier versions of the gem made implicit guarantees that the encoding of + # each `line` is in the same encoding as the file was originally read (in + # practice, UTF-8 or ASCII-8BIT) + assert_equal Encoding.default_external, blob("Text/utf16le.txt").lines.first.encoding + end + def test_size assert_equal 15, blob("Ruby/foo.rb").size end