From 85efbde3f769d3fb8da98b294afec81e07d5814d Mon Sep 17 00:00:00 2001
From: Andy Lindeman <andy@lindeman.io>
Date: Wed, 21 May 2014 11:44:18 -0400
Subject: [PATCH 1/3] Counts the number of lines correctly for files with
 certain multibyte encodings

---
 lib/linguist/blob_helper.rb      |   5 +++--
 samples/Text/utf16le-windows.txt | Bin 0 -> 40 bytes
 samples/Text/utf16le.txt         | Bin 0 -> 36 bytes
 test/test_blob.rb                |   3 +++
 4 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 samples/Text/utf16le-windows.txt
 create mode 100644 samples/Text/utf16le.txt

diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb
index 37793a36..8aa96ed4 100644
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -241,7 +241,8 @@ module Linguist
     def lines
       @lines ||=
         if viewable? && data
-          data.split(/\r\n|\r|\n/, -1)
+          newlines = Regexp.new("\r\n|\r|\n".encode(encoding))
+          data.force_encoding(encoding).split(newlines, -1)
         else
           []
         end
@@ -262,7 +263,7 @@ module Linguist
     #
     # Returns Integer
     def sloc
-      lines.grep(/\S/).size
+      lines.grep(Regexp.new('\S'.encode(encoding))).size
     end
 
     # Public: Is the blob a generated file?
diff --git a/samples/Text/utf16le-windows.txt b/samples/Text/utf16le-windows.txt
new file mode 100644
index 0000000000000000000000000000000000000000..590ae2a63f89c45e1118a554a8f6c0b27c5a6873
GIT binary patch
literal 40
ecmezWFNYzMA&()I!H|KMfr|mgGeY4RGXMa@90$4p

literal 0
HcmV?d00001

diff --git a/samples/Text/utf16le.txt b/samples/Text/utf16le.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1829ef7099c936f6ebfe92fa033a517a133ae6e4
GIT binary patch
literal 36
ccmezWFNYzMA&()I!H|KA0me2$VjD960It>tpa1{>

literal 0
HcmV?d00001

diff --git a/test/test_blob.rb b/test/test_blob.rb
index ec1443ad..890242b6 100644
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -77,12 +77,15 @@ class TestBlob < Test::Unit::TestCase
 
   def test_sloc
     assert_equal 2, blob("Ruby/foo.rb").sloc
+    assert_equal 3, blob("Text/utf16le-windows.txt").sloc
   end
 
   def test_encoding
     assert_equal "ISO-8859-2", blob("Text/README").encoding
     assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding
     assert_equal "UTF-8", blob("Text/foo.txt").encoding
+    assert_equal "UTF-16LE", blob("Text/utf16le.txt").encoding
+    assert_equal "UTF-16LE", blob("Text/utf16le-windows.txt").encoding
     assert_nil blob("Binary/dog.o").encoding
   end
 

From 185db0e8d5553067e394a230fc6453728f55e795 Mon Sep 17 00:00:00 2001
From: Andy Lindeman <andy@lindeman.io>
Date: Wed, 21 May 2014 11:59:48 -0400
Subject: [PATCH 2/3] Makes sure we do not fail if encoding == nil

It looks like it's valid to call this method even if `binary?` is true.
Encoding as 'ASCII-8BIT' should always succeed.
---
 lib/linguist/blob_helper.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb
index 8aa96ed4..9ab8d2c5 100644
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -263,7 +263,7 @@ module Linguist
     #
     # Returns Integer
     def sloc
-      lines.grep(Regexp.new('\S'.encode(encoding))).size
+      lines.grep(Regexp.new('\S'.encode(encoding || 'ASCII-8BIT'))).size
     end
 
     # Public: Is the blob a generated file?

From 09a33f8daad0b3d0584466324999b00414feb732 Mon Sep 17 00:00:00 2001
From: Andy Lindeman <andy@lindeman.io>
Date: Wed, 21 May 2014 15:11:06 -0400
Subject: [PATCH 3/3] Takes a different approach

---
 lib/linguist/blob_helper.rb | 23 ++++++++++++++++++++---
 test/test_blob.rb           | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb
index 9ab8d2c5..76eab7ea 100644
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -241,8 +241,25 @@ module Linguist
     def lines
       @lines ||=
         if viewable? && data
-          newlines = Regexp.new("\r\n|\r|\n".encode(encoding))
-          data.force_encoding(encoding).split(newlines, -1)
+          # `data` is usually encoded as ASCII-8BIT even when the content has
+          # been detected as a different encoding. However, we are not allowed
+          # to change the encoding of `data` because we've made the implicit
+          # guarantee that each entry in `lines` is encoded the same way as
+          # `data`.
+          #
+          # Instead, we re-encode each possible newline sequence as the
+          # detected encoding, then force them back to the encoding of `data`
+          # (usually a binary encoding like ASCII-8BIT). This means that the
+          # byte sequence will match how newlines are likely encoded in the
+          # file, but we don't have to change the encoding of `data` as far as
+          # Ruby is concerned. This allows us to correctly parse out each line
+          # without changing the encoding of `data`, and
+          # also--importantly--without having to duplicate many (potentially
+          # large) strings.
+          encoded_newlines = ["\r\n", "\r", "\n"].
+            map { |nl| nl.encode(encoding).force_encoding(data.encoding) }
+
+          data.split(Regexp.union(encoded_newlines), -1)
         else
           []
         end
@@ -263,7 +280,7 @@ module Linguist
     #
     # Returns Integer
     def sloc
-      lines.grep(Regexp.new('\S'.encode(encoding || 'ASCII-8BIT'))).size
+      lines.grep(/\S/).size
     end
 
     # Public: Is the blob a generated file?
diff --git a/test/test_blob.rb b/test/test_blob.rb
index 890242b6..da13b96e 100644
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -11,6 +11,17 @@ class TestBlob < Test::Unit::TestCase
 
   Lexer = Pygments::Lexer
 
+  def setup
+    # git blobs are normally loaded as ASCII-8BIT since they may contain data
+    # with arbitrary encoding not known ahead of time
+    @original_external = Encoding.default_external
+    Encoding.default_external = Encoding.find("ASCII-8BIT")
+  end
+
+  def teardown
+    Encoding.default_external = @original_external
+  end
+
   def samples_path
     File.expand_path("../../samples", __FILE__)
   end
@@ -67,6 +78,14 @@ class TestBlob < Test::Unit::TestCase
     assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length
   end
 
+  def test_lines_maintains_original_encoding
+    # Even if the file's encoding is detected as something like UTF-16LE,
+    # earlier versions of the gem made implicit guarantees that the encoding of
+    # each `line` is in the same encoding as the file was originally read (in
+    # practice, UTF-8 or ASCII-8BIT)
+    assert_equal Encoding.default_external, blob("Text/utf16le.txt").lines.first.encoding
+  end
+
   def test_size
     assert_equal 15, blob("Ruby/foo.rb").size
   end