Merge pull request #1245 from alindeman/binarylike_data

Handle case where newline chars don't transcode to detected encoding
2026-01-30 13:16:33 +00:00 · 2014-06-03 12:55:33 -04:00
parent a5b6331ab5 aa5a94cc3e
commit 83c5f6a004
3 changed files with 11 additions and 3 deletions
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -256,10 +256,16 @@ module Linguist
          # without changing the encoding of `data`, and
          # also--importantly--without having to duplicate many (potentially
          # large) strings.
-          encoded_newlines = ["\r\n", "\r", "\n"].
-            map { |nl| nl.encode(encoding).force_encoding(data.encoding) }
+          begin
+            encoded_newlines = ["\r\n", "\r", "\n"].
+              map { |nl| nl.encode(encoding, "ASCII-8BIT").force_encoding(data.encoding) }

-          data.split(Regexp.union(encoded_newlines), -1)
+            data.split(Regexp.union(encoded_newlines), -1)
+          rescue Encoding::ConverterNotFoundError
+            # The data is not splittable in the detected encoding.  Assume it's
+            # one big line.
+            [data]
+          end
        else
          []
        end
--- a/samples/Text/iso8859-8-i.txt
+++ b/samples/Text/iso8859-8-i.txt
@@ -0,0 +1 @@
+%<25>בי
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -97,6 +97,7 @@ class TestBlob < Test::Unit::TestCase
  def test_sloc
    assert_equal 2, blob("Ruby/foo.rb").sloc
    assert_equal 3, blob("Text/utf16le-windows.txt").sloc
+    assert_equal 1, blob("Text/iso8859-8-i.txt").sloc
  end

  def test_encoding