From aa5a94cc3ee144d6a896e7c61183c9fe13f686e2 Mon Sep 17 00:00:00 2001 From: Andy Lindeman Date: Tue, 3 Jun 2014 12:21:07 -0400 Subject: [PATCH] Handle case where newline chars don't transcode to detected encoding We've seen cases where binary files are detected as encodings such as ISO-8859-8-I. This usually happens when the binary files are short, so while the detector is mistaken, there is also not very much data for use in the detection algorithm in the first place so it's understandable that the detector was wrong. In these cases, the code to convert ASCII newline characters to encodings such as ISO-8859-8-I fails because there is no conversion between them. We now simply assume that the data is all one line in those cases. In reality the data is binary, but this obviously difficult to detect reliably. --- lib/linguist/blob_helper.rb | 12 +++++++++--- samples/Text/iso8859-8-i.txt | 1 + test/test_blob.rb | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 samples/Text/iso8859-8-i.txt diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 76eab7ea..67f6eef3 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -256,10 +256,16 @@ module Linguist # without changing the encoding of `data`, and # also--importantly--without having to duplicate many (potentially # large) strings. - encoded_newlines = ["\r\n", "\r", "\n"]. - map { |nl| nl.encode(encoding).force_encoding(data.encoding) } + begin + encoded_newlines = ["\r\n", "\r", "\n"]. + map { |nl| nl.encode(encoding, "ASCII-8BIT").force_encoding(data.encoding) } - data.split(Regexp.union(encoded_newlines), -1) + data.split(Regexp.union(encoded_newlines), -1) + rescue Encoding::ConverterNotFoundError + # The data is not splittable in the detected encoding. Assume it's + # one big line. + [data] + end else [] end diff --git a/samples/Text/iso8859-8-i.txt b/samples/Text/iso8859-8-i.txt new file mode 100644 index 00000000..ed2bf6c4 --- /dev/null +++ b/samples/Text/iso8859-8-i.txt @@ -0,0 +1 @@ +%¿áé \ No newline at end of file diff --git a/test/test_blob.rb b/test/test_blob.rb index da13b96e..2109e9b4 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -97,6 +97,7 @@ class TestBlob < Test::Unit::TestCase def test_sloc assert_equal 2, blob("Ruby/foo.rb").sloc assert_equal 3, blob("Text/utf16le-windows.txt").sloc + assert_equal 1, blob("Text/iso8859-8-i.txt").sloc end def test_encoding