From 85efbde3f769d3fb8da98b294afec81e07d5814d Mon Sep 17 00:00:00 2001 From: Andy Lindeman Date: Wed, 21 May 2014 11:44:18 -0400 Subject: [PATCH] Counts the number of lines correctly for files with certain multibyte encodings --- lib/linguist/blob_helper.rb | 5 +++-- samples/Text/utf16le-windows.txt | Bin 0 -> 40 bytes samples/Text/utf16le.txt | Bin 0 -> 36 bytes test/test_blob.rb | 3 +++ 4 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 samples/Text/utf16le-windows.txt create mode 100644 samples/Text/utf16le.txt diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 37793a36..8aa96ed4 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -241,7 +241,8 @@ module Linguist def lines @lines ||= if viewable? && data - data.split(/\r\n|\r|\n/, -1) + newlines = Regexp.new("\r\n|\r|\n".encode(encoding)) + data.force_encoding(encoding).split(newlines, -1) else [] end @@ -262,7 +263,7 @@ module Linguist # # Returns Integer def sloc - lines.grep(/\S/).size + lines.grep(Regexp.new('\S'.encode(encoding))).size end # Public: Is the blob a generated file? diff --git a/samples/Text/utf16le-windows.txt b/samples/Text/utf16le-windows.txt new file mode 100644 index 0000000000000000000000000000000000000000..590ae2a63f89c45e1118a554a8f6c0b27c5a6873 GIT binary patch literal 40 ecmezWFNYzMA&()I!H|KMfr|mgGeY4RGXMa@90$4p literal 0 HcmV?d00001 diff --git a/samples/Text/utf16le.txt b/samples/Text/utf16le.txt new file mode 100644 index 0000000000000000000000000000000000000000..1829ef7099c936f6ebfe92fa033a517a133ae6e4 GIT binary patch literal 36 ccmezWFNYzMA&()I!H|KA0me2$VjD960It>tpa1{> literal 0 HcmV?d00001 diff --git a/test/test_blob.rb b/test/test_blob.rb index ec1443ad..890242b6 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -77,12 +77,15 @@ class TestBlob < Test::Unit::TestCase def test_sloc assert_equal 2, blob("Ruby/foo.rb").sloc + assert_equal 3, blob("Text/utf16le-windows.txt").sloc end def test_encoding assert_equal "ISO-8859-2", blob("Text/README").encoding assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding assert_equal "UTF-8", blob("Text/foo.txt").encoding + assert_equal "UTF-16LE", blob("Text/utf16le.txt").encoding + assert_equal "UTF-16LE", blob("Text/utf16le-windows.txt").encoding assert_nil blob("Binary/dog.o").encoding end