Compare commits

...

6 Commits

Author SHA1 Message Date
Andy Lindeman
9a3c9a8c19 Bumps to 2.11.1 2014-05-22 11:33:44 -04:00
Andy Lindeman
6a192dae63 Merge pull request #1211 from alindeman/multibyte_line_count
Counts the number of lines correctly for files with certain multibyte encodings
2014-05-22 11:27:35 -04:00
Andy Lindeman
09a33f8daa Takes a different approach 2014-05-21 15:11:06 -04:00
Andy Lindeman
185db0e8d5 Makes sure we do not fail if encoding == nil
It looks like it's valid to call this method even if `binary?` is true.
Encoding as 'ASCII-8BIT' should always succeed.
2014-05-21 13:36:39 -04:00
Andy Lindeman
85efbde3f7 Counts the number of lines correctly for files with certain multibyte encodings 2014-05-21 13:36:39 -04:00
Andy Lindeman
93d7aa3d07 Merge pull request #1194 from github/linguist-version-2.11.0
Bumping to 2.11.0
2014-05-21 13:34:00 -04:00
5 changed files with 42 additions and 2 deletions

View File

@@ -241,7 +241,25 @@ module Linguist
def lines
@lines ||=
if viewable? && data
data.split(/\r\n|\r|\n/, -1)
# `data` is usually encoded as ASCII-8BIT even when the content has
# been detected as a different encoding. However, we are not allowed
# to change the encoding of `data` because we've made the implicit
# guarantee that each entry in `lines` is encoded the same way as
# `data`.
#
# Instead, we re-encode each possible newline sequence as the
# detected encoding, then force them back to the encoding of `data`
# (usually a binary encoding like ASCII-8BIT). This means that the
# byte sequence will match how newlines are likely encoded in the
# file, but we don't have to change the encoding of `data` as far as
# Ruby is concerned. This allows us to correctly parse out each line
# without changing the encoding of `data`, and
# also--importantly--without having to duplicate many (potentially
# large) strings.
encoded_newlines = ["\r\n", "\r", "\n"].
map { |nl| nl.encode(encoding).force_encoding(data.encoding) }
data.split(Regexp.union(encoded_newlines), -1)
else
[]
end

View File

@@ -1,3 +1,3 @@
module Linguist
VERSION = "2.11.0"
VERSION = "2.11.1"
end

Binary file not shown.

BIN
samples/Text/utf16le.txt Normal file

Binary file not shown.

View File

@@ -11,6 +11,17 @@ class TestBlob < Test::Unit::TestCase
Lexer = Pygments::Lexer
def setup
# git blobs are normally loaded as ASCII-8BIT since they may contain data
# with arbitrary encoding not known ahead of time
@original_external = Encoding.default_external
Encoding.default_external = Encoding.find("ASCII-8BIT")
end
def teardown
Encoding.default_external = @original_external
end
def samples_path
File.expand_path("../../samples", __FILE__)
end
@@ -67,6 +78,14 @@ class TestBlob < Test::Unit::TestCase
assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length
end
def test_lines_maintains_original_encoding
# Even if the file's encoding is detected as something like UTF-16LE,
# earlier versions of the gem made implicit guarantees that the encoding of
# each `line` is in the same encoding as the file was originally read (in
# practice, UTF-8 or ASCII-8BIT)
assert_equal Encoding.default_external, blob("Text/utf16le.txt").lines.first.encoding
end
def test_size
assert_equal 15, blob("Ruby/foo.rb").size
end
@@ -77,12 +96,15 @@ class TestBlob < Test::Unit::TestCase
def test_sloc
assert_equal 2, blob("Ruby/foo.rb").sloc
assert_equal 3, blob("Text/utf16le-windows.txt").sloc
end
def test_encoding
assert_equal "ISO-8859-2", blob("Text/README").encoding
assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding
assert_equal "UTF-8", blob("Text/foo.txt").encoding
assert_equal "UTF-16LE", blob("Text/utf16le.txt").encoding
assert_equal "UTF-16LE", blob("Text/utf16le-windows.txt").encoding
assert_nil blob("Binary/dog.o").encoding
end