mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Merge pull request #1211 from alindeman/multibyte_line_count
Counts the number of lines correctly for files with certain multibyte encodings
This commit is contained in:
@@ -241,7 +241,25 @@ module Linguist
|
|||||||
def lines
|
def lines
|
||||||
@lines ||=
|
@lines ||=
|
||||||
if viewable? && data
|
if viewable? && data
|
||||||
data.split(/\r\n|\r|\n/, -1)
|
# `data` is usually encoded as ASCII-8BIT even when the content has
|
||||||
|
# been detected as a different encoding. However, we are not allowed
|
||||||
|
# to change the encoding of `data` because we've made the implicit
|
||||||
|
# guarantee that each entry in `lines` is encoded the same way as
|
||||||
|
# `data`.
|
||||||
|
#
|
||||||
|
# Instead, we re-encode each possible newline sequence as the
|
||||||
|
# detected encoding, then force them back to the encoding of `data`
|
||||||
|
# (usually a binary encoding like ASCII-8BIT). This means that the
|
||||||
|
# byte sequence will match how newlines are likely encoded in the
|
||||||
|
# file, but we don't have to change the encoding of `data` as far as
|
||||||
|
# Ruby is concerned. This allows us to correctly parse out each line
|
||||||
|
# without changing the encoding of `data`, and
|
||||||
|
# also--importantly--without having to duplicate many (potentially
|
||||||
|
# large) strings.
|
||||||
|
encoded_newlines = ["\r\n", "\r", "\n"].
|
||||||
|
map { |nl| nl.encode(encoding).force_encoding(data.encoding) }
|
||||||
|
|
||||||
|
data.split(Regexp.union(encoded_newlines), -1)
|
||||||
else
|
else
|
||||||
[]
|
[]
|
||||||
end
|
end
|
||||||
|
|||||||
BIN
samples/Text/utf16le-windows.txt
Normal file
BIN
samples/Text/utf16le-windows.txt
Normal file
Binary file not shown.
BIN
samples/Text/utf16le.txt
Normal file
BIN
samples/Text/utf16le.txt
Normal file
Binary file not shown.
@@ -11,6 +11,17 @@ class TestBlob < Test::Unit::TestCase
|
|||||||
|
|
||||||
Lexer = Pygments::Lexer
|
Lexer = Pygments::Lexer
|
||||||
|
|
||||||
|
def setup
|
||||||
|
# git blobs are normally loaded as ASCII-8BIT since they may contain data
|
||||||
|
# with arbitrary encoding not known ahead of time
|
||||||
|
@original_external = Encoding.default_external
|
||||||
|
Encoding.default_external = Encoding.find("ASCII-8BIT")
|
||||||
|
end
|
||||||
|
|
||||||
|
def teardown
|
||||||
|
Encoding.default_external = @original_external
|
||||||
|
end
|
||||||
|
|
||||||
def samples_path
|
def samples_path
|
||||||
File.expand_path("../../samples", __FILE__)
|
File.expand_path("../../samples", __FILE__)
|
||||||
end
|
end
|
||||||
@@ -67,6 +78,14 @@ class TestBlob < Test::Unit::TestCase
|
|||||||
assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length
|
assert_equal 475, blob("Emacs Lisp/ess-julia.el").lines.length
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_lines_maintains_original_encoding
|
||||||
|
# Even if the file's encoding is detected as something like UTF-16LE,
|
||||||
|
# earlier versions of the gem made implicit guarantees that the encoding of
|
||||||
|
# each `line` is in the same encoding as the file was originally read (in
|
||||||
|
# practice, UTF-8 or ASCII-8BIT)
|
||||||
|
assert_equal Encoding.default_external, blob("Text/utf16le.txt").lines.first.encoding
|
||||||
|
end
|
||||||
|
|
||||||
def test_size
|
def test_size
|
||||||
assert_equal 15, blob("Ruby/foo.rb").size
|
assert_equal 15, blob("Ruby/foo.rb").size
|
||||||
end
|
end
|
||||||
@@ -77,12 +96,15 @@ class TestBlob < Test::Unit::TestCase
|
|||||||
|
|
||||||
def test_sloc
|
def test_sloc
|
||||||
assert_equal 2, blob("Ruby/foo.rb").sloc
|
assert_equal 2, blob("Ruby/foo.rb").sloc
|
||||||
|
assert_equal 3, blob("Text/utf16le-windows.txt").sloc
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_encoding
|
def test_encoding
|
||||||
assert_equal "ISO-8859-2", blob("Text/README").encoding
|
assert_equal "ISO-8859-2", blob("Text/README").encoding
|
||||||
assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding
|
assert_equal "ISO-8859-1", blob("Text/dump.sql").encoding
|
||||||
assert_equal "UTF-8", blob("Text/foo.txt").encoding
|
assert_equal "UTF-8", blob("Text/foo.txt").encoding
|
||||||
|
assert_equal "UTF-16LE", blob("Text/utf16le.txt").encoding
|
||||||
|
assert_equal "UTF-16LE", blob("Text/utf16le-windows.txt").encoding
|
||||||
assert_nil blob("Binary/dog.o").encoding
|
assert_nil blob("Binary/dog.o").encoding
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user