Takes a different approach

2026-01-08 18:35:32 +00:00 · 2014-05-21 15:11:06 -04:00
parent 185db0e8d5
commit 09a33f8daa
2 changed files with 39 additions and 3 deletions
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -241,8 +241,25 @@ module Linguist
    def lines
      @lines ||=
        if viewable? && data
-          newlines = Regexp.new("\r\n|\r|\n".encode(encoding))
-          data.force_encoding(encoding).split(newlines, -1)
+          # `data` is usually encoded as ASCII-8BIT even when the content has
+          # been detected as a different encoding. However, we are not allowed
+          # to change the encoding of `data` because we've made the implicit
+          # guarantee that each entry in `lines` is encoded the same way as
+          # `data`.
+          #
+          # Instead, we re-encode each possible newline sequence as the
+          # detected encoding, then force them back to the encoding of `data`
+          # (usually a binary encoding like ASCII-8BIT). This means that the
+          # byte sequence will match how newlines are likely encoded in the
+          # file, but we don't have to change the encoding of `data` as far as
+          # Ruby is concerned. This allows us to correctly parse out each line
+          # without changing the encoding of `data`, and
+          # also--importantly--without having to duplicate many (potentially
+          # large) strings.
+          encoded_newlines = ["\r\n", "\r", "\n"].
+            map { |nl| nl.encode(encoding).force_encoding(data.encoding) }
+
+          data.split(Regexp.union(encoded_newlines), -1)
        else
          []
        end
@@ -263,7 +280,7 @@ module Linguist
    #
    # Returns Integer
    def sloc
-      lines.grep(Regexp.new('\S'.encode(encoding || 'ASCII-8BIT'))).size
+      lines.grep(/\S/).size
    end

    # Public: Is the blob a generated file?