diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index a942b205..08e88581 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -2,6 +2,7 @@ require 'linguist/language' require 'linguist/mime' require 'linguist/pathname' +require 'charlock_holmes' require 'escape_utils' require 'pygments' require 'yaml' @@ -52,7 +53,7 @@ module Linguist # # Returns a content type String. def content_type - pathname.content_type + @content_type ||= binary? ? mime_type : "text/plain; charset=#{encoding.downcase}" end # Public: Get the Content-Disposition header value @@ -71,11 +72,30 @@ module Linguist end end + def encoding + if hash = detect_encoding + hash[:encoding] + end + end + + # Try to guess the encoding + # + # Returns: a Hash, with :encoding, :confidence, :type + # this will return nil if an error occurred during detection or + # no valid encoding could be found + def detect_encoding + @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data + end + # Public: Is the blob binary? # # Return true or false def binary? - pathname.binary? + if mime_type = Mime.lookup_mime_type_for(pathname.extname) + mime_type.binary? + else + detect_encoding.nil? || detect_encoding[:type] == :binary + end end # Public: Is the blob text? @@ -529,6 +549,8 @@ module Linguist # Returns html String def colorize(options = {}) return if !text? || large? + options[:options] ||= {} + options[:options][:encoding] ||= encoding lexer.highlight(data, options) end @@ -540,7 +562,7 @@ module Linguist # Returns html String def colorize_without_wrapper(options = {}) return if !text? || large? - if text = lexer.highlight(data, options) + if text = colorize(options) text[%r{
(.*?)
\s*
}m, 1] else '' diff --git a/test/test_blob.rb b/test/test_blob.rb index 3602a9d6..bc8a00be 100644 --- a/test/test_blob.rb +++ b/test/test_blob.rb @@ -38,14 +38,11 @@ class TestBlob < Test::Unit::TestCase assert_equal "application/octet-stream", blob("dog.o").content_type assert_equal "application/pdf", blob("foo.pdf").content_type assert_equal "image/png", blob("foo.png").content_type - assert_equal "text/plain; charset=utf8", blob("README").content_type - assert_equal "text/plain; charset=utf8", blob("foo.html").content_type - assert_equal "text/plain; charset=utf8", blob("foo.pl").content_type - assert_equal "text/plain; charset=utf8", blob("foo.py").content_type - assert_equal "text/plain; charset=utf8", blob("foo.rb").content_type - assert_equal "text/plain; charset=utf8", blob("foo.sh").content_type - assert_equal "text/plain; charset=utf8", blob("foo.xhtml").content_type - assert_equal "text/plain; charset=utf8", blob("foo.xml").content_type + assert_equal "text/plain; charset=iso-8859-2", blob("README").content_type + assert_equal "text/plain; charset=iso-8859-1", blob("script.pl").content_type + assert_equal "text/plain; charset=iso-8859-1", blob("script.py").content_type + assert_equal "text/plain; charset=iso-8859-1", blob("script.rb").content_type + assert_equal "text/plain; charset=iso-8859-1", blob("script.sh").content_type end def test_disposition @@ -79,6 +76,13 @@ class TestBlob < Test::Unit::TestCase assert_equal 2, blob("foo.rb").sloc end + def test_encoding + assert_equal "ISO-8859-2", blob("README").encoding + assert_equal "ISO-8859-1", blob("dump.sql").encoding + assert_equal "UTF-8", blob("file.txt").encoding + assert_nil blob("dog.o").encoding + end + def test_binary assert blob("git.deb").binary? assert blob("git.exe").binary? @@ -86,6 +90,7 @@ class TestBlob < Test::Unit::TestCase assert blob("linguist.gem").binary? assert blob("octocat.ai").binary? assert blob("octocat.png").binary? + assert blob("zip").binary? assert !blob("README").binary? assert !blob("file.txt").binary? assert !blob("foo.rb").binary? @@ -330,7 +335,6 @@ class TestBlob < Test::Unit::TestCase assert_equal Language['Parrot Assembly'], blob("hello.pasm").language # http://gosu-lang.org - assert_equal Language['Gosu'], blob("Hello.gs").language assert_equal Language['Gosu'], blob("Hello.gsx").language assert_equal Language['Gosu'], blob("hello.gsp").language assert_equal Language['Gosu'], blob("Hello.gst").language