mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Use charlock to detect binary blobs
This commit is contained in:
		| @@ -2,6 +2,7 @@ require 'linguist/language' | ||||
| require 'linguist/mime' | ||||
| require 'linguist/pathname' | ||||
|  | ||||
| require 'charlock_holmes' | ||||
| require 'escape_utils' | ||||
| require 'pygments' | ||||
| require 'yaml' | ||||
| @@ -52,7 +53,7 @@ module Linguist | ||||
|     # | ||||
|     # Returns a content type String. | ||||
|     def content_type | ||||
|       pathname.content_type | ||||
|       @content_type ||= binary? ? mime_type : "text/plain; charset=#{encoding.downcase}" | ||||
|     end | ||||
|  | ||||
|     # Public: Get the Content-Disposition header value | ||||
| @@ -71,11 +72,30 @@ module Linguist | ||||
|       end | ||||
|     end | ||||
|  | ||||
|     def encoding | ||||
|       if hash = detect_encoding | ||||
|         hash[:encoding] | ||||
|       end | ||||
|     end | ||||
|  | ||||
|     # Try to guess the encoding | ||||
|     # | ||||
|     # Returns: a Hash, with :encoding, :confidence, :type | ||||
|     #          this will return nil if an error occurred during detection or | ||||
|     #          no valid encoding could be found | ||||
|     def detect_encoding | ||||
|       @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data | ||||
|     end | ||||
|  | ||||
|     # Public: Is the blob binary? | ||||
|     # | ||||
|     # Return true or false | ||||
|     def binary? | ||||
|       pathname.binary? | ||||
|       if mime_type = Mime.lookup_mime_type_for(pathname.extname) | ||||
|         mime_type.binary? | ||||
|       else | ||||
|         detect_encoding.nil? || detect_encoding[:type] == :binary | ||||
|       end | ||||
|     end | ||||
|  | ||||
|     # Public: Is the blob text? | ||||
| @@ -529,6 +549,8 @@ module Linguist | ||||
|     # Returns html String | ||||
|     def colorize(options = {}) | ||||
|       return if !text? || large? | ||||
|       options[:options] ||= {} | ||||
|       options[:options][:encoding] ||= encoding | ||||
|       lexer.highlight(data, options) | ||||
|     end | ||||
|  | ||||
| @@ -540,7 +562,7 @@ module Linguist | ||||
|     # Returns html String | ||||
|     def colorize_without_wrapper(options = {}) | ||||
|       return if !text? || large? | ||||
|       if text = lexer.highlight(data, options) | ||||
|       if text = colorize(options) | ||||
|         text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1] | ||||
|       else | ||||
|         '' | ||||
|   | ||||
| @@ -38,14 +38,11 @@ class TestBlob < Test::Unit::TestCase | ||||
|     assert_equal "application/octet-stream", blob("dog.o").content_type | ||||
|     assert_equal "application/pdf", blob("foo.pdf").content_type | ||||
|     assert_equal "image/png", blob("foo.png").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("README").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("foo.html").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("foo.pl").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("foo.py").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("foo.rb").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("foo.sh").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("foo.xhtml").content_type | ||||
|     assert_equal "text/plain; charset=utf8", blob("foo.xml").content_type | ||||
|     assert_equal "text/plain; charset=iso-8859-2", blob("README").content_type | ||||
|     assert_equal "text/plain; charset=iso-8859-1", blob("script.pl").content_type | ||||
|     assert_equal "text/plain; charset=iso-8859-1", blob("script.py").content_type | ||||
|     assert_equal "text/plain; charset=iso-8859-1", blob("script.rb").content_type | ||||
|     assert_equal "text/plain; charset=iso-8859-1", blob("script.sh").content_type | ||||
|   end | ||||
|  | ||||
|   def test_disposition | ||||
| @@ -79,6 +76,13 @@ class TestBlob < Test::Unit::TestCase | ||||
|     assert_equal 2, blob("foo.rb").sloc | ||||
|   end | ||||
|  | ||||
|   def test_encoding | ||||
|     assert_equal "ISO-8859-2", blob("README").encoding | ||||
|     assert_equal "ISO-8859-1", blob("dump.sql").encoding | ||||
|     assert_equal "UTF-8", blob("file.txt").encoding | ||||
|     assert_nil blob("dog.o").encoding | ||||
|   end | ||||
|  | ||||
|   def test_binary | ||||
|     assert blob("git.deb").binary? | ||||
|     assert blob("git.exe").binary? | ||||
| @@ -86,6 +90,7 @@ class TestBlob < Test::Unit::TestCase | ||||
|     assert blob("linguist.gem").binary? | ||||
|     assert blob("octocat.ai").binary? | ||||
|     assert blob("octocat.png").binary? | ||||
|     assert blob("zip").binary? | ||||
|     assert !blob("README").binary? | ||||
|     assert !blob("file.txt").binary? | ||||
|     assert !blob("foo.rb").binary? | ||||
| @@ -330,7 +335,6 @@ class TestBlob < Test::Unit::TestCase | ||||
|     assert_equal Language['Parrot Assembly'], blob("hello.pasm").language | ||||
|  | ||||
|     # http://gosu-lang.org | ||||
|     assert_equal Language['Gosu'], blob("Hello.gs").language | ||||
|     assert_equal Language['Gosu'], blob("Hello.gsx").language | ||||
|     assert_equal Language['Gosu'], blob("hello.gsp").language | ||||
|     assert_equal Language['Gosu'], blob("Hello.gst").language | ||||
|   | ||||
		Reference in New Issue
	
	Block a user