mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Both JSON and Yajl were listed as dependencies. Pygments.rb already requires yajl, so let's just use that instead of using both.
		
			
				
	
	
		
			152 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			152 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| begin
 | |
|   require 'yajl'
 | |
| rescue LoadError
 | |
|   require 'yaml'
 | |
| end
 | |
| 
 | |
| require 'linguist/md5'
 | |
| require 'linguist/classifier'
 | |
| 
 | |
| module Linguist
 | |
|   # Model for accessing classifier training data.
 | |
|   module Samples
 | |
|     # Path to samples root directory
 | |
|     ROOT = File.expand_path("../../../samples", __FILE__)
 | |
| 
 | |
|     # Path for serialized samples db
 | |
|     PATH = File.expand_path('../samples.json', __FILE__)
 | |
| 
 | |
|     # Hash of serialized samples object
 | |
|     def self.cache
 | |
|       @cache ||= begin
 | |
|         serializer = defined?(Yajl) ? Yajl : YAML
 | |
|         serializer.load(File.read(PATH))
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Public: Iterate over each sample.
 | |
|     #
 | |
|     # &block - Yields Sample to block
 | |
|     #
 | |
|     # Returns nothing.
 | |
|     def self.each(&block)
 | |
|       Dir.entries(ROOT).sort!.each do |category|
 | |
|         next if category == '.' || category == '..'
 | |
| 
 | |
|         # Skip text and binary for now
 | |
|         # Possibly reconsider this later
 | |
|         next if category == 'Text' || category == 'Binary'
 | |
| 
 | |
|         dirname = File.join(ROOT, category)
 | |
|         Dir.entries(dirname).each do |filename|
 | |
|           next if filename == '.' || filename == '..'
 | |
| 
 | |
|           if filename == 'filenames'
 | |
|             Dir.entries(File.join(dirname, filename)).each do |subfilename|
 | |
|               next if subfilename == '.' || subfilename == '..'
 | |
| 
 | |
|               yield({
 | |
|                 :path    => File.join(dirname, filename, subfilename),
 | |
|                 :language => category,
 | |
|                 :filename => subfilename
 | |
|               })
 | |
|             end
 | |
|           else
 | |
|             if File.extname(filename) == ""
 | |
|               raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
 | |
|             end
 | |
| 
 | |
|             yield({
 | |
|               :path     => File.join(dirname, filename),
 | |
|               :language => category,
 | |
|               :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
 | |
|               :extname  => File.extname(filename)
 | |
|             })
 | |
|           end
 | |
|         end
 | |
|       end
 | |
| 
 | |
|       nil
 | |
|     end
 | |
| 
 | |
|     # Public: Build Classifier from all samples.
 | |
|     #
 | |
|     # Returns trained Classifier.
 | |
|     def self.data
 | |
|       db = {}
 | |
|       db['extnames'] = {}
 | |
|       db['interpreters'] = {}
 | |
|       db['filenames'] = {}
 | |
| 
 | |
|       each do |sample|
 | |
|         language_name = sample[:language]
 | |
| 
 | |
|         if sample[:extname]
 | |
|           db['extnames'][language_name] ||= []
 | |
|           if !db['extnames'][language_name].include?(sample[:extname])
 | |
|             db['extnames'][language_name] << sample[:extname]
 | |
|             db['extnames'][language_name].sort!
 | |
|           end
 | |
|         end
 | |
| 
 | |
|         if sample[:interpreter]
 | |
|           db['interpreters'][language_name] ||= []
 | |
|           if !db['interpreters'][language_name].include?(sample[:interpreter])
 | |
|             db['interpreters'][language_name] << sample[:interpreter]
 | |
|             db['interpreters'][language_name].sort!
 | |
|           end
 | |
|         end
 | |
| 
 | |
|         if sample[:filename]
 | |
|           db['filenames'][language_name] ||= []
 | |
|           db['filenames'][language_name] << sample[:filename]
 | |
|           db['filenames'][language_name].sort!
 | |
|         end
 | |
| 
 | |
|         data = File.read(sample[:path])
 | |
|         Classifier.train!(db, language_name, data)
 | |
|       end
 | |
| 
 | |
|       db['md5'] = Linguist::MD5.hexdigest(db)
 | |
| 
 | |
|       db
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   # Used to retrieve the interpreter from the shebang line of a file's
 | |
|   # data.
 | |
|   def self.interpreter_from_shebang(data)
 | |
|     lines = data.lines.to_a
 | |
| 
 | |
|     if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
 | |
|       bang.sub!(/^#! /, '#!')
 | |
|       tokens = bang.split(' ')
 | |
|       pieces = tokens.first.split('/')
 | |
| 
 | |
|       if pieces.size > 1
 | |
|         script = pieces.last
 | |
|       else
 | |
|         script = pieces.first.sub('#!', '')
 | |
|       end
 | |
| 
 | |
|       script = script == 'env' ? tokens[1] : script
 | |
| 
 | |
|       # "python2.6" -> "python"
 | |
|       if script =~ /((?:\d+\.?)+)/
 | |
|         script.sub! $1, ''
 | |
|       end
 | |
| 
 | |
|       # Check for multiline shebang hacks that call `exec`
 | |
|       if script == 'sh' &&
 | |
|         lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
 | |
|         script = $1
 | |
|       end
 | |
| 
 | |
|       script
 | |
|     else
 | |
|       nil
 | |
|     end
 | |
|   end
 | |
| 
 | |
| end
 |