mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	Compare commits
	
		
			60 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 3d7364877d | ||
|  | 2a324c6289 | ||
|  | 77a6a41fc3 | ||
|  | 076bf7d0c8 | ||
|  | 540f2a0941 | ||
|  | 497da86262 | ||
|  | 4b9b8a5058 | ||
|  | 5568489123 | ||
|  | 5cdd5e206c | ||
|  | c353d3a050 | ||
|  | 6252f12175 | ||
|  | 0067f28246 | ||
|  | ac23d64d26 | ||
|  | 9c9607e42c | ||
|  | 516a220d9f | ||
|  | 7bcf90c527 | ||
|  | 8c83cbe244 | ||
|  | 26f95507ef | ||
|  | 4324971cea | ||
|  | 2672089154 | ||
|  | 48ecae0c95 | ||
|  | 5daaee88b4 | ||
|  | db9475f240 | ||
|  | f68e94f181 | ||
|  | cb70572163 | ||
|  | e9eae4e008 | ||
|  | e33d8f3685 | ||
|  | 645a87d02b | ||
|  | 4484011f08 | ||
|  | c114d710f8 | ||
|  | 9810c693c3 | ||
|  | c804d04072 | ||
|  | fdd81ce0be | ||
|  | 176f6483d0 | ||
|  | ee6650f83f | ||
|  | 3fee3ac549 | ||
|  | 9d555862c3 | ||
|  | ddf3ec4a5b | ||
|  | e2b0f6bb50 | ||
|  | 4d5c9b951b | ||
|  | d566b35020 | ||
|  | 8f85a447de | ||
|  | d0691988a9 | ||
|  | d9ecbf0c24 | ||
|  | d5fa8cbcb7 | ||
|  | 555573071e | ||
|  | ecb2397e59 | ||
|  | 12cfab6d50 | ||
|  | 8a75d4d208 | ||
|  | fd8b70ffa4 | ||
|  | 62498cf0e9 | ||
|  | 543922c68a | ||
|  | 6f6dd8bc38 | ||
|  | 8351d55c56 | ||
|  | 9ecab364d1 | ||
|  | 0172623061 | ||
|  | e5ae9c328b | ||
|  | e0c777d995 | ||
|  | f747b49347 | ||
|  | e0cbe815a3 | 
							
								
								
									
										61
									
								
								Rakefile
									
									
									
									
									
								
							
							
						
						
									
										61
									
								
								Rakefile
									
									
									
									
									
								
							| @@ -1,3 +1,4 @@ | ||||
| require 'rake/clean' | ||||
| require 'rake/testtask' | ||||
|  | ||||
| task :default => :test | ||||
| @@ -5,3 +6,63 @@ task :default => :test | ||||
| Rake::TestTask.new do |t| | ||||
|   t.warning = true | ||||
| end | ||||
|  | ||||
|  | ||||
| file 'lib/linguist/classifier.yml' => Dir['test/fixtures/**/*'] do |f| | ||||
|   require 'linguist/sample' | ||||
|   classifier = Linguist::Sample.classifier | ||||
|   File.open(f.name, 'w') { |io| YAML.dump(classifier, io) } | ||||
| end | ||||
|  | ||||
| CLOBBER.include 'lib/linguist/classifier.yml' | ||||
|  | ||||
| task :classifier => ['lib/linguist/classifier.yml'] | ||||
|  | ||||
| namespace :classifier do | ||||
|   LIMIT = 1_000 | ||||
|  | ||||
|   desc "Run classifier against #{LIMIT} public gists" | ||||
|   task :test do | ||||
|     require 'linguist/classifier' | ||||
|  | ||||
|     total, correct, incorrect = 0, 0, 0 | ||||
|     $stdout.sync = true | ||||
|  | ||||
|     each_public_gist do |gist_url, file_url, file_language| | ||||
|       next if file_language.nil? || file_language == 'Text' | ||||
|       begin | ||||
|         data = open(file_url).read | ||||
|         guessed_language, score = Linguist::Classifier.instance.classify(data).first | ||||
|  | ||||
|         total += 1 | ||||
|         guessed_language.name == file_language ? correct += 1 : incorrect += 1 | ||||
|  | ||||
|         print "\r\e[0K%d:%d  %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100] | ||||
|         $stdout.flush | ||||
|       rescue URI::InvalidURIError | ||||
|       else | ||||
|         break if total >= LIMIT | ||||
|       end | ||||
|     end | ||||
|     puts "" | ||||
|   end | ||||
|  | ||||
|   def each_public_gist | ||||
|     require 'open-uri' | ||||
|     require 'json' | ||||
|  | ||||
|     url = "https://api.github.com/gists/public" | ||||
|  | ||||
|     loop do | ||||
|       resp = open(url) | ||||
|       url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1] | ||||
|       gists = JSON.parse(resp.read) | ||||
|  | ||||
|       for gist in gists | ||||
|         for filename, attrs in gist['files'] | ||||
|           yield gist['url'], attrs['raw_url'], attrs['language'] | ||||
|         end | ||||
|       end | ||||
|     end | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| Gem::Specification.new do |s| | ||||
|   s.name    = 'github-linguist' | ||||
|   s.version = '1.0.0' | ||||
|   s.version = '2.0.0' | ||||
|   s.summary = "GitHub Language detection" | ||||
|  | ||||
|   s.authors = "GitHub" | ||||
| @@ -12,5 +12,6 @@ Gem::Specification.new do |s| | ||||
|   s.add_dependency 'escape_utils',    '~> 0.2.3' | ||||
|   s.add_dependency 'mime-types',      '~> 1.18' | ||||
|   s.add_dependency 'pygments.rb',     '~> 0.2.13' | ||||
|   s.add_development_dependency 'json' | ||||
|   s.add_development_dependency 'rake' | ||||
| end | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| require 'linguist/classifier' | ||||
| require 'linguist/language' | ||||
| require 'linguist/mime' | ||||
| require 'linguist/pathname' | ||||
| @@ -453,8 +454,15 @@ module Linguist | ||||
|     # Returns a Language or nil. | ||||
|     def disambiguate_extension_language | ||||
|       if Language.ambiguous?(extname) | ||||
|         name = "guess_#{extname.sub(/^\./, '')}_language" | ||||
|         send(name) if respond_to?(name) | ||||
|         # name = "guess_#{extname.sub(/^\./, '')}_language" | ||||
|         # send(name) if respond_to?(name) | ||||
|  | ||||
|         possible_languages = Language.all.select { |l| l.extensions.include?(extname) } | ||||
|         if possible_languages.any? | ||||
|           if result = Classifier.instance.classify(data, possible_languages).first | ||||
|             result[0] | ||||
|           end | ||||
|         end | ||||
|       end | ||||
|     end | ||||
|  | ||||
|   | ||||
							
								
								
									
										183
									
								
								lib/linguist/classifier.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										183
									
								
								lib/linguist/classifier.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,183 @@ | ||||
| require 'linguist/language' | ||||
| require 'linguist/tokenizer' | ||||
|  | ||||
| module Linguist | ||||
|   # Language bayesian classifier. | ||||
|   class Classifier | ||||
|     # Internal: Path to persisted classifier db. | ||||
|     PATH = File.expand_path('../classifier.yml', __FILE__) | ||||
|  | ||||
|     # Public: Check if persisted db exists on disk. | ||||
|     # | ||||
|     # Returns Boolean. | ||||
|     def self.exist? | ||||
|       File.exist?(PATH) | ||||
|     end | ||||
|  | ||||
|     # Public: Get persisted Classifier instance. | ||||
|     # | ||||
|     # Returns Classifier. | ||||
|     def self.instance | ||||
|       @instance ||= YAML.load_file(PATH) | ||||
|     end | ||||
|  | ||||
|     # Public: Initialize a Classifier. | ||||
|     def initialize | ||||
|       @tokens_total    = 0 | ||||
|       @languages_total = 0 | ||||
|       @tokens          = Hash.new { |h, k| h[k] = Hash.new(0) } | ||||
|       @language_tokens = Hash.new(0) | ||||
|       @languages       = Hash.new(0) | ||||
|     end | ||||
|  | ||||
|     # Public: Compare Classifier objects. | ||||
|     # | ||||
|     # other - Classifier object to compare to. | ||||
|     # | ||||
|     # Returns Boolean. | ||||
|     def eql?(other) | ||||
|       # Lazy fast check counts only | ||||
|       other.is_a?(self.class) && | ||||
|         @tokens_total == other.instance_variable_get(:@tokens_total) && | ||||
|         @languages_total == other.instance_variable_get(:@languages_total) | ||||
|     end | ||||
|     alias_method :==, :eql? | ||||
|  | ||||
|     # Public: Train classifier that data is a certain language. | ||||
|     # | ||||
|     # language - Language of data | ||||
|     # data     - String contents of file | ||||
|     # | ||||
|     # Examples | ||||
|     # | ||||
|     #   train(Language['Ruby'], "def hello; end") | ||||
|     # | ||||
|     # Returns nothing. | ||||
|     def train(language, data) | ||||
|       language = language.name | ||||
|       tokens   = Tokenizer.new(data).tokens | ||||
|  | ||||
|       tokens.each do |token| | ||||
|         @tokens[language][token] += 1 | ||||
|         @language_tokens[language] += 1 | ||||
|         @tokens_total += 1 | ||||
|       end | ||||
|       @languages[language] += 1 | ||||
|       @languages_total += 1 | ||||
|  | ||||
|       nil | ||||
|     end | ||||
|  | ||||
|     # Public: Verify internal counts are consistent. | ||||
|     # | ||||
|     # Returns Boolean. | ||||
|     def verify | ||||
|       @languages.inject(0) { |n, (l, c)| n += c } == @languages_total && | ||||
|         @language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total && | ||||
|         @tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total | ||||
|     end | ||||
|  | ||||
|     # Public: Prune infrequent tokens. | ||||
|     # | ||||
|     # Returns receiver Classifier instance. | ||||
|     def gc | ||||
|       self | ||||
|     end | ||||
|  | ||||
|     # Public: Guess language of data. | ||||
|     # | ||||
|     # data      - Array of tokens or String data to analyze. | ||||
|     # languages - Array of Languages to restrict to. | ||||
|     # | ||||
|     # Examples | ||||
|     # | ||||
|     #   classify("def hello; end") | ||||
|     #   # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ] | ||||
|     # | ||||
|     # Returns sorted Array of result pairs. Each pair contains the | ||||
|     # Language and a Float score. | ||||
|     def classify(tokens, languages = @languages.keys) | ||||
|       tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String) | ||||
|  | ||||
|       scores = {} | ||||
|       languages.each do |language| | ||||
|         language_name = language.is_a?(Language) ? language.name : language | ||||
|         scores[language_name] = tokens_probability(tokens, language_name) + | ||||
|                                    language_probability(language_name) | ||||
|       end | ||||
|  | ||||
|       scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] } | ||||
|     end | ||||
|  | ||||
|     # Internal: Probably of set of tokens in a language occuring - P(D | C) | ||||
|     # | ||||
|     # tokens   - Array of String tokens. | ||||
|     # language - Language to check. | ||||
|     # | ||||
|     # Returns Float between 0.0 and 1.0. | ||||
|     def tokens_probability(tokens, language) | ||||
|       tokens.inject(0.0) do |sum, token| | ||||
|         sum += Math.log(token_probability(token, language)) | ||||
|       end | ||||
|     end | ||||
|  | ||||
|     # Internal: Probably of token in language occuring - P(F | C) | ||||
|     # | ||||
|     # token    - String token. | ||||
|     # language - Language to check. | ||||
|     # | ||||
|     # Returns Float between 0.0 and 1.0. | ||||
|     def token_probability(token, language) | ||||
|       if @tokens[language][token].to_f == 0.0 | ||||
|         1 / @tokens_total.to_f | ||||
|       else | ||||
|         @tokens[language][token].to_f / @language_tokens[language].to_f | ||||
|       end | ||||
|     end | ||||
|  | ||||
|     # Internal: Probably of a language occuring - P(C) | ||||
|     # | ||||
|     # language - Language to check. | ||||
|     # | ||||
|     # Returns Float between 0.0 and 1.0. | ||||
|     def language_probability(language) | ||||
|       Math.log(@languages[language].to_f / @languages_total.to_f) | ||||
|     end | ||||
|  | ||||
|     # Public: Serialize classifier to YAML. | ||||
|     # | ||||
|     # opts - Hash of YAML options. | ||||
|     # | ||||
|     # Returns nothing. | ||||
|     def to_yaml(io) | ||||
|       data = "--- !ruby/object:Linguist::Classifier\n" | ||||
|  | ||||
|       data << "languages_total: #{@languages_total}\n" | ||||
|       data << "tokens_total: #{@tokens_total}\n" | ||||
|  | ||||
|       data << "languages:\n" | ||||
|       @languages.sort.each do |language, count| | ||||
|         data << "  #{{language => count}.to_yaml.lines.to_a[1]}" | ||||
|       end | ||||
|  | ||||
|       data << "language_tokens:\n" | ||||
|       @language_tokens.sort.each do |language, count| | ||||
|         data << "  #{{language => count}.to_yaml.lines.to_a[1]}" | ||||
|       end | ||||
|  | ||||
|       data << "tokens:\n" | ||||
|       @tokens.sort.each do |language, tokens| | ||||
|         data << "  #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}" | ||||
|         tokens.sort.each do |token, count| | ||||
|           data << "    #{{token => count}.to_yaml.lines.to_a[1]}" | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       io.write data | ||||
|       nil | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   # Eager load instance | ||||
|   Classifier.instance if Classifier.exist? | ||||
| end | ||||
							
								
								
									
										19013
									
								
								lib/linguist/classifier.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19013
									
								
								lib/linguist/classifier.yml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -26,7 +26,7 @@ module Linguist | ||||
|       @overrides.include?(extension) | ||||
|     end | ||||
|  | ||||
|     # Include?: Return overridden extensions. | ||||
|     # Internal: Return overridden extensions. | ||||
|     # | ||||
|     # Returns extensions Array. | ||||
|     def self.overridden_extensions | ||||
|   | ||||
							
								
								
									
										74
									
								
								lib/linguist/sample.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								lib/linguist/sample.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,74 @@ | ||||
| require 'linguist/classifier' | ||||
| require 'linguist/language' | ||||
|  | ||||
| module Linguist | ||||
|   # Model for accessing classifier training data. | ||||
|   class Sample | ||||
|     # Samples live in test/ for now, we'll eventually move them out | ||||
|     PATH = File.expand_path("../../../test/fixtures", __FILE__) | ||||
|  | ||||
|     # Public: Iterate over each Sample. | ||||
|     # | ||||
|     # &block - Yields Sample to block | ||||
|     # | ||||
|     # Returns nothing. | ||||
|     def self.each(&block) | ||||
|       Dir.entries(PATH).each do |category| | ||||
|         next if category == '.' || category == '..' | ||||
|  | ||||
|         # Skip text and binary for now | ||||
|         # Possibly reconsider this later | ||||
|         next if category == 'text' || category == 'binary' | ||||
|  | ||||
|         # Map directory name to a Language alias | ||||
|         language = Linguist::Language.find_by_alias(category) | ||||
|         raise "No language for #{category.inspect}" unless language | ||||
|  | ||||
|         dirname = File.join(PATH, category) | ||||
|         Dir.entries(dirname).each do |filename| | ||||
|           next if filename == '.' || filename == '..' | ||||
|           yield new(File.join(dirname, filename), language) | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       nil | ||||
|     end | ||||
|  | ||||
|     # Public: Build Classifier from all samples. | ||||
|     # | ||||
|     # Returns trained Classifier. | ||||
|     def self.classifier | ||||
|       classifier = Classifier.new | ||||
|       each { |sample| classifier.train(sample.language, sample.data) } | ||||
|       classifier.gc | ||||
|     end | ||||
|  | ||||
|     # Internal: Initialize Sample. | ||||
|     # | ||||
|     # Samples should be initialized by Sample.each. | ||||
|     # | ||||
|     # path     - String full path to file. | ||||
|     # language - Language of sample. | ||||
|     def initialize(path, language) | ||||
|       @path     = path | ||||
|       @language = language | ||||
|     end | ||||
|  | ||||
|     # Public: Get full path to file. | ||||
|     # | ||||
|     # Returns String. | ||||
|     attr_reader :path | ||||
|  | ||||
|     # Public: Get sample language. | ||||
|     # | ||||
|     # Returns Language. | ||||
|     attr_reader :language | ||||
|  | ||||
|     # Public: Read file contents. | ||||
|     # | ||||
|     # Returns String. | ||||
|     def data | ||||
|       File.read(path) | ||||
|     end | ||||
|   end | ||||
| end | ||||
							
								
								
									
										157
									
								
								lib/linguist/tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								lib/linguist/tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,157 @@ | ||||
| module Linguist | ||||
|   # Generic programming language tokenizer. | ||||
|   # | ||||
|   # Tokens are designed for use in the language bayes classifier. | ||||
|   # It strips any data strings or comments and preserves significant | ||||
|   # language symbols. | ||||
|   class Tokenizer | ||||
|     # Public: Initialize a Tokenizer. | ||||
|     # | ||||
|     # data - String data to scan. | ||||
|     def initialize(data) | ||||
|       @data = data | ||||
|     end | ||||
|  | ||||
|     # Public: Get source data. | ||||
|     # | ||||
|     # Returns String. | ||||
|     attr_reader :data | ||||
|  | ||||
|     # Public: Extract tokens from data. | ||||
|     # | ||||
|     # Returns Array of token Strings. | ||||
|     def tokens | ||||
|       extract_tokens(data) | ||||
|     end | ||||
|  | ||||
|     # Internal: Extract generic tokens from data. | ||||
|     # | ||||
|     # data - String to scan. | ||||
|     # | ||||
|     # Examples | ||||
|     # | ||||
|     #   extract_tokens("printf('Hello')") | ||||
|     #   # => ['printf', '(', ')'] | ||||
|     # | ||||
|     # Returns Array of token Strings. | ||||
|     def extract_tokens(data) | ||||
|       s = StringScanner.new(data) | ||||
|  | ||||
|       tokens = [] | ||||
|       until s.eos? | ||||
|         # Ruby single line comment | ||||
|         if token = s.scan(/# /) | ||||
|           tokens << "#" | ||||
|           s.skip_until(/\n|\Z/) | ||||
|  | ||||
|         # C style single line comment | ||||
|         elsif token = s.scan(/\/\/ /) | ||||
|           tokens << "//" | ||||
|           s.skip_until(/\n|\Z/) | ||||
|  | ||||
|         # Leading Tex or Matlab comments | ||||
|         elsif token = s.scan(/\n%/) | ||||
|           tokens << "%" | ||||
|           s.skip_until(/\n|\Z/) | ||||
|  | ||||
|         # C multiline comments | ||||
|         elsif token = s.scan(/\/\*/) | ||||
|           tokens << "/*" | ||||
|           s.skip_until(/\*\//) | ||||
|           tokens << "*/" | ||||
|  | ||||
|         # Haskell multiline comments | ||||
|         elsif token = s.scan(/\{-/) | ||||
|           tokens << "{-" | ||||
|           s.skip_until(/-\}/) | ||||
|           tokens << "-}" | ||||
|  | ||||
|         # XML multiline comments | ||||
|         elsif token = s.scan(/<!--/) | ||||
|           tokens << "<!--" | ||||
|           s.skip_until(/-->/) | ||||
|           tokens << "-->" | ||||
|  | ||||
|         # Skip single or double quoted strings | ||||
|         elsif s.scan(/"/) | ||||
|           s.skip_until(/[^\\]"/) | ||||
|         elsif s.scan(/'/) | ||||
|           s.skip_until(/[^\\]'/) | ||||
|  | ||||
|         # Skip number literals | ||||
|         elsif s.scan(/(0x)?\d+/) | ||||
|  | ||||
|         # SGML style brackets | ||||
|         elsif token = s.scan(/<[^\s<>][^<>]*>/) | ||||
|           extract_sgml_tokens(token).each { |t| tokens << t } | ||||
|  | ||||
|         # Common programming punctuation | ||||
|         elsif token = s.scan(/;|\{|\}|\(|\)/) | ||||
|           tokens << token | ||||
|  | ||||
|         # Regular token | ||||
|         elsif token = s.scan(/[\w\.@#\/\*]+/) | ||||
|           tokens << token | ||||
|  | ||||
|         # Common operators | ||||
|         elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/) | ||||
|           tokens << token | ||||
|  | ||||
|         else | ||||
|           s.getch | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       tokens | ||||
|     end | ||||
|  | ||||
|     # Internal: Extract tokens from inside SGML tag. | ||||
|     # | ||||
|     # data - SGML tag String. | ||||
|     # | ||||
|     # Examples | ||||
|     # | ||||
|     #   extract_sgml_tokens("<a href='' class=foo>") | ||||
|     #   # => ["<a>", "href="] | ||||
|     # | ||||
|     # Returns Array of token Strings. | ||||
|     def extract_sgml_tokens(data) | ||||
|       s = StringScanner.new(data) | ||||
|  | ||||
|       tokens = [] | ||||
|  | ||||
|       until s.eos? | ||||
|         # Emit start token | ||||
|         if token = s.scan(/<\/?[^\s>]+/) | ||||
|           tokens << "#{token}>" | ||||
|  | ||||
|         # Emit attributes with trailing = | ||||
|         elsif token = s.scan(/\w+=/) | ||||
|           tokens << token | ||||
|  | ||||
|           # Then skip over attribute value | ||||
|           if s.scan(/"/) | ||||
|             s.skip_until(/[^\\]"/) | ||||
|           elsif s.scan(/'/) | ||||
|             s.skip_until(/[^\\]'/) | ||||
|           else | ||||
|             s.skip_until(/\w+/) | ||||
|           end | ||||
|  | ||||
|         # Emit lone attributes | ||||
|         elsif token = s.scan(/\w+/) | ||||
|           tokens << token | ||||
|  | ||||
|         # Stop at the end of the tag | ||||
|         elsif s.scan(/>/) | ||||
|           s.terminate | ||||
|  | ||||
|         else | ||||
|           s.getch | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       tokens | ||||
|     end | ||||
|   end | ||||
| end | ||||
							
								
								
									
										9
									
								
								test/fixtures/matlab/average.m
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								test/fixtures/matlab/average.m
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| function y = average(x) | ||||
| % AVERAGE Mean of vector elements. | ||||
| % AVERAGE(X), where X is a vector, is the mean of vector | ||||
| % elements. Nonvector input results in an error. | ||||
| [m,n] = size(x); | ||||
| if (~((m == 1) | (n == 1)) | (m == 1 & n == 1)) | ||||
|     error('Input must be a vector') | ||||
| end | ||||
| y = sum(x)/length(x); | ||||
							
								
								
									
										38
									
								
								test/fixtures/matlab/make_filter.m
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								test/fixtures/matlab/make_filter.m
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| function [filtfcn, statefcn] = makeFilter(b, a) | ||||
| %   FILTFCN = MAKEFILTER(B, A) creates an IIR filtering | ||||
| %   function and returns it in the form of a function handle, | ||||
| %   FILTFCN. Each time you call FILTFCN with a new filter  | ||||
| %   input value, it computes the corresponding new filter  | ||||
| %   output value, updating its internal state vector at the | ||||
| %   same time. | ||||
| % | ||||
| %   [FILTFCN, STATEFCN] = MAKEFILTER(B, A) also returns a  | ||||
| %   function (in the form of a function handle, STATEFCN)  | ||||
| %   that can return the filter's internal state.  The internal | ||||
| %   state vector is in the form of a transposed direct form  | ||||
| %   II delay line. | ||||
|  | ||||
| %   Initialize state vector. To keep this example a bit  | ||||
| %   simpler, assume that a and b have the same length.   | ||||
| %   Also assume that a(1) is 1. | ||||
|  | ||||
| v = zeros(size(a)); | ||||
|  | ||||
| filtfcn =  @iirFilter; | ||||
| statefcn = @getState; | ||||
|  | ||||
|    function yn = iirFilter(xn) | ||||
|       % Update the state vector | ||||
|       v(1) = v(2) + b(1) * xn; | ||||
|       v(2:end-1) = v(3:end) + b(2:end-1) * xn - ... | ||||
|          a(2:end-1) * v(1); | ||||
|       v(end) = b(end) * xn - a(end) * v(1); | ||||
|        | ||||
|       % Output is the first element of the state vector. | ||||
|       yn = v(1); | ||||
|    end | ||||
|  | ||||
|    function vOut = getState | ||||
|       vOut = v; | ||||
|    end | ||||
| end | ||||
							
								
								
									
										33
									
								
								test/fixtures/matlab/matlab_function2.m
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										33
									
								
								test/fixtures/matlab/matlab_function2.m
									
									
									
									
										vendored
									
									
								
							| @@ -1,33 +0,0 @@ | ||||
|    function ret = matlab_function2(A,B) | ||||
| % Simple function that combines two values using function handles and displays | ||||
| % the return value | ||||
|  | ||||
| % create function handles | ||||
| fun1=@interface; | ||||
| fun2=@implementation; | ||||
| fun3=@property; | ||||
| fun4=@synthesize; | ||||
|  | ||||
| % use function handles | ||||
| ret = fun1(A)+fun2(A)+fun3(B)+fun4(B); | ||||
|  | ||||
| % Display the return value | ||||
| disp('Return value in function'); | ||||
| disp(ret); | ||||
|  | ||||
|  | ||||
| function A=interface(A) | ||||
| % simple sub-function with same name Objective-C @keyword | ||||
| A=2*A; | ||||
|  | ||||
| function A=implementation(A) | ||||
| % simple sub-function with same name Objective-C @keyword | ||||
| A=A^2; | ||||
|  | ||||
| function B=property(B) | ||||
| % simple sub-function with same name Objective-C @keyword | ||||
| B=2*B; | ||||
|  | ||||
| function B=synthesize(B) | ||||
| % simple sub-function with same name Objective-C @keyword | ||||
| B=B^2; | ||||
| @@ -1,4 +1,5 @@ | ||||
| require 'linguist/file_blob' | ||||
| require 'linguist/sample' | ||||
|  | ||||
| require 'test/unit' | ||||
| require 'mime/types' | ||||
| @@ -24,23 +25,6 @@ class TestBlob < Test::Unit::TestCase | ||||
|     blob | ||||
|   end | ||||
|  | ||||
|   def each_language_fixture | ||||
|     Dir["#{fixtures_path}/*"].each do |path| | ||||
|       name = File.basename(path) | ||||
|  | ||||
|       if name == 'text' || name == 'binary' | ||||
|         next | ||||
|       else | ||||
|         assert language = Language.find_by_alias(name), "No language alias for #{name.inspect}" | ||||
|       end | ||||
|  | ||||
|       Dir.entries(path).each do |filename| | ||||
|         next if filename == '.' || filename == '..' | ||||
|         yield language, blob(File.join(path, filename)) | ||||
|       end | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   def test_name | ||||
|     assert_equal "foo.rb", blob("foo.rb").name | ||||
|   end | ||||
| @@ -291,9 +275,9 @@ class TestBlob < Test::Unit::TestCase | ||||
|   end | ||||
|  | ||||
|   def test_language | ||||
|     # Drop any files under test/fixtures/LANGUAGE | ||||
|     each_language_fixture do |language, blob| | ||||
|       assert_equal language, blob.language, blob.name | ||||
|     Sample.each do |sample| | ||||
|       blob = blob(sample.path) | ||||
|       assert_equal sample.language, blob.language, blob.name | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   | ||||
							
								
								
									
										82
									
								
								test/test_classifier.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								test/test_classifier.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | ||||
| require 'linguist/classifier' | ||||
| require 'linguist/language' | ||||
| require 'linguist/sample' | ||||
| require 'linguist/tokenizer' | ||||
|  | ||||
| require 'test/unit' | ||||
|  | ||||
| class TestClassifier < Test::Unit::TestCase | ||||
|   include Linguist | ||||
|  | ||||
|   def fixtures_path | ||||
|     File.expand_path("../fixtures", __FILE__) | ||||
|   end | ||||
|  | ||||
|   def fixture(name) | ||||
|     File.read(File.join(fixtures_path, name)) | ||||
|   end | ||||
|  | ||||
|   def test_instance_freshness | ||||
|     # Just warn, it shouldn't scare people off by breaking the build. | ||||
|     unless Classifier.instance.eql?(Linguist::Sample.classifier) | ||||
|       warn "Classifier database is out of date. Run `bundle exec rake classifier`." | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   def test_classify | ||||
|     classifier = Classifier.new | ||||
|     classifier.train Language["Ruby"], fixture("ruby/foo.rb") | ||||
|     classifier.train Language["Objective-C"], fixture("objective-c/Foo.h") | ||||
|     classifier.train Language["Objective-C"], fixture("objective-c/Foo.m") | ||||
|  | ||||
|     results = classifier.classify(fixture("objective-c/hello.m")) | ||||
|     assert_equal Language["Objective-C"], results.first[0] | ||||
|  | ||||
|     tokens  = Tokenizer.new(fixture("objective-c/hello.m")).tokens | ||||
|     results = classifier.classify(tokens) | ||||
|     assert_equal Language["Objective-C"], results.first[0] | ||||
|   end | ||||
|  | ||||
|   def test_restricted_classify | ||||
|     classifier = Classifier.new | ||||
|     classifier.train Language["Ruby"], fixture("ruby/foo.rb") | ||||
|     classifier.train Language["Objective-C"], fixture("objective-c/Foo.h") | ||||
|     classifier.train Language["Objective-C"], fixture("objective-c/Foo.m") | ||||
|  | ||||
|     results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]]) | ||||
|     assert_equal Language["Objective-C"], results.first[0] | ||||
|  | ||||
|     results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]]) | ||||
|     assert_equal Language["Ruby"], results.first[0] | ||||
|   end | ||||
|  | ||||
|   def test_instance_classify_empty | ||||
|     results = Classifier.instance.classify("") | ||||
|     assert results.first[1] < 0.5, results.first.inspect | ||||
|   end | ||||
|  | ||||
|   def test_verify | ||||
|     assert Classifier.instance.verify | ||||
|   end | ||||
|  | ||||
|   def test_gc | ||||
|     Classifier.instance.gc | ||||
|   end | ||||
|  | ||||
|   def test_classify_ambiguous_languages | ||||
|     Sample.each do |sample| | ||||
|       # TODO: These tests are pending | ||||
|       next if sample.path =~ /hello.h/ | ||||
|       next if sample.path =~ /MainMenuViewController.h/ | ||||
|  | ||||
|       next unless sample.language.overrides.any? | ||||
|  | ||||
|       extname   = File.extname(sample.path) | ||||
|       languages = Language.all.select { |l| l.extensions.include?(extname) } | ||||
|       next unless languages.length > 1 | ||||
|  | ||||
|       results = Classifier.instance.classify(sample.data, languages) | ||||
|       assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}" | ||||
|     end | ||||
|   end | ||||
| end | ||||
							
								
								
									
										91
									
								
								test/test_tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								test/test_tokenizer.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,91 @@ | ||||
| require 'linguist/tokenizer' | ||||
|  | ||||
| require 'test/unit' | ||||
|  | ||||
| class TestTokenizer < Test::Unit::TestCase | ||||
|   include Linguist | ||||
|  | ||||
|   def fixtures_path | ||||
|     File.expand_path("../fixtures", __FILE__) | ||||
|   end | ||||
|  | ||||
|   def tokenize(data) | ||||
|     data = File.read(File.join(fixtures_path, data.to_s)) if data.is_a?(Symbol) | ||||
|     Tokenizer.new(data).tokens | ||||
|   end | ||||
|  | ||||
|   def test_skip_string_literals | ||||
|     assert_equal %w(print), tokenize('print ""') | ||||
|     assert_equal %w(print), tokenize('print "Josh"') | ||||
|     assert_equal %w(print), tokenize("print 'Josh'") | ||||
|     assert_equal %w(print), tokenize('print "Hello \"Josh\""') | ||||
|     assert_equal %w(print), tokenize("print 'Hello \\'Josh\\''") | ||||
|   end | ||||
|  | ||||
|   def test_skip_number_literals | ||||
|     assert_equal %w(+), tokenize('1 + 1') | ||||
|     assert_equal %w(add \( \)), tokenize('add(123, 456)') | ||||
|     assert_equal %w(|), tokenize('0x01 | 0x10') | ||||
|   end | ||||
|  | ||||
|   def test_skip_comments | ||||
|     assert_equal %w(foo #), tokenize("foo # Comment") | ||||
|     assert_equal %w(foo # bar), tokenize("foo # Comment\nbar") | ||||
|     assert_equal %w(foo //), tokenize("foo // Comment") | ||||
|     assert_equal %w(foo /* */), tokenize("foo /* Comment */") | ||||
|     assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */") | ||||
|     assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->") | ||||
|     assert_equal %w(foo {- -}), tokenize("foo {- Comment -}") | ||||
|     assert_equal %w(% %), tokenize("2 % 10\n% Comment") | ||||
|   end | ||||
|  | ||||
|   def test_sgml_tags | ||||
|     assert_equal %w(<html> </html>), tokenize("<html></html>") | ||||
|     assert_equal %w(<div> id </div>), tokenize("<div id></div>") | ||||
|     assert_equal %w(<div> id= </div>), tokenize("<div id=foo></div>") | ||||
|     assert_equal %w(<div> id class </div>), tokenize("<div id class></div>") | ||||
|     assert_equal %w(<div> id= </div>), tokenize("<div id=\"foo bar\"></div>") | ||||
|     assert_equal %w(<div> id= </div>), tokenize("<div id='foo bar'></div>") | ||||
|     assert_equal %w(<?xml> version=), tokenize("<?xml version=\"1.0\"?>") | ||||
|   end | ||||
|  | ||||
|   def test_operators | ||||
|     assert_equal %w(+), tokenize("1 + 1") | ||||
|     assert_equal %w(-), tokenize("1 - 1") | ||||
|     assert_equal %w(*), tokenize("1 * 1") | ||||
|     assert_equal %w(/), tokenize("1 / 1") | ||||
|     assert_equal %w(%), tokenize("2 % 5") | ||||
|     assert_equal %w(&), tokenize("1 & 1") | ||||
|     assert_equal %w(&&), tokenize("1 && 1") | ||||
|     assert_equal %w(|), tokenize("1 | 1") | ||||
|     assert_equal %w(||), tokenize("1 || 1") | ||||
|     assert_equal %w(<), tokenize("1 < 0x01") | ||||
|     assert_equal %w(<<), tokenize("1 << 0x01") | ||||
|   end | ||||
|  | ||||
|   def test_c_tokens | ||||
|     assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize(:"c/hello.h") | ||||
|     assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return ; }), tokenize(:"c/hello.c") | ||||
|   end | ||||
|  | ||||
|   def test_cpp_tokens | ||||
|     assert_equal %w(class Bar { protected char *name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h") | ||||
|     assert_equal %w(#include <iostream> using namespace std ; int main \( \) { cout << << endl ; }), tokenize(:"cpp/hello.cpp") | ||||
|   end | ||||
|  | ||||
|   def test_objective_c_tokens | ||||
|     assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize(:"objective-c/Foo.h") | ||||
|     assert_equal %w(#import @implementation Foo @end), tokenize(:"objective-c/Foo.m") | ||||
|     assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m") | ||||
|   end | ||||
|  | ||||
|   def test_javascript_tokens | ||||
|     assert_equal %w( \( function \( \) { console.log \( \) ; } \) .call \( this \) ;), tokenize(:"javascript/hello.js") | ||||
|   end | ||||
|  | ||||
|   def test_ruby_tokens | ||||
|     assert_equal %w(module Foo end), tokenize(:"ruby/foo.rb") | ||||
|     assert_equal %w(# /usr/bin/env ruby puts), tokenize(:"ruby/script.rb") | ||||
|     assert_equal %w(task default do puts end), tokenize(:"ruby/Rakefile") | ||||
|   end | ||||
| end | ||||
		Reference in New Issue
	
	Block a user