Linguist 2.0.1

Guard against classify nil data
Linguist 2.0.0
2025-10-29 17:50:22 +00:00 · 2012-06-21 11:48:08 -05:00 · 2012-06-21 11:47:32 -05:00 · 2012-06-21 11:28:24 -05:00 · 2012-06-21 09:26:54 -07:00 · 2012-06-21 11:25:34 -05:00
14 changed files with 19730 additions and 57 deletions
--- a/61
+++ b/61
@@ -1,3 +1,4 @@
 require 'rake/clean'
 require 'rake/testtask'
 task :default => :test
@@ -5,3 +6,63 @@ task :default => :test
 Rake::TestTask.new do |t|
  t.warning = true
 end
 file 'lib/linguist/classifier.yml' => Dir['test/fixtures/**/*'] do |f|
  require 'linguist/sample'
  classifier = Linguist::Sample.classifier
  File.open(f.name, 'w') { |io| YAML.dump(classifier, io) }
 end
 CLOBBER.include 'lib/linguist/classifier.yml'
 task :classifier => ['lib/linguist/classifier.yml']
 namespace :classifier do
  LIMIT = 1_000
  desc "Run classifier against #{LIMIT} public gists"
  task :test do
    require 'linguist/classifier'
    total, correct, incorrect = 0, 0, 0
    $stdout.sync = true
    each_public_gist do |gist_url, file_url, file_language|
      next if file_language.nil? || file_language == 'Text'
      begin
        data = open(file_url).read
        guessed_language, score = Linguist::Classifier.instance.classify(data).first
        total += 1
        guessed_language.name == file_language ? correct += 1 : incorrect += 1
        print "\r\e[0K%d:%d  %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
        $stdout.flush
      rescue URI::InvalidURIError
      else
        break if total >= LIMIT
      end
    end
    puts ""
  end
  def each_public_gist
    require 'open-uri'
    require 'json'
    url = "https://api.github.com/gists/public"
    loop do
      resp = open(url)
      url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
      gists = JSON.parse(resp.read)
      for gist in gists
        for filename, attrs in gist['files']
          yield gist['url'], attrs['raw_url'], attrs['language']
        end
      end
    end
  end
 end
--- a/github-linguist.gemspec
+++ b/github-linguist.gemspec
@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
  s.name    = 'github-linguist'
-  s.version = '1.0.0'
+  s.version = '2.0.1'
  s.summary = "GitHub Language detection"
  s.authors = "GitHub"
@@ -12,5 +12,6 @@ Gem::Specification.new do |s|
  s.add_dependency 'escape_utils',    '~> 0.2.3'
  s.add_dependency 'mime-types',      '~> 1.18'
  s.add_dependency 'pygments.rb',     '~> 0.2.13'
  s.add_development_dependency 'json'
  s.add_development_dependency 'rake'
 end
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -1,3 +1,4 @@
 require 'linguist/classifier'
 require 'linguist/language'
 require 'linguist/mime'
 require 'linguist/pathname'
@@ -453,8 +454,15 @@ module Linguist
    # Returns a Language or nil.
    def disambiguate_extension_language
      if Language.ambiguous?(extname)
-        name = "guess_#{extname.sub(/^\./, '')}_language"
+        # name = "guess_#{extname.sub(/^\./, '')}_language"
-        send(name) if respond_to?(name)
+        # send(name) if respond_to?(name)
        possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
        if possible_languages.any?
          if result = Classifier.instance.classify(data, possible_languages).first
            result[0]
          end
        end
      end
    end
--- a/lib/linguist/classifier.rb
+++ b/lib/linguist/classifier.rb
@@ -0,0 +1,184 @@
 require 'linguist/language'
 require 'linguist/tokenizer'
 module Linguist
  # Language bayesian classifier.
  class Classifier
    # Internal: Path to persisted classifier db.
    PATH = File.expand_path('../classifier.yml', __FILE__)
    # Public: Check if persisted db exists on disk.
    #
    # Returns Boolean.
    def self.exist?
      File.exist?(PATH)
    end
    # Public: Get persisted Classifier instance.
    #
    # Returns Classifier.
    def self.instance
      @instance ||= YAML.load_file(PATH)
    end
    # Public: Initialize a Classifier.
    def initialize
      @tokens_total    = 0
      @languages_total = 0
      @tokens          = Hash.new { |h, k| h[k] = Hash.new(0) }
      @language_tokens = Hash.new(0)
      @languages       = Hash.new(0)
    end
    # Public: Compare Classifier objects.
    #
    # other - Classifier object to compare to.
    #
    # Returns Boolean.
    def eql?(other)
      # Lazy fast check counts only
      other.is_a?(self.class) &&
        @tokens_total == other.instance_variable_get(:@tokens_total) &&
        @languages_total == other.instance_variable_get(:@languages_total)
    end
    alias_method :==, :eql?
    # Public: Train classifier that data is a certain language.
    #
    # language - Language of data
    # data     - String contents of file
    #
    # Examples
    #
    #   train(Language['Ruby'], "def hello; end")
    #
    # Returns nothing.
    def train(language, data)
      language = language.name
      tokens   = Tokenizer.new(data).tokens
      tokens.each do |token|
        @tokens[language][token] += 1
        @language_tokens[language] += 1
        @tokens_total += 1
      end
      @languages[language] += 1
      @languages_total += 1
      nil
    end
    # Public: Verify internal counts are consistent.
    #
    # Returns Boolean.
    def verify
      @languages.inject(0) { |n, (l, c)| n += c } == @languages_total &&
        @language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total &&
        @tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total
    end
    # Public: Prune infrequent tokens.
    #
    # Returns receiver Classifier instance.
    def gc
      self
    end
    # Public: Guess language of data.
    #
    # data      - Array of tokens or String data to analyze.
    # languages - Array of Languages to restrict to.
    #
    # Examples
    #
    #   classify("def hello; end")
    #   # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
    #
    # Returns sorted Array of result pairs. Each pair contains the
    # Language and a Float score.
    def classify(tokens, languages = @languages.keys)
      return [] if tokens.nil?
      tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
      scores = {}
      languages.each do |language|
        language_name = language.is_a?(Language) ? language.name : language
        scores[language_name] = tokens_probability(tokens, language_name) +
                                   language_probability(language_name)
      end
      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
    end
    # Internal: Probably of set of tokens in a language occuring - P(D | C)
    #
    # tokens   - Array of String tokens.
    # language - Language to check.
    #
    # Returns Float between 0.0 and 1.0.
    def tokens_probability(tokens, language)
      tokens.inject(0.0) do |sum, token|
        sum += Math.log(token_probability(token, language))
      end
    end
    # Internal: Probably of token in language occuring - P(F | C)
    #
    # token    - String token.
    # language - Language to check.
    #
    # Returns Float between 0.0 and 1.0.
    def token_probability(token, language)
      if @tokens[language][token].to_f == 0.0
        1 / @tokens_total.to_f
      else
        @tokens[language][token].to_f / @language_tokens[language].to_f
      end
    end
    # Internal: Probably of a language occuring - P(C)
    #
    # language - Language to check.
    #
    # Returns Float between 0.0 and 1.0.
    def language_probability(language)
      Math.log(@languages[language].to_f / @languages_total.to_f)
    end
    # Public: Serialize classifier to YAML.
    #
    # opts - Hash of YAML options.
    #
    # Returns nothing.
    def to_yaml(io)
      data = "--- !ruby/object:Linguist::Classifier\n"
      data << "languages_total: #{@languages_total}\n"
      data << "tokens_total: #{@tokens_total}\n"
      data << "languages:\n"
      @languages.sort.each do |language, count|
        data << "  #{{language => count}.to_yaml.lines.to_a[1]}"
      end
      data << "language_tokens:\n"
      @language_tokens.sort.each do |language, count|
        data << "  #{{language => count}.to_yaml.lines.to_a[1]}"
      end
      data << "tokens:\n"
      @tokens.sort.each do |language, tokens|
        data << "  #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
        tokens.sort.each do |token, count|
          data << "    #{{token => count}.to_yaml.lines.to_a[1]}"
        end
      end
      io.write data
      nil
    end
  end
  # Eager load instance
  Classifier.instance if Classifier.exist?
 end
--- a/lib/linguist/classifier.yml
+++ b/lib/linguist/classifier.yml
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -26,7 +26,7 @@ module Linguist
      @overrides.include?(extension)
    end
-    # Include?: Return overridden extensions.
+    # Internal: Return overridden extensions.
    #
    # Returns extensions Array.
    def self.overridden_extensions
--- a/lib/linguist/sample.rb
+++ b/lib/linguist/sample.rb
@@ -0,0 +1,74 @@
 require 'linguist/classifier'
 require 'linguist/language'
 module Linguist
  # Model for accessing classifier training data.
  class Sample
    # Samples live in test/ for now, we'll eventually move them out
    PATH = File.expand_path("../../../test/fixtures", __FILE__)
    # Public: Iterate over each Sample.
    #
    # &block - Yields Sample to block
    #
    # Returns nothing.
    def self.each(&block)
      Dir.entries(PATH).each do |category|
        next if category == '.' || category == '..'
        # Skip text and binary for now
        # Possibly reconsider this later
        next if category == 'text' || category == 'binary'
        # Map directory name to a Language alias
        language = Linguist::Language.find_by_alias(category)
        raise "No language for #{category.inspect}" unless language
        dirname = File.join(PATH, category)
        Dir.entries(dirname).each do |filename|
          next if filename == '.' || filename == '..'
          yield new(File.join(dirname, filename), language)
        end
      end
      nil
    end
    # Public: Build Classifier from all samples.
    #
    # Returns trained Classifier.
    def self.classifier
      classifier = Classifier.new
      each { |sample| classifier.train(sample.language, sample.data) }
      classifier.gc
    end
    # Internal: Initialize Sample.
    #
    # Samples should be initialized by Sample.each.
    #
    # path     - String full path to file.
    # language - Language of sample.
    def initialize(path, language)
      @path     = path
      @language = language
    end
    # Public: Get full path to file.
    #
    # Returns String.
    attr_reader :path
    # Public: Get sample language.
    #
    # Returns Language.
    attr_reader :language
    # Public: Read file contents.
    #
    # Returns String.
    def data
      File.read(path)
    end
  end
 end
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -0,0 +1,157 @@
 module Linguist
  # Generic programming language tokenizer.
  #
  # Tokens are designed for use in the language bayes classifier.
  # It strips any data strings or comments and preserves significant
  # language symbols.
  class Tokenizer
    # Public: Initialize a Tokenizer.
    #
    # data - String data to scan.
    def initialize(data)
      @data = data
    end
    # Public: Get source data.
    #
    # Returns String.
    attr_reader :data
    # Public: Extract tokens from data.
    #
    # Returns Array of token Strings.
    def tokens
      extract_tokens(data)
    end
    # Internal: Extract generic tokens from data.
    #
    # data - String to scan.
    #
    # Examples
    #
    #   extract_tokens("printf('Hello')")
    #   # => ['printf', '(', ')']
    #
    # Returns Array of token Strings.
    def extract_tokens(data)
      s = StringScanner.new(data)
      tokens = []
      until s.eos?
        # Ruby single line comment
        if token = s.scan(/# /)
          tokens << "#"
          s.skip_until(/\n|\Z/)
        # C style single line comment
        elsif token = s.scan(/\/\/ /)
          tokens << "//"
          s.skip_until(/\n|\Z/)
        # Leading Tex or Matlab comments
        elsif token = s.scan(/\n%/)
          tokens << "%"
          s.skip_until(/\n|\Z/)
        # C multiline comments
        elsif token = s.scan(/\/\*/)
          tokens << "/*"
          s.skip_until(/\*\//)
          tokens << "*/"
        # Haskell multiline comments
        elsif token = s.scan(/\{-/)
          tokens << "{-"
          s.skip_until(/-\}/)
          tokens << "-}"
        # XML multiline comments
        elsif token = s.scan(/<!--/)
          tokens << "<!--"
          s.skip_until(/-->/)
          tokens << "-->"
        # Skip single or double quoted strings
        elsif s.scan(/"/)
          s.skip_until(/[^\\]"/)
        elsif s.scan(/'/)
          s.skip_until(/[^\\]'/)
        # Skip number literals
        elsif s.scan(/(0x)?\d+/)
        # SGML style brackets
        elsif token = s.scan(/<[^\s<>][^<>]*>/)
          extract_sgml_tokens(token).each { |t| tokens << t }
        # Common programming punctuation
        elsif token = s.scan(/;|\{|\}|\(|\)/)
          tokens << token
        # Regular token
        elsif token = s.scan(/[\w\.@#\/\*]+/)
          tokens << token
        # Common operators
        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
          tokens << token
        else
          s.getch
        end
      end
      tokens
    end
    # Internal: Extract tokens from inside SGML tag.
    #
    # data - SGML tag String.
    #
    # Examples
    #
    #   extract_sgml_tokens("<a href='' class=foo>")
    #   # => ["<a>", "href="]
    #
    # Returns Array of token Strings.
    def extract_sgml_tokens(data)
      s = StringScanner.new(data)
      tokens = []
      until s.eos?
        # Emit start token
        if token = s.scan(/<\/?[^\s>]+/)
          tokens << "#{token}>"
        # Emit attributes with trailing =
        elsif token = s.scan(/\w+=/)
          tokens << token
          # Then skip over attribute value
          if s.scan(/"/)
            s.skip_until(/[^\\]"/)
          elsif s.scan(/'/)
            s.skip_until(/[^\\]'/)
          else
            s.skip_until(/\w+/)
          end
        # Emit lone attributes
        elsif token = s.scan(/\w+/)
          tokens << token
        # Stop at the end of the tag
        elsif s.scan(/>/)
          s.terminate
        else
          s.getch
        end
      end
      tokens
    end
  end
 end
--- a/test/fixtures/matlab/average.m
+++ b/test/fixtures/matlab/average.m
@@ -0,0 +1,9 @@
 function y = average(x)
 % AVERAGE Mean of vector elements.
 % AVERAGE(X), where X is a vector, is the mean of vector
 % elements. Nonvector input results in an error.
 [m,n] = size(x);
 if (~((m == 1) | (n == 1)) | (m == 1 & n == 1))
    error('Input must be a vector')
 end
 y = sum(x)/length(x);
--- a/test/fixtures/matlab/make_filter.m
+++ b/test/fixtures/matlab/make_filter.m
@@ -0,0 +1,38 @@
 function [filtfcn, statefcn] = makeFilter(b, a)
 %   FILTFCN = MAKEFILTER(B, A) creates an IIR filtering
 %   function and returns it in the form of a function handle,
 %   FILTFCN. Each time you call FILTFCN with a new filter 
 %   input value, it computes the corresponding new filter 
 %   output value, updating its internal state vector at the
 %   same time.
 %
 %   [FILTFCN, STATEFCN] = MAKEFILTER(B, A) also returns a 
 %   function (in the form of a function handle, STATEFCN) 
 %   that can return the filter's internal state.  The internal
 %   state vector is in the form of a transposed direct form 
 %   II delay line.
 %   Initialize state vector. To keep this example a bit 
 %   simpler, assume that a and b have the same length.  
 %   Also assume that a(1) is 1.
 v = zeros(size(a));
 filtfcn =  @iirFilter;
 statefcn = @getState;
   function yn = iirFilter(xn)
      % Update the state vector
      v(1) = v(2) + b(1) * xn;
      v(2:end-1) = v(3:end) + b(2:end-1) * xn - ...
         a(2:end-1) * v(1);
      v(end) = b(end) * xn - a(end) * v(1);
      % Output is the first element of the state vector.
      yn = v(1);
   end
   function vOut = getState
      vOut = v;
   end
 end
--- a/test/fixtures/matlab/matlab_function2.m
+++ b/test/fixtures/matlab/matlab_function2.m
@@ -1,33 +0,0 @@
   function ret = matlab_function2(A,B)
 % Simple function that combines two values using function handles and displays
 % the return value
 % create function handles
 fun1=@interface;
 fun2=@implementation;
 fun3=@property;
 fun4=@synthesize;
 % use function handles
 ret = fun1(A)+fun2(A)+fun3(B)+fun4(B);
 % Display the return value
 disp('Return value in function');
 disp(ret);
 function A=interface(A)
 % simple sub-function with same name Objective-C @keyword
 A=2*A;
 function A=implementation(A)
 % simple sub-function with same name Objective-C @keyword
 A=A^2;
 function B=property(B)
 % simple sub-function with same name Objective-C @keyword
 B=2*B;
 function B=synthesize(B)
 % simple sub-function with same name Objective-C @keyword
 B=B^2;
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -1,4 +1,5 @@
 require 'linguist/file_blob'
 require 'linguist/sample'
 require 'test/unit'
 require 'mime/types'
@@ -24,23 +25,6 @@ class TestBlob < Test::Unit::TestCase
    blob
  end
  def each_language_fixture
    Dir["#{fixtures_path}/*"].each do |path|
      name = File.basename(path)
      if name == 'text' || name == 'binary'
        next
      else
        assert language = Language.find_by_alias(name), "No language alias for #{name.inspect}"
      end
      Dir.entries(path).each do |filename|
        next if filename == '.' || filename == '..'
        yield language, blob(File.join(path, filename))
      end
    end
  end
  def test_name
    assert_equal "foo.rb", blob("foo.rb").name
  end
@@ -291,9 +275,9 @@ class TestBlob < Test::Unit::TestCase
  end
  def test_language
-    # Drop any files under test/fixtures/LANGUAGE
+    Sample.each do |sample|
-    each_language_fixture do |language, blob|
+      blob = blob(sample.path)
-      assert_equal language, blob.language, blob.name
+      assert_equal sample.language, blob.language, blob.name
    end
  end
--- a/test/test_classifier.rb
+++ b/test/test_classifier.rb
@@ -0,0 +1,86 @@
 require 'linguist/classifier'
 require 'linguist/language'
 require 'linguist/sample'
 require 'linguist/tokenizer'
 require 'test/unit'
 class TestClassifier < Test::Unit::TestCase
  include Linguist
  def fixtures_path
    File.expand_path("../fixtures", __FILE__)
  end
  def fixture(name)
    File.read(File.join(fixtures_path, name))
  end
  def test_instance_freshness
    # Just warn, it shouldn't scare people off by breaking the build.
    unless Classifier.instance.eql?(Linguist::Sample.classifier)
      warn "Classifier database is out of date. Run `bundle exec rake classifier`."
    end
  end
  def test_classify
    classifier = Classifier.new
    classifier.train Language["Ruby"], fixture("ruby/foo.rb")
    classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
    classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
    results = classifier.classify(fixture("objective-c/hello.m"))
    assert_equal Language["Objective-C"], results.first[0]
    tokens  = Tokenizer.new(fixture("objective-c/hello.m")).tokens
    results = classifier.classify(tokens)
    assert_equal Language["Objective-C"], results.first[0]
  end
  def test_restricted_classify
    classifier = Classifier.new
    classifier.train Language["Ruby"], fixture("ruby/foo.rb")
    classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
    classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
    results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]])
    assert_equal Language["Objective-C"], results.first[0]
    results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]])
    assert_equal Language["Ruby"], results.first[0]
  end
  def test_instance_classify_empty
    results = Classifier.instance.classify("")
    assert results.first[1] < 0.5, results.first.inspect
  end
  def test_instance_classify_nil
    assert_equal [], Classifier.instance.classify(nil)
  end
  def test_verify
    assert Classifier.instance.verify
  end
  def test_gc
    Classifier.instance.gc
  end
  def test_classify_ambiguous_languages
    Sample.each do |sample|
      # TODO: These tests are pending
      next if sample.path =~ /hello.h/
      next if sample.path =~ /MainMenuViewController.h/
      next unless sample.language.overrides.any?
      extname   = File.extname(sample.path)
      languages = Language.all.select { |l| l.extensions.include?(extname) }
      next unless languages.length > 1
      results = Classifier.instance.classify(sample.data, languages)
      assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}"
    end
  end
 end
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -0,0 +1,91 @@
 require 'linguist/tokenizer'
 require 'test/unit'
 class TestTokenizer < Test::Unit::TestCase
  include Linguist
  def fixtures_path
    File.expand_path("../fixtures", __FILE__)
  end
  def tokenize(data)
    data = File.read(File.join(fixtures_path, data.to_s)) if data.is_a?(Symbol)
    Tokenizer.new(data).tokens
  end
  def test_skip_string_literals
    assert_equal %w(print), tokenize('print ""')
    assert_equal %w(print), tokenize('print "Josh"')
    assert_equal %w(print), tokenize("print 'Josh'")
    assert_equal %w(print), tokenize('print "Hello \"Josh\""')
    assert_equal %w(print), tokenize("print 'Hello \\'Josh\\''")
  end
  def test_skip_number_literals
    assert_equal %w(+), tokenize('1 + 1')
    assert_equal %w(add \( \)), tokenize('add(123, 456)')
    assert_equal %w(|), tokenize('0x01 | 0x10')
  end
  def test_skip_comments
    assert_equal %w(foo #), tokenize("foo # Comment")
    assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
    assert_equal %w(foo //), tokenize("foo // Comment")
    assert_equal %w(foo /* */), tokenize("foo /* Comment */")
    assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
    assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")
    assert_equal %w(foo {- -}), tokenize("foo {- Comment -}")
    assert_equal %w(% %), tokenize("2 % 10\n% Comment")
  end
  def test_sgml_tags
    assert_equal %w(<html> </html>), tokenize("<html></html>")
    assert_equal %w(<div> id </div>), tokenize("<div id></div>")
    assert_equal %w(<div> id= </div>), tokenize("<div id=foo></div>")
    assert_equal %w(<div> id class </div>), tokenize("<div id class></div>")
    assert_equal %w(<div> id= </div>), tokenize("<div id=\"foo bar\"></div>")
    assert_equal %w(<div> id= </div>), tokenize("<div id='foo bar'></div>")
    assert_equal %w(<?xml> version=), tokenize("<?xml version=\"1.0\"?>")
  end
  def test_operators
    assert_equal %w(+), tokenize("1 + 1")
    assert_equal %w(-), tokenize("1 - 1")
    assert_equal %w(*), tokenize("1 * 1")
    assert_equal %w(/), tokenize("1 / 1")
    assert_equal %w(%), tokenize("2 % 5")
    assert_equal %w(&), tokenize("1 & 1")
    assert_equal %w(&&), tokenize("1 && 1")
    assert_equal %w(|), tokenize("1 | 1")
    assert_equal %w(||), tokenize("1 || 1")
    assert_equal %w(<), tokenize("1 < 0x01")
    assert_equal %w(<<), tokenize("1 << 0x01")
  end
  def test_c_tokens
    assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize(:"c/hello.h")
    assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return ; }), tokenize(:"c/hello.c")
  end
  def test_cpp_tokens
    assert_equal %w(class Bar { protected char *name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h")
    assert_equal %w(#include <iostream> using namespace std ; int main \( \) { cout << << endl ; }), tokenize(:"cpp/hello.cpp")
  end
  def test_objective_c_tokens
    assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize(:"objective-c/Foo.h")
    assert_equal %w(#import @implementation Foo @end), tokenize(:"objective-c/Foo.m")
    assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m")
  end
  def test_javascript_tokens
    assert_equal %w( \( function \( \) { console.log \( \) ; } \) .call \( this \) ;), tokenize(:"javascript/hello.js")
  end
  def test_ruby_tokens
    assert_equal %w(module Foo end), tokenize(:"ruby/foo.rb")
    assert_equal %w(# /usr/bin/env ruby puts), tokenize(:"ruby/script.rb")
    assert_equal %w(task default do puts end), tokenize(:"ruby/Rakefile")
  end
 end
Author	SHA1	Message	Date
Joshua Peek	cf624e44ff	Linguist 2.0.1	2012-06-21 11:48:08 -05:00
Joshua Peek	2b712dc790	Guard against classify nil data	2012-06-21 11:47:32 -05:00
Joshua Peek	3d7364877d	Linguist 2.0.0	2012-06-21 11:28:24 -05:00
Joshua Peek	2a324c6289	Merge pull request #172 from github/bayesian Bayesian Classifier	2012-06-21 09:26:54 -07:00
Joshua Peek	77a6a41fc3	Merge branch 'master' into bayesian	2012-06-21 11:25:34 -05:00
Joshua Peek	076bf7d0c8	Use classifier as primary method for disambiguation	2012-06-21 10:55:26 -05:00
Joshua Peek	540f2a0941	More matlab samples	2012-06-21 10:44:31 -05:00
Joshua Peek	497da86262	Strip tex and matlab leading inline comments	2012-06-21 10:38:28 -05:00
Joshua Peek	4b9b8a5058	Remove matlab file with bogus keywords	2012-06-21 10:25:30 -05:00
Joshua Peek	5568489123	Rebuild classifier db	2012-06-20 17:17:09 -05:00
Joshua Peek	5cdd5e206c	Improve operator tokenizing	2012-06-20 17:16:53 -05:00
Joshua Peek	c353d3a050	Fix indent	2012-06-20 16:58:32 -05:00
Joshua Peek	6252f12175	Rebuild classifier db	2012-06-20 16:54:40 -05:00
Joshua Peek	0067f28246	YAML sucks	2012-06-20 16:54:29 -05:00
Joshua Peek	ac23d64d26	Merge branch 'master' into bayesian	2012-06-20 16:24:39 -05:00
Joshua Peek	9c9607e42c	Log regexp and classifier guess mismatches	2012-06-20 16:20:59 -05:00
Joshua Peek	516a220d9f	Verify classifer counts	2012-06-20 15:48:46 -05:00
Joshua Peek	7bcf90c527	Skip gc step for now	2012-06-20 15:13:06 -05:00
Joshua Peek	8c83cbe244	Merge branch 'master' into bayesian Conflicts: linguist.gemspec	2012-06-20 14:56:15 -05:00
Joshua Peek	26f95507ef	Test against real gist data	2012-06-20 14:55:13 -05:00
Joshua Peek	4324971cea	Remove debug line	2012-06-20 14:11:23 -05:00
Joshua Peek	2672089154	Ensure language is loaded	2012-06-20 14:10:34 -05:00
Joshua Peek	48ecae0c95	Rebuild classifier db	2012-06-20 12:50:42 -05:00
Joshua Peek	5daaee88b4	Sort classifier yaml output	2012-06-20 12:50:05 -05:00
Joshua Peek	db9475f240	Rebuild classifier data	2012-06-20 11:27:18 -05:00
Joshua Peek	f68e94f181	Skip number literals	2012-06-20 11:26:14 -05:00
Joshua Peek	cb70572163	Rebuild classifier db	2012-06-20 11:19:36 -05:00
Joshua Peek	e9eae4e008	Skip pending tests	2012-06-20 11:19:02 -05:00
Joshua Peek	e33d8f3685	Merge branch 'master' into bayesian	2012-06-20 11:18:47 -05:00
Joshua Peek	645a87d02b	Remove dead fixture test	2012-06-19 16:34:13 -05:00
Joshua Peek	4484011f08	Switch to log probabilities to avoid float underflows	2012-06-19 16:33:29 -05:00
Joshua Peek	c114d710f8	Test classifier on ambiguous languages	2012-06-19 16:32:56 -05:00
Joshua Peek	9810c693c3	Rebuild classifier db	2012-06-19 16:30:46 -05:00
Joshua Peek	c804d04072	Merge branch 'master' into bayesian	2012-06-19 16:29:01 -05:00
Joshua Peek	fdd81ce0be	Merge branch 'master' into bayesian	2012-06-19 16:26:43 -05:00
Joshua Peek	176f6483d0	Ensure token probability is less than 1.0	2012-06-19 15:26:56 -05:00
Joshua Peek	ee6650f83f	Fix doc typo	2012-06-19 15:23:23 -05:00
Joshua Peek	3fee3ac549	Rebuild classifier db	2012-06-19 15:23:06 -05:00
Joshua Peek	9d555862c3	Merge branch 'master' into bayesian	2012-06-19 15:02:02 -05:00
Joshua Peek	ddf3ec4a5b	Warn if classifier instance is out of date	2012-06-19 14:32:04 -05:00
Joshua Peek	e2b0f6bb50	Depend classifier db on fixtures	2012-06-19 14:23:12 -05:00
Joshua Peek	4d5c9b951b	Rebuild classifier	2012-06-19 14:22:42 -05:00
Joshua Peek	d566b35020	Allow classifer languages to be scoped	2012-06-19 14:21:42 -05:00
Joshua Peek	8f85a447de	Allow tokens to be passed directly to classify	2012-06-19 14:17:27 -05:00
Joshua Peek	d0691988a9	More classifier docs	2012-06-19 14:15:10 -05:00
Joshua Peek	d9ecbf0c24	Doc sample class	2012-06-19 13:30:28 -05:00
Joshua Peek	d5fa8cbcb7	Refactor tokenizer test helper	2012-06-19 13:12:17 -05:00
Joshua Peek	555573071e	More tokenizer docs	2012-06-19 13:09:23 -05:00
Joshua Peek	ecb2397e59	Merge branch 'master' into bayesian	2012-06-19 11:43:48 -05:00
Joshua Peek	12cfab6d50	Rebuild classifier data	2012-06-08 16:04:52 -05:00
Joshua Peek	8a75d4d208	GC classifier db	2012-06-08 16:04:43 -05:00
Joshua Peek	fd8b70ffa4	Rebuild classifier data	2012-06-08 15:49:35 -05:00
Joshua Peek	62498cf0e9	Merge branch 'master' into bayesian	2012-06-08 15:46:48 -05:00
Joshua Peek	543922c68a	Rebuild classifier data	2012-06-08 14:48:04 -05:00
Joshua Peek	6f6dd8bc38	Improve tokenizing sgml tags	2012-06-08 14:46:16 -05:00
Joshua Peek	8351d55c56	Don't crash if classifier data is missing	2012-06-08 14:46:06 -05:00
Joshua Peek	9ecab364d1	Dump classifier results	2012-06-08 14:13:26 -05:00
Joshua Peek	0172623061	Add sample gathering class	2012-06-08 13:51:49 -05:00
Joshua Peek	e5ae9c328b	Use language name as hash key	2012-06-08 13:43:57 -05:00
Joshua Peek	e0c777d995	Fix test name	2012-06-08 13:43:37 -05:00
Joshua Peek	f747b49347	Add simple classifier	2012-06-07 17:10:28 -05:00
Joshua Peek	e0cbe815a3	Add basic Tokenizer	2012-06-07 14:55:11 -05:00