Linguist 2.0.0

Merge pull request #172 from github/bayesian
Bayesian Classifier
2025-10-29 17:50:22 +00:00 · 2012-06-21 11:28:24 -05:00 · 2012-06-21 09:26:54 -07:00 · 2012-06-21 11:25:34 -05:00 · 2012-06-21 10:55:26 -05:00 · 2012-06-21 10:44:31 -05:00
14 changed files with 19725 additions and 57 deletions
--- a/61
+++ b/61
@@ -1,3 +1,4 @@
+require 'rake/clean'
 require 'rake/testtask'

 task :default => :test
@@ -5,3 +6,63 @@ task :default => :test
 Rake::TestTask.new do |t|
  t.warning = true
 end
+
+
+file 'lib/linguist/classifier.yml' => Dir['test/fixtures/**/*'] do |f|
+  require 'linguist/sample'
+  classifier = Linguist::Sample.classifier
+  File.open(f.name, 'w') { |io| YAML.dump(classifier, io) }
+end
+
+CLOBBER.include 'lib/linguist/classifier.yml'
+
+task :classifier => ['lib/linguist/classifier.yml']
+
+namespace :classifier do
+  LIMIT = 1_000
+
+  desc "Run classifier against #{LIMIT} public gists"
+  task :test do
+    require 'linguist/classifier'
+
+    total, correct, incorrect = 0, 0, 0
+    $stdout.sync = true
+
+    each_public_gist do |gist_url, file_url, file_language|
+      next if file_language.nil? || file_language == 'Text'
+      begin
+        data = open(file_url).read
+        guessed_language, score = Linguist::Classifier.instance.classify(data).first
+
+        total += 1
+        guessed_language.name == file_language ? correct += 1 : incorrect += 1
+
+        print "\r\e[0K%d:%d  %g%%" % [correct, incorrect, (correct.to_f/total.to_f)*100]
+        $stdout.flush
+      rescue URI::InvalidURIError
+      else
+        break if total >= LIMIT
+      end
+    end
+    puts ""
+  end
+
+  def each_public_gist
+    require 'open-uri'
+    require 'json'
+
+    url = "https://api.github.com/gists/public"
+
+    loop do
+      resp = open(url)
+      url = resp.meta['link'][/<([^>]+)>; rel="next"/, 1]
+      gists = JSON.parse(resp.read)
+
+      for gist in gists
+        for filename, attrs in gist['files']
+          yield gist['url'], attrs['raw_url'], attrs['language']
+        end
+      end
+    end
+  end
+end
--- a/github-linguist.gemspec
+++ b/github-linguist.gemspec
@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
  s.name    = 'github-linguist'
-  s.version = '1.0.0'
+  s.version = '2.0.0'
  s.summary = "GitHub Language detection"

  s.authors = "GitHub"
@@ -12,5 +12,6 @@ Gem::Specification.new do |s|
  s.add_dependency 'escape_utils',    '~> 0.2.3'
  s.add_dependency 'mime-types',      '~> 1.18'
  s.add_dependency 'pygments.rb',     '~> 0.2.13'
+  s.add_development_dependency 'json'
  s.add_development_dependency 'rake'
 end
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -1,3 +1,4 @@
+require 'linguist/classifier'
 require 'linguist/language'
 require 'linguist/mime'
 require 'linguist/pathname'
@@ -453,8 +454,15 @@ module Linguist
    # Returns a Language or nil.
    def disambiguate_extension_language
      if Language.ambiguous?(extname)
-        name = "guess_#{extname.sub(/^\./, '')}_language"
-        send(name) if respond_to?(name)
+        # name = "guess_#{extname.sub(/^\./, '')}_language"
+        # send(name) if respond_to?(name)
+
+        possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
+        if possible_languages.any?
+          if result = Classifier.instance.classify(data, possible_languages).first
+            result[0]
+          end
+        end
      end
    end

--- a/lib/linguist/classifier.rb
+++ b/lib/linguist/classifier.rb
@@ -0,0 +1,183 @@
+require 'linguist/language'
+require 'linguist/tokenizer'
+
+module Linguist
+  # Language bayesian classifier.
+  class Classifier
+    # Internal: Path to persisted classifier db.
+    PATH = File.expand_path('../classifier.yml', __FILE__)
+
+    # Public: Check if persisted db exists on disk.
+    #
+    # Returns Boolean.
+    def self.exist?
+      File.exist?(PATH)
+    end
+
+    # Public: Get persisted Classifier instance.
+    #
+    # Returns Classifier.
+    def self.instance
+      @instance ||= YAML.load_file(PATH)
+    end
+
+    # Public: Initialize a Classifier.
+    def initialize
+      @tokens_total    = 0
+      @languages_total = 0
+      @tokens          = Hash.new { |h, k| h[k] = Hash.new(0) }
+      @language_tokens = Hash.new(0)
+      @languages       = Hash.new(0)
+    end
+
+    # Public: Compare Classifier objects.
+    #
+    # other - Classifier object to compare to.
+    #
+    # Returns Boolean.
+    def eql?(other)
+      # Lazy fast check counts only
+      other.is_a?(self.class) &&
+        @tokens_total == other.instance_variable_get(:@tokens_total) &&
+        @languages_total == other.instance_variable_get(:@languages_total)
+    end
+    alias_method :==, :eql?
+
+    # Public: Train classifier that data is a certain language.
+    #
+    # language - Language of data
+    # data     - String contents of file
+    #
+    # Examples
+    #
+    #   train(Language['Ruby'], "def hello; end")
+    #
+    # Returns nothing.
+    def train(language, data)
+      language = language.name
+      tokens   = Tokenizer.new(data).tokens
+
+      tokens.each do |token|
+        @tokens[language][token] += 1
+        @language_tokens[language] += 1
+        @tokens_total += 1
+      end
+      @languages[language] += 1
+      @languages_total += 1
+
+      nil
+    end
+
+    # Public: Verify internal counts are consistent.
+    #
+    # Returns Boolean.
+    def verify
+      @languages.inject(0) { |n, (l, c)| n += c } == @languages_total &&
+        @language_tokens.inject(0) { |n, (l, c)| n += c } == @tokens_total &&
+        @tokens.inject(0) { |n, (l, ts)| n += ts.inject(0) { |m, (t, c)| m += c } } == @tokens_total
+    end
+
+    # Public: Prune infrequent tokens.
+    #
+    # Returns receiver Classifier instance.
+    def gc
+      self
+    end
+
+    # Public: Guess language of data.
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of Languages to restrict to.
+    #
+    # Examples
+    #
+    #   classify("def hello; end")
+    #   # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # Language and a Float score.
+    def classify(tokens, languages = @languages.keys)
+      tokens = Tokenizer.new(tokens).tokens if tokens.is_a?(String)
+
+      scores = {}
+      languages.each do |language|
+        language_name = language.is_a?(Language) ? language.name : language
+        scores[language_name] = tokens_probability(tokens, language_name) +
+                                   language_probability(language_name)
+      end
+
+      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
+    end
+
+    # Internal: Probably of set of tokens in a language occuring - P(D | C)
+    #
+    # tokens   - Array of String tokens.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def tokens_probability(tokens, language)
+      tokens.inject(0.0) do |sum, token|
+        sum += Math.log(token_probability(token, language))
+      end
+    end
+
+    # Internal: Probably of token in language occuring - P(F | C)
+    #
+    # token    - String token.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def token_probability(token, language)
+      if @tokens[language][token].to_f == 0.0
+        1 / @tokens_total.to_f
+      else
+        @tokens[language][token].to_f / @language_tokens[language].to_f
+      end
+    end
+
+    # Internal: Probably of a language occuring - P(C)
+    #
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def language_probability(language)
+      Math.log(@languages[language].to_f / @languages_total.to_f)
+    end
+
+    # Public: Serialize classifier to YAML.
+    #
+    # opts - Hash of YAML options.
+    #
+    # Returns nothing.
+    def to_yaml(io)
+      data = "--- !ruby/object:Linguist::Classifier\n"
+
+      data << "languages_total: #{@languages_total}\n"
+      data << "tokens_total: #{@tokens_total}\n"
+
+      data << "languages:\n"
+      @languages.sort.each do |language, count|
+        data << "  #{{language => count}.to_yaml.lines.to_a[1]}"
+      end
+
+      data << "language_tokens:\n"
+      @language_tokens.sort.each do |language, count|
+        data << "  #{{language => count}.to_yaml.lines.to_a[1]}"
+      end
+
+      data << "tokens:\n"
+      @tokens.sort.each do |language, tokens|
+        data << "  #{{language => true}.to_yaml.lines.to_a[1].sub(/ true/, "")}"
+        tokens.sort.each do |token, count|
+          data << "    #{{token => count}.to_yaml.lines.to_a[1]}"
+        end
+      end
+
+      io.write data
+      nil
+    end
+  end
+
+  # Eager load instance
+  Classifier.instance if Classifier.exist?
+end
--- a/lib/linguist/classifier.yml
+++ b/lib/linguist/classifier.yml
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -26,7 +26,7 @@ module Linguist
      @overrides.include?(extension)
    end

-    # Include?: Return overridden extensions.
+    # Internal: Return overridden extensions.
    #
    # Returns extensions Array.
    def self.overridden_extensions
--- a/lib/linguist/sample.rb
+++ b/lib/linguist/sample.rb
@@ -0,0 +1,74 @@
+require 'linguist/classifier'
+require 'linguist/language'
+
+module Linguist
+  # Model for accessing classifier training data.
+  class Sample
+    # Samples live in test/ for now, we'll eventually move them out
+    PATH = File.expand_path("../../../test/fixtures", __FILE__)
+
+    # Public: Iterate over each Sample.
+    #
+    # &block - Yields Sample to block
+    #
+    # Returns nothing.
+    def self.each(&block)
+      Dir.entries(PATH).each do |category|
+        next if category == '.' || category == '..'
+
+        # Skip text and binary for now
+        # Possibly reconsider this later
+        next if category == 'text' || category == 'binary'
+
+        # Map directory name to a Language alias
+        language = Linguist::Language.find_by_alias(category)
+        raise "No language for #{category.inspect}" unless language
+
+        dirname = File.join(PATH, category)
+        Dir.entries(dirname).each do |filename|
+          next if filename == '.' || filename == '..'
+          yield new(File.join(dirname, filename), language)
+        end
+      end
+
+      nil
+    end
+
+    # Public: Build Classifier from all samples.
+    #
+    # Returns trained Classifier.
+    def self.classifier
+      classifier = Classifier.new
+      each { |sample| classifier.train(sample.language, sample.data) }
+      classifier.gc
+    end
+
+    # Internal: Initialize Sample.
+    #
+    # Samples should be initialized by Sample.each.
+    #
+    # path     - String full path to file.
+    # language - Language of sample.
+    def initialize(path, language)
+      @path     = path
+      @language = language
+    end
+
+    # Public: Get full path to file.
+    #
+    # Returns String.
+    attr_reader :path
+
+    # Public: Get sample language.
+    #
+    # Returns Language.
+    attr_reader :language
+
+    # Public: Read file contents.
+    #
+    # Returns String.
+    def data
+      File.read(path)
+    end
+  end
+end
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -0,0 +1,157 @@
+module Linguist
+  # Generic programming language tokenizer.
+  #
+  # Tokens are designed for use in the language bayes classifier.
+  # It strips any data strings or comments and preserves significant
+  # language symbols.
+  class Tokenizer
+    # Public: Initialize a Tokenizer.
+    #
+    # data - String data to scan.
+    def initialize(data)
+      @data = data
+    end
+
+    # Public: Get source data.
+    #
+    # Returns String.
+    attr_reader :data
+
+    # Public: Extract tokens from data.
+    #
+    # Returns Array of token Strings.
+    def tokens
+      extract_tokens(data)
+    end
+
+    # Internal: Extract generic tokens from data.
+    #
+    # data - String to scan.
+    #
+    # Examples
+    #
+    #   extract_tokens("printf('Hello')")
+    #   # => ['printf', '(', ')']
+    #
+    # Returns Array of token Strings.
+    def extract_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+      until s.eos?
+        # Ruby single line comment
+        if token = s.scan(/# /)
+          tokens << "#"
+          s.skip_until(/\n|\Z/)
+
+        # C style single line comment
+        elsif token = s.scan(/\/\/ /)
+          tokens << "//"
+          s.skip_until(/\n|\Z/)
+
+        # Leading Tex or Matlab comments
+        elsif token = s.scan(/\n%/)
+          tokens << "%"
+          s.skip_until(/\n|\Z/)
+
+        # C multiline comments
+        elsif token = s.scan(/\/\*/)
+          tokens << "/*"
+          s.skip_until(/\*\//)
+          tokens << "*/"
+
+        # Haskell multiline comments
+        elsif token = s.scan(/\{-/)
+          tokens << "{-"
+          s.skip_until(/-\}/)
+          tokens << "-}"
+
+        # XML multiline comments
+        elsif token = s.scan(/<!--/)
+          tokens << "<!--"
+          s.skip_until(/-->/)
+          tokens << "-->"
+
+        # Skip single or double quoted strings
+        elsif s.scan(/"/)
+          s.skip_until(/[^\\]"/)
+        elsif s.scan(/'/)
+          s.skip_until(/[^\\]'/)
+
+        # Skip number literals
+        elsif s.scan(/(0x)?\d+/)
+
+        # SGML style brackets
+        elsif token = s.scan(/<[^\s<>][^<>]*>/)
+          extract_sgml_tokens(token).each { |t| tokens << t }
+
+        # Common programming punctuation
+        elsif token = s.scan(/;|\{|\}|\(|\)/)
+          tokens << token
+
+        # Regular token
+        elsif token = s.scan(/[\w\.@#\/\*]+/)
+          tokens << token
+
+        # Common operators
+        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
+          tokens << token
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
+
+    # Internal: Extract tokens from inside SGML tag.
+    #
+    # data - SGML tag String.
+    #
+    # Examples
+    #
+    #   extract_sgml_tokens("<a href='' class=foo>")
+    #   # => ["<a>", "href="]
+    #
+    # Returns Array of token Strings.
+    def extract_sgml_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+
+      until s.eos?
+        # Emit start token
+        if token = s.scan(/<\/?[^\s>]+/)
+          tokens << "#{token}>"
+
+        # Emit attributes with trailing =
+        elsif token = s.scan(/\w+=/)
+          tokens << token
+
+          # Then skip over attribute value
+          if s.scan(/"/)
+            s.skip_until(/[^\\]"/)
+          elsif s.scan(/'/)
+            s.skip_until(/[^\\]'/)
+          else
+            s.skip_until(/\w+/)
+          end
+
+        # Emit lone attributes
+        elsif token = s.scan(/\w+/)
+          tokens << token
+
+        # Stop at the end of the tag
+        elsif s.scan(/>/)
+          s.terminate
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
+  end
+end
--- a/test/fixtures/matlab/average.m
+++ b/test/fixtures/matlab/average.m
@@ -0,0 +1,9 @@
+function y = average(x)
+% AVERAGE Mean of vector elements.
+% AVERAGE(X), where X is a vector, is the mean of vector
+% elements. Nonvector input results in an error.
+[m,n] = size(x);
+if (~((m == 1) | (n == 1)) | (m == 1 & n == 1))
+    error('Input must be a vector')
+end
+y = sum(x)/length(x);
--- a/test/fixtures/matlab/make_filter.m
+++ b/test/fixtures/matlab/make_filter.m
@@ -0,0 +1,38 @@
+function [filtfcn, statefcn] = makeFilter(b, a)
+%   FILTFCN = MAKEFILTER(B, A) creates an IIR filtering
+%   function and returns it in the form of a function handle,
+%   FILTFCN. Each time you call FILTFCN with a new filter 
+%   input value, it computes the corresponding new filter 
+%   output value, updating its internal state vector at the
+%   same time.
+%
+%   [FILTFCN, STATEFCN] = MAKEFILTER(B, A) also returns a 
+%   function (in the form of a function handle, STATEFCN) 
+%   that can return the filter's internal state.  The internal
+%   state vector is in the form of a transposed direct form 
+%   II delay line.
+
+%   Initialize state vector. To keep this example a bit 
+%   simpler, assume that a and b have the same length.  
+%   Also assume that a(1) is 1.
+
+v = zeros(size(a));
+
+filtfcn =  @iirFilter;
+statefcn = @getState;
+
+   function yn = iirFilter(xn)
+      % Update the state vector
+      v(1) = v(2) + b(1) * xn;
+      v(2:end-1) = v(3:end) + b(2:end-1) * xn - ...
+         a(2:end-1) * v(1);
+      v(end) = b(end) * xn - a(end) * v(1);
+      
+      % Output is the first element of the state vector.
+      yn = v(1);
+   end
+
+   function vOut = getState
+      vOut = v;
+   end
+end
--- a/test/fixtures/matlab/matlab_function2.m
+++ b/test/fixtures/matlab/matlab_function2.m
@@ -1,33 +0,0 @@
-   function ret = matlab_function2(A,B)
-% Simple function that combines two values using function handles and displays
-% the return value
-
-% create function handles
-fun1=@interface;
-fun2=@implementation;
-fun3=@property;
-fun4=@synthesize;
-
-% use function handles
-ret = fun1(A)+fun2(A)+fun3(B)+fun4(B);
-
-% Display the return value
-disp('Return value in function');
-disp(ret);
-
-
-function A=interface(A)
-% simple sub-function with same name Objective-C @keyword
-A=2*A;
-
-function A=implementation(A)
-% simple sub-function with same name Objective-C @keyword
-A=A^2;
-
-function B=property(B)
-% simple sub-function with same name Objective-C @keyword
-B=2*B;
-
-function B=synthesize(B)
-% simple sub-function with same name Objective-C @keyword
-B=B^2;
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -1,4 +1,5 @@
 require 'linguist/file_blob'
+require 'linguist/sample'

 require 'test/unit'
 require 'mime/types'
@@ -24,23 +25,6 @@ class TestBlob < Test::Unit::TestCase
    blob
  end

-  def each_language_fixture
-    Dir["#{fixtures_path}/*"].each do |path|
-      name = File.basename(path)
-
-      if name == 'text' || name == 'binary'
-        next
-      else
-        assert language = Language.find_by_alias(name), "No language alias for #{name.inspect}"
-      end
-
-      Dir.entries(path).each do |filename|
-        next if filename == '.' || filename == '..'
-        yield language, blob(File.join(path, filename))
-      end
-    end
-  end
-
  def test_name
    assert_equal "foo.rb", blob("foo.rb").name
  end
@@ -291,9 +275,9 @@ class TestBlob < Test::Unit::TestCase
  end

  def test_language
-    # Drop any files under test/fixtures/LANGUAGE
-    each_language_fixture do |language, blob|
-      assert_equal language, blob.language, blob.name
+    Sample.each do |sample|
+      blob = blob(sample.path)
+      assert_equal sample.language, blob.language, blob.name
    end
  end

--- a/test/test_classifier.rb
+++ b/test/test_classifier.rb
@@ -0,0 +1,82 @@
+require 'linguist/classifier'
+require 'linguist/language'
+require 'linguist/sample'
+require 'linguist/tokenizer'
+
+require 'test/unit'
+
+class TestClassifier < Test::Unit::TestCase
+  include Linguist
+
+  def fixtures_path
+    File.expand_path("../fixtures", __FILE__)
+  end
+
+  def fixture(name)
+    File.read(File.join(fixtures_path, name))
+  end
+
+  def test_instance_freshness
+    # Just warn, it shouldn't scare people off by breaking the build.
+    unless Classifier.instance.eql?(Linguist::Sample.classifier)
+      warn "Classifier database is out of date. Run `bundle exec rake classifier`."
+    end
+  end
+
+  def test_classify
+    classifier = Classifier.new
+    classifier.train Language["Ruby"], fixture("ruby/foo.rb")
+    classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
+    classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
+
+    results = classifier.classify(fixture("objective-c/hello.m"))
+    assert_equal Language["Objective-C"], results.first[0]
+
+    tokens  = Tokenizer.new(fixture("objective-c/hello.m")).tokens
+    results = classifier.classify(tokens)
+    assert_equal Language["Objective-C"], results.first[0]
+  end
+
+  def test_restricted_classify
+    classifier = Classifier.new
+    classifier.train Language["Ruby"], fixture("ruby/foo.rb")
+    classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
+    classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
+
+    results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]])
+    assert_equal Language["Objective-C"], results.first[0]
+
+    results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]])
+    assert_equal Language["Ruby"], results.first[0]
+  end
+
+  def test_instance_classify_empty
+    results = Classifier.instance.classify("")
+    assert results.first[1] < 0.5, results.first.inspect
+  end
+
+  def test_verify
+    assert Classifier.instance.verify
+  end
+
+  def test_gc
+    Classifier.instance.gc
+  end
+
+  def test_classify_ambiguous_languages
+    Sample.each do |sample|
+      # TODO: These tests are pending
+      next if sample.path =~ /hello.h/
+      next if sample.path =~ /MainMenuViewController.h/
+
+      next unless sample.language.overrides.any?
+
+      extname   = File.extname(sample.path)
+      languages = Language.all.select { |l| l.extensions.include?(extname) }
+      next unless languages.length > 1
+
+      results = Classifier.instance.classify(sample.data, languages)
+      assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}"
+    end
+  end
+end
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -0,0 +1,91 @@
+require 'linguist/tokenizer'
+
+require 'test/unit'
+
+class TestTokenizer < Test::Unit::TestCase
+  include Linguist
+
+  def fixtures_path
+    File.expand_path("../fixtures", __FILE__)
+  end
+
+  def tokenize(data)
+    data = File.read(File.join(fixtures_path, data.to_s)) if data.is_a?(Symbol)
+    Tokenizer.new(data).tokens
+  end
+
+  def test_skip_string_literals
+    assert_equal %w(print), tokenize('print ""')
+    assert_equal %w(print), tokenize('print "Josh"')
+    assert_equal %w(print), tokenize("print 'Josh'")
+    assert_equal %w(print), tokenize('print "Hello \"Josh\""')
+    assert_equal %w(print), tokenize("print 'Hello \\'Josh\\''")
+  end
+
+  def test_skip_number_literals
+    assert_equal %w(+), tokenize('1 + 1')
+    assert_equal %w(add \( \)), tokenize('add(123, 456)')
+    assert_equal %w(|), tokenize('0x01 | 0x10')
+  end
+
+  def test_skip_comments
+    assert_equal %w(foo #), tokenize("foo # Comment")
+    assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
+    assert_equal %w(foo //), tokenize("foo // Comment")
+    assert_equal %w(foo /* */), tokenize("foo /* Comment */")
+    assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
+    assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")
+    assert_equal %w(foo {- -}), tokenize("foo {- Comment -}")
+    assert_equal %w(% %), tokenize("2 % 10\n% Comment")
+  end
+
+  def test_sgml_tags
+    assert_equal %w(<html> </html>), tokenize("<html></html>")
+    assert_equal %w(<div> id </div>), tokenize("<div id></div>")
+    assert_equal %w(<div> id= </div>), tokenize("<div id=foo></div>")
+    assert_equal %w(<div> id class </div>), tokenize("<div id class></div>")
+    assert_equal %w(<div> id= </div>), tokenize("<div id=\"foo bar\"></div>")
+    assert_equal %w(<div> id= </div>), tokenize("<div id='foo bar'></div>")
+    assert_equal %w(<?xml> version=), tokenize("<?xml version=\"1.0\"?>")
+  end
+
+  def test_operators
+    assert_equal %w(+), tokenize("1 + 1")
+    assert_equal %w(-), tokenize("1 - 1")
+    assert_equal %w(*), tokenize("1 * 1")
+    assert_equal %w(/), tokenize("1 / 1")
+    assert_equal %w(%), tokenize("2 % 5")
+    assert_equal %w(&), tokenize("1 & 1")
+    assert_equal %w(&&), tokenize("1 && 1")
+    assert_equal %w(|), tokenize("1 | 1")
+    assert_equal %w(||), tokenize("1 || 1")
+    assert_equal %w(<), tokenize("1 < 0x01")
+    assert_equal %w(<<), tokenize("1 << 0x01")
+  end
+
+  def test_c_tokens
+    assert_equal %w(#ifndef HELLO_H #define HELLO_H void hello \( \) ; #endif), tokenize(:"c/hello.h")
+    assert_equal %w(#include <stdio.h> int main \( \) { printf \( \) ; return ; }), tokenize(:"c/hello.c")
+  end
+
+  def test_cpp_tokens
+    assert_equal %w(class Bar { protected char *name ; public void hello \( \) ; }), tokenize(:"cpp/bar.h")
+    assert_equal %w(#include <iostream> using namespace std ; int main \( \) { cout << << endl ; }), tokenize(:"cpp/hello.cpp")
+  end
+
+  def test_objective_c_tokens
+    assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize(:"objective-c/Foo.h")
+    assert_equal %w(#import @implementation Foo @end), tokenize(:"objective-c/Foo.m")
+    assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"objective-c/hello.m")
+  end
+
+  def test_javascript_tokens
+    assert_equal %w( \( function \( \) { console.log \( \) ; } \) .call \( this \) ;), tokenize(:"javascript/hello.js")
+  end
+
+  def test_ruby_tokens
+    assert_equal %w(module Foo end), tokenize(:"ruby/foo.rb")
+    assert_equal %w(# /usr/bin/env ruby puts), tokenize(:"ruby/script.rb")
+    assert_equal %w(task default do puts end), tokenize(:"ruby/Rakefile")
+  end
+end
Author	SHA1	Message	Date
Joshua Peek	3d7364877d	Linguist 2.0.0	2012-06-21 11:28:24 -05:00
Joshua Peek	2a324c6289	Merge pull request #172 from github/bayesian Bayesian Classifier	2012-06-21 09:26:54 -07:00
Joshua Peek	77a6a41fc3	Merge branch 'master' into bayesian	2012-06-21 11:25:34 -05:00
Joshua Peek	076bf7d0c8	Use classifier as primary method for disambiguation	2012-06-21 10:55:26 -05:00
Joshua Peek	540f2a0941	More matlab samples	2012-06-21 10:44:31 -05:00
Joshua Peek	497da86262	Strip tex and matlab leading inline comments	2012-06-21 10:38:28 -05:00
Joshua Peek	4b9b8a5058	Remove matlab file with bogus keywords	2012-06-21 10:25:30 -05:00
Joshua Peek	5568489123	Rebuild classifier db	2012-06-20 17:17:09 -05:00
Joshua Peek	5cdd5e206c	Improve operator tokenizing	2012-06-20 17:16:53 -05:00
Joshua Peek	c353d3a050	Fix indent	2012-06-20 16:58:32 -05:00
Joshua Peek	6252f12175	Rebuild classifier db	2012-06-20 16:54:40 -05:00
Joshua Peek	0067f28246	YAML sucks	2012-06-20 16:54:29 -05:00
Joshua Peek	ac23d64d26	Merge branch 'master' into bayesian	2012-06-20 16:24:39 -05:00
Joshua Peek	9c9607e42c	Log regexp and classifier guess mismatches	2012-06-20 16:20:59 -05:00
Joshua Peek	516a220d9f	Verify classifer counts	2012-06-20 15:48:46 -05:00
Joshua Peek	7bcf90c527	Skip gc step for now	2012-06-20 15:13:06 -05:00
Joshua Peek	8c83cbe244	Merge branch 'master' into bayesian Conflicts: linguist.gemspec	2012-06-20 14:56:15 -05:00
Joshua Peek	26f95507ef	Test against real gist data	2012-06-20 14:55:13 -05:00
Joshua Peek	4324971cea	Remove debug line	2012-06-20 14:11:23 -05:00
Joshua Peek	2672089154	Ensure language is loaded	2012-06-20 14:10:34 -05:00
Joshua Peek	48ecae0c95	Rebuild classifier db	2012-06-20 12:50:42 -05:00
Joshua Peek	5daaee88b4	Sort classifier yaml output	2012-06-20 12:50:05 -05:00
Joshua Peek	db9475f240	Rebuild classifier data	2012-06-20 11:27:18 -05:00
Joshua Peek	f68e94f181	Skip number literals	2012-06-20 11:26:14 -05:00
Joshua Peek	cb70572163	Rebuild classifier db	2012-06-20 11:19:36 -05:00
Joshua Peek	e9eae4e008	Skip pending tests	2012-06-20 11:19:02 -05:00
Joshua Peek	e33d8f3685	Merge branch 'master' into bayesian	2012-06-20 11:18:47 -05:00
Joshua Peek	645a87d02b	Remove dead fixture test	2012-06-19 16:34:13 -05:00
Joshua Peek	4484011f08	Switch to log probabilities to avoid float underflows	2012-06-19 16:33:29 -05:00
Joshua Peek	c114d710f8	Test classifier on ambiguous languages	2012-06-19 16:32:56 -05:00
Joshua Peek	9810c693c3	Rebuild classifier db	2012-06-19 16:30:46 -05:00
Joshua Peek	c804d04072	Merge branch 'master' into bayesian	2012-06-19 16:29:01 -05:00
Joshua Peek	fdd81ce0be	Merge branch 'master' into bayesian	2012-06-19 16:26:43 -05:00
Joshua Peek	176f6483d0	Ensure token probability is less than 1.0	2012-06-19 15:26:56 -05:00
Joshua Peek	ee6650f83f	Fix doc typo	2012-06-19 15:23:23 -05:00
Joshua Peek	3fee3ac549	Rebuild classifier db	2012-06-19 15:23:06 -05:00
Joshua Peek	9d555862c3	Merge branch 'master' into bayesian	2012-06-19 15:02:02 -05:00
Joshua Peek	ddf3ec4a5b	Warn if classifier instance is out of date	2012-06-19 14:32:04 -05:00
Joshua Peek	e2b0f6bb50	Depend classifier db on fixtures	2012-06-19 14:23:12 -05:00
Joshua Peek	4d5c9b951b	Rebuild classifier	2012-06-19 14:22:42 -05:00
Joshua Peek	d566b35020	Allow classifer languages to be scoped	2012-06-19 14:21:42 -05:00
Joshua Peek	8f85a447de	Allow tokens to be passed directly to classify	2012-06-19 14:17:27 -05:00
Joshua Peek	d0691988a9	More classifier docs	2012-06-19 14:15:10 -05:00
Joshua Peek	d9ecbf0c24	Doc sample class	2012-06-19 13:30:28 -05:00
Joshua Peek	d5fa8cbcb7	Refactor tokenizer test helper	2012-06-19 13:12:17 -05:00
Joshua Peek	555573071e	More tokenizer docs	2012-06-19 13:09:23 -05:00
Joshua Peek	ecb2397e59	Merge branch 'master' into bayesian	2012-06-19 11:43:48 -05:00
Joshua Peek	12cfab6d50	Rebuild classifier data	2012-06-08 16:04:52 -05:00
Joshua Peek	8a75d4d208	GC classifier db	2012-06-08 16:04:43 -05:00
Joshua Peek	fd8b70ffa4	Rebuild classifier data	2012-06-08 15:49:35 -05:00
Joshua Peek	62498cf0e9	Merge branch 'master' into bayesian	2012-06-08 15:46:48 -05:00
Joshua Peek	543922c68a	Rebuild classifier data	2012-06-08 14:48:04 -05:00
Joshua Peek	6f6dd8bc38	Improve tokenizing sgml tags	2012-06-08 14:46:16 -05:00
Joshua Peek	8351d55c56	Don't crash if classifier data is missing	2012-06-08 14:46:06 -05:00
Joshua Peek	9ecab364d1	Dump classifier results	2012-06-08 14:13:26 -05:00
Joshua Peek	0172623061	Add sample gathering class	2012-06-08 13:51:49 -05:00
Joshua Peek	e5ae9c328b	Use language name as hash key	2012-06-08 13:43:57 -05:00
Joshua Peek	e0c777d995	Fix test name	2012-06-08 13:43:37 -05:00
Joshua Peek	f747b49347	Add simple classifier	2012-06-07 17:10:28 -05:00
Joshua Peek	e0cbe815a3	Add basic Tokenizer	2012-06-07 14:55:11 -05:00