From 7292bdc18078784849526a5f8620ddb80ec4d8f1 Mon Sep 17 00:00:00 2001 From: Joshua Peek Date: Fri, 20 Jul 2012 15:52:27 -0500 Subject: [PATCH] Change Classifier to accept language name Strings --- lib/linguist/blob_helper.rb | 4 ++-- lib/linguist/classifier.rb | 21 +++++++++------------ test/test_classifier.rb | 28 ++++++++++++++-------------- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index cb5cddfc..1ea5b44d 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -439,10 +439,10 @@ module Linguist # Returns a Language or nil. def disambiguate_extension_language if Language.ambiguous?(extname) - possible_languages = Language.all.select { |l| l.extensions.include?(extname) } + possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) if possible_languages.any? if result = Classifier.instance.classify(data, possible_languages).first - result[0] + Language[result[0]] end end end diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index f09753f6..03958301 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -1,4 +1,3 @@ -require 'linguist/language' require 'linguist/tokenizer' module Linguist @@ -45,17 +44,16 @@ module Linguist # Public: Train classifier that data is a certain language. # - # language - Language of data + # language - String language of data # data - String contents of file # # Examples # - # train(Language['Ruby'], "def hello; end") + # train('Ruby', "def hello; end") # # Returns nothing. def train(language, data) - language = language.name - tokens = Tokenizer.tokenize(data) + tokens = Tokenizer.tokenize(data) tokens.each do |token| @tokens[language][token] += 1 @@ -87,27 +85,26 @@ module Linguist # Public: Guess language of data. # # data - Array of tokens or String data to analyze. - # languages - Array of Languages to restrict to. + # languages - Array of language name Strings to restrict to. # # Examples # # classify("def hello; end") - # # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ] + # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ] # # Returns sorted Array of result pairs. Each pair contains the - # Language and a Float score. + # String language name and a Float score. def classify(tokens, languages = @languages.keys) return [] if tokens.nil? tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String) scores = {} languages.each do |language| - language_name = language.is_a?(Language) ? language.name : language - scores[language_name] = tokens_probability(tokens, language_name) + - language_probability(language_name) + scores[language] = tokens_probability(tokens, language) + + language_probability(language) end - scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] } + scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] } end # Internal: Probably of set of tokens in a language occuring - P(D | C) diff --git a/test/test_classifier.rb b/test/test_classifier.rb index 9a72c9b2..33de385a 100644 --- a/test/test_classifier.rb +++ b/test/test_classifier.rb @@ -25,29 +25,29 @@ class TestClassifier < Test::Unit::TestCase def test_classify classifier = Classifier.new - classifier.train Language["Ruby"], fixture("ruby/foo.rb") - classifier.train Language["Objective-C"], fixture("objective-c/Foo.h") - classifier.train Language["Objective-C"], fixture("objective-c/Foo.m") + classifier.train "Ruby", fixture("ruby/foo.rb") + classifier.train "Objective-C", fixture("objective-c/Foo.h") + classifier.train "Objective-C", fixture("objective-c/Foo.m") results = classifier.classify(fixture("objective-c/hello.m")) - assert_equal Language["Objective-C"], results.first[0] + assert_equal "Objective-C", results.first[0] tokens = Tokenizer.tokenize(fixture("objective-c/hello.m")) results = classifier.classify(tokens) - assert_equal Language["Objective-C"], results.first[0] + assert_equal "Objective-C", results.first[0] end def test_restricted_classify classifier = Classifier.new - classifier.train Language["Ruby"], fixture("ruby/foo.rb") - classifier.train Language["Objective-C"], fixture("objective-c/Foo.h") - classifier.train Language["Objective-C"], fixture("objective-c/Foo.m") + classifier.train "Ruby", fixture("ruby/foo.rb") + classifier.train "Objective-C", fixture("objective-c/Foo.h") + classifier.train "Objective-C", fixture("objective-c/Foo.m") - results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]]) - assert_equal Language["Objective-C"], results.first[0] + results = classifier.classify(fixture("objective-c/hello.m"), ["Objective-C"]) + assert_equal "Objective-C", results.first[0] - results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]]) - assert_equal Language["Ruby"], results.first[0] + results = classifier.classify(fixture("objective-c/hello.m"), ["Ruby"]) + assert_equal "Ruby", results.first[0] end def test_instance_classify_empty @@ -72,11 +72,11 @@ class TestClassifier < Test::Unit::TestCase next unless sample.language.overrides.any? extname = File.extname(sample.path) - languages = Language.all.select { |l| l.extensions.include?(extname) } + languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name) next unless languages.length > 1 results = Classifier.instance.classify(sample.data, languages) - assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}" + assert_equal sample.language.name, results.first[0], "#{sample.path}\n#{results.inspect}" end end end