Change Classifier to accept language name Strings

This commit is contained in:
Joshua Peek
2012-07-20 15:52:27 -05:00
parent bc84a98b54
commit 7292bdc180
3 changed files with 25 additions and 28 deletions

View File

@@ -439,10 +439,10 @@ module Linguist
# Returns a Language or nil. # Returns a Language or nil.
def disambiguate_extension_language def disambiguate_extension_language
if Language.ambiguous?(extname) if Language.ambiguous?(extname)
possible_languages = Language.all.select { |l| l.extensions.include?(extname) } possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
if possible_languages.any? if possible_languages.any?
if result = Classifier.instance.classify(data, possible_languages).first if result = Classifier.instance.classify(data, possible_languages).first
result[0] Language[result[0]]
end end
end end
end end

View File

@@ -1,4 +1,3 @@
require 'linguist/language'
require 'linguist/tokenizer' require 'linguist/tokenizer'
module Linguist module Linguist
@@ -45,17 +44,16 @@ module Linguist
# Public: Train classifier that data is a certain language. # Public: Train classifier that data is a certain language.
# #
# language - Language of data # language - String language of data
# data - String contents of file # data - String contents of file
# #
# Examples # Examples
# #
# train(Language['Ruby'], "def hello; end") # train('Ruby', "def hello; end")
# #
# Returns nothing. # Returns nothing.
def train(language, data) def train(language, data)
language = language.name tokens = Tokenizer.tokenize(data)
tokens = Tokenizer.tokenize(data)
tokens.each do |token| tokens.each do |token|
@tokens[language][token] += 1 @tokens[language][token] += 1
@@ -87,27 +85,26 @@ module Linguist
# Public: Guess language of data. # Public: Guess language of data.
# #
# data - Array of tokens or String data to analyze. # data - Array of tokens or String data to analyze.
# languages - Array of Languages to restrict to. # languages - Array of language name Strings to restrict to.
# #
# Examples # Examples
# #
# classify("def hello; end") # classify("def hello; end")
# # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ] # # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
# #
# Returns sorted Array of result pairs. Each pair contains the # Returns sorted Array of result pairs. Each pair contains the
# Language and a Float score. # String language name and a Float score.
def classify(tokens, languages = @languages.keys) def classify(tokens, languages = @languages.keys)
return [] if tokens.nil? return [] if tokens.nil?
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String) tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
scores = {} scores = {}
languages.each do |language| languages.each do |language|
language_name = language.is_a?(Language) ? language.name : language scores[language] = tokens_probability(tokens, language) +
scores[language_name] = tokens_probability(tokens, language_name) + language_probability(language)
language_probability(language_name)
end end
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] } scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
end end
# Internal: Probably of set of tokens in a language occuring - P(D | C) # Internal: Probably of set of tokens in a language occuring - P(D | C)

View File

@@ -25,29 +25,29 @@ class TestClassifier < Test::Unit::TestCase
def test_classify def test_classify
classifier = Classifier.new classifier = Classifier.new
classifier.train Language["Ruby"], fixture("ruby/foo.rb") classifier.train "Ruby", fixture("ruby/foo.rb")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h") classifier.train "Objective-C", fixture("objective-c/Foo.h")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m") classifier.train "Objective-C", fixture("objective-c/Foo.m")
results = classifier.classify(fixture("objective-c/hello.m")) results = classifier.classify(fixture("objective-c/hello.m"))
assert_equal Language["Objective-C"], results.first[0] assert_equal "Objective-C", results.first[0]
tokens = Tokenizer.tokenize(fixture("objective-c/hello.m")) tokens = Tokenizer.tokenize(fixture("objective-c/hello.m"))
results = classifier.classify(tokens) results = classifier.classify(tokens)
assert_equal Language["Objective-C"], results.first[0] assert_equal "Objective-C", results.first[0]
end end
def test_restricted_classify def test_restricted_classify
classifier = Classifier.new classifier = Classifier.new
classifier.train Language["Ruby"], fixture("ruby/foo.rb") classifier.train "Ruby", fixture("ruby/foo.rb")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h") classifier.train "Objective-C", fixture("objective-c/Foo.h")
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m") classifier.train "Objective-C", fixture("objective-c/Foo.m")
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]]) results = classifier.classify(fixture("objective-c/hello.m"), ["Objective-C"])
assert_equal Language["Objective-C"], results.first[0] assert_equal "Objective-C", results.first[0]
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]]) results = classifier.classify(fixture("objective-c/hello.m"), ["Ruby"])
assert_equal Language["Ruby"], results.first[0] assert_equal "Ruby", results.first[0]
end end
def test_instance_classify_empty def test_instance_classify_empty
@@ -72,11 +72,11 @@ class TestClassifier < Test::Unit::TestCase
next unless sample.language.overrides.any? next unless sample.language.overrides.any?
extname = File.extname(sample.path) extname = File.extname(sample.path)
languages = Language.all.select { |l| l.extensions.include?(extname) } languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
next unless languages.length > 1 next unless languages.length > 1
results = Classifier.instance.classify(sample.data, languages) results = Classifier.instance.classify(sample.data, languages)
assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}" assert_equal sample.language.name, results.first[0], "#{sample.path}\n#{results.inspect}"
end end
end end
end end