mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Change Classifier to accept language name Strings
This commit is contained in:
@@ -439,10 +439,10 @@ module Linguist
|
|||||||
# Returns a Language or nil.
|
# Returns a Language or nil.
|
||||||
def disambiguate_extension_language
|
def disambiguate_extension_language
|
||||||
if Language.ambiguous?(extname)
|
if Language.ambiguous?(extname)
|
||||||
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }
|
possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
||||||
if possible_languages.any?
|
if possible_languages.any?
|
||||||
if result = Classifier.instance.classify(data, possible_languages).first
|
if result = Classifier.instance.classify(data, possible_languages).first
|
||||||
result[0]
|
Language[result[0]]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
require 'linguist/language'
|
|
||||||
require 'linguist/tokenizer'
|
require 'linguist/tokenizer'
|
||||||
|
|
||||||
module Linguist
|
module Linguist
|
||||||
@@ -45,17 +44,16 @@ module Linguist
|
|||||||
|
|
||||||
# Public: Train classifier that data is a certain language.
|
# Public: Train classifier that data is a certain language.
|
||||||
#
|
#
|
||||||
# language - Language of data
|
# language - String language of data
|
||||||
# data - String contents of file
|
# data - String contents of file
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
#
|
#
|
||||||
# train(Language['Ruby'], "def hello; end")
|
# train('Ruby', "def hello; end")
|
||||||
#
|
#
|
||||||
# Returns nothing.
|
# Returns nothing.
|
||||||
def train(language, data)
|
def train(language, data)
|
||||||
language = language.name
|
tokens = Tokenizer.tokenize(data)
|
||||||
tokens = Tokenizer.tokenize(data)
|
|
||||||
|
|
||||||
tokens.each do |token|
|
tokens.each do |token|
|
||||||
@tokens[language][token] += 1
|
@tokens[language][token] += 1
|
||||||
@@ -87,27 +85,26 @@ module Linguist
|
|||||||
# Public: Guess language of data.
|
# Public: Guess language of data.
|
||||||
#
|
#
|
||||||
# data - Array of tokens or String data to analyze.
|
# data - Array of tokens or String data to analyze.
|
||||||
# languages - Array of Languages to restrict to.
|
# languages - Array of language name Strings to restrict to.
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
#
|
#
|
||||||
# classify("def hello; end")
|
# classify("def hello; end")
|
||||||
# # => [ [Language['Ruby'], 0.90], [Language['Python'], 0.2], ... ]
|
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
||||||
#
|
#
|
||||||
# Returns sorted Array of result pairs. Each pair contains the
|
# Returns sorted Array of result pairs. Each pair contains the
|
||||||
# Language and a Float score.
|
# String language name and a Float score.
|
||||||
def classify(tokens, languages = @languages.keys)
|
def classify(tokens, languages = @languages.keys)
|
||||||
return [] if tokens.nil?
|
return [] if tokens.nil?
|
||||||
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
||||||
|
|
||||||
scores = {}
|
scores = {}
|
||||||
languages.each do |language|
|
languages.each do |language|
|
||||||
language_name = language.is_a?(Language) ? language.name : language
|
scores[language] = tokens_probability(tokens, language) +
|
||||||
scores[language_name] = tokens_probability(tokens, language_name) +
|
language_probability(language)
|
||||||
language_probability(language_name)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
|
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
|
||||||
end
|
end
|
||||||
|
|
||||||
# Internal: Probably of set of tokens in a language occuring - P(D | C)
|
# Internal: Probably of set of tokens in a language occuring - P(D | C)
|
||||||
|
|||||||
@@ -25,29 +25,29 @@ class TestClassifier < Test::Unit::TestCase
|
|||||||
|
|
||||||
def test_classify
|
def test_classify
|
||||||
classifier = Classifier.new
|
classifier = Classifier.new
|
||||||
classifier.train Language["Ruby"], fixture("ruby/foo.rb")
|
classifier.train "Ruby", fixture("ruby/foo.rb")
|
||||||
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
|
classifier.train "Objective-C", fixture("objective-c/Foo.h")
|
||||||
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
|
classifier.train "Objective-C", fixture("objective-c/Foo.m")
|
||||||
|
|
||||||
results = classifier.classify(fixture("objective-c/hello.m"))
|
results = classifier.classify(fixture("objective-c/hello.m"))
|
||||||
assert_equal Language["Objective-C"], results.first[0]
|
assert_equal "Objective-C", results.first[0]
|
||||||
|
|
||||||
tokens = Tokenizer.tokenize(fixture("objective-c/hello.m"))
|
tokens = Tokenizer.tokenize(fixture("objective-c/hello.m"))
|
||||||
results = classifier.classify(tokens)
|
results = classifier.classify(tokens)
|
||||||
assert_equal Language["Objective-C"], results.first[0]
|
assert_equal "Objective-C", results.first[0]
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_restricted_classify
|
def test_restricted_classify
|
||||||
classifier = Classifier.new
|
classifier = Classifier.new
|
||||||
classifier.train Language["Ruby"], fixture("ruby/foo.rb")
|
classifier.train "Ruby", fixture("ruby/foo.rb")
|
||||||
classifier.train Language["Objective-C"], fixture("objective-c/Foo.h")
|
classifier.train "Objective-C", fixture("objective-c/Foo.h")
|
||||||
classifier.train Language["Objective-C"], fixture("objective-c/Foo.m")
|
classifier.train "Objective-C", fixture("objective-c/Foo.m")
|
||||||
|
|
||||||
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Objective-C"]])
|
results = classifier.classify(fixture("objective-c/hello.m"), ["Objective-C"])
|
||||||
assert_equal Language["Objective-C"], results.first[0]
|
assert_equal "Objective-C", results.first[0]
|
||||||
|
|
||||||
results = classifier.classify(fixture("objective-c/hello.m"), [Language["Ruby"]])
|
results = classifier.classify(fixture("objective-c/hello.m"), ["Ruby"])
|
||||||
assert_equal Language["Ruby"], results.first[0]
|
assert_equal "Ruby", results.first[0]
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_instance_classify_empty
|
def test_instance_classify_empty
|
||||||
@@ -72,11 +72,11 @@ class TestClassifier < Test::Unit::TestCase
|
|||||||
next unless sample.language.overrides.any?
|
next unless sample.language.overrides.any?
|
||||||
|
|
||||||
extname = File.extname(sample.path)
|
extname = File.extname(sample.path)
|
||||||
languages = Language.all.select { |l| l.extensions.include?(extname) }
|
languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
|
||||||
next unless languages.length > 1
|
next unless languages.length > 1
|
||||||
|
|
||||||
results = Classifier.instance.classify(sample.data, languages)
|
results = Classifier.instance.classify(sample.data, languages)
|
||||||
assert_equal sample.language, results.first[0], "#{sample.path}\n#{results.inspect}"
|
assert_equal sample.language.name, results.first[0], "#{sample.path}\n#{results.inspect}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user