mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-29 21:31:01 +00:00
Use language name as hash key
This commit is contained in:
@@ -4,14 +4,15 @@ module Linguist
|
|||||||
# Language bayesian classifier.
|
# Language bayesian classifier.
|
||||||
class Classifier
|
class Classifier
|
||||||
def initialize
|
def initialize
|
||||||
|
@tokens_total = 0
|
||||||
|
@languages_total = 0
|
||||||
@tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
|
@tokens = Hash.new { |h, k| h[k] = Hash.new(0) }
|
||||||
@language_tokens = Hash.new(0)
|
@language_tokens = Hash.new(0)
|
||||||
@languages = Hash.new(0)
|
@languages = Hash.new(0)
|
||||||
@languages_total = 0
|
|
||||||
@tokens_total = 0
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def train(language, data)
|
def train(language, data)
|
||||||
|
language = language.name
|
||||||
tokens = Tokenizer.new(data).tokens
|
tokens = Tokenizer.new(data).tokens
|
||||||
|
|
||||||
tokens.each do |token|
|
tokens.each do |token|
|
||||||
@@ -31,7 +32,7 @@ module Linguist
|
|||||||
scores[language] = tokens_probability(tokens, language) * language_probability(language)
|
scores[language] = tokens_probability(tokens, language) * language_probability(language)
|
||||||
end
|
end
|
||||||
|
|
||||||
scores.sort { |a, b| b[1] <=> a[1] }
|
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
|
||||||
end
|
end
|
||||||
|
|
||||||
def tokens_probability(tokens, language)
|
def tokens_probability(tokens, language)
|
||||||
|
|||||||
Reference in New Issue
Block a user