Switch to log probabilities to avoid float underflows

This commit is contained in:
Joshua Peek
2012-06-19 16:33:29 -05:00
parent c114d710f8
commit 4484011f08

View File

@@ -103,8 +103,8 @@ module Linguist
scores = {} scores = {}
languages.each do |language| languages.each do |language|
language_name = language.is_a?(Language) ? language.name : language language_name = language.is_a?(Language) ? language.name : language
scores[language_name] = tokens_probability(tokens, language_name) * scores[language_name] = tokens_probability(tokens, language_name) +
language_probability(language_name) language_probability(language_name)
end end
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] } scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
@@ -117,8 +117,8 @@ module Linguist
# #
# Returns Float between 0.0 and 1.0. # Returns Float between 0.0 and 1.0.
def tokens_probability(tokens, language) def tokens_probability(tokens, language)
tokens.inject(1.0) do |sum, token| tokens.inject(0.0) do |sum, token|
sum *= token_probability(token, language) sum += Math.log(token_probability(token, language))
end end
end end
@@ -142,7 +142,7 @@ module Linguist
# #
# Returns Float between 0.0 and 1.0. # Returns Float between 0.0 and 1.0.
def language_probability(language) def language_probability(language)
@languages[language].to_f / @languages_total.to_f Math.log(@languages[language].to_f / @languages_total.to_f)
end end
end end