mirror of
https://github.com/KevinMidboe/linguist.git
synced 2026-02-15 12:49:30 +00:00
Switch to log probabilities to avoid float underflows
This commit is contained in:
@@ -103,8 +103,8 @@ module Linguist
|
|||||||
scores = {}
|
scores = {}
|
||||||
languages.each do |language|
|
languages.each do |language|
|
||||||
language_name = language.is_a?(Language) ? language.name : language
|
language_name = language.is_a?(Language) ? language.name : language
|
||||||
scores[language_name] = tokens_probability(tokens, language_name) *
|
scores[language_name] = tokens_probability(tokens, language_name) +
|
||||||
language_probability(language_name)
|
language_probability(language_name)
|
||||||
end
|
end
|
||||||
|
|
||||||
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
|
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [Language[score[0]], score[1]] }
|
||||||
@@ -117,8 +117,8 @@ module Linguist
|
|||||||
#
|
#
|
||||||
# Returns Float between 0.0 and 1.0.
|
# Returns Float between 0.0 and 1.0.
|
||||||
def tokens_probability(tokens, language)
|
def tokens_probability(tokens, language)
|
||||||
tokens.inject(1.0) do |sum, token|
|
tokens.inject(0.0) do |sum, token|
|
||||||
sum *= token_probability(token, language)
|
sum += Math.log(token_probability(token, language))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -142,7 +142,7 @@ module Linguist
|
|||||||
#
|
#
|
||||||
# Returns Float between 0.0 and 1.0.
|
# Returns Float between 0.0 and 1.0.
|
||||||
def language_probability(language)
|
def language_probability(language)
|
||||||
@languages[language].to_f / @languages_total.to_f
|
Math.log(@languages[language].to_f / @languages_total.to_f)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user