diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index f9ba6e8a..a9707b28 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -147,16 +147,20 @@ module Linguist # indicate the least-likely language (and zero points) for each token. def dump_all_tokens(tokens, languages) maxlen = tokens.map { |tok| tok.size }.max + printf "%#{maxlen}s", "" puts " #" + languages.map { |lang| sprintf("%10s", lang) }.join + tokmap = Hash.new(0) tokens.each { |tok| tokmap[tok] += 1 } + tokmap.sort.each { |tok, count| arr = languages.map { |lang| [lang, token_probability(tok, lang)] } min = arr.map { |a,b| b }.min minlog = Math.log(min) if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] } printf "%#{maxlen}s%5d", tok, count + puts arr.map { |ent| ent[1] == min ? " -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog)) }.join