From 1831390429649489c06cfb8f24072e918fe18653 Mon Sep 17 00:00:00 2001 From: Arfon Smith Date: Thu, 6 Nov 2014 14:09:19 -0600 Subject: [PATCH] Use heuristics earlier to inform the rest of the classification process --- lib/linguist/language.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 49c4a6be..3ad318fb 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -125,6 +125,11 @@ module Linguist if possible_languages.length > 1 data = blob.data possible_language_names = possible_languages.map(&:name) + heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names) + + if heuristic_languages.size > 1 + possible_language_names = heuristic_languages.map(&:name) + end # Don't bother with binary contents or an empty file if data.nil? || data == "" @@ -133,8 +138,8 @@ module Linguist elsif (result = find_by_shebang(data)) && !result.empty? result.first # No shebang. Still more work to do. Try to find it with our heuristics. - elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty? - determined.first + elsif heuristic_languages.size == 1 + heuristic_languages.first # Lastly, fall back to the probabilistic classifier. elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first # Return the actual Language object based of the string language name (i.e., first element of `#classify`)