From fd7633518f1aff15eee5338fd6715fa2717341fe Mon Sep 17 00:00:00 2001 From: Charlie Somerville Date: Wed, 25 Feb 2015 12:34:07 +1100 Subject: [PATCH 1/9] add instrumentation to detection and classification --- lib/linguist.rb | 12 ++++++++++++ lib/linguist/classifier.rb | 8 +++++--- lib/linguist/language.rb | 28 +++++++++++++++------------- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/lib/linguist.rb b/lib/linguist.rb index ff9fc3a2..4419ff5b 100644 --- a/lib/linguist.rb +++ b/lib/linguist.rb @@ -6,3 +6,15 @@ require 'linguist/repository' require 'linguist/samples' require 'linguist/shebang' require 'linguist/version' + +class << Linguist + attr_accessor :instrumenter + + def instrument(*args, &bk) + if instrumenter + instrumenter.instrument(*args, &bk) + else + yield + end + end +end diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 89a0df2f..208467e4 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -16,9 +16,11 @@ module Linguist # # Returns an Array of Language objects, most probable first. def self.call(blob, possible_languages) - language_names = possible_languages.map(&:name) - classify(Samples.cache, blob.data, language_names).map do |name, _| - Language[name] # Return the actual Language objects + Linguist.instrument("linguist.bayesian_classification") do + language_names = possible_languages.map(&:name) + classify(Samples.cache, blob.data, language_names).map do |name, _| + Language[name] # Return the actual Language objects + end end end diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 2490a9f6..68b4c4fc 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -105,19 +105,21 @@ module Linguist # Bail early if the blob is binary or empty. return nil if blob.likely_binary? || blob.binary? || blob.empty? - # Call each strategy until one candidate is returned. - STRATEGIES.reduce([]) do |languages, strategy| - candidates = strategy.call(blob, languages) - if candidates.size == 1 - return candidates.first - elsif candidates.size > 1 - # More than one candidate was found, pass them to the next strategy. - candidates - else - # No candiates were found, pass on languages from the previous strategy. - languages - end - end.first + Linguist.instrument("linguist.detection") do + # Call each strategy until one candidate is returned. + STRATEGIES.reduce([]) do |languages, strategy| + candidates = strategy.call(blob, languages) + if candidates.size == 1 + return candidates.first + elsif candidates.size > 1 + # More than one candidate was found, pass them to the next strategy. + candidates + else + # No candiates were found, pass on languages from the previous strategy. + languages + end + end.first + end end # Public: Get all Languages From 9a86b9ad753c6073825dff543cdf38496243a59b Mon Sep 17 00:00:00 2001 From: Arfon Smith Date: Thu, 26 Feb 2015 15:27:33 -0600 Subject: [PATCH 2/9] Instrument all calls and pass the blob, strategy and language candidates in the payload. --- lib/linguist/classifier.rb | 8 +++----- lib/linguist/language.rb | 23 ++++++++++++++++------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/lib/linguist/classifier.rb b/lib/linguist/classifier.rb index 208467e4..89a0df2f 100644 --- a/lib/linguist/classifier.rb +++ b/lib/linguist/classifier.rb @@ -16,11 +16,9 @@ module Linguist # # Returns an Array of Language objects, most probable first. def self.call(blob, possible_languages) - Linguist.instrument("linguist.bayesian_classification") do - language_names = possible_languages.map(&:name) - classify(Samples.cache, blob.data, language_names).map do |name, _| - Language[name] # Return the actual Language objects - end + language_names = possible_languages.map(&:name) + classify(Samples.cache, blob.data, language_names).map do |name, _| + Language[name] # Return the actual Language objects end end diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 68b4c4fc..bf0dfc33 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -107,18 +107,27 @@ module Linguist Linguist.instrument("linguist.detection") do # Call each strategy until one candidate is returned. - STRATEGIES.reduce([]) do |languages, strategy| - candidates = strategy.call(blob, languages) + languages = [] + strategy = nil + + STRATEGIES.each do |strategy| + candidates = Linguist.instrument("linguist.strategy", :blob => blob, :strategy => strategy, :candidates => languages) do + strategy.call(blob, languages) + end if candidates.size == 1 - return candidates.first + languages = candidates + break elsif candidates.size > 1 # More than one candidate was found, pass them to the next strategy. - candidates + languages = candidates else - # No candiates were found, pass on languages from the previous strategy. - languages + # No candidates, try the next strategy end - end.first + end + + Linguist.instrument("linguist.detected", :blob => blob, :strategy => strategy, :language => languages.first) do + languages.first + end end end From c22a6563d953b04e3ae591ad027038aa486c045d Mon Sep 17 00:00:00 2001 From: Arfon Smith Date: Sun, 1 Mar 2015 22:13:26 -0600 Subject: [PATCH 3/9] Writing some (failing) tests for instrumentation --- test/test_instrumentation.rb | 43 ++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 test/test_instrumentation.rb diff --git a/test/test_instrumentation.rb b/test/test_instrumentation.rb new file mode 100644 index 00000000..9078680c --- /dev/null +++ b/test/test_instrumentation.rb @@ -0,0 +1,43 @@ +require_relative "./helper" + +class TestInstrumentation < Minitest::Test + include Linguist + + class LocalInstrumenter + Event = Struct.new(:name, :args) + + attr_reader :events + + def initialize + @events = [] + end + + def instrument(name, *args) + @events << Event.new(name, args) + end + end + + def setup + Linguist.instrumenter = LocalInstrumenter.new + end + + def teardown + Linguist.instrumenter = nil + end + + def test_detection_instrumentation_with_binary_blob + binary_blob = fixture_blob("Binary/octocat.ai") + Language.detect(binary_blob) + + # Shouldn't instrument this (as it's binary) + assert_equal 0, Linguist.instrumenter.events.size + end + + def test_modeline_instrumentation + blob = fixture_blob("Data/Modelines/ruby") + Language.detect(blob) + + assert_equal 3, Linguist.instrumenter.events.size + assert_equal "linguist.detected", Linguist.instrumenter.events.last.name + end +end From 2d5476f6c844fcae21858c39c8a120e2d3fdf632 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 5 Mar 2015 10:01:28 -0800 Subject: [PATCH 4/9] Yield the block in LocalInstrumenter --- test/test_instrumentation.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_instrumentation.rb b/test/test_instrumentation.rb index 9078680c..b9e5c3e9 100644 --- a/test/test_instrumentation.rb +++ b/test/test_instrumentation.rb @@ -14,6 +14,7 @@ class TestInstrumentation < Minitest::Test def instrument(name, *args) @events << Event.new(name, args) + yield end end From e8326529b5121f80c80b58216c554f5de4982cd6 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 5 Mar 2015 10:03:01 -0800 Subject: [PATCH 5/9] Pass blob to instrumentation --- lib/linguist/language.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index bf0dfc33..e0e35602 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -105,7 +105,7 @@ module Linguist # Bail early if the blob is binary or empty. return nil if blob.likely_binary? || blob.binary? || blob.empty? - Linguist.instrument("linguist.detection") do + Linguist.instrument("linguist.detection", :blob => blob) do # Call each strategy until one candidate is returned. languages = [] strategy = nil From 3dcdc11c1b6cf28c2b78d2dba83de4cfce58d132 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 5 Mar 2015 10:03:51 -0800 Subject: [PATCH 6/9] Avoid passing block to detected instrumenter --- lib/linguist/language.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index e0e35602..6c551bf7 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -125,9 +125,9 @@ module Linguist end end - Linguist.instrument("linguist.detected", :blob => blob, :strategy => strategy, :language => languages.first) do - languages.first - end + Linguist.instrument("linguist.detected", :blob => blob, :strategy => strategy, :language => languages.first) + + languages.first end end From 7fdead01009570298d4d6777192880b2d1422a90 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 5 Mar 2015 10:11:08 -0800 Subject: [PATCH 7/9] Only yield if block given --- test/test_instrumentation.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_instrumentation.rb b/test/test_instrumentation.rb index b9e5c3e9..9fd5dc66 100644 --- a/test/test_instrumentation.rb +++ b/test/test_instrumentation.rb @@ -14,7 +14,7 @@ class TestInstrumentation < Minitest::Test def instrument(name, *args) @events << Event.new(name, args) - yield + yield if block_given? end end From 1bc1803628e5b74d813d2db46ffd0756d6c582f3 Mon Sep 17 00:00:00 2001 From: Arfon Smith Date: Thu, 5 Mar 2015 12:50:12 -0600 Subject: [PATCH 8/9] Check for block here too --- lib/linguist.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist.rb b/lib/linguist.rb index 4419ff5b..3929efb9 100644 --- a/lib/linguist.rb +++ b/lib/linguist.rb @@ -14,7 +14,7 @@ class << Linguist if instrumenter instrumenter.instrument(*args, &bk) else - yield + yield if block_given? end end end From a1010b8cf810f09a066937011928d2d4ebc00415 Mon Sep 17 00:00:00 2001 From: Arfon Smith Date: Thu, 5 Mar 2015 13:21:07 -0600 Subject: [PATCH 9/9] Actually return the strategy --- lib/linguist/language.rb | 5 +++-- test/test_instrumentation.rb | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 6c551bf7..a1ff3318 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -108,9 +108,10 @@ module Linguist Linguist.instrument("linguist.detection", :blob => blob) do # Call each strategy until one candidate is returned. languages = [] - strategy = nil + returning_strategy = nil STRATEGIES.each do |strategy| + returning_strategy = strategy candidates = Linguist.instrument("linguist.strategy", :blob => blob, :strategy => strategy, :candidates => languages) do strategy.call(blob, languages) end @@ -125,7 +126,7 @@ module Linguist end end - Linguist.instrument("linguist.detected", :blob => blob, :strategy => strategy, :language => languages.first) + Linguist.instrument("linguist.detected", :blob => blob, :strategy => returning_strategy, :language => languages.first) languages.first end diff --git a/test/test_instrumentation.rb b/test/test_instrumentation.rb index 9fd5dc66..ab0615e5 100644 --- a/test/test_instrumentation.rb +++ b/test/test_instrumentation.rb @@ -38,7 +38,13 @@ class TestInstrumentation < Minitest::Test blob = fixture_blob("Data/Modelines/ruby") Language.detect(blob) + detect_event = Linguist.instrumenter.events.last + detect_event_payload = detect_event[:args].first + assert_equal 3, Linguist.instrumenter.events.size - assert_equal "linguist.detected", Linguist.instrumenter.events.last.name + assert_equal "linguist.detected", detect_event.name + assert_equal Language['Ruby'], detect_event_payload[:language] + assert_equal blob, detect_event_payload[:blob] + assert_equal Linguist::Strategy::Modeline, detect_event_payload[:strategy] end end