async router fetch, configurable timeouts

This commit is contained in:
Arseniy Kuznetsov
2023-01-01 10:30:09 +01:00
parent 660dd33369
commit 0159c47fb3
3 changed files with 80 additions and 18 deletions

View File

@@ -14,8 +14,9 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from timeit import default_timer
from datetime import datetime
from threading import Event, Timer
from mktxp.cli.config.config import config_handler
from mktxp.cli.config.config import MKTXPConfigKeys
class CollectorHandler:
''' MKTXP Collectors Handler
@@ -26,7 +27,8 @@ class CollectorHandler:
self.collector_registry = collector_registry
self.last_collect_timestamp = 0
def collect_synchronous(self):
def collect_sync(self):
"""
Collect the metrics of all router entries defined in the current users configuration synchronously.
This function iterates over each router entry one-by-one.
@@ -43,53 +45,103 @@ class CollectorHandler:
yield from collect_func(router_entry)
router_entry.time_spent[collector_ID] += default_timer() - start
def collect_single(self, router_entry):
def collect_router_entry_async(self, router_entry, scrape_timeout_event, total_scrape_timeout_event):
results = []
for collector_ID, collect_func in self.collector_registry.registered_collectors.items():
if scrape_timeout_event.is_set():
print(f'Hit timeout while scraping router entry: {router_entry.router_id[MKTXPConfigKeys.ROUTERBOARD_NAME]}')
break
if total_scrape_timeout_event.is_set():
print(f'Hit overall timeout while scraping router entry: {router_entry.router_id[MKTXPConfigKeys.ROUTERBOARD_NAME]}')
break
start = default_timer()
result = list(collect_func(router_entry))
results += result
router_entry.time_spent[collector_ID] += default_timer() - start
return results
def collect_parallel(self, max_worker_threads=5):
def collect_async(self, max_worker_threads=5):
"""
Collect the metrics of all router entries defined in the current users configuration in parallel.
This function iterates over multiple routers in parallel (depending on the value of max_worker_threads).
Thus, the total runtime scales sub linearly (number_of_routers / max_worker_threads).
"""
def timeout(timeout_event):
timeout_event.set()
# overall scrape duration
total_scrape_timeout_event = Event()
total_scrape_timer = Timer(config_handler.system_entry().total_max_scrape_duration, timeout, args=(total_scrape_timeout_event,))
total_scrape_timer.start()
with ThreadPoolExecutor(max_workers=max_worker_threads) as executor:
futures = []
futures = {}
for router_entry in self.entries_handler.router_entries:
if total_scrape_timeout_event.is_set():
print(f'Hit overall timeout while scraping router entry: {router_entry.router_id[MKTXPConfigKeys.ROUTERBOARD_NAME]}')
break
if not router_entry.api_connection.is_connected():
# let's pick up on things in the next run
router_entry.api_connection.connect()
continue
# Publish the collection function as a future
futures.append(executor.submit(self.collect_single, router_entry))
# Duration of individual scrapes
scrape_timeout_event = Event()
scrape_timer = Timer(config_handler.system_entry().max_scrape_duration, timeout, args=(scrape_timeout_event,))
scrape_timer.start()
futures[executor.submit(self.collect_router_entry_async, router_entry, scrape_timeout_event, total_scrape_timeout_event)] = scrape_timer
for future in as_completed(futures):
# cancel unused timers for scrapes finished regularly (within set duration)
futures[future].cancel()
yield from future.result()
# in case collection finished without timeouts, cancel the overall scrape duration timer
total_scrape_timer.cancel()
def collect(self):
now = datetime.now().timestamp()
diff = now - self.last_collect_timestamp
if diff < config_handler.system_entry().minimal_collect_interval:
if config_handler.system_entry().verbose_mode:
print(f'An attemp to collect metrics within minimal collection interval: {diff} < {config_handler.system_entry().minimal_collect_interval}')
print('deferring..')
if not self._valid_collect_interval():
return
self.last_collect_timestamp = now
# bandwidth collector
yield from self.collector_registry.bandwidthCollector.collect()
# all other collectors
# Check whether to run in parallel by looking at the mktxp system configuration
parallel = config_handler.system_entry().fetch_routers_in_parallel
max_worker_threads = config_handler.system_entry().max_worker_threads
if parallel:
yield from self.collect_parallel(max_worker_threads=max_worker_threads)
yield from self.collect_async(max_worker_threads=max_worker_threads)
else:
yield from self.collect_synchronous()
yield from self.collect_sync()
def _valid_collect_interval(self):
now = datetime.now().timestamp()
diff = now - self.last_collect_timestamp
if diff < config_handler.system_entry().minimal_collect_interval:
if config_handler.system_entry().verbose_mode:
print(f'An attemp to collect metrics within minimal metrics collection interval: {diff} < {config_handler.system_entry().minimal_collect_interval}')
print('deferring..')
return False
self.last_collect_timestamp = now
return True