From 0159c47fb3f8701047265a6044065fa25bd9ca79 Mon Sep 17 00:00:00 2001 From: Arseniy Kuznetsov Date: Sun, 1 Jan 2023 10:30:09 +0100 Subject: [PATCH] async router fetch, configurable timeouts --- mktxp/cli/config/_mktxp.conf | 3 ++ mktxp/cli/config/config.py | 11 ++++- mktxp/flow/collector_handler.py | 84 ++++++++++++++++++++++++++------- 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/mktxp/cli/config/_mktxp.conf b/mktxp/cli/config/_mktxp.conf index 7131b05..add84b0 100644 --- a/mktxp/cli/config/_mktxp.conf +++ b/mktxp/cli/config/_mktxp.conf @@ -27,3 +27,6 @@ fetch_routers_in_parallel = False # Set to True if you want to fetch multiple routers parallel max_worker_threads = 5 # Max number of worker threads that can fetch routers. Meaningless if fetch_routers_in_parallel is set to False + + max_scrape_duration = 10 # Max duration of individual routers' metrics collection + total_max_scrape_duration = 30 # Max overall duration of all metrics collection \ No newline at end of file diff --git a/mktxp/cli/config/config.py b/mktxp/cli/config/config.py index 1b4e677..05549ea 100755 --- a/mktxp/cli/config/config.py +++ b/mktxp/cli/config/config.py @@ -73,6 +73,8 @@ class MKTXPConfigKeys: MKTXP_MIN_COLLECT_INTERVAL = 'minimal_collect_interval' MKTXP_FETCH_IN_PARALLEL = 'fetch_routers_in_parallel' MKTXP_MAX_WORKER_THREADS = 'max_worker_threads' + MKTXP_MAX_SCRAPE_DURATION = 'max_scrape_duration' + MKTXP_TOTAL_MAX_SCRAPE_DURATION = 'total_max_scrape_duration' # UnRegistered entries placeholder NO_ENTRIES_REGISTERED = 'NoEntriesRegistered' @@ -95,6 +97,8 @@ class MKTXPConfigKeys: DEFAULT_MKTXP_MIN_COLLECT_INTERVAL = 5 DEFAULT_MKTXP_FETCH_IN_PARALLEL = False DEFAULT_MKTXP_MAX_WORKER_THREADS = 5 + DEFAULT_MKTXP_MAX_SCRAPE_DURATION = 10 + DEFAULT_MKTXP_TOTAL_MAX_SCRAPE_DURATION = 30 BOOLEAN_KEYS_NO = {ENABLED_KEY, SSL_KEY, NO_SSL_CERTIFICATE, @@ -112,7 +116,7 @@ class MKTXPConfigKeys: STR_KEYS = (HOST_KEY, USER_KEY, PASSWD_KEY) MKTXP_INT_KEYS = (PORT_KEY, MKTXP_SOCKET_TIMEOUT, MKTXP_INITIAL_DELAY, MKTXP_MAX_DELAY, MKTXP_INC_DIV, MKTXP_BANDWIDTH_TEST_INTERVAL, MKTXP_MIN_COLLECT_INTERVAL, - MKTXP_MAX_WORKER_THREADS,) + MKTXP_MAX_WORKER_THREADS, MKTXP_MAX_SCRAPE_DURATION, MKTXP_TOTAL_MAX_SCRAPE_DURATION) # MKTXP config entry nane MKTXP_CONFIG_ENTRY_NAME = 'MKTXP' @@ -133,7 +137,8 @@ class ConfigEntry: MKTXPConfigKeys.MKTXP_INC_DIV, MKTXPConfigKeys.MKTXP_BANDWIDTH_KEY, MKTXPConfigKeys.MKTXP_VERBOSE_MODE, MKTXPConfigKeys.MKTXP_BANDWIDTH_TEST_INTERVAL, MKTXPConfigKeys.MKTXP_MIN_COLLECT_INTERVAL, MKTXPConfigKeys.MKTXP_FETCH_IN_PARALLEL, - MKTXPConfigKeys.MKTXP_MAX_WORKER_THREADS]) + MKTXPConfigKeys.MKTXP_MAX_WORKER_THREADS, MKTXPConfigKeys.MKTXP_MAX_SCRAPE_DURATION, + MKTXPConfigKeys.MKTXP_TOTAL_MAX_SCRAPE_DURATION]) class OSConfig(metaclass=ABCMeta): @@ -326,6 +331,8 @@ class MKTXPConfigHandler: MKTXPConfigKeys.MKTXP_MIN_COLLECT_INTERVAL: lambda value: MKTXPConfigKeys.DEFAULT_MKTXP_MIN_COLLECT_INTERVAL, MKTXPConfigKeys.MKTXP_FETCH_IN_PARALLEL: lambda _: MKTXPConfigKeys.DEFAULT_MKTXP_FETCH_IN_PARALLEL, MKTXPConfigKeys.MKTXP_MAX_WORKER_THREADS: lambda _: MKTXPConfigKeys.DEFAULT_MKTXP_MAX_WORKER_THREADS, + MKTXPConfigKeys.MKTXP_MAX_SCRAPE_DURATION: lambda _: MKTXPConfigKeys.DEFAULT_MKTXP_MAX_SCRAPE_DURATION, + MKTXPConfigKeys.MKTXP_TOTAL_MAX_SCRAPE_DURATION: lambda _: MKTXPConfigKeys.DEFAULT_MKTXP_TOTAL_MAX_SCRAPE_DURATION, }[key](value) diff --git a/mktxp/flow/collector_handler.py b/mktxp/flow/collector_handler.py index c97a08f..b56688a 100644 --- a/mktxp/flow/collector_handler.py +++ b/mktxp/flow/collector_handler.py @@ -14,8 +14,9 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from timeit import default_timer from datetime import datetime +from threading import Event, Timer from mktxp.cli.config.config import config_handler - +from mktxp.cli.config.config import MKTXPConfigKeys class CollectorHandler: ''' MKTXP Collectors Handler @@ -26,7 +27,8 @@ class CollectorHandler: self.collector_registry = collector_registry self.last_collect_timestamp = 0 - def collect_synchronous(self): + + def collect_sync(self): """ Collect the metrics of all router entries defined in the current users configuration synchronously. This function iterates over each router entry one-by-one. @@ -43,53 +45,103 @@ class CollectorHandler: yield from collect_func(router_entry) router_entry.time_spent[collector_ID] += default_timer() - start - def collect_single(self, router_entry): + + def collect_router_entry_async(self, router_entry, scrape_timeout_event, total_scrape_timeout_event): results = [] for collector_ID, collect_func in self.collector_registry.registered_collectors.items(): + if scrape_timeout_event.is_set(): + print(f'Hit timeout while scraping router entry: {router_entry.router_id[MKTXPConfigKeys.ROUTERBOARD_NAME]}') + break + + if total_scrape_timeout_event.is_set(): + print(f'Hit overall timeout while scraping router entry: {router_entry.router_id[MKTXPConfigKeys.ROUTERBOARD_NAME]}') + break + start = default_timer() result = list(collect_func(router_entry)) results += result router_entry.time_spent[collector_ID] += default_timer() - start + return results - def collect_parallel(self, max_worker_threads=5): + + def collect_async(self, max_worker_threads=5): """ Collect the metrics of all router entries defined in the current users configuration in parallel. This function iterates over multiple routers in parallel (depending on the value of max_worker_threads). Thus, the total runtime scales sub linearly (number_of_routers / max_worker_threads). """ + + def timeout(timeout_event): + timeout_event.set() + + # overall scrape duration + total_scrape_timeout_event = Event() + total_scrape_timer = Timer(config_handler.system_entry().total_max_scrape_duration, timeout, args=(total_scrape_timeout_event,)) + total_scrape_timer.start() + with ThreadPoolExecutor(max_workers=max_worker_threads) as executor: - futures = [] + futures = {} for router_entry in self.entries_handler.router_entries: + if total_scrape_timeout_event.is_set(): + print(f'Hit overall timeout while scraping router entry: {router_entry.router_id[MKTXPConfigKeys.ROUTERBOARD_NAME]}') + break + if not router_entry.api_connection.is_connected(): # let's pick up on things in the next run router_entry.api_connection.connect() continue - # Publish the collection function as a future - futures.append(executor.submit(self.collect_single, router_entry)) + # Duration of individual scrapes + scrape_timeout_event = Event() + scrape_timer = Timer(config_handler.system_entry().max_scrape_duration, timeout, args=(scrape_timeout_event,)) + scrape_timer.start() + + futures[executor.submit(self.collect_router_entry_async, router_entry, scrape_timeout_event, total_scrape_timeout_event)] = scrape_timer for future in as_completed(futures): + # cancel unused timers for scrapes finished regularly (within set duration) + futures[future].cancel() yield from future.result() + + # in case collection finished without timeouts, cancel the overall scrape duration timer + total_scrape_timer.cancel() def collect(self): - now = datetime.now().timestamp() - diff = now - self.last_collect_timestamp - if diff < config_handler.system_entry().minimal_collect_interval: - if config_handler.system_entry().verbose_mode: - print(f'An attemp to collect metrics within minimal collection interval: {diff} < {config_handler.system_entry().minimal_collect_interval}') - print('deferring..') + if not self._valid_collect_interval(): return - self.last_collect_timestamp = now + # bandwidth collector yield from self.collector_registry.bandwidthCollector.collect() + # all other collectors # Check whether to run in parallel by looking at the mktxp system configuration parallel = config_handler.system_entry().fetch_routers_in_parallel max_worker_threads = config_handler.system_entry().max_worker_threads if parallel: - yield from self.collect_parallel(max_worker_threads=max_worker_threads) + yield from self.collect_async(max_worker_threads=max_worker_threads) else: - yield from self.collect_synchronous() + yield from self.collect_sync() + + + def _valid_collect_interval(self): + now = datetime.now().timestamp() + diff = now - self.last_collect_timestamp + if diff < config_handler.system_entry().minimal_collect_interval: + if config_handler.system_entry().verbose_mode: + print(f'An attemp to collect metrics within minimal metrics collection interval: {diff} < {config_handler.system_entry().minimal_collect_interval}') + print('deferring..') + return False + + self.last_collect_timestamp = now + return True + + + + + + + +