Browse Source

logger: add watchdog that boosts its priority when the writer is blocked for >1s

This is to guard against busy-looping tasks, so that we at least have a
log. In case the watchdog is triggered, it also logs CPU load of all
tasks.
sbg
Beat Küng 7 years ago committed by Lorenz Meier
parent
commit
dd30012b27
  1. 1
      src/modules/logger/CMakeLists.txt
  2. 7
      src/modules/logger/log_writer.h
  3. 2
      src/modules/logger/log_writer_file.h
  4. 25
      src/modules/logger/logger.cpp
  5. 168
      src/modules/logger/watchdog.cpp
  6. 78
      src/modules/logger/watchdog.h
  7. 4
      src/modules/systemlib/cpuload.c

1
src/modules/logger/CMakeLists.txt

@ -43,6 +43,7 @@ px4_add_module( @@ -43,6 +43,7 @@ px4_add_module(
log_writer.cpp
log_writer_file.cpp
log_writer_mavlink.cpp
watchdog.cpp
DEPENDS
version
)

7
src/modules/logger/log_writer.h

@ -137,6 +137,13 @@ public: @@ -137,6 +137,13 @@ public:
return 0;
}
pthread_t thread_id_file() const
{
if (_log_writer_file) { return _log_writer_file->thread_id(); }
return (pthread_t)0;
}
/**
* Indicate to the underlying backend whether future write_message() calls need a reliable

2
src/modules/logger/log_writer_file.h

@ -113,6 +113,8 @@ public: @@ -113,6 +113,8 @@ public:
return _need_reliable_transfer;
}
pthread_t thread_id() const { return _thread; }
private:
static void *run_helper(void *);

25
src/modules/logger/logger.cpp

@ -34,6 +34,7 @@ @@ -34,6 +34,7 @@
#include <px4_config.h>
#include "logger.h"
#include "messages.h"
#include "watchdog.h"
#include <dirent.h>
#include <sys/stat.h>
@ -92,6 +93,8 @@ using namespace px4::logger; @@ -92,6 +93,8 @@ using namespace px4::logger;
struct timer_callback_data_s {
px4_sem_t semaphore;
watchdog_data_t watchdog_data;
volatile bool watchdog_triggered = false;
};
/* This is used to schedule work for the logger (periodic scan for updated topics) */
@ -101,6 +104,9 @@ static void timer_callback(void *arg) @@ -101,6 +104,9 @@ static void timer_callback(void *arg)
timer_callback_data_s *data = (timer_callback_data_s *)arg;
if (watchdog_update(data->watchdog_data)) {
data->watchdog_triggered = true;
}
/* check the value of the semaphore: if the logger cannot keep up with running it's main loop as fast
* as the timer_callback here increases the semaphore count, the counter would increase unbounded,
@ -951,6 +957,17 @@ void Logger::run() @@ -951,6 +957,17 @@ void Logger::run()
} else {
if (_writer.backend() & LogWriter::BackendFile) {
const pid_t pid_self = getpid();
// The pthread_t ID is equal to the PID on NuttX
const pid_t pid_writer = _writer.thread_id_file();
// sched_note_start is already called from pthread_create and task_create,
// which means we can expect to find the tasks in system_load.tasks, as required in watchdog_initialize
watchdog_initialize(pid_self, pid_writer, timer_callback_data.watchdog_data);
}
hrt_call_every(&timer_call, _log_interval, _log_interval, timer_callback, &timer_callback_data);
}
@ -1027,6 +1044,11 @@ void Logger::run() @@ -1027,6 +1044,11 @@ void Logger::run()
}
if (timer_callback_data.watchdog_triggered) {
timer_callback_data.watchdog_triggered = false;
initialize_load_output(PrintLoadReason::Watchdog);
}
const hrt_abstime loop_time = hrt_absolute_time();
@ -1624,6 +1646,9 @@ void Logger::initialize_load_output(PrintLoadReason reason) @@ -1624,6 +1646,9 @@ void Logger::initialize_load_output(PrintLoadReason reason)
void Logger::write_load_output()
{
if (_print_load_reason == PrintLoadReason::Watchdog) {
PX4_ERR("Writing watchdog data"); // this is just that we see it easily in the log
}
perf_callback_data_t callback_data = {};
char buffer[140];
callback_data.logger = this;

168
src/modules/logger/watchdog.cpp

@ -0,0 +1,168 @@ @@ -0,0 +1,168 @@
/****************************************************************************
*
* Copyright (c) 2016 PX4 Development Team. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name PX4 nor the names of its contributors may be
* used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
****************************************************************************/
#include "watchdog.h"
#include <px4_log.h>
using namespace time_literals;
namespace px4
{
namespace logger
{
bool watchdog_update(watchdog_data_t &watchdog_data)
{
#ifdef __PX4_NUTTX
if (system_load.initialized && watchdog_data.logger_main_task_index >= 0
&& watchdog_data.logger_writer_task_index >= 0) {
const hrt_abstime now = hrt_absolute_time();
const system_load_taskinfo_s &log_writer_task = system_load.tasks[watchdog_data.logger_writer_task_index];
if (log_writer_task.valid) {
// Trigger the watchdog if the log writer task has been ready to run for a
// minimum duration and it has not been scheduled during that time.
// When the writer is waiting for an SD transfer, it is not in ready state, thus a long dropout
// will not trigger it. The longest period in ready state I measured was around 70ms,
// after a param change.
// We only check the log writer because it runs at lower priority than the main thread.
// No need to lock the tcb access, since we are in IRQ context
// update the timestamp if it has been scheduled recently
if (log_writer_task.curr_start_time > watchdog_data.ready_to_run_timestamp) {
watchdog_data.ready_to_run_timestamp = log_writer_task.curr_start_time;
}
// update the timestamp if not ready to run or if transitioned into ready to run
uint8_t current_state = log_writer_task.tcb->task_state;
if (current_state != TSTATE_TASK_READYTORUN
|| (watchdog_data.last_state != TSTATE_TASK_READYTORUN && current_state == TSTATE_TASK_READYTORUN)) {
watchdog_data.ready_to_run_timestamp = now;
}
watchdog_data.last_state = current_state;
#if 0 // for debugging
// test code that prints the maximum time in ready state
static uint64_t max_time = 0;
if (now - watchdog_data.ready_to_run_timestamp > max_time) {
max_time = now - watchdog_data.ready_to_run_timestamp;
}
static int counter = 0;
if (++counter > 300) {
PX4_ERR("max time in ready: %i ms", (int)max_time / 1000);
counter = 0;
max_time = 0;
}
#endif
if (now - watchdog_data.ready_to_run_timestamp > 1_s) {
PX4_ERR("watchdog triggered!"); // this will most likely not be logged due to dropouts
// boost the priority to make sure the logger continues to write to the log
sched_param param{};
param.sched_priority = SCHED_PRIORITY_MAX;
int ret;
if (system_load.tasks[watchdog_data.logger_main_task_index].valid) {
ret = sched_setparam(system_load.tasks[watchdog_data.logger_main_task_index].tcb->pid, &param);
if (ret < 0) {
PX4_ERR("sched_reprioritize failed (%i)", ret);
}
}
ret = sched_setparam(log_writer_task.tcb->pid, &param);
if (ret < 0) {
PX4_ERR("sched_reprioritize failed (%i)", ret);
}
// make sure we won't trigger again
watchdog_data.logger_main_task_index = -1;
return true;
}
} else {
// should never happen
watchdog_data.logger_main_task_index = -1;
}
}
#endif /* __PX4_NUTTX */
return false;
}
void watchdog_initialize(const pid_t pid_logger_main, const pid_t pid_logger_writer, watchdog_data_t &watchdog_data)
{
#ifdef __PX4_NUTTX
sched_lock(); // need to lock the tcb access
for (int i = 0; i < CONFIG_MAX_TASKS; i++) {
if (system_load.tasks[i].valid) {
if (system_load.tasks[i].tcb->pid == pid_logger_writer) {
watchdog_data.logger_writer_task_index = i;
}
if (system_load.tasks[i].tcb->pid == pid_logger_main) {
watchdog_data.logger_main_task_index = i;
}
}
}
sched_unlock();
if (watchdog_data.logger_writer_task_index == -1 ||
watchdog_data.logger_main_task_index == -1) {
// If we land here it means the NuttX implementation changed
// and one of our assumptions is not valid anymore
PX4_ERR("watchdog init failed");
}
#endif /* __PX4_NUTTX */
}
} // namespace logger
} // namespace px4

78
src/modules/logger/watchdog.h

@ -0,0 +1,78 @@ @@ -0,0 +1,78 @@
/****************************************************************************
*
* Copyright (c) 2018 PX4 Development Team. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name PX4 nor the names of its contributors may be
* used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
****************************************************************************/
#pragma once
#include <drivers/drv_hrt.h>
#ifdef __PX4_NUTTX
#include <sched.h>
#include <systemlib/cpuload.h>
#endif /* __PX4_NUTTX */
namespace px4
{
namespace logger
{
struct watchdog_data_t {
#ifdef __PX4_NUTTX
int logger_main_task_index = -1;
int logger_writer_task_index = -1;
hrt_abstime ready_to_run_timestamp = hrt_absolute_time();
uint8_t last_state = TSTATE_TASK_INVALID;
#endif /* __PX4_NUTTX */
};
/**
* Initialize the watchdog, fill in watchdog_data.
*/
void watchdog_initialize(const pid_t pid_logger_main, const pid_t pid_logger_writer, watchdog_data_t &watchdog_data);
/**
* Update the watchdog and trigger it if necessary. It is triggered when the log writer task is in
* ready state for a certain period of time, but did not get scheduled. It means that most likely
* some other higher-prio task runs busy.
* When the watchdog triggers, it boosts the priority of the logger's main & writer tasks to maximum, so
* that they get scheduled again.
*
* Expected to be called from IRQ context.
*
* @param watchdog_data
* @return true if watchdog is triggered, false otherwise
*/
bool watchdog_update(watchdog_data_t &watchdog_data);
} //namespace logger
} //namespace px4

4
src/modules/systemlib/cpuload.c

@ -167,7 +167,11 @@ void sched_note_resume(FAR struct tcb_s *tcb) @@ -167,7 +167,11 @@ void sched_note_resume(FAR struct tcb_s *tcb)
for (int i = 0; i < CONFIG_MAX_TASKS; i++) {
if (system_load.tasks[i].valid && system_load.tasks[i].tcb->pid == tcb->pid) {
// curr_start_time is accessed from an IRQ handler (in logger), so we need
// to make the update atomic
irqstate_t irq_state = px4_enter_critical_section();
system_load.tasks[i].curr_start_time = new_time;
px4_leave_critical_section(irq_state);
break;
}
}

Loading…
Cancel
Save