在多线程环境中捕获信号答案

【问题标题】：trapping signals in a multithreaded environment在多线程环境中捕获信号
【发布时间】：2015-05-29 22:42:57
【问题描述】：

我有一个大型程序，需要尽可能地弹性化，并且有大量线程。我需要捕获所有信号SIGBUSSIGSEGV，并在必要时重新初始化问题线程，或者禁用线程以继续减少功能。

我的第一个想法是做一个setjump，然后设置可以记录问题的信号处理程序，然后做一个longjump 回到线程中的恢复点。存在一个问题，即信号处理程序需要确定信号来自哪个线程，以使用适当的跳转缓冲区，因为跳回错误的线程将毫无用处。

有人知道如何在信号处理程序中确定有问题的线程吗？

【问题讨论】：

您需要使用sigsetjump()/siglongjmp() 而不是setjmp()/longjmp()，这样您就不必重置信号处理程序。
信号处理程序访问静态或线程存储的对象，并调用标准库函数？这听起来像 UB。

标签： c multithreading signals setjmp

【解决方案1】：

我假设您已经考虑过这一点，并且有充分的理由相信您的程序将通过尝试在 SIGSEGV 之后重试而具有更多弹性 - 请记住段错误突出显示悬空指针和其他滥用问题，这些问题也可能会破坏进程地址空间中不可预测的位置，而不会出现段错误。

由于您已经非常仔细地考虑过这一点，并且您已经确定（以某种方式）您的应用程序段错误的特定方式不可能掩盖用于取消和重新启动线程的会计数据的损坏，并且您可以完美取消这些线程的逻辑（也非常罕见），让我们继续解决问题。

Linux 上的 SIGSEGV 处理程序在失败指令的线程中执行（man 7 信号）。我们不能调用 pthread_self() 因为它不是异步信号安全的，但是互联网上似乎普遍认为 syscall (man 2 syscall) 是安全的，所以我们可以通过 syscall SYS_gettid 获取线程 ID。因此，我们将维护 pthread_t (pthread_self) 到 pid (gettid()) 的映射。由于 write() 也是安全的，我们可以捕获 SEGV，将当前线程 ID 写入管道，然后暂停直到 pthread_cancel 终止我们。

我们还需要一个监控线程来监视事情何时变成梨形。监视器线程监视管道的读取端以获取有关已终止线程的信息，并可能重新启动它。

因为我认为假装处理 SIGSEGV 是愚蠢的，所以我将在这里调用执行此操作的结构 daft_thread_t 等。someone_please_fix_me 代表您损坏的代码。监控线程是 main()。当一个线程出现段错误时，它被信号处理程序捕获，将其 ID 写入管道；监视器读取管道，使用 pthread_cancel 和 pthread_join 取消线程，然后重新启动它。

#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <signal.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/syscall.h>

#define MAX_DAFT_THREADS (1024) // arbitrary

#define CHECK_OSCALL(call, onfail) { \
    if ((call) == -1) { \
        char buf[512]; \
        strerror_r(errno, buf, sizeof(buf)); \
        fprintf(stderr, "%s@%d failed: %s\n", __FILE__, __LINE__, buf); \
        onfail; \
    } \
}

/*********************** daft thread accounting *****************/
typedef void* (*threadproc_t)(void* arg);

struct daft_thread_t {
    threadproc_t start_routine;
    void* start_routine_arg;
    pthread_t pthread;
    pid_t tid;
};

struct daft_thread_accounting_info_t {
    int monitor_pipe[2];
    pthread_mutex_t info_lock;
    size_t daft_thread_count;
    struct daft_thread_t daft_threads[MAX_DAFT_THREADS];
};

static struct daft_thread_accounting_info_t g_thread_accounting;

void daft_thread_accounting_info_init(struct daft_thread_accounting_info_t* inf)
{
    memset(inf, 0, sizeof(*inf));
    pthread_mutex_init(&inf->info_lock, NULL);
    CHECK_OSCALL(pipe(inf->monitor_pipe), abort());
}

struct daft_thread_wrapper_data_t {
    struct daft_thread_t* thread_info;
};

static void* daft_thread_wrapper(void* arg)
{
    struct daft_thread_t* wrapper = arg;
    wrapper->tid = gettid();
    return (*wrapper->start_routine)(wrapper->start_routine_arg);
}

static void start_daft_thread(threadproc_t proc, void* arg)
{
    struct daft_thread_t*  info;
    pthread_mutex_lock(&g_thread_accounting.info_lock);
    assert (g_thread_accounting.daft_thread_count < MAX_DAFT_THREADS);
    info = &g_thread_accounting.daft_threads[g_thread_accounting.daft_thread_count++];
    pthread_mutex_unlock(&g_thread_accounting.info_lock);
    info->start_routine = proc;
    info->start_routine_arg = arg;
    CHECK_OSCALL(pthread_create(&info->pthread, NULL, daft_thread_wrapper, info), abort());
}

static struct daft_thread_t* find_thread_by_tid(pid_t thread_id)
{
    int k;
    struct daft_thread_t* info = NULL;
    pthread_mutex_lock(&g_thread_accounting.info_lock);
    for (k = 0; k < g_thread_accounting.daft_thread_count; ++k) {
        if (g_thread_accounting.daft_threads[k].tid == thread_id) {
            info = &g_thread_accounting.daft_threads[k];
            break;
        }
    }
    pthread_mutex_unlock(&g_thread_accounting.info_lock);
    return info;
}

static void restart_daft_thread(struct daft_thread_t* info)
{
    void* unused;
    CHECK_OSCALL(pthread_cancel(info->pthread), abort());
    CHECK_OSCALL(pthread_join(info->pthread, &unused), abort());
    info->tid = 0;
    CHECK_OSCALL(pthread_create(&info->pthread, NULL, daft_thread_wrapper, info), abort());
}

/************* signal handling stuff **************/
struct sigdeath_notify_info {
    int signum;
    pid_t tid;
};

static void sigdeath_handler(int signum, siginfo_t* info, void* ctx)
{
    int z;
    struct sigdeath_notify_info inf = {
        .signum = signum,
        .tid = gettid()
    };
    z = write(g_thread_accounting.monitor_pipe[1], &inf, sizeof(inf));
    assert (z == sizeof(inf)); // or else SIGABRT. Are we handling that too? Hope     not.
    pause(); // returning doesn't do us any good.
}

static void register_signal_handlers()
{
    struct sigaction sa = {};
    sa.sa_sigaction = sigdeath_handler;
    sa.sa_flags = SA_SIGINFO;
    CHECK_OSCALL(sigaction(SIGSEGV, &sa, NULL), abort());
    CHECK_OSCALL(sigaction(SIGBUS, &sa, NULL), abort());
}

pid_t gettid() { return (pid_t) syscall(SYS_gettid); }

/** This is the code that segfaults randomly. Kwality with a 'k'. */
static void* someone_please_fix_me(void* arg)
{
    char* i_think_this_address_looks_nice = (char*) 42;
    sleep(1 + rand() % 200);
    i_think_this_address_looks_nice[0] = 'q'; // ugh
    return NULL;
}

// main() will serve as the monitor thread here
int main()
{
    int k;
    struct sigdeath_notify_info death;
    daft_thread_accounting_info_init(&g_thread_accounting);
    register_signal_handlers();
    for (k = 0; k < 200; ++k) {
        start_daft_thread(someone_please_fix_me, (void*) k);
    }
    while (read(g_thread_accounting.monitor_pipe[0], &death, sizeof(death)) == sizeof(death)) {
        struct daft_thread_t* info = find_thread_by_tid(death.tid);
        if (info == NULL) {
            fprintf(stderr, "*** thread_id %u not found\n", death.tid);
            continue;
        }
        fprintf(stderr, "Thread %u (%d) died of %d, restarting.\n",
            death.tid, (int) info->start_routine_arg, death.signum);
        restart_daft_thread(info);
    }
    fprintf(stderr, "Shouldn't get here.\n");
    return 0;
}

如果您没有考虑过：尝试从 SIGSEGV 中恢复是非常危险的 - 我强烈建议您不要这样做。线程共享一个地址空间。发生段错误的线程也可能损坏了其他线程数据或全局记帐数据，例如 malloc() 的记帐。一个更安全的方法 - 假设失败的代码被不可挽回地破坏但必须使用 - 是将失败的代码隔离在进程边界后面，例如在调用破坏的代码之前通过 fork()ing。然后，您必须捕获 SIGCLD 并处理进程正常崩溃或终止，以及许多其他陷阱，但至少您不必担心随机损坏。当然，最好的选择是修复该死的代码，这样您就不会观察到段错误。

【讨论】：

以上所有内容中最重要的一行是最后一行。线程不只是“中断”——你必须编写糟糕的代码。如果您添加某种“监控”线程，这只是您必须测试、调试、维护的另一件事，当然还有一个可能会中断的额外线程：(
应始终修复损坏的代码，并记录此类问题 - 尽管在磁盘空间不足时写入内存映射文件是一个可恢复的段错误。错误也可能发生在封闭的库中。
@camelccc 都是真的。请注意，尽管我提出了警告，但我已经为您的问题提供了一个可行的解决方案 :) 一个封闭的库完全能够破坏您的进程的地址空间，但无法读取其源代码并不能保护您。因此，我维持该案例的“分叉”建议。回复：mmap：我认为 mmap 失败实际上给了 SIGBUS，但可能是错的。
@MartinJames 非常正确。尝试给我的回应是“这是一个糟糕的主意，这就是你如何去做”的氛围 - 在原始问题的限制内工作。正如他的评论所暗示的那样，我怀疑 OP 的第三方库损坏了。坚持认为，如果您处于这种情况，最好在单独的进程中运行有问题的代码。

【解决方案2】：

在我的 Linux 机器上使用 syscall(SYS_gettid) 对我有用：gcc pt.c -lpthread -Wall -Wextra

//pt.c
#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <setjmp.h>
#include <signal.h>
#include <string.h>
#include <ucontext.h>
#include <stdlib.h>

static sigjmp_buf jmpbuf[65536];

static void handler(int sig, siginfo_t *siginfo, void *context)
{
    //ucontext_t *ucontext = context;
    pid_t tid = syscall(SYS_gettid);

    printf("Thread %d in handler, signal %d\n", tid, sig);
    siglongjmp(jmpbuf[tid], 1);
}

static void *threadfunc(void *data)
{
    int index, segvindex = *(int *)data;
    pid_t tid = syscall(SYS_gettid);

    for(index = 0; index < 500; index++) {
        if (sigsetjmp(jmpbuf[tid], 1) == 1) {
            printf("Recovery of thread %d\n", tid); 
            continue;
        }
        printf("Thread %d, index %d\n", tid, index);
        if (index % 5 == segvindex) {
            printf("%zu\n", strlen((char *)2)); // SIGSEGV
        }
        pthread_yield();
    }
    return NULL;
}

int main(void)
{
    pthread_t thread1, thread2, thread3;
    int segvindex1 = rand() % 5;
    int segvindex2 = rand() % 5;
    int segvindex3 = rand() % 5;
    struct sigaction sact;

    memset(&sact, 0, sizeof sact);
    sact.sa_sigaction = handler;
    sact.sa_flags = SA_SIGINFO;
    if (sigaction(SIGSEGV, &sact, NULL) < 0) {
        perror("sigaction");
        return 1;
    }
    pthread_create(&thread1, NULL, &threadfunc, (void *) &segvindex1);
    pthread_create(&thread2, NULL, &threadfunc, (void *) &segvindex2);
    pthread_create(&thread3, NULL, &threadfunc, (void *) &segvindex3);
    pthread_join(thread1, NULL);
    pthread_join(thread2, NULL);
    pthread_join(thread3, NULL);
    return 0;
}

为了更便携可以使用pthread_self。它是异步信号安全的。

但是获得SIGSEGV 的线程应该通过异步信号安全方式启动一个新线程，并且不应该执行siglongjmp，因为它可能导致调用非异步信号安全函数。

【讨论】：

在信号处理程序中使用系统调用安全吗？
@user252127 Linux 特定的syscall(SYS_gettid) 是异步信号安全的。它满足异步信号安全的所有规则。由于它是特定于 Linux 的，它不在异步信号安全函数的 POSIX-list 上。几乎相同的getpid()在列表中。

【解决方案3】：

根据我的经验，当线程程序接收到同步信号时 - 即由程序所做的某事生成的信号，例如取消引用错误指针 - 导致问题的线程接收到信号。

我使用了一个明确保证这种行为的系统，但我不知道它是否通用。当然，如果有问题的线程阻塞了信号，就像在一个线程处理所有信号的范例中一样，它可能会转到信号处理线程。

【讨论】：

这必须是正确的才能工作，但接下来是什么 - 你需要跳出处理程序，并且需要确定去哪里。我看不到任何使用堆栈框架来提供帮助的方法
Linux signal(7) 手册页 (linux.die.net/man/7/signal) 似乎暗示导致 SIGSEGV 等的线程将是处理信号的线程：“可能会生成一个信号（因此未决） ) 用于整个进程（例如，当使用 kill(2) 发送时）或特定线程（例如，由于执行特定机器语言指令而生成的某些信号，例如 SIGSEGV 和 SIGFPE 是线程导向的, 就像使用 pthread_kill(3)) 针对特定线程的信号一样。”