TombStone文件如何生成

69 阅读 0 评论 46 点赞

我是靠谱客的博主深情电灯胆，最近开发中收集的这篇文章主要介绍TombStone文件如何生成，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

开发者在分析系统稳定性的时候通常需要知道进程发生异常时的调用栈来分析问题，确认发生异常时进程正在做什么。这样才能根据进程所处场景或者调用栈、寄存器信息分析异常发生的原因。而在Android中，当Native进程发生崩溃时，会在/data/tombstones/目录下生成tombstones_xxx的文件，里面记录了某一个进程在发生崩溃时候的所有信息，包括调用栈、寄存器信息等等。这极大的有助于开发者来定位问题，本篇文章来分析下TombStone文件到底是如何生成的。

首先我们要知道，Native进程发生崩溃的时候，是因为接收到了Kernel发送的异常信号。这些信号是如何发送给当前进程呢？在Android平台上Native进程通过fork创建，然后由exec族的函数将elf文件替换原本的程序段，加载对应的程序指令，接着用通过linker32或者linker64去加载动态共享库，例如libc.so等等。而在Android平台中，linker会注册异常信号到kernel，然后通过回调函数来处理异常信号，在回调的过程就生成了TombStone文件，接下来看一下注册和回调的过程。

首先用objdump对linker64可执行程序进行反汇编，通过readelf找到elf文件执行的入口地址0x4cbb0，然后在反汇编中找到对应的地址就找到了对应的入口函数(类似于main函数)，linker中__dl__start就是入口函数。

aarch64-linux-android-readelf -h  linker64
ELF Header:
  ...
  Machine:                           AArch64
  Version:                           0x1
  Entry point address:               0x4cbb0
  Start of program headers:          64 (bytes into file)
  Start of section headers:          1440120 (bytes into file)
  ...

000000000004cbb0 <__dl__start>:
   4cbb0:   910003e0    mov x0, sp
   4cbb4:   9400cc07    bl  7fbd0 <__dl__linker_init>
   4cbb8:   d61f0000    br  x0
   4cbbc:   00000000    .inst   0x00000000 ; undefined

由于Android.bp中默认添加了前缀"_dl"，prefix_symbols: “_dl”，所以入口地址就是_start。此处是用汇编写的，主要就跳转到__linker_init去执行。

android/bionic/linker/arch/arm64/begin.S
#include <private/bionic_asm.h>

ENTRY(_start)
  // Force unwinds to end in this function.
  .cfi_undefined x30

  mov x0, sp
  bl __linker_init

  /* linker init returns the _entry address in the main image */
  br x0
END(_start)

extern "C" ElfW(Addr) __linker_init(void* raw_args) {
     ...
     return __linker_init_post_relocation(args, tmp_linker_so);
}

static ElfW(Addr) __attribute__((noinline))
__linker_init_post_relocation(KernelArgumentBlock& args, soinfo& tmp_linker_so) {

  ...
  sonext = solist = solinker = get_libdl_info(kLinkerPath, tmp_linker_so);
  g_default_namespace.add_soinfo(solinker);
  init_link_map_head(*solinker, kLinkerPath);

  ElfW(Addr) start_address = linker_main(args, exe_to_load);

  INFO("[ Jumping to _start (%p)... ]", reinterpret_cast<void*>(start_address));

  // Return the address that the calling assembly stub should jump to.
  return start_address;
}

从代码逻辑来看，最后返回的是起始地址，接下来调用linker_main，函数中初始化了系统属性，并且调用debuggerd_init，并且传入了一个callback，但是此处并不是处理异常信号的回调。异常信号的注册是在debuggerd_init中。其中设置了接受异常信号的回调函数debuggerd_signal_handler以及要接收哪些异常信号。

static ElfW(Addr) linker_main(KernelArgumentBlock& args, const char* exe_to_load) {
  ...
  // Initialize system properties
  __system_properties_init(); // may use 'environ'

  // Register the debuggerd signal handler.
#ifdef __ANDROID__
  debuggerd_callbacks_t callbacks = {
    .get_abort_message = []() {
      return __libc_shared_globals()->abort_msg;
    },
    .post_dump = &notify_gdb_of_libraries,
  };
  debuggerd_init(&callbacks);
#endif
  ...
}  

void debuggerd_init(debuggerd_callbacks_t* callbacks) {
  if (callbacks) {
    g_callbacks = *callbacks;    //保存callback
  }
  ...
  struct sigaction action;
  memset(&action, 0, sizeof(action));
  sigfillset(&action.sa_mask);
  action.sa_sigaction = debuggerd_signal_handler;  //设置回调函数
  action.sa_flags = SA_RESTART | SA_SIGINFO;

  // Use the alternate signal stack if available so we can catch stack overflows.
  action.sa_flags |= SA_ONSTACK;   //设置signal的属性
  debuggerd_register_handlers(&action);
}

//注册不同的异常信号
static void __attribute__((__unused__)) debuggerd_register_handlers(struct sigaction* action) {
  sigaction(SIGABRT, action, nullptr);
  sigaction(SIGBUS, action, nullptr);
  sigaction(SIGFPE, action, nullptr);
  sigaction(SIGILL, action, nullptr);
  sigaction(SIGSEGV, action, nullptr);
#if defined(SIGSTKFLT)
  sigaction(SIGSTKFLT, action, nullptr);
#endif
  sigaction(SIGSYS, action, nullptr);
  sigaction(SIGTRAP, action, nullptr);
  sigaction(DEBUGGER_SIGNAL, action, nullptr);
}

待kernel中有signal产生，会判断用户进程是否有注册处理异常信号的函数，如果没有则使用kernel默认的处理函数。此处linker中注册了debuggerd_signal_handler来处理kernel产生的异常信号。

// 通过fork子进程处理crash dump；
// 执行debuggerd来处理实际的dump；
static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
  ...
  struct siginfo dummy_info = {};
  if (!info) {  //存储dump的基本信息
    memset(&dummy_info, 0, sizeof(dummy_info));
    dummy_info.si_signo = signal_number;
    dummy_info.si_code = SI_USER;
    dummy_info.si_pid = __getpid();
    dummy_info.si_uid = getuid();
    info = &dummy_info;
  } 
  ...
  // 通过线程锁来防止多个线程同时处理信号
  int ret = pthread_mutex_lock(&crash_mutex);
  if (ret != 0) {
    async_safe_format_log(ANDROID_LOG_INFO, "libc", "pthread_mutex_lock failed: %s", strerror(ret));
    return;
  }
  //先记录一些总的信息，以防debuggerd无法处理导致没有信息存储下来；
  log_signal_summary(info); 

  debugger_thread_info thread_info = {
      .pseudothread_tid = -1,
      .crashing_tid = __gettid(),
      .siginfo = info,
      .ucontext = context,
      .abort_msg = reinterpret_cast<uintptr_t>(abort_message),
      .fdsan_table = reinterpret_cast<uintptr_t>(android_fdsan_get_fd_table()),
  };

  // Set PR_SET_DUMPABLE to 1, so that crash_dump can ptrace us.
  int orig_dumpable = prctl(PR_GET_DUMPABLE);
  if (prctl(PR_SET_DUMPABLE, 1) != 0) {
    fatal_errno("failed to set dumpable");
  }
  ...
  //通过debuggerd_dispatch_pseudothread线程fork出dump_crash进程来处理；
  //不同指令集使用不同的进程；
  //#if defined(__LP64__)
  //#define CRASH_DUMP_NAME "crash_dump64"
  //#else
  //#define CRASH_DUMP_NAME "crash_dump32"
  //#endif
  pid_t child_pid =
    clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
          CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
          &thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
  ...
}

接下来看下dump_crash怎么存储crash信息；首先获取crash进程中每个线程的信息；

    for (pid_t thread : threads) {
      if (thread == pseudothread_tid) {
        continue;
      }

      if (!ptrace_seize_thread(target_proc_fd, thread, &error)) {
        bool fatal = thread == g_target_thread;
        LOG(fatal ? FATAL : WARNING) << error;
      }

      ThreadInfo info;
      info.pid = target_process;
      info.tid = thread;
      info.uid = getuid();
      info.process_name = process_name;
      info.thread_name = get_thread_name(thread);

      if (!ptrace_interrupt(thread, &info.signo)) {
        PLOG(WARNING) << "failed to ptrace interrupt thread " << thread;
        ptrace(PTRACE_DETACH, thread, 0, 0);
        continue;
      }

      if (thread == g_target_thread) {
        // Read the thread's registers along with the rest of the crash info out of the pipe.
        ReadCrashInfo(input_pipe, &siginfo, &info.registers, &abort_msg_address,
                      &fdsan_table_address);
        info.siginfo = &siginfo;
        info.signo = info.siginfo->si_signo;
      } else {
        info.registers.reset(unwindstack::Regs::RemoteGet(thread));
        if (!info.registers) {
          PLOG(WARNING) << "failed to fetch registers for thread " << thread;
          ptrace(PTRACE_DETACH, thread, 0, 0);
          continue;
        }
      }

      thread_info[thread] = std::move(info);
    }

下面连接到tombstoned守护进程，然后通过该进程来dump backtrace或者dump tombstone，由后面的si_val 变量来判断来做哪一个处理，其中unwinder会获取进程的maps映射信息，列出调用库在进程内所处的地址范围。

  {
    ATRACE_NAME("tombstoned_connect");
    LOG(INFO) << "obtaining output fd from tombstoned, type: " << dump_type;
    g_tombstoned_connected =tombstoned_connect(g_target_thread,
             &g_tombstoned_socket, &g_output_fd, dump_type);
  }

  if (!fatal_signal) {
    int si_val = siginfo.si_value.sival_int;
    if (si_val == 0) {
      backtrace = false;
    } else if (si_val == 1) {
      backtrace = true;
    } else {
      LOG(WARNING) << "unknown si_value value " << si_val;
    }
  }
  if (backtrace) {
    ATRACE_NAME("dump_backtrace");
    dump_backtrace(std::move(g_output_fd), &unwinder, thread_info, g_target_thread);
  } else {
    {
      ATRACE_NAME("fdsan table dump");
      populate_fdsan_table(&open_files, unwinder.GetProcessMemory(), fdsan_table_address);
    }

    {
      ATRACE_NAME("engrave_tombstone");
      engrave_tombstone(std::move(g_output_fd), &unwinder, thread_info, g_target_thread,
                        abort_msg_address, &open_files, &amfd_data);
    }
  }

上面的流程结束后就会在/data/tombstones/下生成对应的tombstone文件，当然文件的数量是有上限的，达到上限后新产生的会覆盖最老的文件。以上就是Native crash产生的整个原理以及流程，其中部分函数并没有深入展开，需要读者自行阅读/android/system/core/debuggerd/目录下的源码。