初始化
Watchdog作为一个独立的线程在SystemServer进程中被初始化:
private void startBootstrapServices(@NonNull TimingsTraceAndSlog t) {
// Start the watchdog as early as possible so we can crash the system server
// if we deadlock during early boot
t.traceBegin("StartWatchdog");
final Watchdog watchdog = Watchdog.getInstance();
watchdog.start();
t.traceEnd();
}
Watchdog类没有在定义时实现Runnable接口,但其实现了run()方法,类变量 private final Thread mThread; 在构造器中被初始化,watchdog.start();开始执行此线程。
构造其中添加了"foreground thread",“main thread”,“ui thread”,“i/o thread”,“display thread”,“animation thread”,“surface animation thread”,"BinderThreadMonitor"等HandlerChecker。
private Watchdog() {
mThread = new Thread(this::run, "watchdog");
// Initialize handler checkers for each common thread we want to check. Note
// that we are not currently checking the background thread, since it can
// potentially hold longer running operations with no guarantees about the timeliness
// of operations there.
// The shared foreground thread is the main checker. It is where we
// will also dispatch monitor checks and do other work.
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread", DEFAULT_TIMEOUT);
mHandlerCheckers.add(mMonitorChecker);
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
"main thread", DEFAULT_TIMEOUT));
// Add checker for shared UI thread.
mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
"ui thread", DEFAULT_TIMEOUT));
// And also check IO thread.
mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
"i/o thread", DEFAULT_TIMEOUT));
// And the display thread.
mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
"display thread", DEFAULT_TIMEOUT));
// And the animation thread.
mHandlerCheckers.add(new HandlerChecker(AnimationThread.getHandler(),
"animation thread", DEFAULT_TIMEOUT));
// And the surface animation thread.
mHandlerCheckers.add(new HandlerChecker(SurfaceAnimationThread.getHandler(),
"surface animation thread", DEFAULT_TIMEOUT));
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());
mInterestingJavaPids.add(Process.myPid());
// See the notes on DEFAULT_TIMEOUT.
assert DB ||
DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
mTraceErrorLogger = new TraceErrorLogger();
}
AMS, PKMS, WMS等会在自己的构造器中将自己添加到Watchdog的HandlerChecker中:
public ActivityManagerService() {
Watchdog.getInstance().addMonitor(this);
Watchdog.getInstance().addThread(mHandler);
}
public PackageManagerService() {
Watchdog.getInstance().addThread(mHandler, WATCHDOG_TIMEOUT);
}
public WindowManagerService{
public void onInitReady() {
// Add ourself to the Watchdog monitors.
Watchdog.getInstance().addMonitor(this);
}
}
工作流程
Watchdog作为单独执行的线程,在run()方法中循环监测所有HandlerChecker的状态,导出异常进程的运行日志,必要时给当前进程(system_server)发送signal 9,杀掉此进程。
public void run{
while (true) {
synchronized (mLock) {
//1. 遍历所有HandlerChecker
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
hc.scheduleCheckLocked();
}
//2. mLock.wait(timeout);使当前线程处于等待状态,等待时间为timeout = CHECK_INTERVAL: 30s
// NOTE: We use uptimeMillis() here because we do not want to increment the time we
// wait while asleep. If the device is asleep then the thing that we are waiting
// to timeout on is asleep as well and won't have a chance to run, causing a false
// positive on when to kill things.
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
Log.d(TAG, "run: timeout = " + timeout);
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
mLock.wait(timeout);
// Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}
//3. 监测HandlerChecker的完成状态
final int waitState = evaluateCheckerCompletionLocked();
if (waitState == COMPLETED) {
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
continue;
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
waitedHalf = true;
pids = new ArrayList<>(mInterestingJavaPids);
doWaitedHalfDump = true;
} else {
continue;
}
} else {
// 存在超时的 HandlerChecker !!!
// something is overdue!
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
pids = new ArrayList<>(mInterestingJavaPids);
}
} // END synchronized (mLock)
// 4. 导出异常日志 ANR:/data/anr
final File finalStack = ActivityManagerService.dumpStackTraces(
pids, processCpuTracker, new SparseArray<>(), nativePids,
tracesFileException, subject);
// 5. 导出异常日志 dropbox:/data/system/dropbox/
Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
public void run() {
if (mActivity != null) {
mActivity.addErrorToDropBox(
"watchdog", null, "system_server", null, null, null,
null, report.toString(), finalStack, null, null, null,
errorId);
}
}
};
dropboxThread.start();
try {
dropboxThread.join(2000); // wait up to 2 seconds for it to return.
} catch (InterruptedException ignored) {
}
// 6. 导出异常日志到 kernel log后关机(trigger kernel panic), 通过/proc/sysrq-trigger触发
if (crashOnWatchdog) {
// Trigger the kernel to dump all blocked threads, and backtraces
// on all CPUs to the kernel log
Slog.e(TAG, "Triggering SysRq for system_server watchdog");
doSysRq('w');
doSysRq('l');
// wait until the above blocked threads be dumped into kernel log
SystemClock.sleep(3000);
doSysRq('c');
}
// 7. 向ActivityController汇报当前状态
IActivityController controller;
if (controller != null) {
Slog.i(TAG, "Reporting stuck state to activity controller");
try {
Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
// 1 = keep waiting, -1 = kill system
int res = controller.systemNotResponding(subject);
if (res >= 0) {
Slog.i(TAG, "Activity controller requested to coninue to wait");
waitedHalf = false;
continue;
}
} catch (RemoteException e) {
}
}
// 8. 判断是否需要杀掉当前进程(system_server进程) Process.killProcess(Process.myPid())
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
if(SmartTraceUtils.isPerfettoDumpEnabled() && dueTime > SystemClock.uptimeMillis()){
long timeDelta = dueTime - SystemClock.uptimeMillis();
// wait until perfetto log to be dumped completely
Slog.i(TAG,"Sleep "+ timeDelta
+" ms to make sure perfetto log to be dumped completely");
SystemClock.sleep(timeDelta);
}
if (!Build.IS_USER && isCrashLoopFound()
&& !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
breakCrashLoop();
}
Process.killProcess(Process.myPid());
System.exit(10);
}
waitedHalf = false;
}
}
检测机制
Watchdog在初始化时将一些重要进程添加到HandlerChecker列表中,通过HandlerChecker对各个监测对象进行监测。
HandlerChecker大致可以分为两类:
- Monitor Checker,用于检查是Monitor对象可能发生的死锁, FgThread, AMS, WMS等核心的系统服务都是Monitor对象。
- Looper Checker,用于检查线程的消息队列是否长时间处于工作状态。Watchdog自身的消息队列,Ui, Io, Display这些全局的消息队列都是被检查的对象。此外,一些重要的线程的消息队列,也会加入到Looper Checker中,譬如AMS, PKMS,这些是在对应的对象初始化时加入的。
public void addMonitor(Monitor monitor) {
synchronized (mLock) {
mMonitorChecker.addMonitorLocked(monitor);
}
}
public void addThread(Handler thread) {
addThread(thread, DEFAULT_TIMEOUT);
}
HandlerChecker是Watchdog的内部类,也实现了Runnable接口。
从上面Watchdog的工作流程中可以看到,Watchdog主要通过HandlerChecker的scheduleCheckLocked()方法监测进程状态。
在scheduleCheckLocked()方法开始初始化类变量mMonitors,mMonitors变量包含了所有的Monitor Checker对象,如上文说的FgThread, AMS, WMS等。
下面主要关注scheduleCheckLocked()方法中的两行代码:
-
通过*mHandler.getLooper().getQueue().isPolling()*方法判断Loop对象是否依然活跃而不是卡住。对于Looper Checker而言,会判断线程的消息队列是否处于空闲状态。 如果被监测的消息队列一直闲不下来,则说明可能已经阻塞等待了很长时间
-
mHandler.postAtFrontOfQueue(this); 将Monitor Checker的对象置于消息队列之前,优先运行。mHandler.postAtFrontOfQueue(Runable r)参数为Runable对象,将HandlerChecker类中实现的run()方法放在监测对象mHandler进程中执行,调用其实现的monitor()方法,方法实现一般很简单,就是获取当前类的对象锁,如果当前对象锁已经被持有,则monitor()会一直处于wait状态,直到超时,这种情况下,很可能是线程发生了死锁。
public final class HandlerChecker implements Runnable {
public void scheduleCheckLocked() {
if (mCompleted) {
// Safe to update monitors in queue, Handler is not in the middle of work
mMonitors.addAll(mMonitorQueue);
mMonitorQueue.clear();
}
if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
|| (mPauseCount > 0)) {
// Don't schedule until after resume OR
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if we have no monitors since those would need to
// be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();
mHandler.postAtFrontOfQueue(this);
}
@Override
public void run() {
// Once we get here, we ensure that mMonitors does not change even if we call
// #addMonitorLocked because we first add the new monitors to mMonitorQueue and
// move them to mMonitors on the next schedule when mCompleted is true, at which
// point we have completed execution of this method.
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (mLock) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();
}
synchronized (mLock) {
mCompleted = true;
mCurrentMonitor = null;
}
}
}
Monitor
Monitor是Watchdog的内部接口:
public class Watchdog {
public interface Monitor {
void monitor();
}
}
AMS的monitor()实现:
public class ActivityManagerService extends IActivityManager.Stub
implements Watchdog.Monitor, BatteryStatsImpl.BatteryCallback, ActivityManagerGlobalLock {
/** In this method we try to acquire our lock to make sure that we have not deadlocked */
public void monitor() {
synchronized (this) {
}
}
}