Monitor算法温控的常见算法之一,在main函数中调用了thermal_monitor函数.
1. thermal_monitor函数
thermal_monitor函数先是从dev_list中获取了各个device_info放入device_info_arr中,然后过滤setting放入tm_states的setting中,然后执行了sensor_up以及创建了一个thread执行函数sensor_monitor来监控是否触发monitor算法。
void thermal_monitor(struct thermal_setting_t *settings) { struct setting_info *cfg_setting; union device_request req; /* Build Device Info List */ if (devices_manager_get_list(NULL, &device_info_arr_len)) {//先获取dev_list的长度 msg("Failed to get device list length\n"); return; } device_info_arr = (struct device_info *)//根据device的长度 malloc malloc(sizeof(struct device_info)*device_info_arr_len); if (device_info_arr == NULL) { msg("Failed to alloc device_info_arr\n"); return; } if (devices_manager_get_list(device_info_arr,//得到device_info &device_info_arr_len)) { msg("Failed to get device list\n"); free(device_info_arr); return; } cfg_setting = settings->list; while (cfg_setting && (tm_cnt < MAX_TM_INSTANCES_SUPPORTED)) { if ((cfg_setting->algo_type != MONITOR_ALGO_TYPE) ||//不是Monitor continue (cfg_setting->err_disable != 0)) {//坏的setting continue cfg_setting = cfg_setting->next; continue; } dbgmsg("%s: Import %s", __func__, cfg_setting->desc); tm_states[tm_cnt].setting = cfg_setting;//放入tm_states数组的setting tm_states[tm_cnt].disable = cfg_setting->disable; tm_cnt++; if (!cfg_setting->disable)//没有disable 打印 print_setting(cfg_setting); /* KEEP at end of while block */ cfg_setting = cfg_setting->next; } if (!sensors_setup()) {// msg("Failed to setup at least one sensor for monitoring\n"); return; } /* Vote to keep kernel mitigation enabled until thermal monitor has processed initial thresholds. */ kernel_dev = devices_manager_reg_clnt("kernel"); if (kernel_dev == NULL) { msg("%s Failed to create kernel device handle\n", __func__); return; } req.value = 1; device_clnt_request(kernel_dev, &req);//继续keep KTM if (pthread_create(&tm_thread, NULL, (void *)&sensor_monitor,//sensor_monitor监控 (void *)NULL) != 0) { msg("Error initializing thermal monitor\n"); device_clnt_cancel_request(kernel_dev);//失败就取消之前keep KTM的命令 } }
我们来看下devices_manager_get_list 函数,配合上面thermal_monitor函数,第一次传进来的info_arr为NULL,因此获取了device的长度,第二次再从dev_list中获取dev_info放入dev_info_arr.
int devices_manager_get_list(struct device_info *info_arr, uint32_t *info_arr_len) { uint32_t dev_idx; struct devices_manager_dev *curr = dev_list; if ((info_arr == NULL) && (info_arr_len == NULL)) { msg("%s: Invalid args.\n", __func__); return -(EINVAL); } if (info_arr == NULL) {//第一次获取长度 /* Interpret as request for number of dev's present. */ *info_arr_len = dev_cnt; return 0; } /* Don't exceed end of info_array */ *info_arr_len = MIN(*info_arr_len, dev_cnt); for (dev_idx = 0; (dev_idx < *info_arr_len) && (curr != NULL); dev_idx++) {//第二次遍历dev_list获取dev_info memcpy(&(info_arr[dev_idx]), &(curr->dev_info), sizeof(struct device_info)); curr = curr->next_dev; } return 0; }
2. sensor_setup
下面我们再来看sensors_setup函数,这个函数我们需要结合monitor算法实际的配置来看。
static int sensors_setup(void) { uint32_t i = 0; int sensor_count = 0; if (!tm_cnt) return 0; /* Set up tm instances */ dbgmsg("%s: tm_cnt %d", __func__, tm_cnt); for (i = 0; i < tm_cnt; i++) {//遍历所有的tm_states struct tm_instance_info *tm_instance_info; struct setting_info *setting; struct tm_setting *tm_setting_info; tm_instance_info = &tm_states[i]; setting = tm_instance_info->setting; tm_setting_info = &(setting->data.tm); dbgmsg("%s: TM Id %s Sensor %s num_thresholds %d", __func__, setting->desc, tm_setting_info->sensor, tm_setting_info->num_thresholds); if (tm_setting_info->num_thresholds > 0) {//结合下面num_thresholds为3 /* Create sensor client */ tm_instance_info->ts_clnt = sensors_manager_reg_clnt(tm_setting_info->sensor);//结合下面看我们的sensor是tsens_tz_sensor1 if (tm_instance_info->ts_clnt == NULL) {//上面是创建一个sensor的client msg("%s: Can't create client for %s.\n", __func__, tm_setting_info->sensor); tm_instance_info->disable = 1; continue; } /* Create necessary device clients */ if (create_device_clnts(tm_instance_info) == 0)//这里处理每一个action sensor_count++; else tm_instance_info->disable = 1; } } return sensor_count; }
下面是一个8909的Monitor算法的一部分:
{ .desc = "CAMERA_CAMCORDER_MONITOR", .algo_type = MONITOR_ALGO_TYPE, .data.tm = { .sensor = "tsens_tz_sensor1", .sampling_period_ms = 250, .num_thresholds = 3,//t的个数 ._n_thresholds = 3, ._n_to_clear = 3, ._n_actions = 3, ._n_action_info = 3, .t[0] = { .lvl_trig = 80000, .lvl_clr = 75000, .num_actions = 2,//action的个数 .actions[0] = { .device = "camera", .info = 1, }, .actions[1] = { .device = "camcorder", .info = 1, }, }, .t[1] = { .lvl_trig = 85000, .lvl_clr = 80000, .num_actions = 2, .actions[0] = { .device = "camera", .info = 2, }, .actions[1] = { .device = "camcorder", .info = 2, }, }, .t[2] = { .lvl_trig = 88000, .lvl_clr = 85000, .num_actions = 2, .actions[0] = { .device = "camera", .info = 10, }, .actions[1] = { .device = "camcorder", .info = 10, }, } }, },
sensors_manager_reg_clnt函数就是为sensor创建一个client
sensor_clnt_handle sensors_manager_reg_clnt(const char *sensor_name) { struct sensor_client_type *client = NULL; struct sensors_mgr_sensor_info *sensor_mgr = NULL; if (sensor_name == NULL) { msg("%s: Invalid args.\n", __func__); return client; } sensor_mgr = find_sensor(sensor_name);//从sensor_list中找到sensor if (sensor_mgr == NULL) { msg("%s: Invalid sensor %s.\n", __func__, sensor_name); return client; } client = malloc(sizeof(struct sensor_client_type)); if (client == NULL) { msg("%s: Alloc. failed for %s.\n", __func__, sensor_name); return client; } memset(client, 0x0, sizeof(struct sensor_client_type)); THERM_MUTEX_LOCK(&ts_clnt_mtx); /* Insert the client */ client->sensor_mgr = sensor_mgr; client->next_clnt = sensor_mgr->client_list; sensor_mgr->client_list = client; THERM_MUTEX_UNLOCK(&ts_clnt_mtx); return client; }
我们再来看下create_device_clnts函数
static int create_device_clnts(struct tm_instance_info *tm_instance_info) { int ret_val = 0; uint32_t t_idx, a_idx; struct tm_devices_list list;//创建一个list struct tm_setting *tm_setting_info = &tm_instance_info->setting->data.tm; memset(&list, 0x0, sizeof(struct tm_devices_list)); /* Create list of unique actions */ for (t_idx = 0; t_idx < tm_setting_info->num_thresholds; t_idx++) {//先遍历num_therosholds就是t的个数 for (a_idx = 0; a_idx < tm_setting_info->t[t_idx].num_actions;//再遍历每个t下面的num_actions就是action的个数 a_idx++) { /* Index used by tm to make requests on correct device client */ tm_setting_info->t[t_idx].actions[a_idx].device_idx = add_device_to_list(tm_instance_info, &list, tm_setting_info->t[t_idx].actions[a_idx].device); if (tm_setting_info->t[t_idx].actions[a_idx].device_idx < 0) { msg("%s: Error adding device %s\n", __func__, tm_setting_info->t[t_idx].actions[a_idx].device); ret_val = -(EFAULT); goto error_handler; } } } error_handler: return ret_val; }我们再来看add_device_to_list函数就是为每个action的device得到deviceinfo放到tm_instance_info的dev_info_list,并且创建一个client放在dev_cln_list,并且把索引保存在每个action的device_idx中。
static int add_device_to_list(struct tm_instance_info *tm_instance_info, struct tm_devices_list *list, const char *device) { uint32_t i; /* Search for match or first available slot. */ for (i = 0; i < MAX_ACTIONS_PER_TM_INSTANCE; i++) { /* Add to first empty entry, if no previous match. */ if (list->device[i] == NULL) break; if (strncasecmp(list->device[i], device, DEVICES_MAX_NAME_LEN) == 0) break; } if (i >= MAX_ACTIONS_PER_TM_INSTANCE) { msg("%s: No room for device %s", __func__, device); return -1; } /* Check if we need to create the device client */ if (list->device[i] == NULL) { tm_instance_info->dev_info_list[i] = get_device_info(device);//每一个tm_instace_info找到每个action的deviceinfo if (tm_instance_info->dev_info_list[i] == NULL) return -1; tm_instance_info->dev_clnt_list[i] =//为该device创建一个client devices_manager_reg_clnt(device); if (tm_instance_info->dev_clnt_list[i] == NULL) return -1; list->device[i] = (char*)device; list->cnt++; } return (int)i;//记录该action下的device在tm_instace_info的dev_info_list的索引 }
3. Monitor监控
我们在thermal_monitor中创建了一个thread执行sensor_monitor来监控,这里我们来看下这个函数,这里主要就是循环调用handle_thresh_sig来监控温度。
static void *sensor_monitor(void *data) { uint32_t idx; for (idx = 0; idx < tm_cnt; idx++) THRESH_MASK_SET(idx); /* Set initial thresholds */ handle_thresh_sig();//第一个调用 /* Vote okay to disable kernel mitigation */ device_clnt_cancel_request(kernel_dev); thermal_server_register_client_req_handler("override", override_notify, NULL);//之前分析为client注册回调 thermal_server_register_client_req_handler(CONFIG_QUERY_CLIENT, config_query_notify, NULL); thermal_server_register_client_req_handler(CONFIG_SET_CLIENT, config_set_notify, NULL); while (exit_daemon != 1) { dbgmsg("%s: Wait for EV", __func__); pthread_mutex_lock(&wait_mutex); if (!THRESH_MASK_ANY_SET) { pthread_cond_wait(&wait_cond, &wait_mutex); } pthread_mutex_unlock(&wait_mutex); dbgmsg("%s: Thresh EVT", __func__); handle_thresh_sig();//循环调用handle_thresh_sig来监控sensor温度是否满足monitor算法 } free(device_info_arr); return NULL; }我们来看下handle_thresh_sig函数,先遍历所有的tm_instance_info,然后过滤器setting下面的data.tm num_thresholds小于1,以及disable的tm_instance_info,然后获取温度
static void handle_thresh_sig(void) { ...... struct tm_instance_info *sensor; struct setting_info *info; struct tm_setting *tm_info; union device_request req; /* Get temp and handle */ for (idx = 0; idx < tm_cnt; idx++) { if (THRESH_MASK_IS_SET(idx) == 0) { continue; } sensor = &tm_states[idx]; info = sensor->setting; tm_info = &(info->data.tm); if ((tm_info->num_thresholds < 1) || (sensor->disable)) {//过滤disable以及num_thresholds /* Unmask TM instance as handled */ THRESH_MASK_CLR(idx); continue; } max_thr = (int)tm_info->num_thresholds; sensor_temp = sensor_get_temperature(sensor);//获取温度 dbgmsg("%s: TM Id %s Sensor %s Temp %d\n", __func__, info->desc, tm_info->sensor, sensor_temp); lvl_max = -1; lvl_min = INT_MAX; for (i = max_thr - 1; i >= 0; i--) {//遍历每一个setting下的t数组 /* Scan for new alarm conditions */ if (sensor_threshold_trigger(sensor_temp, sensor, i)) {//是否触发阈值了 if (sensor->lvl_alarm[i] == 0) {//之前没有记录触发 thermalmsg(LOG_LVL_DBG, (LOG_LOGCAT | LOG_TRACE), "TM Id '%s' Sensor '%s' - alarm " " raised %d at %d.%d degC\n", info->desc, tm_info->sensor, i + 1, RCONV(sensor_temp), (sensor_temp%1000)/100); sensor->lvl_alarm[i] = 1;//记录 alarm_raised = 1;//需要触发 } if (i > lvl_max) lvl_max = i;//lvl_max就是记录触发的t数组的最后一个index } /* Scan for alarm clearing conditions */ if (sensor_threshold_clear(sensor_temp, sensor, i)) {//是否低于小的阈值了 if (sensor->lvl_alarm[i] == 1) {//之前记录触发了 thermalmsg(LOG_LVL_DBG, (LOG_LOGCAT | LOG_TRACE), "TM Id '%s' Sensor '%s' - alarm " "cleared %d at %d.%d degC\n", info->desc, tm_info->sensor, i + 1, RCONV(sensor_temp), (sensor_temp%1000)/100); sensor->lvl_alarm[i] = 0;//触发的记录清除 alarm_cleared = 1;//清除 } if (i < lvl_min) lvl_min = i;//清除的t数组最小的index } } /* Update temperature thresholds */ if (alarm_raised) { threshold_type = THRESHOLD_CROSS; threshold_level = lvl_max + 1; } else if (alarm_cleared) { threshold_type = THRESHOLD_CLEAR; threshold_level = lvl_min; } else { threshold_type = THRESHOLD_NOCHANGE; threshold_level = sensor->last_lvl; } sensor->last_lvl = threshold_level; pthread_mutex_lock(&wait_mutex); /* Unmask TM instance as handled */ THRESH_MASK_CLR(idx); pthread_mutex_unlock(&wait_mutex); sensor_update_thresholds(sensor, threshold_type, threshold_level, idx); if (!alarm_raised && !alarm_cleared) {//没有新的触发或者清除之前continue continue; } /* Perform actions on highest level alarm */ for (i = max_thr - 1; i >= 0; i--) {//从settings最大的开始 if (sensor->lvl_alarm[i] == 0)//满足触发条件 continue; for (j = 0; j < tm_info->t[i].num_actions; j++) { action_idx = tm_info->t[i].actions[j].device_idx;//获取每个setting的action的device index dev_info = sensor->dev_info_list[action_idx]; if (dev_info == NULL) continue; action_info = tm_info->t[i].actions[j].info;//调整的值 sensor->action_mask |= (1U << action_idx); req.value = action_info; switch(dev_info->dev_type) {根据类型区分 ...... case DEVICE_OP_VALUE_TYPE: case DEVICE_GENERIC_TYPE: device_clnt_request(sensor->dev_clnt_list[action_idx], &req);//每一个device的client申请调整 break; case DEVICE_NONE_TYPE: break; default: msg("Unknown action %s\n", dev_info->name); } } break; }我们来看下获取温度的函数sensor_get_temperature函数,是通过sensors_manager_read_trip_temp函数,传进去的参数是每个tm_instnace_info的sensor的client,每一个tm_instance_info都为其sensor创建一个client。
static int sensor_get_temperature(struct tm_instance_info *setting) { int temp = 0; if (setting == NULL || setting->ts_clnt == NULL) { return -EFAULT; } temp = sensors_manager_read_trip_temp(setting->ts_clnt); dbgmsg("TM Id %s Sensor %s Reading %d\n", setting->setting->desc, setting->setting->data.tm.sensor, temp); return temp; }
sensors_manager_read_trip_temp函数还是通过client的sensor来获取温度,sensors_manager_read函数还是通过sensor的get_temperature函数。
int sensors_manager_read_trip_temp(sensor_clnt_handle clnt) { struct sensors_mgr_sensor_info *sensor_mgr = NULL; struct sensor_client_type *client = clnt; int ret_val = INT32_MIN; if (client == NULL) { msg("%s: Invalid args.\n", __func__); return ret_val; } if (validate_clnt(client) != 0) return ret_val; sensor_mgr = client->sensor_mgr; if (!sensor_mgr->get_trip_temperature)//没有赋值这个函数 return sensors_manager_read(clnt); ret_val = sensor_mgr->get_trip_temperature(sensor_mgr); thermalmsg(LOG_LVL_DBG, (LOG_LOGCAT | LOG_LOCAL_SOCKET | LOG_TRACE), "%s:%s:%d mC\n", SENSORS, sensor_mgr->name, ret_val); return ret_val; }
我们再来看sensor_threshold_trigger函数,正常流程是走else的而且是没有override_mode的,因此只要温度大于lvl_trig就是触发了。
static int sensor_threshold_trigger(int value, struct tm_instance_info *sensor, int level) { struct tm_setting *tm_info = &(sensor->setting->data.tm); if (tm_info->descending_thresh) { if (value <= tm_info->t[level].lvl_trig) return 1; else return 0; } else { int active_trig = tm_info->t[level].lvl_trig; if (override_mode) active_trig += tm_info->override;; if (value >= active_trig) return 1; else return 0; } }
还是分析handle_thresh_sig函数,触发之后最后还是调用了device_clnt_request,每一个setting下面action满足条件的device的client都会申请一个request,device_clnt_request这个函数我们在分析device初始化的时候分析过了。像device的type是DEVICE_OP_VALUE_TYPE类型的值我们会取所有client的最小值(比如cpu),像device的type是DEVICE_GENERIC_TYPE我们会取所有client的最大值,最终会到device的action函数来控制。
int device_clnt_request(device_clnt_handle clnt, union device_request *req) { struct devices_manager_dev *dev_mgr = NULL; struct device_clnt *client = clnt; int ret_val = 0; if ((client == NULL) || (req == NULL)) { msg("%s: Invalid args.\n", __func__); return -(EINVAL); } ret_val = validate_clnt(client); if (ret_val != 0) return ret_val; dev_mgr = client->dev_mgr; switch (dev_mgr->dev_info.dev_type) { case DEVICE_GENERIC_TYPE: ret_val = devices_manager_set_lvl(dev_mgr, client, req->value); break; case DEVICE_OP_VALUE_TYPE: ret_val = devices_manager_set_op_value(dev_mgr, client, req->value); break; case DEVICE_DIRECT_ACTION_TYPE: ret_val = devices_manager_set_direct_action(dev_mgr, client, req); break; default: dbgmsg("%s: Unhandled dev_type %d", __func__, dev_mgr->dev_info.dev_type); break; } return ret_val; }
比如我们拿gpu举例,会调用devices_manager_set_op_value函数
static int devices_manager_set_op_value(struct devices_manager_dev *dev_mgr, struct device_clnt *client, int dev_op_value) { uint32_t lvl_idx = 0; if (dev_op_value < 0) { msg("%s: Invalid args.\n", __func__); return -(EINVAL); } dev_mgr = client->dev_mgr; if (dev_mgr->dev_info.max_dev_op_value_valid == 0) { msg("%s: dev_op invalid.\n", __func__); return -(EFAULT); } dev_op_value = MIN(dev_op_value, dev_mgr->dev_info.max_dev_op_value);//去client和device的max_dev_op_value的最小值 if (dev_mgr->lvl_info && (dev_mgr->dev_info.num_of_levels > 0)) { /* Translate to dev_op_value to supported mitigation value */ for (lvl_idx = 0; lvl_idx < dev_mgr->dev_info.num_of_levels; lvl_idx++) { if (dev_mgr->lvl_info[lvl_idx].lvl.value <= dev_op_value) break; } if (lvl_idx >= dev_mgr->dev_info.num_of_levels) { /* Apply highest lvl of mitigation possible */ lvl_idx = dev_mgr->dev_info.num_of_levels - 1U; } dev_op_value = dev_mgr->lvl_info[lvl_idx].lvl.value;//找到gpu合适的一个档位(就是刚比client的值小) } pthread_mutex_lock(&clnt_mtx); client->request_active = 1; client->request.value = dev_op_value; pthread_mutex_unlock(&clnt_mtx); dbgmsg("%s: DEV %s, op_value %d\n", __func__, dev_mgr->dev_info.name, dev_op_value); update_dev_state(dev_mgr); return dev_mgr->active_req.value; }
update_dev_state函数
static int update_dev_state(struct devices_manager_dev *dev_mgr) { union device_request req; struct device_clnt *client = dev_mgr->client_list; if ((dev_mgr->dev_info.dev_type != DEVICE_GENERIC_TYPE) && (dev_mgr->dev_info.dev_type != DEVICE_OP_VALUE_TYPE)) return -(EFAULT); pthread_mutex_lock(&clnt_mtx); if (dev_mgr->dev_info.dev_type == DEVICE_GENERIC_TYPE) { /* Start from min level to find the highest existing client request */ req.value = dev_mgr->dev_info.min_lvl; /* Walk client list to find highest mitigation level */ while (client != NULL) { if (client->request_active)//这个类型找最大的 req.value = MAX(req.value, client->request.value); client = client->next_clnt; } } else if (dev_mgr->dev_info.dev_type == DEVICE_OP_VALUE_TYPE) { /* Start from max allowable value find lowest request */ req.value = dev_mgr->dev_info.max_dev_op_value; /* Walk client list to find highest mitigation level */ while (client != NULL) { if (client->request_active) req.value = MIN(req.value, client->request.value);//gpu这种找所有client的最小的 client = client->next_clnt; } } if (dev_mgr->active_req.value != req.value) { dev_mgr->active_req.value = req.value; if (dev_mgr->action) dev_mgr->action(dev_mgr);//执行device的action函数。 /* Notify clients */ client = dev_mgr->client_list; while (client != NULL) { if (client->cb_func != NULL) client->cb_func(client, &req,//调用client的回调 client->cb_usr_data); client = client->next_clnt; } } pthread_mutex_unlock(&clnt_mtx); return 0; }
我们继续分析handle_thresh_sig函数,当alarm_cleared而且当lvl_min为0,这个代表一个setting下面的t数组所有的触发都清除了,这个时候我们再调用clear_all_alarms函数。
if (alarm_cleared) { /* Handle alarm clearing cases */ if (lvl_min == 0) { dbgmsg("Clearing all alarms\n"); clear_all_alarms(sensor); }
clear_all_alarms是该setting下的所有的device的client之前像device的申请的request都取消。
static void clear_all_alarms(struct tm_instance_info *sensor) { uint32_t i; for (i = 0; i < MAX_ACTIONS_PER_TM_INSTANCE; i++) { if (sensor->dev_clnt_list[i] == NULL) continue; /* check if action may have been set */ if ((sensor->action_mask & (1U << i)) == 0) continue; switch(sensor->dev_info_list[i]->dev_type) { case DEVICE_GENERIC_TYPE: case DEVICE_OP_VALUE_TYPE: device_clnt_cancel_request(sensor->dev_clnt_list[i]); break; default: dbgmsg("%s: No clearing of action %s\n", __func__, sensor->dev_info_list[i]->name); } } sensor->action_mask = 0; }这样monitor基本分析完了。不过在分析handle_thresh_sig函数的时候我们还有一个重要的函数sensor_update_thresholds还没有分析,这里和sensor相关后续继续分析。