# 知识来源于网络,仅供交流使用,如有侵权请及时联系予以删除
# Code for running a basic logistic regression predicting results off of seed differences in R # Load packages library(tidyverse) library(magrittr) # load data need data files Seeds, Results, Conferences, and Coaches seeds <- read_csv("../input/NCAATourneySeeds.csv") results <- read_csv("../input/NCAATourneyCompactResults.csv")
# Inspect the structure of the data 检查数据结构 head(seeds) head(results) # in winning/loser format - need to rearrange for submission file
# head(seeds): Season Seed TeamID
# head(results): Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT
# Remove characters from seed so integer remains 字符串仅保留数值 seeds$Seed <- as.integer(str_extract_all(seeds$Seed, "[0-9]+")) head(seeds) # check to make sure regional characters are removed # Keep only the needed files from the results; Season, WTeamID, and LTeamID results %<>% select(Season, WTeamID, LTeamID)
# 等价于 results<-select(results,Season, WTeamID, LTeamID)
# head(results): Season WTeamID LTeamID # Rearrange data so it matches submission file 数据重编码 results %<>% mutate(team_id_diff = WTeamID - LTeamID, Team1 = case_when(team_id_diff < 0 ~ WTeamID, team_id_diff > 0 ~ LTeamID), Team2 = case_when(team_id_diff > 0 ~ WTeamID, team_id_diff < 0 ~ LTeamID), result = if_else(WTeamID == Team1, 1, 0)) results %>% filter((Team1 - Team2) > 0) # 这里只是进行传输运算,没有什么作用 # Remove WTeamID, LTeamID, and team_id_diff results %<>% select(1,5:7)
# head(results): Season Team1 Team2 result # Split results into training and test sets train <- results %<>% filter(Season < 2014) # results中2014年以前的为训练集 test <- read_csv("../input/SampleSubmissionStage1.csv")
# head(test): ID Pred # Create Training Set # Join seeds onto the results for team1 and team2 team1_seeds <- seeds %>% set_colnames(c("Season", "T1Seed", "Team1ID"))
team2_seeds <- seeds %>% set_colnames(c("Season", "T2Seed", "Team2ID"))
# head(team1_seeds): Season T1Seed Team1ID
# head(team2_seeds): Season T2Seed Team2ID # Join seeds to training set train %<>% left_join(., team1_seeds, by = c("Season", "Team1"="Team1ID")) train %<>% left_join(., team2_seeds, by = c("Season", "Team2"="Team2ID"))
# head(train): Season Team1 Team2 result T1Seed T2Seed # Create relative round indicator train %<>% mutate(team1_seed_str = if_else(T1Seed < 9, 1,0), team2_seed_str = if_else(T2Seed < 9, 1,0)) head(train)
# Season Team1 Team2 result T1Seed T2Seed team1_seed_str team2_seed_str # Create variable seed_diff train %<>% mutate(seed_diff = T1Seed - T2Seed) head(train)
# Season Team1 Team2 result T1Seed T2Seed team1_seed_str team2_seed_str seed_diff # Fit model library(caret) fit1 <- train(result ~ seed_diff + team1_seed_str + team2_seed_str, data = train, method = "glm", family = "binomial") summary(fit1)
p<-predict(fit1,ss,type="raw")
p <-ifelse(p>0.5,1,0)
mm<-table(train$result,p)
# 得到预测准确率为72.09%
# Prepare submission file test %<>% select(ID) %>% separate(ID, sep = "_", into = c("Season", "Team1", "Team2"), convert = TRUE) # head(test) : Season Team1 Team2 test %<>% left_join(., team1_seeds, by = c("Season", "Team1"="Team1ID")) test %<>% left_join(., team2_seeds, by = c("Season", "Team2"="Team2ID")) head(test) # Create relative round indicator test %<>% mutate(team1_seed_str = if_else(T1Seed < 9, 1,0), team2_seed_str = if_else(T2Seed < 9, 1,0), seed_diff = T1Seed - T2Seed) test %<>% mutate(pred = predict(fit1, .)) head(test) test %<>% mutate(id = paste(Season, Team1, Team2, sep = "_")) %>% select(10,9) write_csv(test, "test.csv")
key points:
1、create a predictor variable
2、find out the real independent variable
[转] R 代码 00001 18.06.16
猜你喜欢
转载自blog.csdn.net/scpcmoon/article/details/80711110
今日推荐
周排行