[转] R 代码 00001 18.06.16

# 知识来源于网络，仅供交流使用，如有侵权请及时联系予以删除# Code for running a basic logistic regression predicting results off of seed differences in R
# Load packages
library(tidyverse)
library(magrittr)

# load data need data files Seeds, Results, Conferences, and Coaches
seeds <- read_csv("../input/NCAATourneySeeds.csv")
results <- read_csv("../input/NCAATourneyCompactResults.csv")
# Inspect the structure of the data  检查数据结构
head(seeds)
head(results) # in winning/loser format - need to rearrange for submission file    # head(seeds): Season Seed  TeamID
    # head(results): Season DayNum WTeamID WScore LTeamID LScore WLoc  NumOT
# Remove characters from seed so integer remains  字符串仅保留数值
seeds$Seed <- as.integer(str_extract_all(seeds$Seed, "[0-9]+"))
head(seeds) # check to make sure regional characters are removed

# Keep only the needed files from the results; Season, WTeamID, and LTeamID
results %<>% select(Season, WTeamID, LTeamID)    # 等价于 results<-select(results,Season, WTeamID, LTeamID)    # head(results): Season WTeamID LTeamID
# Rearrange data so it matches submission file  数据重编码
results %<>% mutate(team_id_diff = WTeamID - LTeamID,
                    Team1 = case_when(team_id_diff < 0 ~ WTeamID,
                                      team_id_diff > 0 ~ LTeamID),
                    Team2 = case_when(team_id_diff > 0 ~ WTeamID,
                                      team_id_diff < 0 ~ LTeamID),
                    result = if_else(WTeamID == Team1, 1, 0))
results %>% filter((Team1 - Team2) > 0)   # 这里只是进行传输运算，没有什么作用
# Remove WTeamID, LTeamID, and team_id_diff
results %<>% select(1,5:7)    # head(results): Season Team1 Team2 result

# Split results into training and test sets
train <- results %<>% filter(Season < 2014)     # results中2014年以前的为训练集
test <- read_csv("../input/SampleSubmissionStage1.csv")    # head(test): ID  Pred

# Create Training Set
# Join seeds onto the results for team1 and team2
team1_seeds <- seeds %>% set_colnames(c("Season", "T1Seed", "Team1ID"))team2_seeds <- seeds %>% set_colnames(c("Season", "T2Seed", "Team2ID"))    # head(team1_seeds): Season T1Seed Team1ID    # head(team2_seeds): Season T2Seed Team2ID

# Join seeds to training set
train %<>% left_join(., team1_seeds, by = c("Season", "Team1"="Team1ID"))
train %<>% left_join(., team2_seeds, by = c("Season", "Team2"="Team2ID"))    # head(train): Season Team1 Team2 result T1Seed T2Seed

# Create relative round indicator 
train %<>% mutate(team1_seed_str = if_else(T1Seed < 9, 1,0),
                  team2_seed_str = if_else(T2Seed < 9, 1,0))
head(train)    # Season Team1 Team2 result T1Seed T2Seed team1_seed_str team2_seed_str

# Create variable seed_diff
train %<>% mutate(seed_diff = T1Seed - T2Seed)
head(train)    # Season Team1 Team2 result T1Seed T2Seed team1_seed_str team2_seed_str seed_diff

# Fit model
library(caret)
fit1 <- train(result ~ seed_diff + team1_seed_str + team2_seed_str, 
              data = train,
              method = "glm",
              family = "binomial")

summary(fit1)        p<-predict(fit1,ss,type="raw")
        p <-ifelse(p>0.5,1,0)
        mm<-table(train$result,p)        #  得到预测准确率为72.09%
# Prepare submission file
test %<>% select(ID) %>% separate(ID, sep = "_", into = c("Season", "Team1", "Team2"), convert = TRUE)
    # head(test) : Season Team1 Team2
test %<>% left_join(., team1_seeds, by = c("Season", "Team1"="Team1ID"))
test %<>% left_join(., team2_seeds, by = c("Season", "Team2"="Team2ID"))
head(test)

# Create relative round indicator 
test %<>% mutate(team1_seed_str = if_else(T1Seed < 9, 1,0),
                  team2_seed_str = if_else(T2Seed < 9, 1,0),
                  seed_diff = T1Seed - T2Seed)
                  
test %<>% mutate(pred = predict(fit1, .))
head(test)

test %<>% mutate(id = paste(Season, Team1, Team2, sep = "_")) %>% 
  select(10,9)

write_csv(test, "test.csv")
key points:1、create a predictor variable 2、find out the real independent variable
[转] R 代码 00001 18.06.16

猜你喜欢