R Formula Tutorial

最后发布时间:2022-08-21 20:24:36 浏览量:
  • - for removing terms;
  • : for interaction;
  • * for crossing;
  • %in% for nesting; And
  • ^ for limit crossing to the specified degree.
# Set seed
set.seed(123)

# Data
x = rnorm(5)
x2 = rnorm(5)
y = rnorm(5)

# Model frame
model.frame(y ~ x * x2, data = data.frame(x = x, y = y, x2=x2))
           y           x         x2
1  1.2240818 -0.56047565  1.7150650
2  0.3598138 -0.23017749  0.4609162
3  0.4007715  1.55870831 -1.2650612
4  0.1106827  0.07050839 -0.6868529
5 -0.5558411  0.12928774 -0.4456620
model.frame(y ~ x + x2 + x:x2, data = data.frame(x = x, y = y, x2=x2))
           y           x         x2
1  1.2240818 -0.56047565  1.7150650
2  0.3598138 -0.23017749  0.4609162
3  0.4007715  1.55870831 -1.2650612
4  0.1106827  0.07050839 -0.6868529
5 -0.5558411  0.12928774 -0.4456620

model.frame( y ~ x + x^2, data = data.frame(x = rnorm(5), y = rnorm(5)))
model.frame( y ~ x + I(x^2), data = data.frame(x = rnorm(5), y = rnorm(5)))
           y          x       I(x^2)
1 -1.0678237  1.7869131 3.193058....
2 -0.2179749  0.4978505 0.247855....
3 -1.0260044 -1.9666172 3.867583....
4 -0.7288912  0.7013559 0.491900....
5 -0.6250393 -0.4727914 0.223531....

连续变量*连续变量

fit1 <- lm(loss ~ hours * effort, data = dat)
summary(fit1)

等价于

fit1 <- lm(loss ~ hours + effort + hours : effort, data = dat)
summary(fit1)

等价于

dat$hours_effort <- dat$hours * dat$effort

fit1 <- lm(loss ~ hours + effort + hours_effort, data = dat)
summary(fit1)

连续变量*分类变量

连续变量*分类变量(*2)

dat$gender <- relevel(dat$gender, ref = "female") 

fit2 <- lm(loss ~ hours * gender, data = dat)
summary(fit2)
dat$hours_gender_male <- dat$hours * (dat$gender == "male") 

fit2 <- lm(loss ~ hours + gender + hours_gender_male, data = dat)
summary(fit2)

连续变量*分类变量(*3)

dat$prog <- relevel(dat$prog, ref = "read") 

fit3 <- lm(loss ~ hours * prog, data = dat)
summary(fit3)
dat$hours_prog_jog <- dat$hours * (dat$prog == "jog") 
dat$hours_prog_swim <- dat$hours * (dat$prog == "swim") 

fit3 <- lm(loss ~ hours + prog + hours_prog_jog + hours_prog_swim, data = dat)
summary(fit3)

分类变量*分类变量

分类变量分类变量(23)

dat$gender <- relevel(dat$gender, ref = "female") 
dat$prog <- relevel(dat$prog, ref = "read") 

fit4 <- lm(loss ~ gender * prog, data = dat)
summary(fit4)
dat$gender_male_prog_jog <- (dat$gender == "male") * (dat$prog == "jog") 
dat$gender_male_prog_swim <- (dat$gender == "male") * (dat$prog == "swim") 

fit4 <- lm(loss ~ gender + prog + gender_male_prog_jog + gender_male_prog_swim, data = dat)
summary(fit4)

分类变量分类变量(33)

dat$effort_cat <- cut(dat$effort, breaks = c(0, 25, 35, Inf), labels = c("low", "medium", "high"))

dat$effort_cat <- relevel(dat$effort_cat, ref = "low")
dat$prog <- relevel(dat$prog, ref = "read") 

fit5 <- lm(loss ~ effort_cat * prog, data = dat)
summary(fit5)
dat$effort_cat_medium_prog_jog <- (dat$effort_cat == "medium") * (dat$prog == "jog")
dat$effort_cat_high_prog_jog <- (dat$effort_cat == "high") * (dat$prog == "jog")
dat$effort_cat_medium_prog_swim <- (dat$effort_cat == "medium") * (dat$prog == "swim")
dat$effort_cat_high_prog_swim <- (dat$effort_cat == "high") * (dat$prog == "swim")

fit5 <- lm(loss ~ effort_cat + prog + effort_cat_medium_prog_jog + effort_cat_high_prog_jog +
             effort_cat_medium_prog_swim + effort_cat_high_prog_swim, data = dat)
summary(fit5)

参考

https://www.datacamp.com/tutorial/r-formula-tutorial
https://f1000research.com/articles/9-1444
https://zhuanlan.zhihu.com/p/460060330