R语言实战之如何对数据进行缺失值处理
以下是对于数据中含有部分缺失值的处理方式,代码十分详尽基础:
下面展示一些 基础代码
。
manager <- c(1,2,3,4,5)
date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
country <- c("US","US","UK","UK","UK")
gender <- c("M","F","F","M","F")
age <- c(32,45,25,39,99)
q1 <- c(5,3,3,3,2)
q2 <- c(4,5,5,3,2)
q3 <- c(5,2,5,4,1)
q4 <- c(5,5,5,NA,2)
q5 <- c(5,5,2,NA,1)
leadership <- data.frame(manager, date, country, gender, age,
q1,q2,q3,q4,q5,stringsAsFactors = FALSE)
#stringsAsFactors = FALSE遇到字符型的数值的时候,不将其转换成变量
leadership
leadership$age[leadership$age == 99] <- NA
#将99岁的数据列为缺失值
leadership <- within(leadership,{
agecat <- NA
agecat[age > 75] <- "elder"
agecat[age >= 55 & age <=75] <- "Middle Aged"
agecat[age < 55] <- "Young"})
leadership
library("plyr")
#fix(leadership)
#打开数据编辑器进行修改,当然也可直接修改,示例如下:
leadership <- rename(leadership, c(manager = "Manager ID", date = "Testdate"))
names(leadership)[6:10] <- c("item1","item2","item3","item4","item5")
leadership
is.na(leadership)
#检测是否有缺失值
x <- c(1,2,NA,3)
y <- sum(x,na.rm = "true")
y
#na.rm移除缺失的值为真
#na.omit忽略缺失值
leadership
newdata <- na.omit(leadership)
newdata
> manager <- c(1,2,3,4,5)
> date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
> country <- c("US","US","UK","UK","UK")
> gender <- c("M","F","F","M","F")
> age <- c(32,45,25,39,99)
> q1 <- c(5,3,3,3,2)
> q2 <- c(4,5,5,3,2)
> q3 <- c(5,2,5,4,1)
> q4 <- c(5,5,5,NA,2)
> q5 <- c(5,5,2,NA,1)
> leadership <- data.frame(manager, date, country, gender, age,
+ q1,q2,q3,q4,q5,stringsAsFactors = FALSE)
> #stringsAsFactors = FALSE遇到字符型的数值的时候,不将其转换成变量
> leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F 99 2 2 1 2 1
>
> leadership$age[leadership$age == 99] <- NA
> #将99岁的数据列为缺失值
> leadership <- within(leadership,{
+ agecat <- NA
+ agecat[age > 75] <- "elder"
+ agecat[age >= 55 & age <=75] <- "Middle Aged"
+ agecat[age < 55] <- "Young"})
>
> leadership
manager date country gender age q1 q2 q3 q4 q5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
>
> library("plyr")
> #fix(leadership)
> #打开数据编辑器进行修改,当然也可直接修改,示例如下:
> leadership <- rename(leadership, c(manager = "Manager ID", date = "Testdate"))
> names(leadership)[6:10] <- c("item1","item2","item3","item4","item5")
> leadership
Manager ID Testdate country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
> is.na(leadership)
Manager ID Testdate country gender age item1 item2 item3 item4 item5 agecat
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
[5,] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
> #检测是否有缺失值
>
> x <- c(1,2,NA,3)
> y <- sum(x,na.rm = "true")
> y
[1] 6
> #na.rm移除缺失的值为真
> #na.omit忽略缺失值
> leadership
Manager ID Testdate country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
> newdata <- na.omit(leadership)
> newdata
Manager ID Testdate country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
以上,共勉。