데이터 R지?

11차시_대면수업

jsBae 2022. 5. 11. 15:41

수업 준비

1. R_studio 에서 프로젝트 확인

2. 경로 셋팅 : ctrl +shift + h

> setwd("C:/Users/USER/Dropbox/R/Mybook")

3. 기존 data 삭제

rm(list=ls())

실습1. 데이터 프레임을 생성 후 .csv 로 저장 해 보자.

df <- data.frame(class=c(1:4),
                 korean=sample(70:90, 4),
                 english = c(82, 86, 83, 87),
                 math = c(78, 79, 81, 80))
df
> df
  class korean english math
1     1     76      82   78
2     2     80      86   79
3     3     86      83   81
4     4     73      87   8

#save of data

write.csv(df, "class_mean.csv")

실습2. .csv 파일을 불러보자.(별도의 패키지 없이 내장 함수 사용)

> read.csv("class_mean.csv")
  X class korean english math
1 1     1     75      82   78
2 2     2     81      86   79
3 3     3     90      83   81
4 4     4     77      87   80
> data <- read.csv("class_mean.csv")
> str(data)
'data.frame':	4 obs. of  5 variables:
 $ X      : int  1 2 3 4
 $ class  : int  1 2 3 4
 $ korean : int  75 81 90 77
 $ english: int  82 86 83 87
 $ math   : int  78 79 81 80
> dim(data)
[1] 4 5
> nrow(data)
[1] 4
> ncol(data)
[1] 5
> names(data)
[1] "X"       "class"   "korean"  "english" "math"   
> class(data)
[1] "data.frame"
> View(data)
> head(data)
  X class korean english math
1 1     1     75      82   78
2 2     2     81      86   79
3 3     3     90      83   81
4 4     4     77      87   80
> tail(data)
  X class korean english math
1 1     1     75      82   78
2 2     2     81      86   79
3 3     3     90      83   81
4 4     4     77      87   80
> summary(data)
       X            class          korean         english           math      
 Min.   :1.00   Min.   :1.00   Min.   :75.00   Min.   :82.00   Min.   :78.00  
 1st Qu.:1.75   1st Qu.:1.75   1st Qu.:76.50   1st Qu.:82.75   1st Qu.:78.75  
 Median :2.50   Median :2.50   Median :79.00   Median :84.50   Median :79.50  
 Mean   :2.50   Mean   :2.50   Mean   :80.75   Mean   :84.50   Mean   :79.50  
 3rd Qu.:3.25   3rd Qu.:3.25   3rd Qu.:83.25   3rd Qu.:86.25   3rd Qu.:80.25  
 Max.   :4.00   Max.   :4.00   Max.   :90.00   Max.   :87.00   Max.   :81.00  
> mean(data$korean)
[1] 80.75
> data <- data[-1]
> data
  class korean english math
1     1     75      82   78
2     2     81      86   79
3     3     90      83   81
4     4     77      87   80
> summary(data)
     class          korean         english           math      
 Min.   :1.00   Min.   :75.00   Min.   :82.00   Min.   :78.00  
 1st Qu.:1.75   1st Qu.:76.50   1st Qu.:82.75   1st Qu.:78.75  
 Median :2.50   Median :79.00   Median :84.50   Median :79.50  
 Mean   :2.50   Mean   :80.75   Mean   :84.50   Mean   :79.50  
 3rd Qu.:3.25   3rd Qu.:83.25   3rd Qu.:86.25   3rd Qu.:80.25  
 Max.   :4.00   Max.   :90.00   Max.   :87.00   Max.   :81.00

실습3. 내장 데이터셋(iris)을 활용한 데이터 추출

1.데이터 셋 불러오기

> data()
> data(iris)
> colnames(iris)
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
> dim(iris)
[1] 150   5

2.데이터 인덱싱 & 추출

> iris[1,2]
[1] 3.5
iris["Species"]=="setosa"

> data <- iris[iris["Species"]=="setosa",]
> dim(data)
[1] 50  5

> summary(data)
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species  
 Min.   :4.300   Min.   :2.300   Min.   :1.000   Min.   :0.100   setosa    :50  
 1st Qu.:4.800   1st Qu.:3.200   1st Qu.:1.400   1st Qu.:0.200   versicolor: 0  
 Median :5.000   Median :3.400   Median :1.500   Median :0.200   virginica : 0  
 Mean   :5.006   Mean   :3.428   Mean   :1.462   Mean   :0.246                  
 3rd Qu.:5.200   3rd Qu.:3.675   3rd Qu.:1.575   3rd Qu.:0.300                  
 Max.   :5.800   Max.   :4.400   Max.   :1.900   Max.   :0.600

문제 : Petal 길이가 평균값(3.76) 이하인 붓꽃 데이터 추출

> summary(iris)
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species  
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :50  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300   virginica :50  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199                  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
> iris$Petal.Length<=3.76
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [17]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [33]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
 [65]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
 [81] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
 [97] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[113] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[129] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE
> data_1 <- iris[iris$Petal.Length<=3.76,]
> dim(data_1)
[1] 57  5
> summary(data_1)
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width           Species  
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.1000   setosa    :50  
 1st Qu.:4.800   1st Qu.:3.000   1st Qu.:1.400   1st Qu.:0.2000   versicolor: 7  
 Median :5.000   Median :3.400   Median :1.500   Median :0.2000   virginica : 0  
 Mean   :5.037   Mean   :3.307   Mean   :1.702   Mean   :0.3456                  
 3rd Qu.:5.200   3rd Qu.:3.600   3rd Qu.:1.600   3rd Qu.:0.4000                  
 Max.   :5.800   Max.   :4.400   Max.   :3.700   Max.   :1.3000

실습4. subset() 함수를 활용한 데이터 추출

subset(데이터, 조건식, 선택열)

subset(iris, Petal.Length<=3.76)
subset(iris, Petal.Length<=3.76, c("Species"))

실습5. ifelse()함수를 활용한 데이터 추출

ifelse(조건식, TRUE값, FALSE값)

> x <- c(1:10)
> x
 [1]  1  2  3  4  5  6  7  8  9 10
> ifelse(x%%2==1, T, F)
 [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
> x[ifelse(x%%2==1, T, F)]
[1] 1 3 5 7 9

문제 : iris 데이터에 꽃잎길이를 above, below로 표현

iris$"Petal.Mean" <- ifelse(iris$Petal.Length>=3.76, "above", "below")
View(iris)

> head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species Petal.Mean
1          5.1         3.5          1.4         0.2  setosa      below
2          4.9         3.0          1.4         0.2  setosa      below
3          4.7         3.2          1.3         0.2  setosa      below
4          4.6         3.1          1.5         0.2  setosa      below
5          5.0         3.6          1.4         0.2  setosa      below
6          5.4         3.9          1.7         0.4  setosa      below

실습6. 데이터 일괄처리

apply(x, 1(행)or2(열), 함수)

exam <- data.frame(id=c(1:10),
                   ex1=sample(60:100,10),
                   ex2=sample(50:90,10),
                   ex3=sample(75:99,10),
                   ex4=sample(70:85,10))
dim(exam)
View(exam)
str(exam)
'data.frame':	10 obs. of  5 variables:
 $ id : int  1 2 3 4 5 6 7 8 9 10
 $ ex1: int  72 99 84 98 82 79 88 100 81 94
 $ ex2: int  77 82 70 80 66 58 72 68 75 65
 $ ex3: int  85 84 81 93 76 98 75 99 89 94
 $ ex4: int  82 77 79 74 81 76 84 85 73 75
exam$id <- apply(exam[1],1,as.factor)

> str(exam)
'data.frame':	10 obs. of  6 variables:
 $ id   : Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10
 $ ex1  : int  72 99 84 98 82 79 88 100 81 94
 $ ex2  : int  77 82 70 80 66 58 72 68 75 65
 $ ex3  : int  85 84 81 93 76 98 75 99 89 94
 $ ex4  : int  82 77 79 74 81 76 84 85 73 75
 $ total: int  316 342 314 345 305 311 319 352 318 328
> apply(exam[2:5], 2, max)
ex1 ex2 ex3 ex4 
100  82  99  85 
> apply(exam[2:5], 2, mean)
 ex1  ex2  ex3  ex4 
87.7 71.3 87.4 78.6

문제 :  전 과목 합산 열을 추가하여 1등을 추출 해 보자.

exam$"total" <- apply(exam[2:5], 1, sum)

누가 1등인가?

> summary(exam)
       id         ex1              ex2            ex3             ex4            total      
 1      :1   Min.   : 72.00   Min.   :58.0   Min.   :75.00   Min.   :73.00   Min.   :305.0  
 2      :1   1st Qu.: 81.25   1st Qu.:66.5   1st Qu.:81.75   1st Qu.:75.25   1st Qu.:314.5  
 3      :1   Median : 86.00   Median :71.0   Median :87.00   Median :78.00   Median :318.5  
 4      :1   Mean   : 87.70   Mean   :71.3   Mean   :87.40   Mean   :78.60   Mean   :325.0  
 5      :1   3rd Qu.: 97.00   3rd Qu.:76.5   3rd Qu.:93.75   3rd Qu.:81.75   3rd Qu.:338.5  
 6      :1   Max.   :100.00   Max.   :82.0   Max.   :99.00   Max.   :85.00   Max.   :352.0  
 (Other):4                                                                                  
> exam[exam$total==352,]
  id ex1 ex2 ex3 ex4 total
8  8 100  68  99  85   352

상위 6명은 명단은?

> rank <- exam[order(-exam$total),]
> head(rank)
   id ex1 ex2 ex3 ex4 total
8   8 100  68  99  85   352
4   4  98  80  93  74   345
2   2  99  82  84  77   342
10 10  94  65  94  75   328
7   7  88  72  75  84   319
9   9  81  75  89  73   318
728x90
반응형