目录

数据操作-apply函数族

R 作为一种向量化的编程语言,一大特征便是以向量计算替代了循环计算,使效率大大提升。
apply函数族正是为解决数据循环处理问题而生的 —— 面向不同数据类型,生成不同返回值的包含8个相关函数的函数族。

为何要用apply?

在使用 R 时,要尽量用 array 的方式思考,避免 for 循环,写过多的 for 循环代码,最后把 R 代码写的跟 C 似得说明你没有进入 R 的思考方式,是一种费力不讨好的行为。那么不用循环怎么实现迭代呢?apply函数族是一把利器,它不是一个函数,而是一族功能类似的函数。



语法详解

apply

1
apply(X, margin, FUN, ...)

参数列表: X:数组、矩阵、数据框 margin:按维度运算,1表示按行,2表示按列,c(1,3)表示第1、3维 FUN:要使用的函数

{% label info@举例阐释 %}

1
2
3
4
5
6
7
8
9
> mat <- matrix(1:12, 3, 4)
> mat
     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12

> apply(mat, 2, sum)
[1]  6 15 24 33
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
> ary <- array(1:12, c(2,3,2))
> ary
, , 1
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6
, , 2
     [,1] [,2] [,3]
[1,]    7    9   11
[2,]    8   10   12

> apply(ary, c(1,3), sum)
     [,1] [,2]
[1,]    9   27
[2,]   12   30
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
> data <- data.frame(x1=1:5, x2=6:10)
> data
  x1 x2
1  1  6
2  2  7
3  3  8
4  4  9
5  5 10

> apply(data, 2, mean)
x1 x2 
 3  8 

tapply

1
tapply(X, INDEX, FUN = NULL, ..., simplify = TRUE)

参数列表: X:向量、数组 INDEX:用于分组的索引 FUN:要使用的函数 simplify : 是否数组化,当值TRUE时,输出结果按数组进行分组输出

{% label info@举例阐释 %}

1
2
3
4
> x <- 1:6
> INDEX <- c('a','a','b','c','c','c')
> tapply(x, INDEX)
[1] 1 1 2 3 3 3
1
2
3
> tapply(x, INDEX, sum)
 a  b  c 
 3  3 15 
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
> mat <- matrix(1:10, 2)
> mat
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    3    5    7    9
[2,]    2    4    6    8   10
> INDEX <- matrix(c(rep(1,5), rep(2,5)), nrow=2)
> INDEX
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    1    1    2    2
[2,]    1    1    2    2    2
> tapply(mat, INDEX)
 [1] 1 1 1 1 1 2 2 2 2 2
> tapply(mat, INDEX, mean)
1 2 
3 8 

lapply

1
lapply(X, FUN, ...)

参数列表: X:列表、向量、数据框 FUN:要使用的函数

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
> lst <- list(a=1:10, b=seq(0,7,2), c=c(2,5,8))
> lst
$a
 [1]  1  2  3  4  5  6  7  8  9 10
$b
[1] 0 2 4 6
$c
[1] 2 5 8

> lapply(lst, mean)
$a
[1] 5.5
$b
[1] 3
$c
[1] 5
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
> data <- data.frame(x1=1:5, x2=6:10)
> data
  x1 x2
1  1  6
2  2  7
3  3  8
4  4  9
5  5 10
> lapply(data, sum)
$x1
[1] 15
$x2
[1] 40
1
2
3
4
5
6
7
> isLeapYear <- function(a){
+   if( (a%%4==0 & a%/%100!=0) | a%%400==0 )
+     a
+ }
> a <- 1900:1910
> unlist(lapply(a, isLeapYear))
[1] 1900 1904 1908

rapply

1
rapply(list, f, classes = "ANY", deflt = NULL,how = c("unlist", "replace", "list"), ...)

参数列表: list:列表 f:要使用的函数 classes: 匹配类型, ANY为所有类型 deflt: 非匹配类型的默认值 how: 3种操作方式,

  • replace:则用调用f后的结果替换原list中原来的元素;
  • list:新建一个list,类型匹配调用f函数,不匹配赋值为deflt;
  • unlist:执行一次unlist(recursive = TRUE)操作

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
> lst <- list(a=list(aa=c(1:5), ab=c(6:10)), b=list(ba=c(1:10)))
> lst
$a
$a$aa
[1] 1 2 3 4 5
$a$ab
[1]  6  7  8  9 10
$b
$b$ba
 [1]  1  2  3  4  5  6  7  8  9 10

> rapply(lst, sum, how="replace")  # 输出结果为list
$a
$a$aa
[1] 15
$a$ab
[1] 40
$b
$b$ba
[1] 55

> rapply(lst, sum, how="unlist")   # 输出结果为vector
a.aa a.ab b.ba 
  15   40   55 

sapply

1
sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)

参数列表: X:列表、向量、数据框 FUN:要使用的函数 simplify: 若FALSE,等价于lapply。否则,将lapply输出的list简化为vector或matrix USE.NAMES: 如果X为字符串,TRUE设置字符串为数据名,FALSE不设置

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
> lst <- list(a=c(1:5), b=c(6:10))
> sapply(lst, sum, simplify = F)    # 输出list
$a
[1] 15
$b
[1] 40

> sapply(lst, sum)                  # 输出vector
 a  b 
15 40 

> sapply(lst, fivenum)              # 输出matrix
     a  b
[1,] 1  6
[2,] 2  7
[3,] 3  8
[4,] 4  9
[5,] 5 10
1
2
3
4
5
6
7
8
> val <- head(letters)
> val
[1] "a" "b" "c" "d" "e" "f"
> sapply(val, paste)
  a   b   c   d   e   f 
"a" "b" "c" "d" "e" "f" 
> sapply(val, paste, USE.NAMES = F)
[1] "a" "b" "c" "d" "e" "f"

vapply

1
vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

参数列表: X:列表、数据框 FUN:要使用的函数 FUN.VALUE:定义返回值的行名row.names USE.NAMES: 如果X为字符串,TRUE设置字符串为数据名,FALSE不设置

{% label info@举例阐释 %}

1
2
3
4
5
6
> lst <- list(a=c(1:5), b=c(6:10))
> res <- vapply(lst, function(x) c(min(x), max(x)), c(min.=0, max.=0))
> res
     a  b
min. 1  6
max. 5 10
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
> data <- data.frame(cbind(x1=3, x2=c(2:1,4:5)))
> data
  x1 x2
1  3  2
2  3  1
3  3  4
4  3  5

> vapply(data, cumsum, FUN.VALUE=c('a'=0,'b'=0,'c'=0,'d'=0))
  x1 x2
a  3  2
b  6  3
c  9  7
d 12 12

mapply

1
mapply(FUN, ..., MoreArgs=NULL, SIMPLIFY=TRUE, USE.NAMES=TRUE)

参数列表: FUN:要使用的函数 …: 接收多个数据(list、vector) MoreArgs: FUN的参数列表 simplify: 若FALSE,输出list。否则,将输出的list简化为vector或matrix USE.NAMES: 如果X为字符串,TRUE设置字符串为数据名,FALSE不设置

{% label info@举例阐释 %}

1
2
3
> mapply(sum, list(a=1,b=2,c=3), list(a=10,b=20,d=30))
 a  b  c 
11 22 33 
1
2
3
4
5
6
7
8
> a <- 1:10
> b <- 5:-4
> a
 [1]  1  2  3  4  5  6  7  8  9 10
> b
 [1]  5  4  3  2  1  0 -1 -2 -3 -4
> mapply(max, a, b)
 [1]  5  4  3  4  5  6  7  8  9 10
1
2
3
4
5
> mapply(function(x,y) c(x+y, x^y, x-y), c(1:5), c(1:5))
     [,1] [,2] [,3] [,4] [,5]
[1,]    2    4    6    8   10
[2,]    1    4   27  256 3125
[3,]    0    0    0    0    0

eapply

1
eapply(env, FUN, ..., all.names = FALSE, USE.NAMES = TRUE)

参数列表: env: 环境空间 FUN:要使用的函数 all.names: 匹配类型, ANY为所有类型 USE.NAMES: 如果X为字符串,TRUE设置字符串为数据名,FALSE不设置

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
> # 定义一个环境空间
> env <- new.env()
> # 向这个环境空间中存入3个变量
> env$a <- 1:10
> env$b <- exp(-3:3)
> env$logic <- c(TRUE, FALSE, FALSE, TRUE)

> ls(env)      # 查看env空间中的变量
[1] "a"     "b"     "logic"
> ls.str(env)  # 查看env空间中的变量字符串结构
a :  int [1:10] 1 2 3 4 5 6 7 8 9 10
b :  num [1:7] 0.0498 0.1353 0.3679 1 2.7183 ...
logic :  logi [1:4] TRUE FALSE FALSE TRUE

> eapply(env, mean)   # 计算env环境空间中所有变量的均值
$a
[1] 5.5
$b
[1] 4.535125
$logic
[1] 0.5


应用及拓展

应用展示

原始数据为按年份year、地区loc和商品类别type进行统计的销售量。我们要制作两个销售总量的crosstable,一个以年份为行、地区为列,一个以年份为行,类别为列。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
> df <- data.frame(year=kronecker(2001:2003, rep(1,4)), 
                 loc=c('beijing','beijing','shanghai','shanghai'), 
                 type=rep(c('A','B'),6), sale=rep(1:12))
> df
   year      loc type sale
1  2001  beijing    A    1
2  2001  beijing    B    2
3  2001 shanghai    A    3
4  2001 shanghai    B    4
5  2002  beijing    A    5
6  2002  beijing    B    6
7  2002 shanghai    A    7
8  2002 shanghai    B    8
9  2003  beijing    A    9
10 2003  beijing    B   10
11 2003 shanghai    A   11
12 2003 shanghai    B   12

> tapply(df$sale, df[,c('year','loc')], sum)
      loc
year   beijing shanghai
  2001       3        7
  2002      11       15
  2003      19       23

> tapply(df$sale, df[,c('year','type')], sum)
      type
year    A  B
  2001  4  6
  2002 12 14
  2003 20 22
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
> list1 <- list(a=1:5, b=list(c=1:4, d=5:9))
> list1
$a
[1] 1 2 3 4 5
$b
$b$c
[1] 1 2 3 4
$b$d
[1] 5 6 7 8 9

> list2 <- list(a=1:5, b=list(c=5:8, d=1:5))
> list2
$a
[1] 1 2 3 4 5
$b
$b$c
[1] 5 6 7 8
$b$d
[1] 1 2 3 4 5

> "%+%" <- function(x,y) mapply("+", x, y)
> mapply("%+%", list1, list2)
$a
[1]  2  4  6  8 10
$b
$b$c
[1]  6  8 10 12
$b$d
[1]  6  8 10 12 14

相关函数

by

1
by(data, INDICES, FUN, ..., simplify = TRUE)

参数列表: data: 数据框 INDICES:与数据框行数等长的用于分组的索引 FUN:要使用的函数

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
> data <- data.frame(a=c(1:5), b=c(6:10))
> data
  a  b
1 1  6
2 2  7
3 3  8
4 4  9
5 5 10
> INDICES <- c(1,1,2,2,2)

> by(data, INDICES, colMeans)
INDICES: 1
  a   b 
1.5 6.5 
-------------------------------------------------------------------------------
INDICES: 2
a b 
4 9 
> by(data, INDICES, rowMeans)
INDICES: 1
  1   2 
3.5 4.5 
-------------------------------------------------------------------------------
INDICES: 2
  3   4   5 
5.5 6.5 7.5 

outer

1
outer(X, Y, FUN = "*", ...)

参数列表: X、Y: 向量、数组 FUN:当为空时即为外积运算,否则为将FUN代替外积运算符进行类似外积的运算操作

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
> x <- 1:4; y <- 2:4
> x; y
[1] 1 2 3 4
[1] 2 3 4
> outer(x, y)
     [,1] [,2] [,3]
[1,]    2    3    4
[2,]    4    6    8
[3,]    6    9   12
[4,]    8   12   16

> month.abb
 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
> outer(month.abb, 1999:2003, FUN = "paste")
      [,1]       [,2]       [,3]       [,4]       [,5]      
 [1,] "Jan 1999" "Jan 2000" "Jan 2001" "Jan 2002" "Jan 2003"
 [2,] "Feb 1999" "Feb 2000" "Feb 2001" "Feb 2002" "Feb 2003"
 [3,] "Mar 1999" "Mar 2000" "Mar 2001" "Mar 2002" "Mar 2003"
 [4,] "Apr 1999" "Apr 2000" "Apr 2001" "Apr 2002" "Apr 2003"
 [5,] "May 1999" "May 2000" "May 2001" "May 2002" "May 2003"
 [6,] "Jun 1999" "Jun 2000" "Jun 2001" "Jun 2002" "Jun 2003"
 [7,] "Jul 1999" "Jul 2000" "Jul 2001" "Jul 2002" "Jul 2003"
 [8,] "Aug 1999" "Aug 2000" "Aug 2001" "Aug 2002" "Aug 2003"
 [9,] "Sep 1999" "Sep 2000" "Sep 2001" "Sep 2002" "Sep 2003"
[10,] "Oct 1999" "Oct 2000" "Oct 2001" "Oct 2002" "Oct 2003"
[11,] "Nov 1999" "Nov 2000" "Nov 2001" "Nov 2002" "Nov 2003"
[12,] "Dec 1999" "Dec 2000" "Dec 2001" "Dec 2002" "Dec 2003"

sweep

1
sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)

参数列表: x: 数组、矩阵 MARGIN:运算维度,1表示行,2表示列,3即第三维度,以此类推 STATS:运算参数,类似于减法中的减数,除法中的除数 FUN:要使用的函数

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
> mat <- matrix(1:9, 3)
> mat
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9

> sweep(mat, 1, c(1,4,7), "+")  # 第一行都加1,第二行都加4,第三行都加7
     [,1] [,2] [,3]
[1,]    2    5    8
[2,]    6    9   12
[3,]   10   13   16
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
> A <- array(1:24, dim = 4:2)
> median <- apply(A, 1:2, median)
> A
, , 1
     [,1] [,2] [,3]
[1,]    1    5    9
[2,]    2    6   10
[3,]    3    7   11
[4,]    4    8   12
, , 2
     [,1] [,2] [,3]
[1,]   13   17   21
[2,]   14   18   22
[3,]   15   19   23
[4,]   16   20   24
> median
     [,1] [,2] [,3]
[1,]    7   11   15
[2,]    8   12   16
[3,]    9   13   17
[4,]   10   14   18

> sweep(A, 1:2, median)
, , 1
     [,1] [,2] [,3]
[1,]   -6   -6   -6
[2,]   -6   -6   -6
[3,]   -6   -6   -6
[4,]   -6   -6   -6
, , 2
     [,1] [,2] [,3]
[1,]    6    6    6
[2,]    6    6    6
[3,]    6    6    6
[4,]    6    6    6

replicate

1
replicate(n, expr, simplify = "array")

参数列表: n: 调用的次数 expr:调用的表达式

{% label info@举例阐释 %}

1
2
3
4
5
6
> game <- function() {
+   n <- sample(1:6,2,replace=T)
+   return(sum(n))
+ }
> replicate(n=10, game())
 [1]  6  6  6  7  7  7 11  8  7  9

aggregate

1
aggregate(x, by, FUN, ...)

参数列表: x: 一种R数据结构,通常为数据框 by:分组索引,必须为list格式 FUN:要使用的函数

{% label info@举例阐释 %}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
> data <- data.frame(name=c("张三","李四","王五","赵六"),
+              sex=c("M","M","F","F"), age=c(20,40,22,30),
+              height=c(166,170,150,155))
> data
  name sex age height
1 张三   M  20    166
2 李四   M  40    170
3 王五   F  22    150
4 赵六   F  30    155

> aggregate(data[,3:4], by=list(data$sex), mean)
  Group.1 age height
1       F  26  152.5
2       M  30  168.0


致谢

参考文章