[关闭]
@fanxy 2020-03-15T16:52:43.000000Z 字数 8850 阅读 6042

第三讲 数据可视化与编程基础

樊潇彦 复旦大学经济学院 金融数据


0. 准备工作

下载 Ch03.rar,解压缩后存于工作目录下。

  1. setwd("D:\\...\\Ch03") # 设定工作目录,注意为/或\\
  2. rm(list=ls()) # 清内存
  3. ## 调用之前已安装且当前要用的包
  4. library(tidyverse)
  5. library(readstata13)
  6. library(haven)
  7. library(readxl)
  8. ## 安装和调用本节要用的包
  9. install.packages(c("ggplot2","ggvis","shiny","dygraphs"))
  10. library(ggplot2)
  11. library(ggvis)
  12. library(shiny)
  13. library(dygraphs)

1. 数据可视化

1.1 基础作图

  1. # 基础绘图 plot,包括 points, lines, scatter, histogram等
  2. data(iris)
  3. plot(iris$Sepal.Length) # 单变量点图
  4. plot(iris$Sepal.Length,type="l") # 单变量线图
  5. plot(x=iris$Sepal.Length, y=iris$Petal.Length) # 双变量散点图
  6. plot(x=iris$Sepal.Length, y=iris$Petal.Length, type="h") # 双变量柱状图
  7. plot(x=iris$Species, y=iris$Petal.Length, type="h") # 连续变量按离散变量分组的箱式图
  8. # 条形图 barplot
  9. msft <- c(26.85,27.41,28.21,32.64,34.66,34.30,31.62,33.40)
  10. msft.returns <- msft[-1] / msft[-length(msft)] - 1
  11. names(msft.returns) <- month.abb[1:length(msft.returns)]
  12. barplot(msft.returns, col="blue")
  13. barplot(msft.returns, names.arg=month.name[1:length(msft.returns)],
  14. col="blue",las=2)
  15. # 饼图 pie
  16. x <-1:5;pie(x,col=rainbow(5))
  17. box()
  18. # 向日葵图 sunflowerplot
  19. sunflowerplot(iris[,3:4])
  20. # 绘制矩阵或数据框的二元图 pairs
  21. data(iris)
  22. pairs(iris[1:4], main = "Anderson's Iris Data -- 3 species",pch = 21,
  23. bg = c("red", "green3", "blue")[unclass(iris$Species)])
  24. # 多个数据作图 matplot
  25. set.seed(1)
  26. x <- cumsum(rnorm(50))
  27. y <- cumsum(rnorm(50))
  28. z <- cumsum(rnorm(50))
  29. matplot(cbind(x,y,z),col=2:4,type="l",lty=1, xlab="", ylab="")
  30. legend("bottom",legend=c("x","y","z"),
  31. lty=1,col=2:4,bty="n")
  32. # 根据指定函数绘制指定范围的曲线图
  33. curve(sin, -2*pi, 2*pi, xname = "t")

1.2 修饰和保存

  1. # 以参数`par`和标签`legend`为例:
  2. intc <- c(20.42,20.48,21.43,23.50,24.04,24.00,23.11,21.98)
  3. intc.returns <- intc[-1] / intc[-length(intc)] - 1
  4. barplot(rbind(msft.returns,intc.returns),beside=T,col=c(2,4))
  5. legend(x="topleft",legend=c("MSFT","INTC"),pch=15,col=c(2,4),bty="n")
  6. # 生成一个绘图窗口在其中绘制图形后用savePlot()函数保存
  7. windows()
  8. plot(1:10)
  9. rect(1, 5, 3, 7, col="blue")
  10. savePlot("test01", type="jpg",device=dev.cur(),restoreConsole=TRUE)
  11. # 直接在jpeg设备上绘制图形,完成后使用dev.off()关闭设备,存盘退出
  12. jpeg(file="myplot.jpeg")
  13. plot(1:10)
  14. rect(1, 5, 3, 7, col="blue")
  15. dev.off()

1.3 ggplot2

Basic Components of a ggplot2 Plot

ggplot_command.jpg-141.5kB

  1. library(ggplot2)
  2. data(diamonds) # 调用钻石数据
  3. set.seed(42) # 设随机数,抽1000个样本
  4. small <- diamonds[sample(nrow(diamonds), 1000), ]
  5. head(small)
  6. # 生成空白图,查看ggplot对象要素
  7. p = ggplot()
  8. class(p)
  9. names(p)
  10. # 数据和映射
  11. # 以克拉(carat)、切工(cut)、透明度(clarity)等因素对钻石价格(price)的影响为例:
  12. ggplot(data = small,
  13. mapping = aes(x = carat, y = price, # 纵横轴
  14. color = cut, shape = clarity)
  15. ) +
  16. geom_point()
  17. # 几何与统计
  18. ggplot(small) +
  19. geom_density(aes(x=price, colour=cut))
  20. # 分面与标签
  21. ggplot(small, aes(x=carat, y=price, color=color))+ # 标题、纵横轴标签
  22. geom_point() +
  23. facet_wrap(~cut) + # 一页多图
  24. labs(title ="Diamonds", x = "Carat", y = "Price") +
  25. theme_bw()

1.4 ggvis

1.主要命令

  1. library(ggvis)
  2. library(dplyr)
  3. data(mtcars)
  4. # 散点和拟合线:points, smooths
  5. mtcars %>%
  6. ggvis(~wt, ~mpg) %>%
  7. layer_points(fill= ~factor(cyl))%>% # 除填充(fill)外,还可设边界(stroke)、大小(size)、形状(shape)和透明度(opacity)
  8. layer_lines(stroke:= "gray") %>%
  9. layer_smooths()
  10. # 线和条:lines, bars
  11. data(pressure)
  12. pressure %>%
  13. ggvis(~temperature, ~pressure) %>%
  14. layer_lines(stroke := "red") %>%
  15. layer_bars(width=20, fill:=NA)

2.统计作图

  1. # Boxplots
  2. mtcars %>%
  3. ggvis(~factor(cyl), ~mpg) %>%
  4. layer_boxplots()
  5. # Histograms
  6. cocaine %>%
  7. ggvis(~potency) %>%
  8. layer_histograms(width = 10, center = 0, fill := "pink") %>%
  9. add_axis("x", title = "potency") %>%
  10. add_axis("y", title = "histograms")
  11. # Densities
  12. cocaine %>%
  13. ggvis(~potency) %>%
  14. layer_densities(fill := NA, stroke := "red") %>%
  15. add_axis("x", title = "potency") %>%
  16. add_axis("y", title = "densities")

3.回归预测

  1. mtcars %>%
  2. ggvis(~wt, ~mpg) %>%
  3. layer_points() %>%
  4. layer_smooths() %>%
  5. layer_model_predictions(stroke := "red", model = "lm", se = TRUE)
  6. mtcars %>%
  7. ggvis(~wt, ~mpg, fill = ~factor(cyl)) %>%
  8. layer_points() %>%
  9. group_by(cyl) %>% # 分组回归
  10. layer_model_predictions(model = "lm")

4.交互式作图

  1. # 1) 连续调整
  2. mtcars %>%
  3. ggvis(~wt, ~mpg) %>%
  4. layer_points(size := input_slider(10, 100, value = 30))%>% # 调节点大小
  5. layer_smooths(span = input_slider(0.5, 1, value = 1)) # 调节拟合的窗宽
  6. # 2) 文本框
  7. mtcars %>%
  8. ggvis(x = ~wt) %>%
  9. layer_densities(
  10. adjust = input_slider(.1, 2, value = 1, step = .1, label = "Bandwidth adjustment"),
  11. kernel = input_select(c("Gaussian" = "gaussian",
  12. "Epanechnikov" = "epanechnikov",
  13. "Rectangular" = "rectangular",
  14. "Triangular" = "triangular",
  15. "Biweight" = "biweight",
  16. "Cosine" = "cosine",
  17. "Optcosine" = "optcosine"),
  18. label = "Kernel"))
  19. # 3) 动态图
  20. dat <- data.frame(time = 1:10, value = runif(10))
  21. ddat <- reactive({invalidateLater(2000, NULL)
  22. dat$time <<- c(dat$time[-1], dat$time[length(dat$time)] + 1)
  23. dat$value <<- c(dat$value[-1], runif(1))
  24. dat })
  25. ddat %>%
  26. ggvis(x = ~time, y = ~value, key := ~time) %>%
  27. layer_points() %>%
  28. layer_paths()

1.5 dygraphs

  1. library(dygraphs)
  2. # dyAnnotation:标注
  3. dygraph(presidents, main = "Presidential Approval") %>%
  4. dyAxis("y", valueRange = c(0, 100)) %>%
  5. dyAnnotation("1950-7-1", text = "A", tooltip = "Korea") %>%
  6. dyAnnotation("1965-1-1", text = "B", tooltip = "Vietnam")
  7. # dyAxis:坐标轴
  8. dygraph(nhtemp, main = "New Haven Temperatures") %>%
  9. dyAxis("y", label = "Temp (F)", valueRange = c(40, 60)) %>%
  10. dyOptions(axisLineWidth = 1.5, fillGraph = TRUE, drawGrid = FALSE)
  11. # dyEvent:事件
  12. dygraph(presidents, main = "Presidential Approval") %>%
  13. dyAxis("y", valueRange = c(0, 100)) %>%
  14. dyEvent("1950-6-30", "Korea", labelLoc = "bottom") %>%
  15. dyEvent("1965-2-09", "Vietnam", labelLoc = "bottom")
  16. # dyHighlight:提亮
  17. lungDeaths <- cbind(ldeaths, mdeaths, fdeaths)
  18. dygraph(lungDeaths, main = "Deaths from Lung Disease (UK)") %>%
  19. dyHighlight(highlightCircleSize = 5,
  20. highlightSeriesBackgroundAlpha = 0.2,
  21. hideOnMouseOut = FALSE)
  22. # dyLegend:标签
  23. dygraph(nhtemp, main = "New Haven Temperatures") %>%
  24. dySeries("V1", label = "Temperature (F)") %>%
  25. dyLegend(show = "always", hideOnMouseOut = FALSE)
  26. # dyLimit:极值
  27. dygraph(presidents, main = "Presidential Approval") %>%
  28. dyAxis("y", valueRange = c(0, 100)) %>%
  29. dyLimit(max(presidents, na.rm = TRUE), "Max",
  30. strokePattern = "solid", color = "blue")
  31. # dyOptions:选项
  32. dygraph(lungDeaths) %>% dyRangeSelector()
  33. dygraph(lungDeaths) %>%
  34. dySeries("mdeaths", label = "Male") %>%
  35. dySeries("fdeaths", label = "Female") %>%
  36. dyOptions(stackedGraph = TRUE) %>%
  37. dyRangeSelector(height = 20)
  38. hw <- HoltWinters(ldeaths)
  39. predicted <- predict(hw, n.ahead = 72, prediction.interval = TRUE)
  40. dygraph(predicted, main = "Predicted Lung Deaths (UK)") %>%
  41. dyAxis("x", drawGrid = FALSE) %>%
  42. dySeries(c("lwr", "fit", "upr"), label = "Deaths") %>%
  43. dyOptions(colors = RColorBrewer::brewer.pal(3, "Set1"))
  44. # dyRangeSelector:时间区
  45. dygraph(nhtemp, main = "New Haven Temperatures") %>%
  46. dyRangeSelector()
  47. dygraph(nhtemp, main = "New Haven Temperatures") %>%
  48. dyRangeSelector(dateWindow = c("1920-01-01", "1960-01-01"))
  49. dygraph(nhtemp, main = "New Haven Temperatures") %>%
  50. dyRangeSelector(height = 20, strokeColor = "")
  51. # dyRoller:滚动平滑
  52. # Y values are averaged over the specified number of time scale units.
  53. dygraph(discoveries, main = "Important Discoveries") %>%
  54. dyRoller(rollPeriod = 5)
  55. # dyShading:阴影区
  56. dygraph(nhtemp, main = "New Haven Temperatures") %>%
  57. dyShading(from = "1920-1-1", to = "1930-1-1") %>%
  58. dyShading(from = "1940-1-1", to = "1950-1-1")
  59. dygraph(nhtemp, main = "New Haven Temperatures") %>%
  60. dyShading(from = "48", to = "52", axis = "y") %>%
  61. dyShading(from = "50", to = "50.1", axis = "y", color = "black")

2. 编程基础

介绍变量赋值、分支结构、循环结构、函数使用、获取帮助等知识。

2.1 通过赋值生成一个新变量

  1. x <- 1.5
  2. cat("x = ",x,"\n",sep="") # 屏幕显示,也可用于测试程序
  3. y1 <- c(1.5,2.3,8.6,7.4,9.2)
  4. y2 <- c("MSFT","GOOG","AAPL")
  5. y3 <- c(T,F,T,T,F,F)
  6. 3.1415926 -> z; # 数据在左,变量名在右赋值,但比较少用
  7. assign("t",1.414) # assign()函数给变量赋值

2.2 分支结构:if, if-else

  1. # if
  2. a <- 1
  3. if(a==1) print("a==1")
  4. a <- 2
  5. if(a > 1) print("a > 1") else print("a <= 1")
  6. a <- 3
  7. if( a == 1){
  8. a # 不会显示 a 的值
  9. print("I am a boy!")
  10. }else{ # 如果有多行命令,需要用{}引起来,else必须紧跟在}后面
  11. print(a) # 会显示 a 的值
  12. print("I am a girl!")
  13. }

2.3 多重分支结构:if, ifelse, switch

  1. # 1) if - else if
  2. a <- 4
  3. if( a == 1)
  4. {
  5. print("a == 1")
  6. }else if( a == 2) # 同样每个else必须和前面的}紧紧粘在一起
  7. {
  8. print("a == 2")
  9. }else
  10. {
  11. print("Not 1 & 2")
  12. }
  13. # 2) ifelse()计算第一个逻辑表达式得到结果如果为T则返回第二个参数;否则返回第三个参数
  14. a <- 2
  15. ifelse(a > 1,3.1416,1.414)
  16. # 3) switch语句的多重分支结构
  17. switch(a,
  18. print("选项1"),
  19. print("选项2"),
  20. print("选项3"),
  21. print("选项4"),
  22. print("选项5")
  23. )

2.4 循环结构: for, while, repeat

  1. # 1) for
  2. iTotal <- 0
  3. for(i in 1:100) # 用关键词in枚举向量中的每一整数
  4. {
  5. iTotal <- iTotal + i
  6. }
  7. cat("1-100的累加和为:",iTotal,"\n",sep="")
  8. szSymbols <- c("MSFT","GOOG","AAPL","INTL","ORCL","SYMC")
  9. for(SymbolName in szSymbols) # 字符串也可以枚举
  10. {
  11. cat(SymbolName,"\n",sep="")
  12. }
  13. # 2) while
  14. i <- 1
  15. iTotal <- 0
  16. while(i <= 100)
  17. {
  18. iTotal <- iTotal + i
  19. i <- i + 1
  20. }
  21. cat("1-100的累加和为:",iTotal,"\n",sep="") # 屏幕显示结果
  22. # 3) repeat
  23. i <- 1
  24. iTotal <- 0
  25. repeat # 无条件循环,必须在程序内部设法退出
  26. {
  27. iTotal <- iTotal + i
  28. i <- i + 1
  29. if(i <= 100) next else break # 注意:next,break的用法
  30. }
  31. cat("1-100的累加和为:",iTotal,"\n",sep="")

2.5 自定义函数 function

  1. # 对于小函数,可写好后直接调用。如计算矩阵的幂:
  2. mat_power = function(A, n){
  3. Apower=A
  4. for (i in 2:n) Apower= Apower %*% A
  5. return(Apower)
  6. }
  7. A = matrix(c(1:4),2)
  8. mat_power(A, 3)
  9. A %*% A %*% A
  10. # 对于较大的函数,要另存为.r 文件,再调用。
  11. rm("mat_power")
  12. source("myfun.r") # 调用自编程序
  13. mat_power(A, 3)

2.6 获取帮助信息

  1. ?print # 在RStudio右侧打开相关帮助界面
  2. example(print) # 命令示例
  3. ?quantmod # 打开扩展包整体帮助信息
  4. apropos("print*") # 在搜索路径下查找满足正则表达式的所有函数信息
  5. demo(graphics)
  6. # 如果对包或命令的具体名称不清楚,可以从 google 或 http://rseek.org/ 上查找。
  1. R.I. Kabacoff著:《R语言实战(第2版)》,王小宁、刘撷芯、黄俊文译,人民邮电出版社,2016
  2. Rstudio: ggplot2 Cheat Sheet
  3. Roger D. Peng:Exploratory Data Analysis, Lecture Notes
  4. The DataCamp Team: Questions All R Users Have About Plots
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注