@fanxy
2021-09-15T12:17:21.000000Z
字数 9132
阅读 9057
樊潇彦
复旦大学经济学院
金融数据
数据下载:Ch02.rar
setwd("D:\\...\\Ch02")
## tidyverse 包是一个综合包,内含数据读取与处理(readr,tidyr,dplyr)、作图(ggplot2)、特定数据类型处理(tibble,stringr,forcats),以及函数化编程(purrr)等常用包。
install.packages("tidyverse")
install.packages(c("readstata13","haven","readxl")) # 读写 stata、excel 等其他格式的数据
install.packages(c("babynames","hflights")) # 含姓名和航班数据样本的包
## 调用
library(tidyverse)
library(readstata13)
library(haven)
library(readxl)
library(babynames)
library(hflights)
Wiki: Data are values of qualitative or quantitative variables, belonging to a set of items.
.Rdata
、.csv
、.txt
、.xlsx
、.dta
等)。
# 存放和读取.Rdata
library(quantmod)
getSymbols("^GSPC",from="2015-1-1")
tail(GSPC)
save(GSPC,file="GSPC.Rdata") # 将保存历史记录的数据框直接写入本地Rdata文件
rm(GSPC) # 删除变量GSPC之后,内存中无内容,无法访问
load("GSPC.RDATA") # 装载save()函数保存的数据
tail(GSPC)
# .csv文件
class(GSPC) # 原数据为时序类型
GSPC_df=data.frame(Date=as.Date(index(GSPC)), # 日期
coredata(GSPC)) # 数据
class(GSPC_df) # 类型变为 data.frame
str(GSPC_df) # 查看数据结构
write.csv(GSPC,file="GSPC_csv.csv", row.names=F) # 将数据写入.csv文件,row.names=F不会保留行号
GSPC <- read.csv("GSPC_csv.csv",header=T,skip=0) # read.csv为系统自带命令;含表头,即第一行为变量名
class(GSPC) # data.frame类型,但没有日期
# .txt文件
write.table(GSPC_df, "GSPC_txt.txt") # 将.csv数据写为.txt格式
GSPC <- read.table("GSPC_txt.txt",header = T, # read.table为系统自带命令
stringsAsFactors =F) # 字符不要转成因子,第一列 Date 作为字符处理
str(GSPC)
# .dta文件
library(readstata13)
save.dta13(GSPC_df,"GSPC.dta",version=14)
GSPC_dta=read.dta13("GSPC.dta")
str(GSPC_dta)
library(haven)
GSPC=GSPC_df
colnames(GSPC)=gsub("GSPC.","",colnames(GSPC)) # 原变量名中含有"GSPC.",去掉后保存,否则报错
write_dta(GSPC, "GSPC.dta", version=14)
GSPC_dta=read_dta("GSPC.dta")
str(GSPC_dta)
# .xls 或 .xlsx文件
library(readxl)
loan=read_excel("loan.xls")
str(loan) # 发现部分变量名有误,在练习 loan 中介绍如何处理
readr
包读写和处理1. read_*, write_*
:读写不同格式数据文件
library(readr)
library(readxl)
# 以 ',' 为分割符的 .csv 文件
read_csv(system.file("extdata/mtcars.csv", package = "readr"))
read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))
read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))
read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
read_csv("x,y\n1,2\n3,4")
# 其他常用分隔符
data=c("a;b\tc d|e\nf;g\th|i j")
read_csv2(data, col_names=F) # ;
read_tsv(data, col_names=F) # \t
read_table(data, col_names=F) # " "
read_lines(data) # \n
read_delim(data, delim = "|", col_names=F) # 通用,如"|"
# 读入列为固定长度的文件
read_fwf(data, fwf_empty(data)) # 空格分割
read_fwf(data, fwf_widths(c(2, 7))) # 规定每一列字符数
read_fwf(data, fwf_positions(c(1,3), c(2,8))) # 规定起止字符位置
# 作为字符读入
read_file(data)
# 读入log文件
read_log(system.file("extdata/example.log", package = "readr"))
# 将系统自带的 iris 数据存为.csv, .txt等
write_csv(iris, "iris.csv") # ,
write_tsv(iris, "iris.txt") # \t
write_delim(iris, "iris.csv", delim=",") # delim 可以设不同分割符
2. cols
: 设置列向量的类型
?
= guess-_/-
to skip the column.
# 读入数据,最后一列为 character
iris=read_csv("iris.csv")
str(iris)
# 改为 factor
iris=read_csv("iris.csv", col_types = cols(
Species = col_factor(c("setosa", "versicolor", "virginica"))
))
str(iris)
# 只读入一列 cols_only
iris=read_csv("iris.csv", col_types = cols_only(
Species = col_factor(c("setosa", "versicolor", "virginica"))
))
head(iris)
# type_convert:直接转换为最有可能的格式
df <- data.frame(
x = as.character(runif(10)),
y = as.character(sample(10)),
stringsAsFactors = FALSE
)
str(df)
str(type_convert(df))
df <- data.frame(x = c("NA", 10), stringsAsFactors = FALSE)
str(type_convert(df))
3. parse_*
: 字符向量转换
parse_integer(c("1", "2", "3"))
parse_double(c("1", "2", "3.123"))
parse_factor(c("a", "b"), letters)
parse_number("$1,123,456.00")
# 空值处理
x <- c("1", "2", "3", "-")
parse_double(x, na = "-")
4. 其他
# 设置中国时间格式、数字分隔符和时区
myloc <- locale(date_names ="zh", decimal_mark =",", tz="PRC" )
parse_datetime(x=Sys.time(), locale=myloc)
# 报错
y <- read_csv("x\n1\n2\nb", col_types = list(col_double()))
y
problems(y)
rm(list=ls())
# 查看数据情况
data(iris)
dim(iris) # 几行几列
nrow(iris); ncol(iris)
colnames(iris) # 指标名称
attributes(iris) # 数据性质
str(iris) # 数据结构
summary(iris)
# 剔除缺失值和异常值样本
set.seed(1)
a=rnorm(100); b=a+rnorm(100) # 生成模拟数据
a[c(2, 3)]="NA" # a有4个缺失值 NA 和 NULL
a[c(4, 5)]="NULL"
b[50]=10*max(b) # b有异常值
simdata=data.frame(id=seq(1,100),a=a,b=b)
head(simdata)
str(simdata) # 发现a是因子变量
simdata$a=as.numeric(as.character(a)) # 把a转换成数值变量
na.fail(simdata) # 检测向量、矩阵、数据框中是否包含NA数值,如果包含返回错误信息
simdata=na.omit(simdata) # 去掉有缺失值的样本
varplot=c("a","b") # 需要作图的指标
par(mfrow=c(1,length(varplot))) # 设置窗口
for (i in varplot) boxplot(simdata[,i], main=i) # 做箱线图,发现b有异常值
outlier=abs(simdata$b-mean(simdata$b))>3*sd(simdata$b) # 异常值所在行的逻辑向量
simdata=simdata[-which(outlier),] # 去除异常值
summary(simdata)
tidyr
和 dplyr
包处理数据1. 为什么用 tidyr
和 dplyr
?
- Classroom data are like teddy bears and real data are like a grizzley bear with salmon blood dripping out its mouth.(Jenny Bryan)
- Up to 80% of data analysis is spent on the process of cleaning and preparing data. (Wickham, 2014 and Dasu and Johnson, 2003)
- `tidyr` & `dplyr` provide fundamental functions for cleaning, processing, & manipulating data, making 95% of your data wrangling tasks much easier!(Brad Boehmke, 2015)
2. 基本思想:化整为零、逐步处理
library(babynames)
library(tidyr)
library(dplyr, warn.conflicts = FALSE)
a=babynames %>%
filter(name == "Hadley") %>%
group_by(year) %>%
summarise(total = sum(n)) %>%
arrange(desc(year))
b0 <- babynames
b1 <- filter(b0, name == "Hadley")
b2 <- group_by(b1, year)
b3 <- summarise(b2, total = sum(n))
b4 <- arrange(b3, desc(year))
# 该命令还可以用于函数,如计算x1和x2之间的欧氏距离
x1 <- 1:5; x2 <- 2:6; sqrt(sum((x1-x2)^2))
(x1-x2)^2 %>% sum() %>% sqrt()
3. tidyr
tb <- read_csv("tb.csv")
class(tb)
head(tb)
## 把 m04 (男-4岁)等列指标转换为 demographic = 指标名, n = 指标值的行样本
tb2 <- tb %>%
gather(demographic, n, m04:fu, na.rm = TRUE)
tb2
## 把 demographic 分为 sex 和 age 两个指标
tb3 <- tb2 %>%
separate(demographic, c("sex", "age"), 1)
tb3
## 重命名、排序
tb4 <- tb3 %>%
rename(country = iso2) %>%
arrange(country, year, sex, age)
tb4
4. dplyr
library(dplyr)
library(hflights)
data(hflights) # 调用航班数据
flights <- tbl_df(hflights) # 生成本地数据
glimpse(flights) # 查看数据情况
filter(flights, UniqueCarrier %in% c("AA", "UA"))
filter(flights, Month==1, DayofMonth==1)
filter(flights, Month==1 | DayofMonth==1)
select(flights, DepTime, ArrTime, FlightNum)
select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay)) # DepDelay指标的降序,升序为arrange(DepDelay)
flights <- flights %>% mutate(Speed = Distance/AirTime*60)
# 按Dest分组,计算平均ArrDelay
flights %>%
group_by(Dest) %>%
summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))
# 计算组内多个指标的情况,n()为组内样本量,n_distinct(TailNum)为counts the number of unique items in TailNum
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))
# 分组对多个变量Cancelled, Diverted,计算统计指标mean
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted)
# 分组对一个变量Delay计算多个指标min、max
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Delay"))
# 分组观察一个变量的统计表(number of cancelled and not cancelled flights)
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table()
mean
)的n-input, 1-output
不同,Window function
是n-input, n-output
的统计函数,如min_rank
、lead
、lag
、cummean
(累积加总)等。
# 每架飞机在1年中的哪两天延误时间最长 top_n(2) 等同于filter(min_rank(desc(DepDelay)) <= 2)
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
top_n(2) %>%
arrange(UniqueCarrier, desc(DepDelay))
# 计算每个月飞行的航班数和变动情况
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count - lag(flight_count))
flights %>% sample_n(5) # 抽5个样本
flights %>% sample_frac(0.25, replace=TRUE) # 抽25%样本
rm(list=ls())
# 读取银行贷款数据
loan=read_excel("loan.xls")
str(loan) # 发现变量名有误:(1)先打开数据,选中要复制的表头;(2)运行以下命令将表头复制到timevec
timevec=read.table("clipboard", sep = "\t", header = T) # 从内存剪贴板中读取数据
colnames(loan)=c("var","freq",colnames(timevec)) # 更改变量名
# 用 `tidyr` 和 `dplyr` 包的命令整理
loan_year= loan %>%
filter(freq == "年") %>% select(-freq) %>%
separate(var,c("bank","var"))%>% # 将原来的 "银行名:贷款总额" 拆为两个新变量
select(-var)%>% # 用 table(loan_year$var) 查看后删除 var
gather(time, loan_vol, -bank, na.rm =T) %>%
mutate(time=gsub("X","",time))%>% # 将原来的 "X2000.12" 中的 X 去掉
separate(time,c("year","month"))%>% # 将 "2000.12" 拆为年和月
mutate(year=as.numeric(year))%>% # 字符改为数值
select(-month)%>% # 用 table(loan_year$month) 查看后删除 month
arrange(bank,year) # 按银行名和年份排序
str(loan_year)
loan_quar=loan %>%
filter(freq == "季") %>% select(-freq) %>%
separate(var,c("bank","var"))%>%select(-var)%>%
gather(time, loan_vol, -bank, na.rm =T) %>%
mutate(time=gsub("X","",time))%>%
separate(time,c("year","month")) %>%
mutate(year=as.numeric(year)) %>%
mutate(quar=as.numeric(month)/3) %>% # 用 table(loan_year$quar) 查看是否正确
select(bank,year,quar,loan_vol)%>% # 按顺序选择变量
arrange(bank,year,quar) # 按银行名、年份和季度排序
str(loan_quar)