@fanxy 2020-10-23T07:03:19.000000Z 字数 6359 阅读 9133

数量分析软件应用：R语言

樊潇彦 复旦大学经济学院 数量软件

数量分析软件应用：R语言
考试.zip
准备工作
第一讲 R语言基础
第二讲数据处理
第三讲数据可视化与编程基础
课堂练习
- 2020年10月9日
- 2020年10月16日：检验巴萨效应
第四讲概率统计与统计学习基础（选读）
- 4.1 概率统计基础
- 4.2 统计学习基础
第五讲计量回归基础与应用（选读）
- 5.1 计量回归基础
- 5.2 广义线性回归与面板数据分析

考试.zip

bop_iip.csv472.5kB
Exam20201023.R1.2kB

准备工作

资料下载：
- Kabacoff, R.I. 著：《R语言实战（第2版）》，王小宁等译，人民邮电出版社，2016
  RiA2_Code.zip-491.4kB
- Rstudio: Cheatsheets
  R_Cheetsheet.rar-2272.5kB
软件安装：
- 下载安装 R 和 Rstudio；
- 在Rstudio中依次选定 File --> New File --> .R，新建一个R语言脚本（看起来像个.txt文件），把code直接考到脚本里；
- 改一下工作目录 setwd("D:\\...")，选中某段程序，点右上角的 run 就能运行了。
工作界面
- 左上：程序和变量窗口，右侧有运行Run等命令键
- 右上：显示内存中的变量名(Enviroment)和历史命令(History)
- 左下：控制台(Console窗口，可以用 Ctrl+L清空
- 右下：工作目录(File)、作图(Plot)、管理包(Packages)、帮助(Help)等。

第一讲 R语言基础

第二讲数据处理

第三讲数据可视化与编程基础

课堂练习

2020年10月9日

下载数据并解压到工作目录：
20201009.rar

#---------- 1. 准备工作 ------------------------------------
setwd("D:\\...")                  # 设置工作目录，数据也存在该目录下
library(tidyverse)
# library(dplyr)                                  
# library(tidyr)
# library(ggplot2)
library(readxl)
#---------- 2. GDP与增长率 --------------------------------- 
data=read.csv("gdp.csv")
str(data)
gdp=data%>%
  rename(year=Sgnyea,GDP=Gdp0101)%>%arrange(year)%>%
  mutate(gr= GDP/lag(GDP)-1)%>%
  select(year,GDP,gr)%>%
  gather(var,value,-year)
ggplot(gdp,aes(year,value))+
  geom_line(size=1)+                           # 做线图，宽度为1
  facet_wrap(~var,scales="free")+              # 分面
  geom_vline(xintercept=c(1978,2001,2008), colour="black", linetype="dotted")+ # 加纵线
  labs(title="",x="",y="")+                    # 图名与纵横坐标名称
  scale_x_continuous(breaks=seq(1952,2017,by=13))+ 
  theme_bw()+                                  # 黑白底
  theme(legend.position="bottom",
        strip.text= element_text(size=12),     # 分面字号，纵横分面用element_text.x()和element_text.y()
        axis.text.x = element_text(size = 11), # 横轴字号
        axis.text.y = element_text(size = 11)) # 纵轴字号
#---------- 3. 产业结构 --------------------------------- 
share=data.frame(var=paste("sh",1:3,sep=""),                                 # 指标英文名称
                 var_cn=factor(c("第一产业","第二产业","第三产业"),          # 指标中文名称
                               levels=c("第一产业","第二产业","第三产业"),
                               ordered=T),
                 stringsAsFactors = F)
gdp_sh=data%>%
  rename(year=Sgnyea,gdp=Gdp0101)%>%
  mutate(sh1=Gdp0102/gdp,sh2=Gdp0103/gdp,sh3=Gdp0106/gdp)%>%
  select(year,sh1:sh3)%>%
  gather(var,share,-year)%>%
  left_join(share,by="var")%>%
  arrange(year,var_cn)
ggplot(gdp_sh,aes(year,share,color=var_cn))+
  geom_line(size=1)+
  labs(title="历年GDP产业结构",x="",y="")+
  scale_colour_manual(values=c("green","red","blue"))+  # 设定线条颜色
  scale_x_continuous(breaks=seq(1952,2017,by=5))+ 
  scale_y_continuous(limits=c(0.05,0.55), breaks=seq(0.05,0.55,by=0.1), 
                     labels = scales::percent)+         # 纵轴为百分比
  guides(color=guide_legend(title=NULL))+               # 去掉颜色标签的title 
  theme_bw()+  
  theme(legend.position="bottom",                       # 颜色标签置于底部
        legend.text=element_text(size=12),              # 标签字号12
        axis.text.x = element_text(size = 11),          
        axis.text.y = element_text(size = 11))
#---------- 4. 城乡居民收入 ---------------------------------
ruub = read_excel("CME_Consmp3.xls")  # 读取数据
colnames(ruub)=c("year","inc_ru","inc_ub","incid_ru","incid_ub","engel_ru","engel_ub")  # 改变量名
ruub= ruub%>% 
  select(-incid_ru,-incid_ub)%>%       # 去掉 incid_ru incid_ub 两个指标
  gather(var,value,-year, na.rm=T)%>%  # 将数据变为 year var value 三列
  mutate(region=sub(".*_","",var),
         var=sub("_.*","",var))        # 生成 region = ru, ub 和 var = inc, engel
# 在一张图中画出城乡居民人均收入的线图
ggplot(ruub%>%filter(var=="inc"), aes(year,value,color=region))+geom_line() 
# 在一张图中画出城乡居民恩格尔曲线的线图
ggplot(ruub%>%filter(var=="engel"), aes(year,value,color=region))+geom_line() 
# 按 var 分面作图，比较城乡居民人均收入和恩格尔曲线
ggplot(ruub, aes(year,value, color=region))+geom_line()+
  facet_wrap(~var,scales="free")

2020年10月16日：检验巴萨效应

下载数据：20201016.rar

#---------- 0. 准备工作 ---------------------------------------
setwd("D:\\...")
install.packages("ggrepel")    # geom_text_repel命令，自动调整标记文本
library(tidyverse)
library(readxl)
library(ggrepel)   
#---------- 1. 国家信息 ---------------------------------
countryinfo=read_excel("countryinfo.xlsx")
country6=c("China","Japan","United Kingdom","Germany","France","United States")
#---------- 2. BIS实际汇率指数 ---------------------------------
# BIS "Effective exchange rate indices (monthly)" 
#    https://www.bis.org/statistics/full_data_sets.htm 
re_ori=read.csv("BISWEB_EERDATAFLOW_csv_col.csv")
# colnames(re_ori)[689]  # 截至日期2020.08
str(re_ori[,1:8])
table(re_ori[,3])
re=re_ori%>%
  filter(EER_TYPE=="R", EER_BASKET=="B")%>%
  rename(iso2=REF_AREA,country=Reference.area)%>%
  select(iso2,country,X1964.01:X2020.08)%>%
  gather(time,re,-iso2,-country, na.rm =T)%>%
  mutate(year=as.numeric(substring(time,2,5)))%>%
  group_by(iso2,country,year)%>%
  summarise(re=mean(re))%>%
  group_by()%>%
  left_join(countryinfo,by="iso2")
g_re=re%>%
  filter(country %in% country6)%>%
  mutate(label=ifelse(year==2020,country_cn,""))
ggplot(g_re,aes(year,re,color=country_cn))+geom_line(size=1)+
  geom_text_repel(aes(label=label))+
  labs(title="",x="",y="")+guides(linetype=guide_legend(NULL))+
  scale_x_continuous(breaks = seq(1994,2020,2))+  # 纵轴连续
  theme_bw()+  
  theme(legend.position="non", 
        strip.text= element_text(size=11),        # 分面字号，纵横分面用element_text.x()和element_text.y()
        axis.text.x = element_text(size = 11),    # 横轴字号
        axis.text.y = element_text(size = 11))    # 纵轴字号
summary(re)
data_re=re%>%spread(year,re)%>%                   # 生成回归所用数据
  mutate(grre=`2017`/`1994`-1)%>%                   # PWT数据截至到2017年
  select(iso3,grre)
#---------- 3. PWT实际人均GDP ---------------------------------
# Penn World Table version 9.1
#    https://www.rug.nl/ggdc/productivity/pwt/
pwt=read_excel("pwt91.xlsx",sheet="Data")
str(pwt)
pwt=pwt%>%
  mutate(rgdppa=rgdpe/pop)%>%filter(!is.na(rgdppa))%>%
  rename(iso3=countrycode)%>%
  select(year,iso3,country,rgdppa)%>%
  left_join(countryinfo,by="iso3")
g_pwt=pwt%>%
  filter(country %in% country6)%>%
  mutate(label=ifelse(year==1992,country_cn,""))
ggplot(g_pwt,aes(year,rgdppa,color=country_cn))+geom_line(size=1)+
  geom_text_repel(aes(label=label))+
  labs(title="",x="",y="")+guides(linetype=guide_legend(NULL))+
  scale_x_continuous(breaks = seq(1952,2017,5))+    # 纵轴连续，中国数据从1952年开始
  theme_bw()+  
  theme(legend.position="non", 
        strip.text= element_text(size=11),# 分面字号，纵横分面用element_text.x()和element_text.y()
        axis.text.x = element_text(size = 11),      # 横轴字号
        axis.text.y = element_text(size = 11))      # 纵轴字号
data_pwt=pwt%>%filter(year>=1994)%>%
  spread(year,rgdppa)%>%
  mutate(grgdppa=`2017`/`1994`-1)%>%   # 汇率数据中国从1994年开始
  select(iso3,country_UN,country_cn,grgdppa)
#---------- 4. 合并数据，检验BS效应 ---------------------------------
data=merge(data_re,data_pwt,by="iso3",all=T)%>%
  filter(!is.na(grre) & !is.na(grgdppa))%>%
  arrange(country_cn)%>%
  select(iso3,country_cn,country_UN,grre,grgdppa)
ggplot(data,aes(grgdppa,grre))+geom_point()+geom_smooth(method="lm")+
  geom_text_repel(aes(label=country_cn))+
  labs(title="检验巴拉萨-萨缪尔森效应（1994-2017）",x="实际人均GDP增长",y="实际汇率指数增长")+
  theme_bw()+  
  theme(strip.text= element_text(size=11),# 分面字号，纵横分面用element_text.x()和element_text.y()
        axis.text.x = element_text(size = 11),  # 横轴字号
        axis.text.y = element_text(size = 11))  # 纵轴字号
regbs=lm(grre~grgdppa,data)  
summary(regbs)

数量分析软件应用：R语言

准备工作

课堂练习

2020年10月9日

2020年10月16日：检验巴萨效应

第四讲 概率统计与统计学习基础（选读）

第五讲 计量回归基础与应用（选读）

内容目录

第四讲概率统计与统计学习基础（选读）

第五讲计量回归基础与应用（选读）