@Channelchan
2018-04-19T09:31:16.000000Z
字数 7532
阅读 53303
Alphalens是一个Python第三方库,专门用于选股因子alpha(α)的绩效分析。
下载方式: pip install alphalens
官方网站: http://quantopian.github.io/alphalens/index.html
factor: MultiIndex(用stack()方法来转换)
prices: DataFrame
索引要求均为日期-datetime日期格式
将以上数据通过alphalens处理成标准格式-factor_data 用于进一步分析
下面以沪深300成分股为例,构造选股因子(factor)并
from jaqs_fxdayu.data import DataView # 可以视为一个轻量级的数据库,数据格式基于pandas,方便数据的调用和处理
from jaqs_fxdayu.data import RemoteDataService # 数据服务,用于下载数据
import os
import warnings
warnings.filterwarnings("ignore")
dataview_folder = './Factor'
if not (os.path.isdir(dataview_folder)):
os.makedirs(dataview_folder)
# 数据下载
def save_dataview():
data_config = {
"remote.data.address": "tcp://data.tushare.org:8910",
"remote.data.username": "18566262672",
"remote.data.password": "eyJhbGciOiJIUzI1NiJ9.eyJjcmVhdGVfdGltZSI6IjE1MTI3MDI3NTAyMTIiLCJpc3MiOiJhdXRoMCIsImlkIjoiMTg1NjYyNjI2NzIifQ.O_-yR0zYagrLRvPbggnru1Rapk4kiyAzcwYt2a3vlpM"
}
ds = RemoteDataService()
ds.init_from_config(data_config)
dv = DataView()
props = {'start_date': 20140101, 'end_date': 20140201, 'universe': '000300.SH',
'fields': "pb,pe,ps,float_mv,sw1",
'freq': 1}
dv.init_from_config(props, ds)
dv.prepare_data()
dv.save_dataview(dataview_folder) # 保存数据文件到指定路径,方便下次直接加载
# save_dataview()
# 加载数据
dv = DataView()
dv.load_dataview(dataview_folder)
Dataview loaded successfully.
dv.fields
['pb',
'pe',
'ps',
'float_mv',
'sw1',
'open_adj',
'high_adj',
'low_adj',
'close_adj',
'open',
'high',
'low',
'close',
'vwap',
'vwap_adj',
'adjust_factor',
'index_member',
'index_weight',
'sw1',
'volume',
'_daily_adjust_factor',
'_limit']
import pandas as pd
from datetime import datetime
factor = dv.get_ts("pb")
factor.index = pd.Index(map(lambda x: datetime.strptime(str(x),"%Y%m%d") , factor.index)) #索引调整为datetime日期格式
factor = factor.stack()#处理成MultiIndex格式(alphalens分析因子必须的格式)
factor.head()
symbol
2014-01-02 000001.SZ 1.0563
000002.SZ 1.2891
000008.SZ 4.8981
000009.SZ 3.5794
000012.SZ 2.3725
dtype: float64
def change_index(df):
df.index = pd.Index(map(lambda x: datetime.strptime(str(x),"%Y%m%d") , df.index)) #索引调整为datetime日期格式
return df
prices = dv.get_ts("close_adj") #获取价格
prices = change_index(prices)
prices.head()
symbol | 000001.SZ | 000002.SZ | 000008.SZ | 000009.SZ | 000012.SZ | 000024.SZ | 000027.SZ | 000039.SZ | 000046.SZ | 000059.SZ | ... | 601992.SH | 601997.SH | 601998.SH | 603000.SH | 603160.SH | 603288.SH | 603699.SH | 603858.SH | 603885.SH | 603993.SH |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2014-01-02 | 714.073976 | 933.813512 | 45.579438 | 42.049039 | 129.513427 | 105.238518 | 43.384556 | 349.062344 | 130.893481 | 7.614983 | ... | 6.813780 | NaN | 4.611372 | 79.319265 | NaN | NaN | NaN | NaN | NaN | 6.541190 |
2014-01-03 | 696.557852 | 916.282595 | 45.326219 | 40.930713 | 127.447568 | 103.445262 | 42.353478 | 354.474163 | 128.550869 | 7.339742 | ... | 6.422065 | NaN | 4.539693 | 78.055898 | NaN | NaN | NaN | NaN | NaN | 6.438341 |
2014-01-06 | 681.377212 | 874.208394 | 43.097891 | 40.572849 | 120.296520 | 97.911786 | 41.401715 | 342.748554 | 126.501083 | 6.988046 | ... | 6.133433 | NaN | 4.468013 | 77.762796 | NaN | NaN | NaN | NaN | NaN | 6.191504 |
2014-01-07 | 679.041729 | 868.364755 | 42.490165 | 40.483383 | 119.660872 | 95.759878 | 41.243088 | 353.121208 | 124.451297 | 6.865717 | ... | 6.143741 | NaN | 4.456067 | 79.743757 | NaN | NaN | NaN | NaN | NaN | 6.191504 |
2014-01-08 | 686.632049 | 867.196028 | 41.679864 | 39.767655 | 119.343047 | 93.146848 | 41.718969 | 351.542761 | 126.208256 | 6.835135 | ... | 5.947883 | NaN | 4.468013 | 81.866214 | NaN | NaN | NaN | NaN | NaN | 6.098940 |
5 rows × 472 columns
import alphalens
#计算目标股票池每只股票的持有期收益,和对应因子值的quantile分类
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor, prices, quantiles=5, periods=(1,5,10))
factor_data.head()
1 | 5 | 10 | factor | factor_quantile | ||
---|---|---|---|---|---|---|
date | asset | |||||
2014-01-02 | 000001.SZ | -0.024530 | -0.033524 | -0.044154 | 1.0563 | 1 |
000002.SZ | -0.018773 | -0.066333 | -0.100125 | 1.2891 | 2 | |
000008.SZ | -0.005556 | -0.102222 | -0.054444 | 4.8981 | 4 | |
000009.SZ | -0.026596 | -0.075532 | -0.057447 | 3.5794 | 4 | |
000012.SZ | -0.015951 | -0.093252 | -0.039264 | 2.3725 | 3 |
import matplotlib.pyplot as plt
mean_return_by_q, std_err_by_q = alphalens.performance.mean_return_by_quantile(factor_data, by_date=True, demeaned=True)
alphalens.plotting.plot_cumulative_returns_by_quantile(mean_return_by_q, 1)
alphalens.plotting.plot_cumulative_returns_by_quantile(mean_return_by_q, 5)
alphalens.plotting.plot_cumulative_returns_by_quantile(mean_return_by_q, 10)
plt.show()
度量变量的预测值与实际值之间的相关关系。信息系数是用来评估金融分析师预测技能的一种表现方法。
系数在-1到1之间,越大表示正相关程度强。
这里,我们可以用信息系数来评价一个选股因子效力的好坏。也即,用因子值大小与下一期股票收益大小的相关程度来评估因子。
经验来看,通常|mean(IC)|>0.02可以作为判定一个选股因子有收益预测效力的标准
其中d为秩次差。
A = [1,3,5,7,9]
B = [3,2,4,5,1]
A的排序是1,2,3,4,5
B的排序是3,2,4,5,1
d为排序相减
接下来我们要计算的因子IC值,即为当期因子值与下期股票收益值的spearman相关系数。
ic = alphalens.performance.factor_information_coefficient(factor_data)
ic.head()
1 | 5 | 10 | |
---|---|---|---|
date | |||
2014-01-02 | 0.325391 | 0.367524 | 0.338448 |
2014-01-03 | 0.086336 | 0.231268 | 0.277532 |
2014-01-06 | 0.216727 | 0.159411 | 0.203103 |
2014-01-07 | 0.376174 | 0.151324 | 0.135532 |
2014-01-08 | -0.041003 | 0.161179 | 0.002910 |
alphalens.plotting.plot_ic_hist(ic) #因子IC分布
alphalens.plotting.plot_ic_ts(ic) #因子IC时间序列曲线
plt.show()
# 月均IC热度图
mean_monthly_ic = alphalens.performance.mean_information_coefficient(factor_data, by_time='M')
alphalens.plotting.plot_monthly_ic_heatmap(mean_monthly_ic)#因子月均IC表现
plt.show()
import numpy as np
excel_data = factor_data[factor_data['factor_quantile']==1]["factor_quantile"].unstack().replace(np.nan, 0)
excel_data.to_excel('./pb_quantile_1_by_alphalens.xlsx')
excel_data.head()
asset | 000001.SZ | 000002.SZ | 000024.SZ | 000027.SZ | 000039.SZ | 000059.SZ | 000063.SZ | 000069.SZ | 000157.SZ | 000338.SZ | ... | 601898.SH | 601901.SH | 601918.SH | 601939.SH | 601985.SH | 601988.SH | 601991.SH | 601992.SH | 601997.SH | 601998.SH |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||||||||||
2014-01-02 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 |
2014-01-03 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 |
2014-01-06 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 |
2014-01-07 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 |
2014-01-08 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 |
5 rows × 192 columns
sectors = dv.get_ts("sw1")
sectors = change_index(sectors)
sectors.head()
symbol | 000001.SZ | 000002.SZ | 000008.SZ | 000009.SZ | 000012.SZ | 000024.SZ | 000027.SZ | 000039.SZ | 000046.SZ | 000059.SZ | ... | 601992.SH | 601997.SH | 601998.SH | 603000.SH | 603160.SH | 603288.SH | 603699.SH | 603858.SH | 603885.SH | 603993.SH |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2014-01-02 | 480000 | 430000 | 460000 | 510000 | 610000 | 430000 | 410000 | 640000 | 430000 | 220000 | ... | 610000 | 480000 | 480000 | 720000 | 270000 | 340000 | 640000 | 370000 | 420000 | 240000 |
2014-01-03 | 480000 | 430000 | 460000 | 510000 | 610000 | 430000 | 410000 | 640000 | 430000 | 220000 | ... | 610000 | 480000 | 480000 | 720000 | 270000 | 340000 | 640000 | 370000 | 420000 | 240000 |
2014-01-06 | 480000 | 430000 | 460000 | 510000 | 610000 | 430000 | 410000 | 640000 | 430000 | 220000 | ... | 610000 | 480000 | 480000 | 720000 | 270000 | 340000 | 640000 | 370000 | 420000 | 240000 |
2014-01-07 | 480000 | 430000 | 460000 | 510000 | 610000 | 430000 | 410000 | 640000 | 430000 | 220000 | ... | 610000 | 480000 | 480000 | 720000 | 270000 | 340000 | 640000 | 370000 | 420000 | 240000 |
2014-01-08 | 480000 | 430000 | 460000 | 510000 | 610000 | 430000 | 410000 | 640000 | 430000 | 220000 | ... | 610000 | 480000 | 480000 | 720000 | 270000 | 340000 | 640000 | 370000 | 420000 | 240000 |
5 rows × 472 columns
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor,
prices,
groupby=sectors.stack(),
quantiles=5,
periods=(1, 5, 10))
factor_data.head()
1 | 5 | 10 | factor | group | factor_quantile | ||
---|---|---|---|---|---|---|---|
date | asset | ||||||
2014-01-02 | 000001.SZ | -0.024530 | -0.033524 | -0.044154 | 1.0563 | 480000 | 1 |
000002.SZ | -0.018773 | -0.066333 | -0.100125 | 1.2891 | 430000 | 2 | |
000008.SZ | -0.005556 | -0.102222 | -0.054444 | 4.8981 | 460000 | 4 | |
000009.SZ | -0.026596 | -0.075532 | -0.057447 | 3.5794 | 510000 | 4 | |
000012.SZ | -0.015951 | -0.093252 | -0.039264 | 2.3725 | 610000 | 3 |
ic_by_sector = alphalens.performance.mean_information_coefficient(factor_data, by_group=True)
ic_by_sector.head()
1 | 5 | 10 | |
---|---|---|---|
group | |||
110000 | -0.025498 | -0.038887 | -0.052598 |
210000 | -0.054027 | -0.094159 | -0.118142 |
220000 | -0.023304 | -0.032256 | -0.036310 |
230000 | -0.053635 | -0.073254 | -0.094852 |
240000 | -0.035358 | -0.049735 | -0.065802 |
from matplotlib import pyplot as plt
alphalens.plotting.plot_ic_by_group(ic_by_sector)
plt.show()