[关闭]
@Channelchan 2018-01-30T18:56:13.000000Z 字数 7851 阅读 24064

多因子组合

  1. 读取本地数据
  2. 数据预处理
  3. 多因子组合方法
  4. 查看因子组合绩效
  5. 交集与并集组合的方法

1_读取本地数据

  1. from jaqs.data.dataapi import DataApi
  2. from jaqs.data import DataView
  3. import numpy as np
  4. from datetime import datetime
  5. import pandas as pd
  6. import warnings
  7. import alphalens
  8. warnings.filterwarnings("ignore")
  9. dataview_folder = 'JAQS_Data/hs300'
  10. dv = DataView()
  11. dv.load_dataview(dataview_folder)
D:\Anaconda3\lib\site-packages\statsmodels\compat\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools


Dataview loaded successfully.
  1. def mask_index_member():
  2. df_index_member = dv.get_ts('index_member')
  3. mask_index_member = df_index_member ==0 #定义信号过滤条件-非指数成分
  4. return mask_index_member
  5. def limit_up_down():
  6. # 定义可买卖条件——未停牌、未涨跌停
  7. trade_status = dv.get_ts('trade_status')
  8. mask_sus = trade_status == u'停牌'
  9. # 涨停
  10. dv.add_formula('up_limit', '(close - Delay(close, 1)) / Delay(close, 1) > 0.095', is_quarterly=False)
  11. # 跌停
  12. dv.add_formula('down_limit', '(close - Delay(close, 1)) / Delay(close, 1) < -0.095', is_quarterly=False)
  13. can_enter = np.logical_and(dv.get_ts('up_limit') < 1, ~mask_sus) # 未涨停未停牌
  14. can_exit = np.logical_and(dv.get_ts('down_limit') < 1, ~mask_sus) # 未跌停未停牌
  15. return can_enter,can_exit
  1. mask = mask_index_member()
  2. can_enter,can_exit = limit_up_down()
  3. price = dv.get_ts('close_adj')
  4. price_bench = dv.data_benchmark
Add formula failed: name [up_limit] exist. Try another name.
Add formula failed: name [down_limit] exist. Try another name.
  1. all_factors = ['pb', 'roe', 'price_div_dps', 'ps_ttm', 'pe_ttm', 'roa']

2_数据预处理

  1. 去极值,标准化,加干扰项
  2. 修改名称索引
  3. 计算IC的相关系数,并剔除高相关性的因子
  1. origin_factors = {f: dv.get_ts(f) for f in all_factors}

1. 去极值,标准化,加干扰项

  1. from jaqs.research.signaldigger import process
  2. #去极值,z_score标准化,加干扰值
  3. factor_dict = {name: process.get_disturbed_factor(process.standardize(process.winsorize(frame)))\
  4. for name, frame in origin_factors.items()}

2. 修改名称索引

  1. def change_columns_index(signal):
  2. new_names = {}
  3. for c in signal.columns:
  4. if c.endswith('SZ'):
  5. new_names[c] = c.replace('SZ', 'XSHE')
  6. elif c.endswith('SH'):
  7. new_names[c] = c.replace('SH', 'XSHG')
  8. signal = signal.rename_axis(new_names, axis=1)
  9. signal.index = pd.Index(map(lambda x: datetime.strptime(str(x),"%Y%m%d") , signal.index))
  10. return signal
  1. price_time = change_columns_index(price)

3. 计算IC的相关系数,并剔除高相关性的因子

  1. def cal_daily_ic(factor_df):
  2. factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor_df.stack(), price_time, quantiles=5)
  3. return alphalens.performance.mean_information_coefficient(factor_data, by_time='D')
  4. daily_ic = {key: cal_daily_ic(change_columns_index(value)) for key, value in factor_dict.items()}
  5. daily_panel = pd.Panel(daily_ic)
  1. factor_corr = daily_panel.minor_xs(5).corr()
  1. print(factor_corr)
                     pb    pe_ttm  price_div_dps    ps_ttm       roa       roe
pb             1.000000  0.858428       0.764783  0.913597  0.689696  0.229572
pe_ttm         0.858428  1.000000       0.823935  0.807155  0.376659 -0.169861
price_div_dps  0.764783  0.823935       1.000000  0.699770  0.320101 -0.109434
ps_ttm         0.913597  0.807155       0.699770  1.000000  0.615621  0.244025
roa            0.689696  0.376659       0.320101  0.615621  1.000000  0.789137
roe            0.229572 -0.169861      -0.109434  0.244025  0.789137  1.000000
  1. ic = daily_panel.minor_xs(5).mean()
  1. def compare(corr, targets):
  2. index = list(reversed(targets.index))
  3. length = len(index)
  4. for i in range(length):
  5. name = index[i]
  6. if available(corr, name, index[i+1:]):
  7. yield name
  1. def available(corr, target, compares):
  2. for c in compares:
  3. if corr.loc[target, c] > 0.9 or (corr.loc[target, c] < -0.9 ):
  4. return False
  5. return True
  1. for i in compare(factor_corr, ic.nlargest(2)):
  2. print(i)
roe
roa
  1. big_dict = {i: factor_dict[i] for i in compare(factor_corr, ic.nlargest(2))}

3_多因子组合方法

  1. combine_factors的equal_weight(等权重加权)
  2. combine_factors的max_IR_props(动态加权_最大化IR)

1. combine_factors的equal_weight(等权重加权)

  1. from jaqs.research import multi_factor
  2. Equal_Portfolio = multi_factor.combine_factors(big_dict,
  3. standardize_type="z_score",
  4. winsorization=False,
  5. weighted_method="equal_weight",
  6. max_IR_props=None)
  1. print(Equal_Portfolio.tail(1))
symbol      000001.SZ  000002.SZ  000008.SZ  000009.SZ  000012.SZ  000021.SZ  \
trade_date                                                                     
20171222    -0.346562  -0.171129  -0.823929  -0.656348   0.081882  -0.025089   

symbol      000024.SZ  000027.SZ  000031.SZ  000039.SZ    ...      601998.SH  \
trade_date                                                ...                  
20171222    -0.411117   -0.70641   -0.22589  -0.646009    ...       -0.36197   

symbol      603000.SH  603160.SH  603288.SH  603699.SH  603799.SH  603833.SH  \
trade_date                                                                     
20171222    -1.052117   3.039816   2.988756  -0.184505   1.917662   2.297176   

symbol      603858.SH  603885.SH  603993.SH  
trade_date                                   
20171222      0.39488   0.967938  -0.389438  

[1 rows x 539 columns]

2. combine_factors的max_IRprops(动态加权最大化IR)

  1. price_bench = dv.data_benchmark
  2. max_IR_props = {
  3. 'price': price,
  4. 'benchmark_price': price_bench,# 为空计算的是绝对收益 不为空计算相对收益
  5. 'period': 5,
  6. 'mask': mask,
  7. 'can_enter': can_enter,
  8. 'can_exit': can_exit,
  9. 'forward': True,
  10. 'commission': 0.0008,
  11. "covariance_type": "simple", #协方差矩阵估算方法 还可以为"shrink"
  12. "rollback_period": 30} #用多少期的ic做权重计算
  1. Factor_Portfolio = multi_factor.combine_factors(big_dict,
  2. standardize_type="rank",
  3. winsorization=False,
  4. weighted_method="max_IR",
  5. max_IR_props=max_IR_props)
Nan Data Count (should be zero) : 0;  Percentage of effective data: 53%
Nan Data Count (should be zero) : 0;  Percentage of effective data: 53%
  1. print(Factor_Portfolio.tail(1))
symbol      000001.SZ  000002.SZ  000008.SZ  000009.SZ  000012.SZ  000021.SZ  \
trade_date                                                                     
20171222     0.664193   0.671614   0.120594   0.263451     0.6141   0.593692   

symbol      000024.SZ  000027.SZ  000031.SZ  000039.SZ    ...      601998.SH  \
trade_date                                                ...                  
20171222     0.497217   0.243043   0.617811   0.304267    ...       0.649351   

symbol      603000.SH  603160.SH  603288.SH  603699.SH  603799.SH  603833.SH  \
trade_date                                                                     
20171222     0.059369    0.96846    0.96475   0.441558   0.959184   0.942486   

symbol      603858.SH  603885.SH  603993.SH  
trade_date                                   
20171222      0.61039   0.892393   0.419295  

[1 rows x 539 columns]

4_查看因子组合绩效

  1. JAQS绩效
  2. alphalens绩效

1. JAQS绩效

  1. import matplotlib.pyplot as plt
  2. from jaqs.research import SignalDigger
  3. def cal_obj(signal, name, period, quantile):
  4. price = dv.get_ts('close_adj')
  5. obj = SignalDigger(output_folder="hs300/%s" % name,
  6. output_format='pdf')
  7. obj.process_signal_before_analysis(signal,
  8. price=price,
  9. n_quantiles=quantile, period=period,
  10. # benchmark_price=price_bench,
  11. can_enter = can_enter,
  12. can_exit = can_exit,
  13. mask=mask
  14. )
  15. obj.create_full_report()
  16. return obj
  17. def plot_pfm(signal, name, period=5, quantile=5):
  18. obj = cal_obj(signal, name, period, quantile)
  19. obj.fig_objs
  20. plt.show()
  21. def signal_data(signal, name, period=5, quantile=5):
  22. obj = cal_obj(signal, name, period, quantile)
  23. return obj.signal_data
  1. plot_pfm(Factor_Portfolio, 'roa_roe', 5, 5)
Nan Data Count (should be zero) : 0;  Percentage of effective data: 51%


Value of signals of Different Quantiles Statistics
               min       max      mean       std  count    count %
quantile                                                          
1         0.001855  0.818182  0.124605  0.074315  81165  20.142948
2         0.128015  0.935065  0.332423  0.073199  80584  19.998759
3         0.317254  0.942486  0.522951  0.069826  80584  19.998759
4         0.484230  0.953618  0.706823  0.065806  80584  19.998759
5         0.664193  1.000000  0.881344  0.057896  80028  19.860775
Figure saved: C:\Users\small\OneDrive\notebook\Internet_Course\4_Selection\JAQS_Plus\hs300\roa_roe\returns_report.pdf
Information Analysis
                ic
IC Mean      0.010
IC Std.      0.182
t-stat(IC)   2.008
p-value(IC)  0.045
IC Skew     -0.048
IC Kurtosis -0.267
Ann. IR      0.053
Figure saved: C:\Users\small\OneDrive\notebook\Internet_Course\4_Selection\JAQS_Plus\hs300\roa_roe\information_report.pdf



<matplotlib.figure.Figure at 0x24c7449e828>

output_34_2.png-449.6kB

output_34_3.png-191.4kB

2. Alphalens绩效

  1. factor = change_columns_index(Factor_Portfolio.loc['2015-09-01':]).stack()
  1. factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor, price_time, quantiles=5)
  2. mean_return_by_q, std_err_by_q = alphalens.performance.mean_return_by_quantile(factor_data, by_date=True)
  1. import matplotlib.pyplot as plt
  2. alphalens.plotting.plot_cumulative_returns_by_quantile(mean_return_by_q, 5)
  3. plt.show()

output_38_0.png-92.8kB

  1. ic = alphalens.performance.factor_information_coefficient(factor_data)
  2. alphalens.plotting.plot_ic_hist(ic)
  3. mean_monthly_ic = alphalens.performance.mean_information_coefficient(factor_data, by_time='M')
  4. alphalens.plotting.plot_monthly_ic_heatmap(mean_monthly_ic)
  5. plt.show()

output_39_0.png-39.5kB

output_39_1.png-38.9kB

5_交集与并集选股的方法

  1. roe_df = dv.get_ts('roe')
  2. pb_df = dv.get_ts('pb')
  1. def largest(row, n=30):
  2. return pd.Series(1, row.nlargest(n).index)
  3. def smallest(row, n=30):
  4. return pd.Series(1, row.nlargest(n).index)
  1. from functools import partial
  2. big_roe = roe_df.agg(partial(largest, n=10), axis=1)
  3. small_pb = pb_df.agg(partial(smallest, n=10), axis=1)

PB最低的N只与ROE最高的N只的交集

  1. Intersection_data = big_roe+small_pb
  1. Intersection = Intersection_data[Intersection_data==2].replace(2,1)

PB最低的N只与ROE最高的N只的并集

  1. Union_data = big_roe.replace(np.nan,0)+small_pb.replace(np.nan,0)
  1. Union = Union_data.replace(0, np.nan)
  2. Union[Union>0] = 1
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注