实战之单因子 IC 分析
概述
在此案例中, 我们将结合之前学到的知识, 使用 alphalens 进行单个因子的 IC 分析. 首先, 我们在投资研究中新建一个云端 notebook.
代码实现
导包
# 使用alphalens计算因子的IC分析
# 分析的区间2020-01-01到2021-01-01
import numpy as np
import pandas as pd
import datetime
from alphalens import performance
from alphalens import plotting
from alphalens import tears
from alphalens import utils
1. 准备因子数据
获取因子数据:
# 1.1 获取因子数据
# 记录开始时间
starttime = datetime.datetime.now()
# 获取这一年的交易日期
date_data = get_trading_dates(start_date="2020-01-01", end_date="2021-01-01")
# 定义df数据集
all_data = pd.DataFrame()
for date in date_data:
# 获取当天因子数据
q = query(
fundamentals.income_statement.basic_earnings_per_share
)
# 获取截面数据
fund = get_fundamentals(q, entry_date=date).iloc[:, 0, :]
# 创建日期列
fund["date"] = date
# 拼接
all_data = pd.concat([all_data, fund])
# 设置双重索引, 变成一个MultiIndex DataFrame
multiindex_df = all_data.set_index(["date", all_data.index])
处理因子数据:
# 1.2 处理因子数据 (去极值, 标准化)
def mad(factor):
"""3倍中位数去极值"""
# 求出因子值的中位数
median = np.median(factor)
# 求出因子值与中位数的差值, 进行绝对值
mad = np.median(abs(factor - median))
# 定义几倍的中位数上下限
high = median + (3 * 1.4826 * mad)
low = median - (3 * 1.4826 * mad)
# 替换上下限
factor = np.where(factor > high, high, factor)
factor = np.where(factor < low, low, factor)
return factor
def stand(factor):
"""数据标准化"""
mean = factor.mean()
std = factor.std()
return (factor - mean) / std
multiindex_df["basic_earnings_per_share"] = mad(multiindex_df["basic_earnings_per_share"])
multiindex_df["basic_earnings_per_share"] = stand(multiindex_df["basic_earnings_per_share"])
# 调试输出
print(multiindex_df.head())
print(type(multiindex_df))
# 转换为Series
singlefactor_series = multiindex_df["basic_earnings_per_share"]
# 调试输出
print(singlefactor_series.head())
print(type(singlefactor_series))
输出结果:
basic_earnings_per_share
date
2020-01-02 000004.XSHE -1.15744
000010.XSHE -0.483551
000007.XSHE -0.798318
000005.XSHE -0.527509
000014.XSHE -0.911352
<class 'pandas.core.frame.DataFrame'>
date
2020-01-02 000004.XSHE -1.15744
000010.XSHE -0.483551
000007.XSHE -0.798318
000005.XSHE -0.527509
000014.XSHE -0.911352
Name: basic_earnings_per_share, dtype: object
<class 'pandas.core.series.Series'>
2. 准备价格数据
# 获取所有股票基础信息
stocks = all_instruments("CS")
# 得到合约代码
stocks_list = stocks["order_book_id"]
# 获取收盘价
price = get_price(stocks_list ,start_date="2020-01-01", end_date="2021-01-01", fields="close")
# 调试输出
print(price.head())
print(type(price))
输出结果:
300264.XSHE 002218.XSHE 600231.XSHG 603311.XSHG 002405.XSHE \
date
2020-01-02 5.30 3.0405 2.7089 11.1171 16.5243
2020-01-03 5.83 3.0306 2.7089 11.0480 16.5243
2020-01-06 6.41 3.0505 2.6991 11.0875 17.2437
2020-01-07 7.05 3.3585 2.7382 11.1664 17.3336
2020-01-08 6.69 3.4877 2.6795 10.7324 17.4734
600477.XSHG 300682.XSHE 600996.XSHG 300616.XSHE 600918.XSHG \
date
2020-01-02 2.7966 15.3663 8.1212 75.8828 NaN
2020-01-03 2.8356 15.6519 8.1607 75.9522 NaN
2020-01-06 2.8356 15.5656 8.2396 75.6946 NaN
2020-01-07 2.8843 16.3296 8.6738 75.9324 NaN
2020-01-08 2.8258 15.7450 8.3580 74.4169 NaN
... 002214.XSHE 000671.XSHE 300651.XSHE 603466.XSHG \
date ...
2020-01-02 ... 10.9446 8.3247 27.7488 15.00
2020-01-03 ... 11.1140 8.0821 28.1575 15.40
2020-01-06 ... 11.2835 7.8492 27.8784 15.79
2020-01-07 ... 11.2037 7.9851 28.0179 15.78
2020-01-08 ... 10.9645 7.7037 27.3900 15.78
605177.XSHG 603959.XSHG 603055.XSHG 600805.XSHG 002513.XSHE \
date
2020-01-02 NaN 10.32 6.9786 5.11 7.28
2020-01-03 NaN 10.25 6.8697 5.09 7.54
2020-01-06 NaN 10.23 6.8697 5.09 7.40
2020-01-07 NaN 10.24 6.8994 5.15 7.35
2020-01-08 NaN 9.87 6.7608 5.12 7.10
002048.XSHE
date
2020-01-02 15.5660
2020-01-03 15.4868
2020-01-06 16.6460
2020-01-07 17.2207
2020-01-08 17.7459
[5 rows x 4156 columns]
<class 'pandas.core.frame.DataFrame'>
3. 生成通用 Alphalens 结构
factor_return = utils.get_clean_factor_and_forward_returns(singlefactor_series.astype(float), price.astype(float))
# 调试输出
print(factor_return)
输出结果:
Dropped 7.3% entries from factor data: 7.3% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
1D 5D 10D factor \
date asset
2020-01-02 000004.XSHE 0.000896 -0.000448 0.005822 -1.157436
000010.XSHE 0.000000 0.082111 0.090909 -0.483551
000007.XSHE -0.004188 -0.011518 -0.024084 -0.798318
000005.XSHE 0.003185 0.022293 -0.003185 -0.527509
000014.XSHE 0.000000 0.028664 0.005314 -0.911352
000009.XSHE -0.039047 0.054040 -0.036037 -0.387394
000006.XSHE 0.014825 0.012975 -0.001851 0.620882
000008.XSHE 0.010790 0.016172 -0.026962 -0.479626
000001.XSHE 0.018376 -0.004745 -0.032013 1.949897
000021.XSHE 0.100150 0.116459 0.137423 0.040407
000023.XSHE 0.012888 0.006061 -0.016680 0.841847
000027.XSHE 0.004801 -0.006434 -0.022490 0.720179
000002.XSHE -0.015664 -0.011056 -0.062345 1.949897
000031.XSHE 0.001380 -0.019334 -0.026249 1.662126
000032.XSHE 0.007481 0.076190 0.076190 0.137742
000034.XSHE 0.000485 0.048085 0.134045 1.949897
000035.XSHE -0.008183 -0.044190 -0.024550 -0.031416
000036.XSHE -0.006947 -0.020842 -0.030097 0.916418
000038.XSHE 0.007432 -0.009083 -0.018993 -0.670763
000017.XSHE 0.004367 0.010917 -0.034934 -0.691172
000012.XSHE -0.007928 0.003984 -0.035716 0.013719
000025.XSHE -0.007045 0.007981 -0.007045 -0.109519
000040.XSHE 0.021176 0.037647 -0.030588 -0.496503
000011.XSHE -0.002102 -0.011559 -0.030484 0.483122
000016.XSHE 0.000000 0.011215 0.051583 0.041585
000019.XSHE 0.029183 0.035661 0.003239 0.428960
001872.XSHE 0.004669 0.003500 -0.021002 1.949897
000026.XSHE -0.010609 -0.029177 0.063663 0.892477
000042.XSHE -0.012852 -0.027544 -0.049580 1.949897
000048.XSHE 0.003200 0.013256 0.069469 1.949897
... ... ... ... ...
2020-12-17 605118.XSHG -0.013126 -0.107995 -0.096062 0.508633
605123.XSHG -0.004298 0.114470 0.151289 1.949897
605128.XSHG 0.001724 -0.028621 -0.000345 1.949897
605151.XSHG -0.099964 -0.155461 -0.251156 1.949897
605136.XSHG -0.044045 -0.153846 -0.123139 1.187228
605168.XSHG -0.053092 -0.081649 -0.088235 1.949897
605158.XSHG -0.009259 -0.078283 -0.060606 1.819118
605166.XSHG 0.000917 -0.093492 -0.085243 0.249206
605169.XSHG 0.052446 0.048924 -0.011350 1.949897
605177.XSHG -0.032110 -0.116852 -0.142926 1.949897
605178.XSHG -0.005158 -0.075350 -0.088062 1.949897
605183.XSHG -0.037875 -0.124447 -0.165765 0.837923
605188.XSHG -0.040553 -0.096774 -0.081106 0.052967
605198.XSHG -0.029389 -0.131477 -0.124130 0.606361
605218.XSHG -0.017476 -0.099029 -0.073786 0.704088
605199.XSHG -0.024012 -0.053834 -0.065840 0.092214
605255.XSHG -0.016744 -0.073903 -0.070439 0.680931
605266.XSHG -0.048725 -0.159584 -0.185815 1.949897
605288.XSHG -0.011104 -0.033922 -0.154310 1.949897
605299.XSHG 0.100237 0.028414 -0.001579 0.131462
605318.XSHG -0.017789 -0.073698 -0.067344 0.445445
605333.XSHG -0.013514 -0.077396 -0.082310 -0.261016
605336.XSHG -0.018796 -0.091112 -0.096528 1.949897
605338.XSHG 0.005014 -0.118049 -0.139471 1.465887
605358.XSHG -0.019246 -0.186269 -0.112160 0.720179
605366.XSHG -0.003091 -0.060278 -0.057187 0.798675
605376.XSHG 0.100030 0.259530 0.433364 1.622879
605369.XSHG -0.019853 -0.107353 -0.150735 1.949897
605388.XSHG -0.008299 -0.112448 -0.071369 1.426640
605399.XSHG 0.011446 -0.077523 -0.077523 1.387392
factor_quantile
date asset
2020-01-02 000004.XSHE 1
000010.XSHE 2
000007.XSHE 1
000005.XSHE 2
000014.XSHE 1
000009.XSHE 2
000006.XSHE 4
000008.XSHE 2
000001.XSHE 5
000021.XSHE 3
000023.XSHE 4
000027.XSHE 4
000002.XSHE 5
000031.XSHE 5
000032.XSHE 3
000034.XSHE 5
000035.XSHE 3
000036.XSHE 4
000038.XSHE 1
000017.XSHE 1
000012.XSHE 3
000025.XSHE 3
000040.XSHE 2
000011.XSHE 4
000016.XSHE 3
000019.XSHE 3
001872.XSHE 5
000026.XSHE 4
000042.XSHE 5
000048.XSHE 5
... ...
2020-12-17 605118.XSHG 4
605123.XSHG 5
605128.XSHG 5
605151.XSHG 5
605136.XSHG 4
605168.XSHG 5
605158.XSHG 5
605166.XSHG 3
605169.XSHG 5
605177.XSHG 5
605178.XSHG 5
605183.XSHG 4
605188.XSHG 3
605198.XSHG 4
605218.XSHG 4
605199.XSHG 3
605255.XSHG 4
605266.XSHG 5
605288.XSHG 5
605299.XSHG 3
605318.XSHG 3
605333.XSHG 2
605336.XSHG 5
605338.XSHG 4
605358.XSHG 4
605366.XSHG 4
605376.XSHG 5
605369.XSHG 5
605388.XSHG 4
605399.XSHG 4
[882428 rows x 5 columns]
4. 计算因子
# IC 值, 默认每天的IC结构
IC = performance.factor_information_coefficient(factor_return)
# 调试输出
print(IC.head())
# 耗时
endtime = datetime.datetime.now()
print("耗时:", "endtime - starttime)
输出结果:
1D 5D 10D
date
2020-01-02 -0.169993 -0.120039 0.123029
2020-01-03 -0.138328 0.060024 0.232300
2020-01-06 0.002180 0.220717 0.279139
2020-01-07 0.014441 0.234961 0.248973
2020-01-08 0.057382 0.263628 0.284560
耗时: 0:02:50.477596
因子 IC 结果分析
时间序列图和移动平均线图
alphalens.plotting.plot_ic_ts(IC)
输出结果:
因子直方图
alphalens.plotting.plot_ic_hist(IC)
输出结果:
因子 Q-Q 图
alphalens.plotting.plot_ic_qq(IC)
输出结果:
因子有效性分析表格填充
在筛选因子的时候, 会考虑某段时间的平均值大小. 判断平均值大于某个数字, IC 的值一般根据筛选严格程度取值. 这个值可以自定义, >0.06 意味着筛选严格, >0.02 意味着放松筛选 (可以用在对开始很多因子的海选中).
代码:
# 得出IC > 0.02的比例
a = IC.iloc[:, 0]
len(a[a > 0.02]) / len(a)
输出结果:
0.5236051502145923