Skip to content

Commit 183c5dc

Browse files
authored
Add files via upload
1 parent 5627893 commit 183c5dc

File tree

4 files changed

+91
-232
lines changed

4 files changed

+91
-232
lines changed

scutquant/alpha.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ def market_neutralize(x: pd.Series, long_only: bool = False) -> pd.Series:
3030

3131
def calc_factor_turnover(x: pd.Series) -> pd.Series:
3232
factor_neu = market_neutralize(x, long_only=False)
33-
instrument_to = abs(factor_neu.groupby(level=1).diff())
34-
instrument_to.dropna(inplace=True)
33+
instrument_to = abs(factor_neu - ts_delay(factor_neu, 1).fillna(0))
3534
return instrument_to.groupby(level=0).sum()
3635

3736

@@ -97,7 +96,7 @@ def get_factor_metrics(factor: pd.Series, label: pd.Series, metrics=None, handle
9796
if "ir" in metrics: # information ratio
9897
result["ir"] = result["excess_return"].mean() / result["return"].std()
9998
if "fitness" in metrics:
100-
result["fitness"] = calc_fitness(result["sharpe"], result["return"].values[-1] - 1, result["turnover"].mean())
99+
result["fitness"] = calc_fitness(result["sharpe"], result["return"].mean() - 1, result["turnover"].mean())
101100
return result
102101

103102

@@ -1075,10 +1074,10 @@ def call(self):
10751074
volume_rank = cs_rank(self.data["volume"])
10761075
rank_ratio = volume_rank / c_rank
10771076
if isinstance(self.periods, int):
1078-
self.result = ts_decay_linear(-ts_rank(self.data["close"], self.periods) * rank_ratio, 15)
1077+
self.result = ts_decay(-ts_rank(self.data["close"], self.periods) * rank_ratio, 15)
10791078
else:
10801079
for d in self.periods:
1081-
self.result["wq1_" + str(d)] = ts_decay_linear(-ts_rank(self.data["close"], d) * rank_ratio, 15)
1080+
self.result["wq1_" + str(d)] = ts_decay(-ts_rank(self.data["close"], d) * rank_ratio, 15)
10821081

10831082
def normalize(self):
10841083
if self.norm_method == "zscore":
@@ -1115,7 +1114,7 @@ def __init__(self, data: pd.DataFrame, periods: list[int] | int, normalize: str
11151114

11161115
def call(self):
11171116
self.data["returns"] = ts_returns(self.data["close"], 1)
1118-
self.data["cs_mean"] = cs_mean(self.data["returns"]) * ts_decay_linear(self.data["close"], 15)
1117+
self.data["cs_mean"] = cs_mean(self.data["returns"]) * ts_decay(self.data["close"], 15)
11191118
if isinstance(self.periods, int):
11201119
self.result = cs_rank(ts_corr(self.data["returns"], self.data["cs_mean"], self.periods))
11211120
else:

scutquant/data.py

Lines changed: 78 additions & 206 deletions
Original file line numberDiff line numberDiff line change
@@ -1,216 +1,88 @@
1-
import akshare as ak
21
import pandas as pd
3-
import datetime
2+
import tushare as ts
3+
import os
44

5-
# from joblib import Parallel, delayed
65

7-
"""
8-
akshare的数据并非100%准确!如果有更好的数据源请使用自己的数据
9-
不知为何sh000001和sh000002有问题
10-
"""
11-
12-
13-
def get_stock_data(instruments: list, freq: str = "daily", start: str = "19700101", end: str = "20230731",
14-
adjust: str = "") -> pd.DataFrame:
15-
stock_data = pd.DataFrame()
16-
for i in instruments:
17-
single_stock = ak.stock_zh_a_hist(symbol=i[0: 6], period=freq, start_date=start, end_date=end, adjust=adjust)
18-
single_stock["instrument"] = i
19-
stock_data = pd.concat([stock_data, single_stock], axis=0)
20-
stock_data.columns = ["datetime", "open", "close", "high", "low", "volume", "amount", "amplitude",
21-
"pct_chg", "price_chg", "turnover", "instrument"]
22-
stock_data["datetime"] = pd.to_datetime(stock_data["datetime"])
23-
stock_data.set_index(["datetime", "instrument"], inplace=True)
24-
return stock_data.sort_index()
25-
26-
27-
def get_index_stock_cons(index_code='000300', freq="daily", start="20230330", end="20230331", adjust=""):
6+
def get_adj_hfq(price: pd.Series, pre_close: pd.Series) -> pd.Series:
287
"""
29-
注:此函数还在不断完善中, 尤其是股票代码一块,非沪深300股票池的股票, 代码后缀可能会出错
30-
:param index_code: str, 指数代码
31-
:param freq: str, 有"daily", "weekly"和"monthly"可选
32-
:param start: str, 日期, %y%m%d格式
33-
:param end: str, 日期, %y%m%d格式
34-
:param adjust: ""为不复权, “qfq”为前复权, “hfq”为后复权
35-
:return: pd.DataFrame
36-
example:
37-
data = get_index_stock_cons()
8+
计算后复权因子
389
"""
39-
cons = ak.index_stock_cons(symbol=index_code)
40-
df = pd.DataFrame()
41-
for code in cons["品种代码"]:
42-
stock_data = ak.stock_zh_a_hist(symbol=code, period=freq, start_date=start, end_date=end, adjust=adjust)
43-
stock_data["instrument"] = code + ".SH" if code[0] == "6" else code + ".SZ" # 根据股票代码的第一个数字区分其属于上交所还是深交所
44-
df = pd.concat([df, stock_data], axis=0)
45-
df = df.set_index(["日期", "instrument"]).sort_index()
46-
df.index.names = ["datetime", "instrument"]
47-
df = df[~df.index.duplicated()]
48-
df.columns = ["open", "close", "high", "low", "volume", "amount", "amplitude", "price_chg", "pct_chg", "turnover"]
49-
return df
10+
price_ratio = (price / pre_close).groupby(level=1).transform(lambda x: x.cumprod())
11+
adj = price_ratio.groupby(level=1).transform(lambda x: x / x[0])
12+
return adj
5013

5114

52-
def upgrade_index_stock_cons(index_code='000300', today=None, adjust=""):
53-
"""
54-
此函数设计的目的是自动更新数据
55-
:param index_code: str, 指数代码
56-
:param today: str, 今天的日期, %y%m%d格式
57-
:param adjust: ""为不复权, “qfq”为前复权, “hfq”为后复权
58-
:return: pd.DataFrame
59-
example:
60-
data = upgrade_index_stock_cons(today="20230330")
61-
"""
62-
if today is None:
63-
today = datetime.date.today()
64-
today = today.strftime("%Y%m%d")
65-
df = get_index_stock_cons(index_code=index_code, freq="daily", start=today, end=today, adjust=adjust)
66-
return df
67-
15+
def tus_init(tus_token: str = ""):
16+
token = tus_token
17+
ts.set_token(token)
18+
pro = ts.pro_api()
19+
return pro
6820

69-
def get_daily_data(index_code, adjust=""):
70-
"""
71-
获取指数成分股的历史数据(动态股票池, 日频), 支持各种复权
72-
一次性获取所有日期的数据
73-
74-
:param index_code: 指数代码, like "sh000300"
75-
:param adjust: ""为不复权, “qfq”为前复权, “hfq”为后复权
76-
:return: pd.DataFrame
77-
"""
78-
all_stocks = ak.index_stock_hist(symbol=index_code)
79-
all_stocks["in_date"] = pd.to_datetime(all_stocks["in_date"]).dt.strftime('%Y%m%d')
80-
all_stocks["out_date"] = pd.to_datetime(all_stocks["out_date"]).dt.strftime('%Y%m%d')
8121

22+
def get_index_cons(pro, index_code: str = "000905.SH", start: str = "20100101", end: str = "20101231",
23+
output_folder: str = ""):
8224
data = pd.DataFrame()
83-
84-
for stock in all_stocks["stock_code"].unique():
85-
start, end = all_stocks[all_stocks["stock_code"] == stock]["in_date"].unique(), \
86-
all_stocks[all_stocks["stock_code"] == stock]["out_date"].unique()
87-
# print(start, end)
88-
for i in range(len(start)):
89-
stock_data = ak.stock_zh_a_hist(symbol=stock, period="daily", start_date=start[i], end_date=end[i],
90-
adjust=adjust)
91-
stock_data["code"] = stock + ".SH" if stock[0] == "6" else stock + ".SZ"
92-
data = pd.concat([data, stock_data], axis=0)
93-
data = data.set_index(["日期", "code"]).sort_index()
94-
data.index.names = ["datetime", "code"]
95-
data = data[~data.index.duplicated()]
96-
data.columns = ["open", "close", "high", "low", "volume", "amount", "amplitude", "price_chg", "pct_chg", "turnover"]
97-
return data
98-
99-
100-
"""
101-
def get_high_freq_data(index_code="000300", minutes=1, adjust="hfq"):
102-
def get_minute_data(code, minute, adj):
103-
stock_code = "sh" + code if code[0] == "6" else "sz" + code
104-
stock_data = ak.stock_zh_a_minute(symbol=stock_code, period=str(minute), adjust=adj)
105-
stock_data["code"] = stock_code
106-
stock_data.set_index(["day", "code"], inplace=True)
107-
return stock_data
108-
109-
cons = ak.index_stock_cons(symbol=index_code)
110-
df_list = Parallel(n_jobs=-1)(delayed(get_minute_data)(code, minutes, adjust) for code in cons["品种代码"])
111-
df = pd.concat(df_list, axis=0)
112-
df = df[~df.index.duplicated()]
113-
return df
114-
"""
115-
116-
117-
def get_high_freq_data(index_code="000300", minutes=1, adjust="hfq"):
118-
df = pd.DataFrame()
119-
cons = ak.index_stock_cons(symbol=index_code)
120-
for code in cons["品种代码"]:
121-
stock_code = "sh" + code if code[0] == "6" else "sz" + code
122-
stock_data = ak.stock_zh_a_minute(symbol=stock_code, period=str(minutes), adjust=adjust)
123-
stock_data["code"] = stock_code
124-
df = pd.concat([df, stock_data], axis=0)
125-
df = df.set_index(["day", "code"]).sort_index()
126-
df.dropna(axis=1, how='all', inplace=True)
127-
df = df[~df.index.duplicated()]
128-
return df
129-
130-
131-
"""
132-
# 并行计算会报错: no tables found
133-
def get_financial_data(index_code="000300", sleep=0.01):
134-
def get_stock_data(code):
135-
stock_data = ak.stock_financial_analysis_indicator(symbol=code)
136-
stock_data["code"] = code + ".SH" if code[0] == "6" else code + ".SZ"
137-
stock_data.set_index(["日期", "code"], inplace=True)
138-
time.sleep(sleep)
139-
return stock_data
140-
141-
cons = ak.index_stock_cons(symbol=index_code)
142-
df_list = Parallel(n_jobs=-1)(delayed(get_stock_data)(code) for code in cons["品种代码"])
143-
df = pd.concat(df_list, axis=0)
144-
df.dropna(axis=1, how="all", inplace=True)
145-
df.index.names = ["datetime", "code"]
146-
df = df[~df.index.duplicated()]
147-
return df
148-
"""
149-
150-
151-
def get_fundamental_data(index_code="000300"):
152-
df = pd.DataFrame()
153-
cons = ak.index_stock_cons(symbol=index_code)
154-
for code in cons["品种代码"]:
155-
stock_data = ak.stock_financial_analysis_indicator(symbol=code)
156-
stock_data["code"] = code + ".SH" if code[0] == "6" else code + ".SZ"
157-
df = pd.concat([df, stock_data], axis=0)
158-
df = df.set_index(["日期", "code"]).sort_index()
159-
df.dropna(axis=1, how="all", inplace=True)
160-
df.index.names = ["datetime", "code"]
161-
df = df[~df.index.duplicated()]
162-
return df
163-
164-
165-
def get_futures_news(instrument="AL"):
166-
"""
167-
由于期货是T0, 而新闻的datetime无法具体到分钟,而且新闻具有发布时间离散, 发布时集中(指同一天有多条新闻)的特点, 因此很难直接整合进行情数据中
168-
169-
:param instrument: 品种代码, 由于akshare采用的方法是代码后面+888(表示指数合约), 因此只要输入合约代码的前两位即可
170-
:return: pd.DataFrame, 包括作为索引的datetime, instrument, 作为正式内容的新闻标题(akshare不返回正文内容)和正文链接
171-
172-
注: 链接点开会404, 所以没什么用
173-
174-
instrument 示例:
175-
AL: 沪铝
176-
J9: 焦炭
177-
TA: PTA
178-
CJ: 红枣
179-
JM: 焦煤
180-
"""
181-
news = ak.futures_news_baidu(symbol=instrument)
182-
news.columns = ["title", "datetime", "link"]
183-
news["instrument"] = instrument
184-
return news.set_index(["datetime", "instrument"]).sort_index()
185-
186-
187-
def get_high_freq_futures(instrument="PTA", freq=1):
188-
"""
189-
:param instrument: 资产名称, 品种大类的中文名, 例如PTA, 白糖等
190-
:param freq: int, 频率, 1为1分钟, 以此类推
191-
:return: pd.DataFrame
192-
"""
193-
all_contracts = ak.futures_zh_realtime(symbol=instrument)["symbol"].tolist()
194-
all_data = pd.DataFrame()
195-
for contract in all_contracts:
196-
data = ak.futures_zh_minute_sina(symbol=contract, period=str(freq))
197-
data["instrument"] = contract
198-
all_data = pd.concat([all_data, data], axis=0)
199-
all_data.dropna(axis=1, how="all", inplace=True)
200-
all_data.set_index(["datetime", "instrument"], inplace=True)
201-
return all_data.sort_index()
202-
203-
204-
def get_stock_news(instrument_list: list) -> pd.DataFrame:
205-
all_news = pd.DataFrame(())
206-
for instrument in instrument_list:
207-
i_news = ak.stock_news_em(instrument[0: 6])
208-
i_news["关键词"] = instrument
209-
# print(i_news)
210-
all_news = pd.concat([all_news, i_news], axis=0)
211-
all_news["发布时间"] = pd.to_datetime(all_news["发布时间"]).dt.strftime("%Y-%m-%d")
212-
all_news["发布时间"] = pd.to_datetime(all_news["发布时间"])
213-
all_news.set_index(["发布时间", "关键词"], inplace=True)
214-
all_news.index.names = ["datetime", "instrument"]
215-
all_news.columns = ["title", "content", "resource", "link"]
216-
return all_news.sort_index()
25+
data.index.names = ['datetime']
26+
df = pd.DataFrame(pro.index_weight(index_code=index_code, start_date=start, end_date=end)) # 获得成分股列表
27+
df.set_index(['trade_date'], inplace=True)
28+
df.index.names = ['datetime']
29+
df = df.sort_index()
30+
data = pd.concat([data, df], axis=0).sort_index()
31+
data.to_csv(output_folder + 'index_weight.csv')
32+
33+
34+
def process_index_cons(folder_path):
35+
files = os.listdir(folder_path)
36+
idx_cons = pd.DataFrame()
37+
38+
for file in files:
39+
filepath = folder_path + file
40+
sub_df = pd.read_csv(filepath)
41+
sub_df.set_index("datetime", inplace=True)
42+
code_list = pd.DataFrame()
43+
codes = sub_df["con_code"].groupby(level=0).apply(lambda x: ','.join(x.astype(str)))
44+
code_list["ts_code"] = codes
45+
code_list["days"] = code_list.index.get_level_values(0)
46+
code_list["days"] = code_list["days"].astype(str)
47+
code_list["days"] = pd.to_datetime(code_list["days"], format="%Y-%m-%d")
48+
# print(code_list)
49+
code_list.reset_index(inplace=True)
50+
code_list.set_index("days", inplace=True)
51+
new_index = pd.date_range(start=code_list.index.min(), end=code_list.index.max(), freq='D')
52+
code_list = code_list.reindex(new_index)
53+
idx_cons = pd.concat([idx_cons, code_list], axis=0)
54+
idx_cons.sort_index(inplace=True)
55+
idx_cons.index.name = "days"
56+
idx_cons["datetime"] = idx_cons.index.get_level_values(0).strftime("%Y%m%d").astype(int)
57+
idx_cons.fillna(method="ffill", inplace=True)
58+
idx_cons.to_csv("instrument_list.csv")
59+
60+
61+
def get_stock_data(pro, file_path='instrument_list.csv', adjust_price: bool = False) -> pd.DataFrame:
62+
instrument_data = pd.DataFrame()
63+
# 读取code_list后,按照list获取每支股票的数据
64+
df1 = pd.read_csv(file_path)
65+
df1.fillna(method='ffill', inplace=True)
66+
67+
date = df1['datetime'].unique()
68+
day = []
69+
for i in range(len(date)):
70+
day.append(str(date[i]))
71+
72+
for i in range(len(date)):
73+
df = pd.DataFrame(pro.daily(ts_code=str(df1['ts_code'].values[i]), start_date=day[i], end_date=day[i])) # 行情数据
74+
df['trade_date'] = pd.to_datetime(df['trade_date'])
75+
df.set_index(['trade_date'], inplace=True)
76+
df.index.names = ['datetime']
77+
df = df.sort_index()
78+
instrument_data = pd.concat([instrument_data, df], axis=0).sort_index()
79+
instrument_data = instrument_data.reset_index()
80+
instrument_data.set_index(["datetime", "ts_code"], inplace=True)
81+
instrument_data.index.names = ["datetime", "instrument"]
82+
if adjust_price:
83+
adj = get_adj_hfq(instrument_data["close"], instrument_data["pre_close"])
84+
# fixme: 增加调整volume的功能
85+
prices = ["open", "close", "high", "low"]
86+
for p in prices:
87+
instrument_data[p] *= adj
88+
return instrument_data

scutquant/operators.py

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -370,29 +370,20 @@ def ts_neg_count(data: pd.Series, n_period: int) -> pd.Series:
370370
return data_copy.groupby(level=1).transform(lambda x: x.rolling(n_period).sum())
371371

372372

373-
def linear_decay(x: pd.Series, window: int) -> pd.Series:
374-
"""
375-
Applies linear decay to a time series.
376-
377-
:param x: The time series to apply linear decay to.
378-
:type x: pd.Series
379-
:param window: The window size for the linear decay.
380-
:type window: int
381-
:return: The time series with linear decay applied.
382-
:rtype: pd.Series
383-
"""
384-
weights = [np.exp(-1 / window * (window - t)) for t in range(window)]
385-
return x.rolling(window).apply(lambda y: sum(y * weights) / sum(weights), raw=True)
373+
def decay_n(x: pd.Series, n: int) -> pd.Series:
374+
arr = np.arange(1, n+1)
375+
weights = arr / sum(arr)
376+
return x.rolling(n).apply(lambda y: np.dot(y, weights), raw=True)
386377

387378

388-
def ts_decay_linear(data: pd.Series | pd.core.groupby.SeriesGroupBy, n_period: int) -> pd.Series:
379+
def ts_decay(data: pd.Series | pd.core.groupby.SeriesGroupBy, n_period: int) -> pd.Series:
389380
"""
390381
Returns the linear decay on data for the past n_period days.
391382
"""
392383
if isinstance(data, pd.Series):
393-
return data.groupby(level=1).transform(lambda x: linear_decay(x, n_period))
384+
return data.groupby(level=1).transform(lambda x: decay_n(x, n_period))
394385
else:
395-
res: pd.Series = data.transform(lambda x: linear_decay(x, n_period))
386+
res: pd.Series = data.transform(lambda x: decay_n(x, n_period))
396387
res.index.names = ["datetime", "instrument"]
397388
return res
398389

@@ -599,9 +590,6 @@ def inf_mask(data: pd.Series) -> pd.Series:
599590

600591

601592
def get_resid(x: pd.Series, y: pd.Series) -> pd.Series:
602-
"""
603-
经过百万级的数据的上千次实验, 发现此方法比调用sklearn.linear_model的LinearRegression平均快一倍
604-
"""
605593
cov = x.cov(y)
606594
var = x.var()
607595
beta = cov / var

scutquant/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
akshare>=1.9.59
1+
tushare>=1.2.8
22
pandas>=1.5.3
33
joblib>=1.2.0
44
scipy>=1.10.0

0 commit comments

Comments
 (0)