|
1 |
| -import akshare as ak |
2 | 1 | import pandas as pd
|
3 |
| -import datetime |
| 2 | +import tushare as ts |
| 3 | +import os |
4 | 4 |
|
5 |
| -# from joblib import Parallel, delayed |
6 | 5 |
|
7 |
| -""" |
8 |
| -akshare的数据并非100%准确!如果有更好的数据源请使用自己的数据 |
9 |
| -不知为何sh000001和sh000002有问题 |
10 |
| -""" |
11 |
| - |
12 |
| - |
13 |
| -def get_stock_data(instruments: list, freq: str = "daily", start: str = "19700101", end: str = "20230731", |
14 |
| - adjust: str = "") -> pd.DataFrame: |
15 |
| - stock_data = pd.DataFrame() |
16 |
| - for i in instruments: |
17 |
| - single_stock = ak.stock_zh_a_hist(symbol=i[0: 6], period=freq, start_date=start, end_date=end, adjust=adjust) |
18 |
| - single_stock["instrument"] = i |
19 |
| - stock_data = pd.concat([stock_data, single_stock], axis=0) |
20 |
| - stock_data.columns = ["datetime", "open", "close", "high", "low", "volume", "amount", "amplitude", |
21 |
| - "pct_chg", "price_chg", "turnover", "instrument"] |
22 |
| - stock_data["datetime"] = pd.to_datetime(stock_data["datetime"]) |
23 |
| - stock_data.set_index(["datetime", "instrument"], inplace=True) |
24 |
| - return stock_data.sort_index() |
25 |
| - |
26 |
| - |
27 |
| -def get_index_stock_cons(index_code='000300', freq="daily", start="20230330", end="20230331", adjust=""): |
| 6 | +def get_adj_hfq(price: pd.Series, pre_close: pd.Series) -> pd.Series: |
28 | 7 | """
|
29 |
| - 注:此函数还在不断完善中, 尤其是股票代码一块,非沪深300股票池的股票, 代码后缀可能会出错 |
30 |
| - :param index_code: str, 指数代码 |
31 |
| - :param freq: str, 有"daily", "weekly"和"monthly"可选 |
32 |
| - :param start: str, 日期, %y%m%d格式 |
33 |
| - :param end: str, 日期, %y%m%d格式 |
34 |
| - :param adjust: ""为不复权, “qfq”为前复权, “hfq”为后复权 |
35 |
| - :return: pd.DataFrame |
36 |
| - example: |
37 |
| - data = get_index_stock_cons() |
| 8 | + 计算后复权因子 |
38 | 9 | """
|
39 |
| - cons = ak.index_stock_cons(symbol=index_code) |
40 |
| - df = pd.DataFrame() |
41 |
| - for code in cons["品种代码"]: |
42 |
| - stock_data = ak.stock_zh_a_hist(symbol=code, period=freq, start_date=start, end_date=end, adjust=adjust) |
43 |
| - stock_data["instrument"] = code + ".SH" if code[0] == "6" else code + ".SZ" # 根据股票代码的第一个数字区分其属于上交所还是深交所 |
44 |
| - df = pd.concat([df, stock_data], axis=0) |
45 |
| - df = df.set_index(["日期", "instrument"]).sort_index() |
46 |
| - df.index.names = ["datetime", "instrument"] |
47 |
| - df = df[~df.index.duplicated()] |
48 |
| - df.columns = ["open", "close", "high", "low", "volume", "amount", "amplitude", "price_chg", "pct_chg", "turnover"] |
49 |
| - return df |
| 10 | + price_ratio = (price / pre_close).groupby(level=1).transform(lambda x: x.cumprod()) |
| 11 | + adj = price_ratio.groupby(level=1).transform(lambda x: x / x[0]) |
| 12 | + return adj |
50 | 13 |
|
51 | 14 |
|
52 |
| -def upgrade_index_stock_cons(index_code='000300', today=None, adjust=""): |
53 |
| - """ |
54 |
| - 此函数设计的目的是自动更新数据 |
55 |
| - :param index_code: str, 指数代码 |
56 |
| - :param today: str, 今天的日期, %y%m%d格式 |
57 |
| - :param adjust: ""为不复权, “qfq”为前复权, “hfq”为后复权 |
58 |
| - :return: pd.DataFrame |
59 |
| - example: |
60 |
| - data = upgrade_index_stock_cons(today="20230330") |
61 |
| - """ |
62 |
| - if today is None: |
63 |
| - today = datetime.date.today() |
64 |
| - today = today.strftime("%Y%m%d") |
65 |
| - df = get_index_stock_cons(index_code=index_code, freq="daily", start=today, end=today, adjust=adjust) |
66 |
| - return df |
67 |
| - |
| 15 | +def tus_init(tus_token: str = ""): |
| 16 | + token = tus_token |
| 17 | + ts.set_token(token) |
| 18 | + pro = ts.pro_api() |
| 19 | + return pro |
68 | 20 |
|
69 |
| -def get_daily_data(index_code, adjust=""): |
70 |
| - """ |
71 |
| - 获取指数成分股的历史数据(动态股票池, 日频), 支持各种复权 |
72 |
| - 一次性获取所有日期的数据 |
73 |
| -
|
74 |
| - :param index_code: 指数代码, like "sh000300" |
75 |
| - :param adjust: ""为不复权, “qfq”为前复权, “hfq”为后复权 |
76 |
| - :return: pd.DataFrame |
77 |
| - """ |
78 |
| - all_stocks = ak.index_stock_hist(symbol=index_code) |
79 |
| - all_stocks["in_date"] = pd.to_datetime(all_stocks["in_date"]).dt.strftime('%Y%m%d') |
80 |
| - all_stocks["out_date"] = pd.to_datetime(all_stocks["out_date"]).dt.strftime('%Y%m%d') |
81 | 21 |
|
| 22 | +def get_index_cons(pro, index_code: str = "000905.SH", start: str = "20100101", end: str = "20101231", |
| 23 | + output_folder: str = ""): |
82 | 24 | data = pd.DataFrame()
|
83 |
| - |
84 |
| - for stock in all_stocks["stock_code"].unique(): |
85 |
| - start, end = all_stocks[all_stocks["stock_code"] == stock]["in_date"].unique(), \ |
86 |
| - all_stocks[all_stocks["stock_code"] == stock]["out_date"].unique() |
87 |
| - # print(start, end) |
88 |
| - for i in range(len(start)): |
89 |
| - stock_data = ak.stock_zh_a_hist(symbol=stock, period="daily", start_date=start[i], end_date=end[i], |
90 |
| - adjust=adjust) |
91 |
| - stock_data["code"] = stock + ".SH" if stock[0] == "6" else stock + ".SZ" |
92 |
| - data = pd.concat([data, stock_data], axis=0) |
93 |
| - data = data.set_index(["日期", "code"]).sort_index() |
94 |
| - data.index.names = ["datetime", "code"] |
95 |
| - data = data[~data.index.duplicated()] |
96 |
| - data.columns = ["open", "close", "high", "low", "volume", "amount", "amplitude", "price_chg", "pct_chg", "turnover"] |
97 |
| - return data |
98 |
| - |
99 |
| - |
100 |
| -""" |
101 |
| -def get_high_freq_data(index_code="000300", minutes=1, adjust="hfq"): |
102 |
| - def get_minute_data(code, minute, adj): |
103 |
| - stock_code = "sh" + code if code[0] == "6" else "sz" + code |
104 |
| - stock_data = ak.stock_zh_a_minute(symbol=stock_code, period=str(minute), adjust=adj) |
105 |
| - stock_data["code"] = stock_code |
106 |
| - stock_data.set_index(["day", "code"], inplace=True) |
107 |
| - return stock_data |
108 |
| -
|
109 |
| - cons = ak.index_stock_cons(symbol=index_code) |
110 |
| - df_list = Parallel(n_jobs=-1)(delayed(get_minute_data)(code, minutes, adjust) for code in cons["品种代码"]) |
111 |
| - df = pd.concat(df_list, axis=0) |
112 |
| - df = df[~df.index.duplicated()] |
113 |
| - return df |
114 |
| -""" |
115 |
| - |
116 |
| - |
117 |
| -def get_high_freq_data(index_code="000300", minutes=1, adjust="hfq"): |
118 |
| - df = pd.DataFrame() |
119 |
| - cons = ak.index_stock_cons(symbol=index_code) |
120 |
| - for code in cons["品种代码"]: |
121 |
| - stock_code = "sh" + code if code[0] == "6" else "sz" + code |
122 |
| - stock_data = ak.stock_zh_a_minute(symbol=stock_code, period=str(minutes), adjust=adjust) |
123 |
| - stock_data["code"] = stock_code |
124 |
| - df = pd.concat([df, stock_data], axis=0) |
125 |
| - df = df.set_index(["day", "code"]).sort_index() |
126 |
| - df.dropna(axis=1, how='all', inplace=True) |
127 |
| - df = df[~df.index.duplicated()] |
128 |
| - return df |
129 |
| - |
130 |
| - |
131 |
| -""" |
132 |
| -# 并行计算会报错: no tables found |
133 |
| -def get_financial_data(index_code="000300", sleep=0.01): |
134 |
| - def get_stock_data(code): |
135 |
| - stock_data = ak.stock_financial_analysis_indicator(symbol=code) |
136 |
| - stock_data["code"] = code + ".SH" if code[0] == "6" else code + ".SZ" |
137 |
| - stock_data.set_index(["日期", "code"], inplace=True) |
138 |
| - time.sleep(sleep) |
139 |
| - return stock_data |
140 |
| -
|
141 |
| - cons = ak.index_stock_cons(symbol=index_code) |
142 |
| - df_list = Parallel(n_jobs=-1)(delayed(get_stock_data)(code) for code in cons["品种代码"]) |
143 |
| - df = pd.concat(df_list, axis=0) |
144 |
| - df.dropna(axis=1, how="all", inplace=True) |
145 |
| - df.index.names = ["datetime", "code"] |
146 |
| - df = df[~df.index.duplicated()] |
147 |
| - return df |
148 |
| -""" |
149 |
| - |
150 |
| - |
151 |
| -def get_fundamental_data(index_code="000300"): |
152 |
| - df = pd.DataFrame() |
153 |
| - cons = ak.index_stock_cons(symbol=index_code) |
154 |
| - for code in cons["品种代码"]: |
155 |
| - stock_data = ak.stock_financial_analysis_indicator(symbol=code) |
156 |
| - stock_data["code"] = code + ".SH" if code[0] == "6" else code + ".SZ" |
157 |
| - df = pd.concat([df, stock_data], axis=0) |
158 |
| - df = df.set_index(["日期", "code"]).sort_index() |
159 |
| - df.dropna(axis=1, how="all", inplace=True) |
160 |
| - df.index.names = ["datetime", "code"] |
161 |
| - df = df[~df.index.duplicated()] |
162 |
| - return df |
163 |
| - |
164 |
| - |
165 |
| -def get_futures_news(instrument="AL"): |
166 |
| - """ |
167 |
| - 由于期货是T0, 而新闻的datetime无法具体到分钟,而且新闻具有发布时间离散, 发布时集中(指同一天有多条新闻)的特点, 因此很难直接整合进行情数据中 |
168 |
| -
|
169 |
| - :param instrument: 品种代码, 由于akshare采用的方法是代码后面+888(表示指数合约), 因此只要输入合约代码的前两位即可 |
170 |
| - :return: pd.DataFrame, 包括作为索引的datetime, instrument, 作为正式内容的新闻标题(akshare不返回正文内容)和正文链接 |
171 |
| -
|
172 |
| - 注: 链接点开会404, 所以没什么用 |
173 |
| -
|
174 |
| - instrument 示例: |
175 |
| - AL: 沪铝 |
176 |
| - J9: 焦炭 |
177 |
| - TA: PTA |
178 |
| - CJ: 红枣 |
179 |
| - JM: 焦煤 |
180 |
| - """ |
181 |
| - news = ak.futures_news_baidu(symbol=instrument) |
182 |
| - news.columns = ["title", "datetime", "link"] |
183 |
| - news["instrument"] = instrument |
184 |
| - return news.set_index(["datetime", "instrument"]).sort_index() |
185 |
| - |
186 |
| - |
187 |
| -def get_high_freq_futures(instrument="PTA", freq=1): |
188 |
| - """ |
189 |
| - :param instrument: 资产名称, 品种大类的中文名, 例如PTA, 白糖等 |
190 |
| - :param freq: int, 频率, 1为1分钟, 以此类推 |
191 |
| - :return: pd.DataFrame |
192 |
| - """ |
193 |
| - all_contracts = ak.futures_zh_realtime(symbol=instrument)["symbol"].tolist() |
194 |
| - all_data = pd.DataFrame() |
195 |
| - for contract in all_contracts: |
196 |
| - data = ak.futures_zh_minute_sina(symbol=contract, period=str(freq)) |
197 |
| - data["instrument"] = contract |
198 |
| - all_data = pd.concat([all_data, data], axis=0) |
199 |
| - all_data.dropna(axis=1, how="all", inplace=True) |
200 |
| - all_data.set_index(["datetime", "instrument"], inplace=True) |
201 |
| - return all_data.sort_index() |
202 |
| - |
203 |
| - |
204 |
| -def get_stock_news(instrument_list: list) -> pd.DataFrame: |
205 |
| - all_news = pd.DataFrame(()) |
206 |
| - for instrument in instrument_list: |
207 |
| - i_news = ak.stock_news_em(instrument[0: 6]) |
208 |
| - i_news["关键词"] = instrument |
209 |
| - # print(i_news) |
210 |
| - all_news = pd.concat([all_news, i_news], axis=0) |
211 |
| - all_news["发布时间"] = pd.to_datetime(all_news["发布时间"]).dt.strftime("%Y-%m-%d") |
212 |
| - all_news["发布时间"] = pd.to_datetime(all_news["发布时间"]) |
213 |
| - all_news.set_index(["发布时间", "关键词"], inplace=True) |
214 |
| - all_news.index.names = ["datetime", "instrument"] |
215 |
| - all_news.columns = ["title", "content", "resource", "link"] |
216 |
| - return all_news.sort_index() |
| 25 | + data.index.names = ['datetime'] |
| 26 | + df = pd.DataFrame(pro.index_weight(index_code=index_code, start_date=start, end_date=end)) # 获得成分股列表 |
| 27 | + df.set_index(['trade_date'], inplace=True) |
| 28 | + df.index.names = ['datetime'] |
| 29 | + df = df.sort_index() |
| 30 | + data = pd.concat([data, df], axis=0).sort_index() |
| 31 | + data.to_csv(output_folder + 'index_weight.csv') |
| 32 | + |
| 33 | + |
| 34 | +def process_index_cons(folder_path): |
| 35 | + files = os.listdir(folder_path) |
| 36 | + idx_cons = pd.DataFrame() |
| 37 | + |
| 38 | + for file in files: |
| 39 | + filepath = folder_path + file |
| 40 | + sub_df = pd.read_csv(filepath) |
| 41 | + sub_df.set_index("datetime", inplace=True) |
| 42 | + code_list = pd.DataFrame() |
| 43 | + codes = sub_df["con_code"].groupby(level=0).apply(lambda x: ','.join(x.astype(str))) |
| 44 | + code_list["ts_code"] = codes |
| 45 | + code_list["days"] = code_list.index.get_level_values(0) |
| 46 | + code_list["days"] = code_list["days"].astype(str) |
| 47 | + code_list["days"] = pd.to_datetime(code_list["days"], format="%Y-%m-%d") |
| 48 | + # print(code_list) |
| 49 | + code_list.reset_index(inplace=True) |
| 50 | + code_list.set_index("days", inplace=True) |
| 51 | + new_index = pd.date_range(start=code_list.index.min(), end=code_list.index.max(), freq='D') |
| 52 | + code_list = code_list.reindex(new_index) |
| 53 | + idx_cons = pd.concat([idx_cons, code_list], axis=0) |
| 54 | + idx_cons.sort_index(inplace=True) |
| 55 | + idx_cons.index.name = "days" |
| 56 | + idx_cons["datetime"] = idx_cons.index.get_level_values(0).strftime("%Y%m%d").astype(int) |
| 57 | + idx_cons.fillna(method="ffill", inplace=True) |
| 58 | + idx_cons.to_csv("instrument_list.csv") |
| 59 | + |
| 60 | + |
| 61 | +def get_stock_data(pro, file_path='instrument_list.csv', adjust_price: bool = False) -> pd.DataFrame: |
| 62 | + instrument_data = pd.DataFrame() |
| 63 | + # 读取code_list后,按照list获取每支股票的数据 |
| 64 | + df1 = pd.read_csv(file_path) |
| 65 | + df1.fillna(method='ffill', inplace=True) |
| 66 | + |
| 67 | + date = df1['datetime'].unique() |
| 68 | + day = [] |
| 69 | + for i in range(len(date)): |
| 70 | + day.append(str(date[i])) |
| 71 | + |
| 72 | + for i in range(len(date)): |
| 73 | + df = pd.DataFrame(pro.daily(ts_code=str(df1['ts_code'].values[i]), start_date=day[i], end_date=day[i])) # 行情数据 |
| 74 | + df['trade_date'] = pd.to_datetime(df['trade_date']) |
| 75 | + df.set_index(['trade_date'], inplace=True) |
| 76 | + df.index.names = ['datetime'] |
| 77 | + df = df.sort_index() |
| 78 | + instrument_data = pd.concat([instrument_data, df], axis=0).sort_index() |
| 79 | + instrument_data = instrument_data.reset_index() |
| 80 | + instrument_data.set_index(["datetime", "ts_code"], inplace=True) |
| 81 | + instrument_data.index.names = ["datetime", "instrument"] |
| 82 | + if adjust_price: |
| 83 | + adj = get_adj_hfq(instrument_data["close"], instrument_data["pre_close"]) |
| 84 | + # fixme: 增加调整volume的功能 |
| 85 | + prices = ["open", "close", "high", "low"] |
| 86 | + for p in prices: |
| 87 | + instrument_data[p] *= adj |
| 88 | + return instrument_data |
0 commit comments