-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhigh_low_analyzer_v2.py
More file actions
672 lines (535 loc) · 24.5 KB
/
high_low_analyzer_v2.py
File metadata and controls
672 lines (535 loc) · 24.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
新高新低分析器 V2 - 核心算法模块
采用滚动窗口算法和增量更新策略,大幅提升计算性能:
1. 批量数据获取和处理
2. 并行计算支持
3. 智能增量更新
4. 断点续传机制
5. 数据验证和修复
核心优化:
- 避免重复API调用
- 维护滚动最高最低价缓存
- 支持多线程并行计算
- 智能错误处理和重试
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import tushare as ts
from typing import List, Dict, Optional, Tuple
import threading
from collections import defaultdict
# 导入新的数据库工具
from db_utils_v2 import (
create_tables_v2, batch_insert_stock_data, get_stock_price_range,
calculate_rolling_high_low, save_rolling_stats, get_market_summary,
save_market_summary, get_latest_trade_date_in_price_data,
get_active_stocks, update_system_status, get_system_status,
log_processing, get_historical_summary, cleanup_old_data
)
# 导入原有的tushare工具
try:
from tushare_utils import get_data_with_retry
except ImportError:
def get_data_with_retry(func, max_tries=3, **kwargs):
"""简化版重试函数"""
for i in range(max_tries):
try:
return func(**kwargs)
except Exception as e:
if i < max_tries - 1:
time.sleep((i + 1) * 2)
continue
logger.error(f"API调用失败: {e}")
return pd.DataFrame()
# 配置日志
logger = logging.getLogger('high_low_analyzer_v2')
class HighLowAnalyzerV2:
"""新高新低分析器V2"""
def __init__(self, token: str):
"""
初始化分析器
参数:
token: Tushare API token
"""
self.token = token
self.pro = ts.pro_api(token)
self.lock = threading.Lock()
# 性能监控
self.stats = {
'api_calls': 0,
'cache_hits': 0,
'processing_time': 0,
'errors': 0
}
# 确保数据库表存在
create_tables_v2()
def get_trade_calendar(self, start_date: str, end_date: str) -> List[str]:
"""
获取交易日历
参数:
start_date: 开始日期 'YYYYMMDD'
end_date: 结束日期 'YYYYMMDD'
返回:
List[str]: 交易日期列表
"""
try:
df = get_data_with_retry(
self.pro.trade_cal,
exchange='SSE',
start_date=start_date,
end_date=end_date,
is_open='1'
)
if df.empty:
return []
return sorted(df['cal_date'].tolist())
except Exception as e:
logger.error(f"获取交易日历失败: {e}")
return []
def get_all_stocks(self) -> pd.DataFrame:
"""
获取所有A股股票列表
返回:
pd.DataFrame: 股票基本信息
"""
try:
self.stats['api_calls'] += 1
stocks = get_data_with_retry(
self.pro.stock_basic,
exchange='',
list_status='L',
fields='ts_code,name,industry,market,list_date'
)
if not stocks.empty:
# 过滤掉科创板和创业板的风险股票(可选)
stocks = stocks[
(stocks['ts_code'].str.endswith('.SH')) |
(stocks['ts_code'].str.endswith('.SZ'))
]
logger.info(f"获取到 {len(stocks)} 只股票信息")
return stocks
except Exception as e:
logger.error(f"获取股票列表失败: {e}")
self.stats['errors'] += 1
return pd.DataFrame()
def batch_fetch_daily_data(self, stock_list: List[str], trade_date: str,
batch_size: int = 100) -> Dict[str, pd.DataFrame]:
"""
批量获取股票日线数据
参数:
stock_list: 股票代码列表
trade_date: 交易日期
batch_size: 批次大小
返回:
Dict[str, pd.DataFrame]: 股票代码到数据的映射
"""
stock_data = {}
# 分批处理
for i in range(0, len(stock_list), batch_size):
batch_stocks = stock_list[i:i+batch_size]
try:
# 获取批量数据
self.stats['api_calls'] += 1
# 使用daily接口的批量查询(如果支持)
df_batch = get_data_with_retry(
self.pro.daily,
trade_date=trade_date
)
if not df_batch.empty:
# 过滤出当前批次的股票
df_filtered = df_batch[df_batch['ts_code'].isin(batch_stocks)]
# 按股票代码分组
for ts_code, group in df_filtered.groupby('ts_code'):
stock_data[ts_code] = group
logger.info(f"批量获取完成: {i+1}-{min(i+batch_size, len(stock_list))}/{len(stock_list)}")
# API限频控制
time.sleep(0.1)
except Exception as e:
logger.error(f"批量获取数据失败 (批次 {i//batch_size + 1}): {e}")
self.stats['errors'] += 1
continue
return stock_data
def parallel_calculate_rolling_stats(self, stock_list: List[str],
trade_date: str,
max_workers: int = 4) -> List[Dict]:
"""
并行计算滚动统计数据
参数:
stock_list: 股票代码列表
trade_date: 交易日期
max_workers: 最大工作线程数
返回:
List[Dict]: 滚动统计结果列表
"""
all_results = []
def calculate_stock_stats(ts_code: str) -> List[Dict]:
"""计算单只股票的统计数据"""
results = []
try:
# 计算52周统计
stats_52w = calculate_rolling_high_low(ts_code, trade_date, '52w')
if stats_52w:
results.append(stats_52w)
# 计算26周统计
stats_26w = calculate_rolling_high_low(ts_code, trade_date, '26w')
if stats_26w:
results.append(stats_26w)
except Exception as e:
logger.debug(f"计算股票 {ts_code} 统计数据失败: {e}")
return results
# 并行计算
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交任务
future_to_stock = {
executor.submit(calculate_stock_stats, ts_code): ts_code
for ts_code in stock_list
}
completed = 0
for future in as_completed(future_to_stock):
try:
results = future.result()
all_results.extend(results)
completed += 1
if completed % 100 == 0:
logger.info(f"并行计算进度: {completed}/{len(stock_list)}")
except Exception as e:
ts_code = future_to_stock[future]
logger.error(f"并行计算股票 {ts_code} 失败: {e}")
self.stats['errors'] += 1
return all_results
def initial_data_load(self, end_date: str, days: int = 90,
rebuild: bool = False) -> bool:
"""
初始数据加载
参数:
end_date: 结束日期 'YYYYMMDD'
days: 加载天数
rebuild: 是否重建数据
返回:
bool: 是否成功
"""
start_time = time.time()
operation = 'initial_load'
logger.info(f"开始初始数据加载: {days} 天数据,结束日期 {end_date}")
log_processing(end_date, operation, 'started', f'加载 {days} 天数据')
try:
# 获取股票列表
all_stocks = self.get_all_stocks()
if all_stocks.empty:
raise Exception("无法获取股票列表")
stock_codes = all_stocks['ts_code'].tolist()
logger.info(f"共需处理 {len(stock_codes)} 只股票")
# 获取交易日历
date_obj = datetime.strptime(end_date, '%Y%m%d')
start_date = (date_obj - timedelta(days=days+30)).strftime('%Y%m%d')
trade_dates = self.get_trade_calendar(start_date, end_date)
if not trade_dates:
raise Exception("无法获取交易日历")
# 只处理最近的指定天数
recent_dates = trade_dates[-days:] if len(trade_dates) > days else trade_dates
logger.info(f"需要处理 {len(recent_dates)} 个交易日")
# 预加载所有需要的价格数据
self._preload_price_data(stock_codes, recent_dates[-1], days*2)
# 逐个交易日处理
total_dates = len(recent_dates)
for i, trade_date in enumerate(recent_dates, 1):
logger.info(f"处理交易日 {trade_date} ({i}/{total_dates})")
try:
self._process_single_date(trade_date, stock_codes)
# 记录进度
elapsed = time.time() - start_time
avg_time = elapsed / i
remaining_time = avg_time * (total_dates - i)
logger.info(f"进度: {i}/{total_dates} ({i/total_dates*100:.1f}%), "
f"已用时: {elapsed/60:.1f}分钟, "
f"预计剩余: {remaining_time/60:.1f}分钟")
except Exception as e:
logger.error(f"处理交易日 {trade_date} 失败: {e}")
continue
# 更新系统状态
update_system_status('last_full_update', end_date)
update_system_status('last_update', end_date)
elapsed_time = time.time() - start_time
self.stats['processing_time'] += elapsed_time
message = f'成功加载 {len(recent_dates)} 个交易日数据,耗时 {elapsed_time:.1f} 秒'
log_processing(end_date, operation, 'completed', message, elapsed_time)
logger.info(f"初始数据加载完成: {message}")
return True
except Exception as e:
elapsed_time = time.time() - start_time
error_msg = f"初始数据加载失败: {e}"
log_processing(end_date, operation, 'failed', error_msg, elapsed_time)
logger.error(error_msg)
return False
def _preload_price_data(self, stock_codes: List[str], end_date: str, days: int):
"""
预加载价格数据到数据库
参数:
stock_codes: 股票代码列表
end_date: 结束日期
days: 加载天数
"""
logger.info("开始预加载价格数据...")
# 确保股票基本信息存在
logger.info("首先确保股票基本信息存在...")
stock_info_list = []
for ts_code in stock_codes:
stock_info_list.append({
'ts_code': ts_code,
'name': ts_code, # 临时使用代码作为名称
'industry': 'Unknown',
'market': 'Unknown',
'list_date': '19900101'
})
# 导入并使用stock_info插入函数
from db_utils_v2 import batch_insert_stock_info
batch_insert_stock_info(stock_info_list)
date_obj = datetime.strptime(end_date, '%Y%m%d')
start_date = (date_obj - timedelta(days=days)).strftime('%Y%m%d')
logger.info(f"预加载日期范围: {start_date} 到 {end_date}")
# 批量获取数据
batch_size = 50
total_batches = (len(stock_codes) + batch_size - 1) // batch_size
total_inserted = 0
for i in range(0, len(stock_codes), batch_size):
batch_codes = stock_codes[i:i+batch_size]
batch_num = i // batch_size + 1
logger.info(f"预加载批次 {batch_num}/{total_batches} ({len(batch_codes)} 只股票)")
# 批量获取每只股票的历史数据
stock_data_list = []
successful_stocks = 0
for ts_code in batch_codes:
try:
self.stats['api_calls'] += 1
df = get_data_with_retry(
self.pro.daily,
ts_code=ts_code,
start_date=start_date,
end_date=end_date
)
if not df.empty:
successful_stocks += 1
# 转换为字典列表格式
for _, row in df.iterrows():
stock_data_list.append({
'ts_code': row['ts_code'],
'trade_date': str(row['trade_date']),
'open': float(row['open']) if pd.notna(row['open']) else 0.0,
'high': float(row['high']) if pd.notna(row['high']) else 0.0,
'low': float(row['low']) if pd.notna(row['low']) else 0.0,
'close': float(row['close']) if pd.notna(row['close']) else 0.0,
'vol': float(row.get('vol', 0)) if pd.notna(row.get('vol', 0)) else 0.0,
'amount': float(row.get('amount', 0)) if pd.notna(row.get('amount', 0)) else 0.0
})
else:
logger.debug(f"股票 {ts_code} 无数据")
# API限频
time.sleep(0.05)
except Exception as e:
logger.warning(f"获取股票 {ts_code} 数据失败: {e}")
self.stats['errors'] += 1
continue
# 批量插入数据库
if stock_data_list:
inserted_count = batch_insert_stock_data(stock_data_list)
total_inserted += inserted_count
logger.info(f"批次 {batch_num} 完成: 获取 {successful_stocks} 只股票,插入 {inserted_count} 条记录")
else:
logger.warning(f"批次 {batch_num} 没有获取到任何数据")
logger.info(f"预加载完成: 总共插入 {total_inserted} 条价格记录")
def _process_single_date(self, trade_date: str, stock_codes: List[str]):
"""
处理单个交易日的数据
参数:
trade_date: 交易日期
stock_codes: 股票代码列表
"""
date_start_time = time.time()
# 并行计算滚动统计
rolling_stats = self.parallel_calculate_rolling_stats(stock_codes, trade_date)
# 保存滚动统计数据
if rolling_stats:
save_rolling_stats(rolling_stats)
# 计算并保存市场汇总
for period_type in ['52w', '26w']:
summary = get_market_summary(trade_date, period_type)
if summary:
calculation_time = time.time() - date_start_time
save_market_summary(summary, calculation_time)
logger.info(f"{trade_date} {period_type}: "
f"新高 {summary['new_high_count']} 只, "
f"新低 {summary['new_low_count']} 只, "
f"净新高 {summary['net_high_count']} 只")
def incremental_update(self, end_date: str) -> bool:
"""
增量更新
参数:
end_date: 结束日期 'YYYYMMDD'
返回:
bool: 是否成功
"""
start_time = time.time()
operation = 'incremental_update'
logger.info(f"开始增量更新至 {end_date}")
log_processing(end_date, operation, 'started')
try:
# 获取上次更新日期
last_update = get_system_status('last_update')
if not last_update:
logger.warning("没有找到上次更新记录,建议执行初始加载")
return False
# 计算需要更新的日期范围
last_date_obj = datetime.strptime(last_update, '%Y%m%d')
start_date = (last_date_obj + timedelta(days=1)).strftime('%Y%m%d')
if start_date > end_date:
logger.info("数据已是最新,无需更新")
return True
# 获取需要更新的交易日
trade_dates = self.get_trade_calendar(start_date, end_date)
if not trade_dates:
logger.info("没有需要更新的交易日")
return True
logger.info(f"需要更新 {len(trade_dates)} 个交易日: {trade_dates[0]} 到 {trade_dates[-1]}")
# 获取活跃股票列表
stock_codes = get_active_stocks()
if not stock_codes:
# 如果没有活跃股票,获取全部股票
all_stocks = self.get_all_stocks()
stock_codes = all_stocks['ts_code'].tolist()
logger.info(f"需要更新 {len(stock_codes)} 只股票")
# 预加载新的价格数据
if trade_dates:
self._preload_price_data(stock_codes, trade_dates[-1], len(trade_dates) + 5)
# 逐个交易日更新
for i, trade_date in enumerate(trade_dates, 1):
logger.info(f"更新交易日 {trade_date} ({i}/{len(trade_dates)})")
try:
self._process_single_date(trade_date, stock_codes)
except Exception as e:
logger.error(f"更新交易日 {trade_date} 失败: {e}")
continue
# 更新系统状态
update_system_status('last_update', end_date)
elapsed_time = time.time() - start_time
message = f'成功更新 {len(trade_dates)} 个交易日,耗时 {elapsed_time:.1f} 秒'
log_processing(end_date, operation, 'completed', message, elapsed_time)
logger.info(f"增量更新完成: {message}")
return True
except Exception as e:
elapsed_time = time.time() - start_time
error_msg = f"增量更新失败: {e}"
log_processing(end_date, operation, 'failed', error_msg, elapsed_time)
logger.error(error_msg)
return False
def get_analysis_data(self, end_date: str, days: int = 30) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
获取分析数据
参数:
end_date: 结束日期
days: 获取天数
返回:
Tuple[pd.DataFrame, pd.DataFrame]: (52周数据, 26周数据)
"""
try:
# 从数据库获取历史汇总数据
df_all = get_historical_summary(end_date=end_date, days=days)
if df_all.empty:
logger.warning(f"没有找到 {days} 天的历史数据")
return pd.DataFrame(), pd.DataFrame()
# 分离52周和26周数据
df_52w = df_all[df_all['period_type'] == '52w'].copy()
df_26w = df_all[df_all['period_type'] == '26w'].copy()
# 重命名列以保持兼容性
for df in [df_52w, df_26w]:
if not df.empty:
df.rename(columns={
'new_high_count': 'high_count',
'new_low_count': 'low_count',
'net_high_count': 'net_high'
}, inplace=True)
logger.info(f"获取分析数据成功: 52周 {len(df_52w)} 条, 26周 {len(df_26w)} 条")
return df_52w, df_26w
except Exception as e:
logger.error(f"获取分析数据失败: {e}")
return pd.DataFrame(), pd.DataFrame()
def get_performance_stats(self) -> Dict:
"""获取性能统计信息"""
return self.stats.copy()
def cleanup_old_data(self, days_to_keep: int = 90):
"""清理老旧数据"""
try:
cleanup_old_data(days_to_keep)
logger.info(f"清理 {days_to_keep} 天前的老旧数据完成")
except Exception as e:
logger.error(f"清理老旧数据失败: {e}")
# 便利函数
def analyze_high_low_v2(token: str, end_date: str = None, days: int = 30,
force_update: bool = False, rebuild: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
新高新低分析主函数 V2
参数:
token: Tushare API token
end_date: 结束日期,默认为当前日期
days: 分析天数
force_update: 是否强制更新
rebuild: 是否重建数据
返回:
Tuple[pd.DataFrame, pd.DataFrame]: (52周数据, 26周数据)
"""
if end_date is None:
end_date = datetime.now().strftime('%Y%m%d')
analyzer = HighLowAnalyzerV2(token)
try:
# 检查是否需要更新数据
last_update = get_system_status('last_update')
if rebuild or not last_update:
# 执行初始数据加载
logger.info("执行初始数据加载...")
success = analyzer.initial_data_load(end_date, days=max(days, 90), rebuild=rebuild)
if not success:
logger.error("初始数据加载失败")
return pd.DataFrame(), pd.DataFrame()
elif force_update or last_update < end_date:
# 执行增量更新
logger.info("执行增量更新...")
success = analyzer.incremental_update(end_date)
if not success:
logger.error("增量更新失败")
return pd.DataFrame(), pd.DataFrame()
else:
logger.info(f"数据已是最新({last_update}),直接获取分析结果")
# 获取分析数据
df_52w, df_26w = analyzer.get_analysis_data(end_date, days)
# 输出性能统计
stats = analyzer.get_performance_stats()
logger.info(f"性能统计: API调用 {stats['api_calls']} 次, "
f"缓存命中 {stats['cache_hits']} 次, "
f"处理时间 {stats['processing_time']:.1f} 秒, "
f"错误 {stats['errors']} 次")
return df_52w, df_26w
except Exception as e:
logger.error(f"分析过程失败: {e}")
return pd.DataFrame(), pd.DataFrame()
if __name__ == "__main__":
# 测试代码
TOKEN = '284b804f2f919ea85cb7e6dfe617ff81f123c80b4cd3c4b13b35d736'
# 创建分析器实例
analyzer = HighLowAnalyzerV2(TOKEN)
# 执行测试
logger.info("开始测试新高新低分析器 V2")
# 获取当前日期
end_date = datetime.now().strftime('%Y%m%d')
# 执行分析
df_52w, df_26w = analyze_high_low_v2(TOKEN, end_date, days=30)
if not df_52w.empty and not df_26w.empty:
logger.info("测试成功完成")
print(f"52周数据: {len(df_52w)} 条记录")
print(f"26周数据: {len(df_26w)} 条记录")
else:
logger.error("测试失败")