-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscript.py
86 lines (71 loc) · 2.34 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Imports
import requests
from datetime import datetime
from _variables import amc_data
from dateutil.relativedelta import relativedelta
from pymongo import MongoClient, ASCENDING, DESCENDING, errors
# Constants
url = 'http://portal.amfiindia.com/DownloadNAVHistoryReport_Po.aspx'
start_date_str = '01-Jan-2005'
mongodb_host = 'localhost'
mongodb_port = 27017
# To generate dates
start_date = datetime.strptime(start_date_str, '%d-%b-%Y')
today = datetime.today()
dates = []
x = start_date
while x <= today:
x = x + relativedelta(years=1)
dates.append(x.strftime('%d-%b-%Y'))
# Connect to MongoDB
client = MongoClient(mongodb_host, mongodb_port)
db = client['amfi']
collection = db['nav_history']
collection.create_index([('scheme_code', DESCENDING), ('timestamp', ASCENDING)], unique=True)
cursor = collection.find({})
# Iterate through AMC List
for amc in amc_data:
print('\n--------------------')
print('Scraping', amc['name'])
# Iterate through dates to get previous and current date
for idx, date in enumerate(dates):
if idx == 0: continue
print('From', dates[idx - 1], 'to', dates[idx])
params = {
'mf': amc['code'],
'frmdt': dates[idx - 1],
'todt': dates[idx]
}
res = requests.get(url, params)
data = res.text
# If it does not contain 'Scheme Code' as the first entry, then data does not exist
if not data.startswith('Scheme'):
continue
data_lines = data.splitlines()
data_lines = data_lines[1:]
arr_points = []
for line in data_lines:
x = line.split(';')
# Process the line only if it has relevant fields
if len(x) == 8:
try:
point = dict()
point['amc_code'] = amc['code']
point['amc_name'] = amc['name']
point['scheme_code'] = int(x[0])
point['scheme_name'] = x[1]
point['isin'] = x[2]
point['nav'] = float(x[4])
point['timestamp'] = datetime.strptime(x[7], '%d-%b-%Y')
arr_points.append(point)
except ValueError:
continue
try:
# Insert all documents of request at once
collection.insert_many(arr_points, ordered=False)
except errors.BulkWriteError as e:
panic = filter(lambda x: x['code'] != 11000, e.details['writeErrors'])
if len(list(panic)) > 0:
print(list(panic))
except TypeError:
continue