Skip to content

Commit a82f157

Browse files
authored
Create gstin_scraper.py
Hello owners, this is just a small and beautiful script (pun intended) that demonstrates use of beautifulSoup to extract practical data. GSTIN, short for Goods and Services Tax Identification Number, is a unique 15 digit identification number assigned to every taxpayer (primarily dealer or supplier or any business entity) registered under the GST regime in INDIA. I created this script back in 2021, when one of my brother required this for his startup business and today when I saw this repo and it's inclusiveness for all sorts of crazy python script I desired to include this crazy adventure of mine. Although the original version was bit messier but I refactored it according to current best standards. Hope for a positive commity commity. Thank you so much, Purshotam Bohra
1 parent cbc6235 commit a82f157

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed

Diff for: gstin_scraper.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import time
4+
5+
# Script Name : gstin_scraper.py
6+
# Author : Purshotam
7+
# Created : Sep 6, 2021 7:59 PM
8+
# Last Modified : Oct 3, 2023 6:28 PM
9+
# Version : 1.0
10+
# Modifications :
11+
""" Description :
12+
GSTIN, short for Goods and Services Tax Identification Number,
13+
is a unique 15 digit identification number assigned to every taxpayer
14+
(primarily dealer or supplier or any business entity) registered under the GST regime.
15+
This script is able to fetch GSTIN numbers for any company registered in the
16+
Mumbai / Banglore region.
17+
"""
18+
19+
20+
# Using a demo list in case of testing the script.
21+
# This list will be used in case user skips "company input" dialogue by pressing enter.
22+
demo_companies = ["Bank of Baroda", "Trident Limited", "Reliance Limited", "The Yummy Treat", "Yes Bank", "Mumbai Mineral Trading Corporation"]
23+
24+
def get_company_list():
25+
company_list = []
26+
27+
while True:
28+
company = input("Enter a company name (or press Enter to finish): ")
29+
if not company:
30+
break
31+
company_list.append(company)
32+
33+
return company_list
34+
35+
def fetch_gstins(company_name, csrf_token):
36+
third_party_gstin_site = "https://www.knowyourgst.com/gst-number-search/by-name-pan/"
37+
payload = {'gstnum': company_name, 'csrfmiddlewaretoken': csrf_token}
38+
39+
# Getting the HTML content and extracting the GSTIN content using BeautifulSoup.
40+
html_content = requests.post(third_party_gstin_site, data=payload)
41+
soup = BeautifulSoup(html_content.text, 'html.parser')
42+
site_results = soup.find_all(id="searchresult")
43+
44+
# Extracting GSTIN specific values from child elements.
45+
gstins = [result.strong.next_sibling.next_sibling.string for result in site_results]
46+
47+
return gstins
48+
49+
def main():
50+
temp = get_company_list()
51+
companies = temp if temp else demo_companies
52+
53+
all_gstin_data = ""
54+
third_party_gstin_site = "https://www.knowyourgst.com/gst-number-search/by-name-pan/"
55+
56+
# Getting the CSRF value for further RESTful calls.
57+
page_with_csrf = requests.get(third_party_gstin_site)
58+
soup = BeautifulSoup(page_with_csrf.text, 'html.parser')
59+
csrf_token = soup.find('input', {"name": "csrfmiddlewaretoken"})['value']
60+
61+
for company in companies:
62+
gstins = fetch_gstins(company, csrf_token)
63+
64+
# Only include GSTINs for Bengaluru and Mumbai-based companies
65+
comma_separated_gstins = ', '.join([g for g in gstins if g.startswith(('27', '29'))])
66+
67+
all_gstin_data += f"{company} = {comma_separated_gstins}\n\n"
68+
69+
# Delaying for false DDOS alerts on the third-party site
70+
time.sleep(0.5)
71+
72+
# Printing the data
73+
print(all_gstin_data)
74+
75+
if __name__ == "__main__":
76+
main()

0 commit comments

Comments
 (0)