|  | 
|  | 1 | +from bs4 import BeautifulSoup | 
|  | 2 | +import requests | 
|  | 3 | +import time | 
|  | 4 | + | 
|  | 5 | +# Script Name		: gstin_scraper.py | 
|  | 6 | +# Author				: Purshotam | 
|  | 7 | +# Created				: Sep 6, 2021 7:59 PM | 
|  | 8 | +# Last Modified		: Oct 3, 2023 6:28 PM | 
|  | 9 | +# Version				: 1.0 | 
|  | 10 | +# Modifications		: | 
|  | 11 | +""" Description	: | 
|  | 12 | +GSTIN, short for Goods and Services Tax Identification Number,  | 
|  | 13 | +is a unique 15 digit identification number assigned to every taxpayer  | 
|  | 14 | +(primarily dealer or supplier or any business entity) registered under the GST regime. | 
|  | 15 | +This script is able to fetch GSTIN numbers for any company registered in the | 
|  | 16 | +Mumbai / Banglore region. | 
|  | 17 | +""" | 
|  | 18 | + | 
|  | 19 | + | 
|  | 20 | +# Using a demo list in case of testing the script.  | 
|  | 21 | +# This list will be used in case user skips "company input" dialogue by pressing enter. | 
|  | 22 | +demo_companies = ["Bank of Baroda", "Trident Limited", "Reliance Limited", "The Yummy Treat", "Yes Bank", "Mumbai Mineral Trading Corporation"] | 
|  | 23 | + | 
|  | 24 | +def get_company_list(): | 
|  | 25 | +    company_list = [] | 
|  | 26 | +     | 
|  | 27 | +    while True: | 
|  | 28 | +        company = input("Enter a company name (or press Enter to finish): ") | 
|  | 29 | +        if not company: | 
|  | 30 | +            break | 
|  | 31 | +        company_list.append(company) | 
|  | 32 | +     | 
|  | 33 | +    return company_list | 
|  | 34 | + | 
|  | 35 | +def fetch_gstins(company_name, csrf_token): | 
|  | 36 | +    third_party_gstin_site = "https://www.knowyourgst.com/gst-number-search/by-name-pan/" | 
|  | 37 | +    payload = {'gstnum': company_name, 'csrfmiddlewaretoken': csrf_token} | 
|  | 38 | + | 
|  | 39 | +    # Getting the HTML content and extracting the GSTIN content using BeautifulSoup. | 
|  | 40 | +    html_content = requests.post(third_party_gstin_site, data=payload) | 
|  | 41 | +    soup = BeautifulSoup(html_content.text, 'html.parser') | 
|  | 42 | +    site_results = soup.find_all(id="searchresult") | 
|  | 43 | + | 
|  | 44 | +    # Extracting GSTIN specific values from child elements. | 
|  | 45 | +    gstins = [result.strong.next_sibling.next_sibling.string for result in site_results] | 
|  | 46 | + | 
|  | 47 | +    return gstins | 
|  | 48 | + | 
|  | 49 | +def main(): | 
|  | 50 | +    temp = get_company_list() | 
|  | 51 | +    companies = temp if temp else demo_companies | 
|  | 52 | + | 
|  | 53 | +    all_gstin_data = "" | 
|  | 54 | +    third_party_gstin_site = "https://www.knowyourgst.com/gst-number-search/by-name-pan/" | 
|  | 55 | + | 
|  | 56 | +    # Getting the CSRF value for further RESTful calls. | 
|  | 57 | +    page_with_csrf = requests.get(third_party_gstin_site) | 
|  | 58 | +    soup = BeautifulSoup(page_with_csrf.text, 'html.parser') | 
|  | 59 | +    csrf_token = soup.find('input', {"name": "csrfmiddlewaretoken"})['value'] | 
|  | 60 | + | 
|  | 61 | +    for company in companies: | 
|  | 62 | +        gstins = fetch_gstins(company, csrf_token) | 
|  | 63 | + | 
|  | 64 | +        # Only include GSTINs for Bengaluru and Mumbai-based companies | 
|  | 65 | +        comma_separated_gstins = ', '.join([g for g in gstins if g.startswith(('27', '29'))]) | 
|  | 66 | + | 
|  | 67 | +        all_gstin_data += f"{company} = {comma_separated_gstins}\n\n" | 
|  | 68 | + | 
|  | 69 | +        # Delaying for false DDOS alerts on the third-party site | 
|  | 70 | +        time.sleep(0.5) | 
|  | 71 | + | 
|  | 72 | +    # Printing the data | 
|  | 73 | +    print(all_gstin_data) | 
|  | 74 | + | 
|  | 75 | +if __name__ == "__main__": | 
|  | 76 | +    main() | 
0 commit comments