Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
383 changes: 383 additions & 0 deletions chatgpt.py

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions output_sds.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"SafetyDataSheet": {
"Identification": {
"ProductName": "Acetone",
"Cat No.": "A9-4",
"CASNo": "67-64-1",
"RecommendedUse": "Laboratory chemicals.",
"Supplier": {
"Name": "2. Hazard(s) identification",
"Address": "Classification Company ",
"Telephone": "Fisher Scientific Company"
}
},
"HazardIdentification": {
"SignalWord": "Danger",
"HazardStatements": [
"Highly flammable liquid and vapor"
],
"PrecautionaryStatements": [
"Prevention"
]
},
"Composition/Information on Ingredients": {},
"First-aid measures": {},
"Fire-fighting measures": {},
"Accidental release measures": {},
"Handling and storage": {},
"Exposure controls/personal protection": {},
"Physical and chemical properties": {},
"Stability and reactivity": {},
"Toxicological information": {},
"Ecological information": {},
"Disposal considerations": {},
"Transport information": {},
"Regulatory information": {},
"Other information": {}
}
}
278 changes: 278 additions & 0 deletions rag.ipynb

Large diffs are not rendered by default.

Binary file added requirements.txt
Binary file not shown.
173 changes: 173 additions & 0 deletions test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: PyPDF2 in d:\\data\\data_preprocessor\\.venv\\lib\\site-packages (3.0.1)Note: you may need to restart the kernel to use updated packages.\n",
"\n"
]
}
],
"source": [
"pip install PyPDF2"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PDF text extraction successful!\n",
"Identification section parsed successfully!\n",
"Hazard Identification section parsed successfully!\n",
"SDS data successfully saved to output_sds.json\n"
]
}
],
"source": [
"import PyPDF2\n",
"import json\n",
"import re\n",
"\n",
"class SafetyDataSheetParser:\n",
" def __init__(self, pdf_path, output_json_path):\n",
" self.pdf_path = pdf_path\n",
" self.output_json_path = output_json_path\n",
" self.sds_data = {\n",
" \"SafetyDataSheet\": {\n",
" \"Identification\": {},\n",
" \"HazardIdentification\": {},\n",
" \"Composition/Information on Ingredients\":{},\n",
" \"First-aid measures\":{},\n",
" \"Fire-fighting measures\":{},\n",
" \"Accidental release measures\":{},\n",
" \"Handling and storage\":{},\n",
" \"Exposure controls/personal protection\":{},\n",
" \"Physical and chemical properties\":{},\n",
" \"Stability and reactivity\":{},\n",
" \"Toxicological information\":{},\n",
" \"Ecological information\":{},\n",
" \"Disposal considerations\":{},\n",
" \"Transport information\":{},\n",
" \"Regulatory information\":{},\n",
" \"Other information\":{}\n",
" }\n",
" }\n",
"\n",
" def extract_pdf_text(self):\n",
" try:\n",
" with open(self.pdf_path, 'rb') as file:\n",
" reader = PyPDF2.PdfReader(file)\n",
" text = \"\"\n",
" for page in reader.pages:\n",
" text += page.extract_text() or \"\" # Avoid NoneType errors\n",
" if not text:\n",
" raise ValueError(\"PDF text extraction failed or PDF is empty.\")\n",
" print(\"PDF text extraction successful!\")\n",
" return text\n",
" except Exception as e:\n",
" print(f\"Error during PDF extraction: {e}\")\n",
" return \"\"\n",
"\n",
" def parse_identification_section(self, text):\n",
" \"\"\"Parses the Identification section.\"\"\"\n",
" try:\n",
" product_name_match = re.search(r'Product Name\\s+(.+)', text)\n",
" cat_no_match = re.search(r'Cat No. :\\s+[A-Z]+\\d+[A-Z]*-\\d+[A-Z\\d\\-]*',text)\n",
" cas_no_match = re.search(r'CAS No\\s+(\\d+-\\d+-\\d+)', text)\n",
" recommended_use_match = re.search(r'Recommended Use\\s+(.+)', text)\n",
" supplier_match = re.search(r'Details of the supplier.+\\n(.+)\\n(.+)\\n(.+)', text)\n",
"\n",
" self.sds_data[\"SafetyDataSheet\"][\"Identification\"] = {\n",
" \"ProductName\": product_name_match.group(1) if product_name_match else \"N/A\",\n",
" \"Cat No.\": cat_no_match.group(0) if cat_no_match else \"N/A\",\n",
" \"CASNo\": cas_no_match.group(1) if cas_no_match else \"N/A\",\n",
" \"RecommendedUse\": recommended_use_match.group(1) if recommended_use_match else \"N/A\",\n",
" \"Supplier\": {\n",
" \"Name\": supplier_match.group(1) if supplier_match else \"N/A\",\n",
" \"Address\": supplier_match.group(2) if supplier_match else \"N/A\",\n",
" \"Telephone\": supplier_match.group(3) if supplier_match else \"N/A\"\n",
" }\n",
" }\n",
" print(\"Identification section parsed successfully!\")\n",
" except Exception as e:\n",
" print(f\"Error during Identification section parsing: {e}\")\n",
"\n",
" def parse_hazard_identification_section(self, text):\n",
" \"\"\"Parses the Hazard Identification section.\"\"\"\n",
" try:\n",
" signal_word_match = re.search(r'Signal Word\\s+(.+)', text)\n",
" hazard_statements = re.findall(r'Hazard Statements\\s+(.+)', text)\n",
" precautionary_statements = re.findall(r'Precautionary Statements\\s+(.+)', text)\n",
"\n",
" self.sds_data[\"SafetyDataSheet\"][\"HazardIdentification\"] = {\n",
" \"SignalWord\": signal_word_match.group(1) if signal_word_match else \"N/A\",\n",
" \"HazardStatements\": hazard_statements if hazard_statements else [\"N/A\"],\n",
" \"PrecautionaryStatements\": precautionary_statements if precautionary_statements else [\"N/A\"]\n",
" }\n",
" print(\"Hazard Identification section parsed successfully!\")\n",
" except Exception as e:\n",
" print(f\"Error during Hazard Identification section parsing: {e}\")\n",
"\n",
" def process_sds(self):\n",
" \"\"\"Processes the SDS and saves it as a JSON.\"\"\"\n",
" try:\n",
" text = self.extract_pdf_text()\n",
"\n",
" if text:\n",
" # Parse relevant sections\n",
" self.parse_identification_section(text)\n",
" self.parse_hazard_identification_section(text)\n",
"\n",
" # Save the structured data as JSON\n",
" with open(self.output_json_path, 'w') as json_file:\n",
" json.dump(self.sds_data, json_file, indent=4)\n",
"\n",
" print(f\"SDS data successfully saved to {self.output_json_path}\")\n",
" else:\n",
" print(\"No text extracted from the PDF.\")\n",
" except Exception as e:\n",
" print(f\"Error during SDS processing: {e}\")\n",
"\n",
"# Example Usage:\n",
"if __name__ == \"__main__\":\n",
" pdf_file = \"data/acetone-acs-l.pdf\" # Path to your PDF file\n",
" output_json = \"output_sds.json\" # Path to save the output JSON\n",
"\n",
" parser = SafetyDataSheetParser(pdf_file, output_json)\n",
" parser.process_sds()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
44 changes: 44 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import PyPDF2
import json

# Open and read the PDF file
pdf_file_path = 'data/acetone-acs-l.pdf'
with open(pdf_file_path, 'rb') as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(reader.pages)

# Initialize the structure
structured_data = {}

current_section = None
current_subsection = None

# Loop through each page of the PDF
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()

# Process each line of text
for line in text.split('\n'):
# Detect main sections
if line.strip().isdigit() and int(line.strip()) in range(1, 20): # Assuming sections are numbered
current_section = line.strip()
structured_data[current_section] = {}
elif line.strip().startswith('•'): # Detect subsections by bullet points or specific patterns
current_subsection = line.strip()
if current_section:
structured_data[current_section][current_subsection] = []
elif current_section and current_subsection:
structured_data[current_section][current_subsection].append(line.strip())
elif current_section:
structured_data[current_section].setdefault('content', []).append(line.strip())

# Convert to JSON format
json_data = json.dumps(structured_data, indent=4)

# Save the JSON data to a file
output_json_path = 'structured_data.json'
with open(output_json_path, 'w') as json_file:
json_file.write(json_data)

print(f"JSON data has been saved to {output_json_path}")
Empty file added test2.ipynb
Empty file.
54 changes: 54 additions & 0 deletions vinol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#file location: data_preprocessor/data/acetone-acs-l (1).pdf

import PyPDF2
import os

# Data extraction function
def pdf_to_text(pdf_path):
# Extract the base name of the PDF file (without extension)
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Create the output text file name by appending ".txt"
output_txt = f"{base_name}.txt"

# Open the PDF file in read-binary mode
with open(pdf_path, 'rb') as pdf_file:
# Create a PdfReader object
reader = PyPDF2.PdfReader(pdf_file)

# Initialize an empty string to store the text
text = ''

# Iterate through all pages and extract text
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
page_text = page.extract_text()
if page_text: # Ensure text was extracted
text += page_text + "\n" # Add a newline for page separation

# Write the extracted text to a text file (output_txt)
with open(output_txt, 'w', encoding='utf-8') as txt_file:
txt_file.write(text)

print(f"PDF converted to text successfully! Text saved as: {output_txt}")

# Usage example
pdf_to_text('data/acetone-acs-l (1).pdf')

#pdf_to_text('data_preprocessor/data/acetone-acs-l (1).pdf')

#cleaning up
def exclude_lines_with_keyword(input_file_path):
# Create the output file name by appending '_modified' to the original file name
base_name = os.path.basename(input_file_path)
file_name, file_extension = os.path.splitext(base_name)
output_file_path = os.path.join(os.path.dirname(input_file_path), f"{file_name}_modified{file_extension}")

with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file:
for line in input_file:
# Check if the specific keyword is NOT in the line
if "Page" not in line and "____" not in line:
output_file.write(line) # Write the line to the output file

print(f"Lines excluding the keywords have been written to '{output_file_path}'.")

exclude_lines_with_keyword("acetone-acs-l (1).txt")
76 changes: 76 additions & 0 deletions vinol2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#file location: data_preprocessor/data/acetone-acs-l (1).pdf

import pdfplumber
import json

text=""
with pdfplumber.open("data_preprocessor/data/acetone-acs-l (1).pdf") as pdf:
first_page = pdf.pages[0] #reading only first pages for testing
text = first_page.extract_text()
print(text)


from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
textArr = tokenizer.tokenize(text)
print(textArr)
#sample o/p ['SAFETY', 'DATA', 'SHEET', 'Creation', 'Date', '28', 'Apr', '2009', 'Revision', 'Date', '13', 'Oct', '2023', 'Revision', 'Number']

text2 = ' '.join(textArr)

from transformers import BertTokenizer, BertForTokenClassification
import torch

# Load tokenizer and model for token classification
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
model = BertForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english', num_labels=9) # Change num_labels based on your use case

# Your extracted text from the PDF
text2 = ' '.join(textArr)

# Tokenize the text
tokens = text2.split()
input_tokens = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)

# Get predictions
with torch.no_grad():
outputs = model(**input_tokens)
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)

# Define a mapping of label IDs to label names
label_map = {
0: 'O', # Outside
1: 'B-MISC', # Begin Miscellaneous
2: 'I-MISC', # Inside Miscellaneous
3: 'B-PER', # Begin Person
4: 'I-PER', # Inside Person
5: 'B-ORG', # Begin Organization
6: 'I-ORG', # Inside Organization
7: 'B-LOC', # Begin Location
8: 'I-LOC', # Inside Location
}

# Print the tokens and their predicted labels
for token, pred in zip(tokens, predictions[0].numpy()):
print(f"Token: {token} -> Label: {label_map[pred]}")

#Token: SAFETY -> Label: O
#Token: DATA -> Label: I-ORG
#Token: SHEET -> Label: I-ORG
#Token: Creation -> Label: I-ORG
#Token: Date -> Label: I-ORG
#Token: 28 -> Label: I-ORG

# Convert to JSON
json_output = json.dumps(tokens, indent=4)
# Print JSON output
print(json_output)
# Optionally, save to a JSON file
with open('token_classification_output.json', 'w') as json_file:
json.dump(tokens, json_file, indent=4)