sceptix-club · PlatypusPus · Oct 4, 2024 · Oct 4, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/chatgpt.py b/chatgpt.py
diff --git a/output_sds.json b/output_sds.json
@@ -0,0 +1,38 @@
+{
+    "SafetyDataSheet": {
+        "Identification": {
+            "ProductName": "Acetone",
+            "Cat No.": "A9-4",
+            "CASNo": "67-64-1",
+            "RecommendedUse": "Laboratory chemicals.",
+            "Supplier": {
+                "Name": "2. Hazard(s) identification",
+                "Address": "Classification Company ",
+                "Telephone": "Fisher Scientific Company"
+            }
+        },
+        "HazardIdentification": {
+            "SignalWord": "Danger",
+            "HazardStatements": [
+                "Highly flammable liquid and vapor"
+            ],
+            "PrecautionaryStatements": [
+                "Prevention"
+            ]
+        },
+        "Composition/Information on Ingredients": {},
+        "First-aid measures": {},
+        "Fire-fighting measures": {},
+        "Accidental release measures": {},
+        "Handling and storage": {},
+        "Exposure controls/personal protection": {},
+        "Physical and chemical properties": {},
+        "Stability and reactivity": {},
+        "Toxicological information": {},
+        "Ecological information": {},
+        "Disposal considerations": {},
+        "Transport information": {},
+        "Regulatory information": {},
+        "Other information": {}
+    }
+}
diff --git a/rag.ipynb b/rag.ipynb
diff --git a/requirements.txt b/requirements.txt
diff --git a/test.ipynb b/test.ipynb
@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: PyPDF2 in d:\\data\\data_preprocessor\\.venv\\lib\\site-packages (3.0.1)Note: you may need to restart the kernel to use updated packages.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install PyPDF2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PDF text extraction successful!\n",
+      "Identification section parsed successfully!\n",
+      "Hazard Identification section parsed successfully!\n",
+      "SDS data successfully saved to output_sds.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "import PyPDF2\n",
+    "import json\n",
+    "import re\n",
+    "\n",
+    "class SafetyDataSheetParser:\n",
+    "    def __init__(self, pdf_path, output_json_path):\n",
+    "        self.pdf_path = pdf_path\n",
+    "        self.output_json_path = output_json_path\n",
+    "        self.sds_data = {\n",
+    "            \"SafetyDataSheet\": {\n",
+    "                \"Identification\": {},\n",
+    "                \"HazardIdentification\": {},\n",
+    "                \"Composition/Information on Ingredients\":{},\n",
+    "                \"First-aid measures\":{},\n",
+    "                \"Fire-fighting measures\":{},\n",
+    "                \"Accidental release measures\":{},\n",
+    "                \"Handling and storage\":{},\n",
+    "                \"Exposure controls/personal protection\":{},\n",
+    "                \"Physical and chemical properties\":{},\n",
+    "                \"Stability and reactivity\":{},\n",
+    "                \"Toxicological information\":{},\n",
+    "                \"Ecological information\":{},\n",
+    "                \"Disposal considerations\":{},\n",
+    "                \"Transport information\":{},\n",
+    "                \"Regulatory information\":{},\n",
+    "                \"Other information\":{}\n",
+    "            }\n",
+    "        }\n",
+    "\n",
+    "    def extract_pdf_text(self):\n",
+    "        try:\n",
+    "            with open(self.pdf_path, 'rb') as file:\n",
+    "                reader = PyPDF2.PdfReader(file)\n",
+    "                text = \"\"\n",
+    "                for page in reader.pages:\n",
+    "                    text += page.extract_text() or \"\"  # Avoid NoneType errors\n",
+    "            if not text:\n",
+    "                raise ValueError(\"PDF text extraction failed or PDF is empty.\")\n",
+    "            print(\"PDF text extraction successful!\")\n",
+    "            return text\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during PDF extraction: {e}\")\n",
+    "            return \"\"\n",
+    "\n",
+    "    def parse_identification_section(self, text):\n",
+    "        \"\"\"Parses the Identification section.\"\"\"\n",
+    "        try:\n",
+    "            product_name_match = re.search(r'Product Name\\s+(.+)', text)\n",
+    "            cat_no_match = re.search(r'Cat No. :\\s+[A-Z]+\\d+[A-Z]*-\\d+[A-Z\\d\\-]*',text)\n",
+    "            cas_no_match = re.search(r'CAS No\\s+(\\d+-\\d+-\\d+)', text)\n",
+    "            recommended_use_match = re.search(r'Recommended Use\\s+(.+)', text)\n",
+    "            supplier_match = re.search(r'Details of the supplier.+\\n(.+)\\n(.+)\\n(.+)', text)\n",
+    "\n",
+    "            self.sds_data[\"SafetyDataSheet\"][\"Identification\"] = {\n",
+    "                \"ProductName\": product_name_match.group(1) if product_name_match else \"N/A\",\n",
+    "                \"Cat No.\": cat_no_match.group(0) if cat_no_match else \"N/A\",\n",
+    "                \"CASNo\": cas_no_match.group(1) if cas_no_match else \"N/A\",\n",
+    "                \"RecommendedUse\": recommended_use_match.group(1) if recommended_use_match else \"N/A\",\n",
+    "                \"Supplier\": {\n",
+    "                    \"Name\": supplier_match.group(1) if supplier_match else \"N/A\",\n",
+    "                    \"Address\": supplier_match.group(2) if supplier_match else \"N/A\",\n",
+    "                    \"Telephone\": supplier_match.group(3) if supplier_match else \"N/A\"\n",
+    "                }\n",
+    "            }\n",
+    "            print(\"Identification section parsed successfully!\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during Identification section parsing: {e}\")\n",
+    "\n",
+    "    def parse_hazard_identification_section(self, text):\n",
+    "        \"\"\"Parses the Hazard Identification section.\"\"\"\n",
+    "        try:\n",
+    "            signal_word_match = re.search(r'Signal Word\\s+(.+)', text)\n",
+    "            hazard_statements = re.findall(r'Hazard Statements\\s+(.+)', text)\n",
+    "            precautionary_statements = re.findall(r'Precautionary Statements\\s+(.+)', text)\n",
+    "\n",
+    "            self.sds_data[\"SafetyDataSheet\"][\"HazardIdentification\"] = {\n",
+    "                \"SignalWord\": signal_word_match.group(1) if signal_word_match else \"N/A\",\n",
+    "                \"HazardStatements\": hazard_statements if hazard_statements else [\"N/A\"],\n",
+    "                \"PrecautionaryStatements\": precautionary_statements if precautionary_statements else [\"N/A\"]\n",
+    "            }\n",
+    "            print(\"Hazard Identification section parsed successfully!\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during Hazard Identification section parsing: {e}\")\n",
+    "\n",
+    "    def process_sds(self):\n",
+    "        \"\"\"Processes the SDS and saves it as a JSON.\"\"\"\n",
+    "        try:\n",
+    "            text = self.extract_pdf_text()\n",
+    "\n",
+    "            if text:\n",
+    "                # Parse relevant sections\n",
+    "                self.parse_identification_section(text)\n",
+    "                self.parse_hazard_identification_section(text)\n",
+    "\n",
+    "                # Save the structured data as JSON\n",
+    "                with open(self.output_json_path, 'w') as json_file:\n",
+    "                    json.dump(self.sds_data, json_file, indent=4)\n",
+    "\n",
+    "                print(f\"SDS data successfully saved to {self.output_json_path}\")\n",
+    "            else:\n",
+    "                print(\"No text extracted from the PDF.\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during SDS processing: {e}\")\n",
+    "\n",
+    "# Example Usage:\n",
+    "if __name__ == \"__main__\":\n",
+    "    pdf_file = \"data/acetone-acs-l.pdf\"  # Path to your PDF file\n",
+    "    output_json = \"output_sds.json\"  # Path to save the output JSON\n",
+    "\n",
+    "    parser = SafetyDataSheetParser(pdf_file, output_json)\n",
+    "    parser.process_sds()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test.py b/test.py
@@ -0,0 +1,44 @@
+import PyPDF2
+import json
+
+# Open and read the PDF file
+pdf_file_path = 'data/acetone-acs-l.pdf'
+with open(pdf_file_path, 'rb') as pdf_file:
+    reader = PyPDF2.PdfReader(pdf_file)
+    num_pages = len(reader.pages)
+
+    # Initialize the structure
+    structured_data = {}
+
+    current_section = None
+    current_subsection = None
+
+    # Loop through each page of the PDF
+    for page_num in range(num_pages):
+        page = reader.pages[page_num]
+        text = page.extract_text()
+
+        # Process each line of text
+        for line in text.split('\n'):
+            # Detect main sections
+            if line.strip().isdigit() and int(line.strip()) in range(1, 20):  # Assuming sections are numbered
+                current_section = line.strip()
+                structured_data[current_section] = {}
+            elif line.strip().startswith('•'):  # Detect subsections by bullet points or specific patterns
+                current_subsection = line.strip()
+                if current_section:
+                    structured_data[current_section][current_subsection] = []
+            elif current_section and current_subsection:
+                structured_data[current_section][current_subsection].append(line.strip())
+            elif current_section:
+                structured_data[current_section].setdefault('content', []).append(line.strip())
+
+# Convert to JSON format
+json_data = json.dumps(structured_data, indent=4)
+
+# Save the JSON data to a file
+output_json_path = 'structured_data.json'
+with open(output_json_path, 'w') as json_file:
+    json_file.write(json_data)
+
+print(f"JSON data has been saved to {output_json_path}")
diff --git a/test2.ipynb b/test2.ipynb
diff --git a/vinol.py b/vinol.py
@@ -0,0 +1,54 @@
+#file location: data_preprocessor/data/acetone-acs-l (1).pdf
+
+import PyPDF2
+import os
+
+# Data extraction function
+def pdf_to_text(pdf_path):
+    # Extract the base name of the PDF file (without extension)
+    base_name = os.path.splitext(os.path.basename(pdf_path))[0]   
+    # Create the output text file name by appending ".txt"
+    output_txt = f"{base_name}.txt"
+
+    # Open the PDF file in read-binary mode
+    with open(pdf_path, 'rb') as pdf_file:
+        # Create a PdfReader object
+        reader = PyPDF2.PdfReader(pdf_file)
+
+        # Initialize an empty string to store the text
+        text = ''
+
+        # Iterate through all pages and extract text
+        for page_num in range(len(reader.pages)):
+            page = reader.pages[page_num]
+            page_text = page.extract_text()
+            if page_text:  # Ensure text was extracted
+                text += page_text + "\n"  # Add a newline for page separation
+
+    # Write the extracted text to a text file (output_txt)
+    with open(output_txt, 'w', encoding='utf-8') as txt_file:
+        txt_file.write(text)
+
+    print(f"PDF converted to text successfully! Text saved as: {output_txt}")
+
+# Usage example
+pdf_to_text('data/acetone-acs-l (1).pdf')
+
+#pdf_to_text('data_preprocessor/data/acetone-acs-l (1).pdf') 
+
+#cleaning up
+def exclude_lines_with_keyword(input_file_path):
+    # Create the output file name by appending '_modified' to the original file name
+    base_name = os.path.basename(input_file_path)
+    file_name, file_extension = os.path.splitext(base_name)
+    output_file_path = os.path.join(os.path.dirname(input_file_path), f"{file_name}_modified{file_extension}")
+
+    with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file:
+        for line in input_file:
+            # Check if the specific keyword is NOT in the line
+            if "Page" not in line and "____" not in line:
+                output_file.write(line)  # Write the line to the output file
+
+    print(f"Lines excluding the keywords have been written to '{output_file_path}'.")
+
+exclude_lines_with_keyword("acetone-acs-l (1).txt")
diff --git a/vinol2.py b/vinol2.py
@@ -0,0 +1,76 @@
+#file location: data_preprocessor/data/acetone-acs-l (1).pdf
+
+import pdfplumber
+import json
+
+text=""
+with pdfplumber.open("data_preprocessor/data/acetone-acs-l (1).pdf") as pdf:
+    first_page = pdf.pages[0] #reading only first pages for testing
+    text = first_page.extract_text()
+    print(text)
+
+
+from nltk.tokenize import RegexpTokenizer
+tokenizer = RegexpTokenizer(r'\w+')
+textArr = tokenizer.tokenize(text)
+print(textArr)
+#sample o/p ['SAFETY', 'DATA', 'SHEET', 'Creation', 'Date', '28', 'Apr', '2009', 'Revision', 'Date', '13', 'Oct', '2023', 'Revision', 'Number']
+
+text2 = ' '.join(textArr)
+
+from transformers import BertTokenizer, BertForTokenClassification
+import torch
+
+# Load tokenizer and model for token classification
+tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
+model = BertForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english', num_labels=9)  # Change num_labels based on your use case
+
+# Your extracted text from the PDF
+text2 = ' '.join(textArr)
+
+# Tokenize the text
+tokens = text2.split()
+input_tokens = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)
+
+# Get predictions
+with torch.no_grad():
+    outputs = model(**input_tokens)
+    logits = outputs.logits
+    predictions = torch.argmax(logits, dim=2)
+
+# Define a mapping of label IDs to label names
+label_map = {
+    0: 'O',      # Outside
+    1: 'B-MISC', # Begin Miscellaneous
+    2: 'I-MISC', # Inside Miscellaneous
+    3: 'B-PER',  # Begin Person
+    4: 'I-PER',  # Inside Person
+    5: 'B-ORG',  # Begin Organization
+    6: 'I-ORG',  # Inside Organization
+    7: 'B-LOC',  # Begin Location
+    8: 'I-LOC',  # Inside Location
+}
+
+# Print the tokens and their predicted labels
+for token, pred in zip(tokens, predictions[0].numpy()):
+    print(f"Token: {token} -> Label: {label_map[pred]}")
+
+#Token: SAFETY -> Label: O
+#Token: DATA -> Label: I-ORG
+#Token: SHEET -> Label: I-ORG
+#Token: Creation -> Label: I-ORG
+#Token: Date -> Label: I-ORG
+#Token: 28 -> Label: I-ORG
+
+# Convert to JSON
+json_output = json.dumps(tokens, indent=4)
+# Print JSON output
+print(json_output)
+# Optionally, save to a JSON file
+with open('token_classification_output.json', 'w') as json_file:
+    json.dump(tokens, json_file, indent=4)
+
+
+
+
+