diff --git a/chatgpt.py b/chatgpt.py
new file mode 100644
index 00000000..31dd3847
--- /dev/null
+++ b/chatgpt.py
@@ -0,0 +1,383 @@
+import PyPDF2
+import easyocr
+import re
+import json
+from pdf2image import convert_from_path  # Convert PDF pages to images
+import camelot
+
+class SafetyDataSheetParser:
+    def __init__(self, pdf_path, output_json_path):
+        self.pdf_path = pdf_path
+        self.output_json_path = output_json_path
+        self.reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR reader
+
+    def extract_pdf_text(self):
+        try:
+            # First, try to extract text with PyPDF2
+            with open(self.pdf_path, 'rb') as file:
+                reader = PyPDF2.PdfReader(file)
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text() or ""  # Avoid NoneType errors
+            if text.strip():
+                print("PDF text extraction successful with PyPDF2!")
+                return text
+            else:
+                raise ValueError("PyPDF2 extraction failed. Attempting OCR...")
+
+        except Exception as e:
+            print(f"Error during PDF extraction with PyPDF2: {e}")
+            return self.extract_text_with_ocr()  # Use EasyOCR as fallback
+
+    def extract_text_with_ocr(self):
+        try:
+            print("Extracting text using EasyOCR...")
+            # Convert PDF pages to images for OCR processing
+            images = convert_from_path(self.pdf_path)
+            text = ""
+            for img in images:
+                text += "\n".join(self.reader.readtext(img, detail=0))  # Extract text from image
+            if text.strip():
+                print("Text extraction with EasyOCR successful!")
+                return text
+            else:
+                raise ValueError("OCR text extraction failed.")
+        except Exception as e:
+            print(f"Error during OCR extraction: {e}")
+            return ""
+
+class SafetyDataSheetParser:
+    def __init__(self, pdf_path, output_json_path):
+        self.pdf_path = pdf_path
+        self.output_json_path = output_json_path
+
+        self.sds_data = {
+            "SafetyDataSheet": {
+                "Identification": {},
+                "HazardIdentification": {},
+                "Composition/Information on Ingredients": {},
+                "First-aid measures": {},
+                "Fire-fighting measures": {},
+                "Accidental release measures": {},
+                "Handling and storage": {},
+                "Exposure controls/personal protection": {},
+                "Physical and chemical properties": {},
+                "Stability and reactivity": {},
+                "Toxicological information": {},
+                "Ecological information": {},
+                "Disposal considerations": {},
+                "Transport information": {},
+                "Regulatory information": {},
+                "Other information": {},
+                "Tables": []
+            }
+        }
+
+    def extract_tables(self):
+        try:
+            print("Extracting tables using Camelot...")
+            tables = camelot.read_pdf(self.pdf_path, pages='all', strip_text='\n')  # Read all tables in the PDF
+            table_data = []
+            for table in tables:
+                table_data.append(table.df.to_dict())  # Convert tables to dict (can also save as CSV/JSON)
+            self.sds_data["SafetyDataSheet"]["Tables"] = table_data
+            print(f"{len(tables)} table(s) extracted successfully!")
+        except Exception as e:
+            print(f"Error during table extraction: {e}")
+
+    def extract_pdf_text(self):
+        try:
+            with open(self.pdf_path, 'rb') as file:
+                reader = PyPDF2.PdfReader(file)
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text() or ""  # Avoid NoneType errors
+            if not text:
+                raise ValueError("PDF text extraction failed or PDF is empty.")
+            print("PDF text extraction successful!")
+            return text
+        except Exception as e:
+            print(f"Error during PDF extraction: {e}")
+            return ""
+
+    def parse_identification_section(self, text):
+        try:
+            product_name_match = re.search(r'Product Name\s+(.+)', text)
+            cat_no_match = re.search(r'Cat No. :\s+(.+)', text)
+            cas_no_match = re.search(r'CAS No\s+(\d+-\d+-\d+)', text)
+            recommended_use_match = re.search(r'Recommended Use\s+(.+)', text)
+            supplier_match = re.search(r'Details of the supplier.+\n(.+)\n(.+)\n(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Identification"] = {
+                "ProductName": product_name_match.group(1) if product_name_match else "N/A",
+                "Cat No.": cat_no_match.group(1) if cat_no_match else "N/A",
+                "CASNo": cas_no_match.group(1) if cas_no_match else "N/A",
+                "RecommendedUse": recommended_use_match.group(1) if recommended_use_match else "N/A",
+                "Supplier": {
+                    "Name": supplier_match.group(1) if supplier_match else "N/A",
+                    "Address": supplier_match.group(2) if supplier_match else "N/A",
+                    "Telephone": supplier_match.group(3) if supplier_match else "N/A"
+                }
+            }
+            print("Identification section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Identification section parsing: {e}")
+
+    def parse_hazard_identification_section(self, text):
+        try:
+            signal_word_match = re.search(r'Signal Word\s+(.+)', text)
+            hazard_statements = re.findall(r'Hazard Statements\s+(.+)', text)
+            precautionary_statements = re.findall(r'Precautionary Statements\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["HazardIdentification"] = {
+                "SignalWord": signal_word_match.group(1) if signal_word_match else "N/A",
+                "HazardStatements": hazard_statements if hazard_statements else ["N/A"],
+                "PrecautionaryStatements": precautionary_statements if precautionary_statements else ["N/A"]
+            }
+            print("Hazard Identification section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Hazard Identification section parsing: {e}")
+
+    def parse_composition_section(self, text):
+        try:
+            component_match = re.search(r'Component\s+(.+)\nCAS No\s+(.+)\nWeight %\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Composition/Information on Ingredients"] = {
+                "Component": component_match.group(1) if component_match else "N/A",
+                "CASNo": component_match.group(2) if component_match else "N/A",
+                "WeightPercentage": component_match.group(3) if component_match else "N/A"
+            }
+            print("Composition section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Composition section parsing: {e}")
+
+    def parse_first_aid_measures(self, text):
+        try:
+            general_advice_match = re.search(r'General Advice\s+(.+)', text)
+            eye_contact_match = re.search(r'Eye Contact\s+(.+)', text)
+            skin_contact_match = re.search(r'Skin Contact\s+(.+)', text)
+            inhalation_match = re.search(r'Inhalation\s+(.+)', text)
+            ingestion_match = re.search(r'Ingestion\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["First-aid measures"] = {
+                "GeneralAdvice": general_advice_match.group(1) if general_advice_match else "N/A",
+                "EyeContact": eye_contact_match.group(1) if eye_contact_match else "N/A",
+                "SkinContact": skin_contact_match.group(1) if skin_contact_match else "N/A",
+                "Inhalation": inhalation_match.group(1) if inhalation_match else "N/A",
+                "Ingestion": ingestion_match.group(1) if ingestion_match else "N/A"
+            }
+            print("First-aid measures section parsed successfully!")
+        except Exception as e:
+            print(f"Error during First-aid measures section parsing: {e}")
+
+    def parse_fire_fighting_measures(self, text):
+        try:
+            extinguishing_media_match = re.search(r'Suitable Extinguishing Media\s+(.+)', text)
+            fire_hazards_match = re.search(r'Specific Hazards Arising from the Chemical\s+(.+)', text)
+            protective_equipment_match = re.search(r'Protective Equipment and Precautions for Firefighters\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Fire-fighting measures"] = {
+                "ExtinguishingMedia": extinguishing_media_match.group(1) if extinguishing_media_match else "N/A",
+                "SpecificHazards": fire_hazards_match.group(1) if fire_hazards_match else "N/A",
+                "ProtectiveEquipment": protective_equipment_match.group(1) if protective_equipment_match else "N/A"
+            }
+            print("Fire-fighting measures section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Fire-fighting measures section parsing: {e}")
+
+    def parse_accidental_release_measures(self, text):
+        try:
+            personal_precautions_match = re.search(r'Personal Precautions\s+(.+)', text)
+            environmental_precautions_match = re.search(r'Environmental Precautions\s+(.+)', text)
+            containment_cleanup_match = re.search(r'Methods for Containment and Clean Up\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Accidental release measures"] = {
+                "PersonalPrecautions": personal_precautions_match.group(1) if personal_precautions_match else "N/A",
+                "EnvironmentalPrecautions": environmental_precautions_match.group(1) if environmental_precautions_match else "N/A",
+                "MethodsForContainmentAndCleanUp": containment_cleanup_match.group(1) if containment_cleanup_match else "N/A"
+            }
+            print("Accidental release measures section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Accidental release measures section parsing: {e}")
+
+    def parse_handling_and_storage(self, text):
+        try:
+            handling_match = re.search(r'Handling\s+(.+)', text)
+            storage_match = re.search(r'Storage\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Handling and storage"] = {
+                "Handling": handling_match.group(1) if handling_match else "N/A",
+                "Storage": storage_match.group(1) if storage_match else "N/A"
+            }
+            print("Handling and storage section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Handling and storage section parsing: {e}")
+
+    def parse_exposure_controls(self, text):
+        try:
+            exposure_guidelines_match = re.search(r'Exposure Guidelines\s+(.+)', text)
+            engineering_controls_match = re.search(r'Engineering Measures\s+(.+)', text)
+            personal_protection_match = re.search(r'Personal Protective Equipment\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Exposure controls/personal protection"] = {
+                "ExposureGuidelines": exposure_guidelines_match.group(1) if exposure_guidelines_match else "N/A",
+                "EngineeringControls": engineering_controls_match.group(1) if engineering_controls_match else "N/A",
+                "PersonalProtection": personal_protection_match.group(1) if personal_protection_match else "N/A"
+            }
+            print("Exposure controls/personal protection section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Exposure controls/personal protection section parsing: {e}")
+
+    def parse_physical_and_chemical_properties(self, text):
+        try:
+            physical_state_match = re.search(r'Physical State\s+(.+)', text)
+            appearance_match = re.search(r'Appearance\s+(.+)', text)
+            odor_match = re.search(r'Odor\s+(.+)', text)
+            pH_match = re.search(r'pH\s+(.+)', text)
+            melting_point_match = re.search(r'Melting Point/Range\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Physical and chemical properties"] = {
+                "PhysicalState": physical_state_match.group(1) if physical_state_match else "N/A",
+                "Appearance": appearance_match.group(1) if appearance_match else "N/A",
+                "Odor": odor_match.group(1) if odor_match else "N/A",
+                "pH": pH_match.group(1) if pH_match else "N/A",
+                "MeltingPoint": melting_point_match.group(1) if melting_point_match else "N/A"
+            }
+            print("Physical and chemical properties section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Physical and chemical properties section parsing: {e}")
+
+    def parse_stability_and_reactivity(self, text):
+        try:
+            stability_match = re.search(r'Stability\s+(.+)', text)
+            conditions_to_avoid_match = re.search(r'Conditions to Avoid\s+(.+)', text)
+            incompatible_materials_match = re.search(r'Incompatible Materials\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Stability and reactivity"] = {
+                "Stability": stability_match.group(1) if stability_match else "N/A",
+                "ConditionsToAvoid": conditions_to_avoid_match.group(1) if conditions_to_avoid_match else "N/A",
+                "IncompatibleMaterials": incompatible_materials_match.group(1) if incompatible_materials_match else "N/A"
+            }
+            print("Stability and reactivity section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Stability and reactivity section parsing: {e}")
+
+    def parse_toxicological_information(self, text):
+        try:
+            acute_toxicity_match = re.search(r'Acute Toxicity\s+(.+)', text)
+            symptoms_match = re.search(r'Symptoms\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Toxicological information"] = {
+                "AcuteToxicity": acute_toxicity_match.group(1) if acute_toxicity_match else "N/A",
+                "Symptoms": symptoms_match.group(1) if symptoms_match else "N/A"
+            }
+            print("Toxicological information section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Toxicological information section parsing: {e}")
+
+    def parse_ecological_information(self, text):
+        try:
+            ecotoxicity_match = re.search(r'Ecotoxicity\s+(.+)', text)
+            bioaccumulation_match = re.search(r'Bioaccumulation\s+(.+)', text)
+            mobility_match = re.search(r'Mobility\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Ecological information"] = {
+                "Ecotoxicity": ecotoxicity_match.group(1) if ecotoxicity_match else "N/A",
+                "Bioaccumulation": bioaccumulation_match.group(1) if bioaccumulation_match else "N/A",
+                "Mobility": mobility_match.group(1) if mobility_match else "N/A"
+            }
+            print("Ecological information section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Ecological information section parsing: {e}")
+
+    def parse_disposal_considerations(self, text):
+        try:
+            waste_disposal_match = re.search(r'Waste Disposal Methods\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Disposal considerations"] = {
+                "WasteDisposal": waste_disposal_match.group(1) if waste_disposal_match else "N/A"
+            }
+            print("Disposal considerations section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Disposal considerations section parsing: {e}")
+
+    def parse_transport_information(self, text):
+        try:
+            un_number_match = re.search(r'UN-No\s+(.+)', text)
+            proper_shipping_name_match = re.search(r'Proper Shipping Name\s+(.+)', text)
+            hazard_class_match = re.search(r'Hazard Class\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Transport information"] = {
+                "UNNumber": un_number_match.group(1) if un_number_match else "N/A",
+                "ProperShippingName": proper_shipping_name_match.group(1) if proper_shipping_name_match else "N/A",
+                "HazardClass": hazard_class_match.group(1) if hazard_class_match else "N/A"
+            }
+            print("Transport information section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Transport information section parsing: {e}")
+
+    def parse_regulatory_information(self, text):
+        try:
+            regulatory_match = re.search(r'U\.S\. Federal Regulations\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Regulatory information"] = {
+                "USFederalRegulations": regulatory_match.group(1) if regulatory_match else "N/A"
+            }
+            print("Regulatory information section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Regulatory information section parsing: {e}")
+
+    def parse_other_information(self, text):
+        try:
+            preparation_date_match = re.search(r'Creation Date\s+(.+)', text)
+            revision_date_match = re.search(r'Revision Date\s+(.+)', text)
+
+            self.sds_data["SafetyDataSheet"]["Other information"] = {
+                "CreationDate": preparation_date_match.group(1) if preparation_date_match else "N/A",
+                "RevisionDate": revision_date_match.group(1) if revision_date_match else "N/A"
+            }
+            print("Other information section parsed successfully!")
+        except Exception as e:
+            print(f"Error during Other information section parsing: {e}")
+
+    def process_sds(self):
+        try:
+            text = self.extract_pdf_text()
+
+            if text:
+                # Parse all relevant sections
+                self.parse_identification_section(text)
+                self.parse_hazard_identification_section(text)
+                self.parse_composition_section(text)
+                self.parse_first_aid_measures(text)
+                self.parse_fire_fighting_measures(text)
+                self.parse_accidental_release_measures(text)
+                self.parse_handling_and_storage(text)
+                self.parse_exposure_controls(text)
+                self.parse_physical_and_chemical_properties(text)
+                self.parse_stability_and_reactivity(text)
+                self.parse_toxicological_information(text)
+                self.parse_ecological_information(text)
+                self.parse_disposal_considerations(text)
+                self.parse_transport_information(text)
+                self.parse_regulatory_information(text)
+                self.parse_other_information(text)
+                self.extract_tables()
+
+                # Save the structured data as JSON
+                with open(self.output_json_path, 'w') as json_file:
+                    json.dump(self.sds_data, json_file, indent=4)
+
+                print(f"SDS data successfully saved to {self.output_json_path}")
+            else:
+                print("No text extracted from the PDF.")
+        except Exception as e:
+            print(f"Error during SDS processing: {e}")
+
+# Example Usage:
+if __name__ == "__main__":
+    pdf_file = "data/acetone-acs-l.pdf"  # Path to your PDF file
+    output_json = "output_sds.json"  # Path to save the output JSON
+
+    parser = SafetyDataSheetParser(pdf_file, output_json)
+    parser.process_sds()
diff --git a/output_sds.json b/output_sds.json
new file mode 100644
index 00000000..f49b84ae
--- /dev/null
+++ b/output_sds.json
@@ -0,0 +1,38 @@
+{
+    "SafetyDataSheet": {
+        "Identification": {
+            "ProductName": "Acetone",
+            "Cat No.": "A9-4",
+            "CASNo": "67-64-1",
+            "RecommendedUse": "Laboratory chemicals.",
+            "Supplier": {
+                "Name": "2. Hazard(s) identification",
+                "Address": "Classification Company ",
+                "Telephone": "Fisher Scientific Company"
+            }
+        },
+        "HazardIdentification": {
+            "SignalWord": "Danger",
+            "HazardStatements": [
+                "Highly flammable liquid and vapor"
+            ],
+            "PrecautionaryStatements": [
+                "Prevention"
+            ]
+        },
+        "Composition/Information on Ingredients": {},
+        "First-aid measures": {},
+        "Fire-fighting measures": {},
+        "Accidental release measures": {},
+        "Handling and storage": {},
+        "Exposure controls/personal protection": {},
+        "Physical and chemical properties": {},
+        "Stability and reactivity": {},
+        "Toxicological information": {},
+        "Ecological information": {},
+        "Disposal considerations": {},
+        "Transport information": {},
+        "Regulatory information": {},
+        "Other information": {}
+    }
+}
\ No newline at end of file
diff --git a/rag.ipynb b/rag.ipynb
new file mode 100644
index 00000000..ee506afd
--- /dev/null
+++ b/rag.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_huggingface import HuggingFaceEmbeddings\n",
+    "from langchain_chroma import Chroma\n",
+    "from langchain_community.embeddings.sentence_transformer import (\n",
+    "    SentenceTransformerEmbeddings,\n",
+    ")\n",
+    "from langchain_text_splitters import CharacterTextSplitter\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
+    "from langchain.chains import create_retrieval_chain\n",
+    "from langchain_community.chat_models import ChatOllama\n",
+    "from langchain_community.document_loaders import PyPDFLoader\n",
+    "from PIL import Image\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#trying different llm\n",
+    "llme = ChatOllama(model=\"mistral\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import easyocr\n",
+    "\n",
+    "# Initialize the reader for the desired languages\n",
+    "reader = easyocr.Reader(['en', 'es'] , gpu=True)  # English and Spanish\n",
+    "\n",
+    "# Read text from an image\n",
+    "text = reader.readtext('ooga/acetone-acs-l-2.png', detail=0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "File path OOgaBoo.pdf is not a valid file or url",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[4], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m#text splitter and stuff\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m loader \u001b[38;5;241m=\u001b[39m \u001b[43mPyPDFLoader\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mOOgaBoo.pdf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m      3\u001b[0m pages \u001b[38;5;241m=\u001b[39m loader\u001b[38;5;241m.\u001b[39mload_and_split()\n\u001b[0;32m      4\u001b[0m documents \u001b[38;5;241m=\u001b[39m loader\u001b[38;5;241m.\u001b[39mload()\n",
+      "File \u001b[1;32md:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\langchain_community\\document_loaders\\pdf.py:241\u001b[0m, in \u001b[0;36mPyPDFLoader.__init__\u001b[1;34m(self, file_path, password, headers, extract_images, extraction_mode, extraction_kwargs)\u001b[0m\n\u001b[0;32m    237\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m    238\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[0;32m    239\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpypdf package not found, please install it with `pip install pypdf`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    240\u001b[0m     )\n\u001b[1;32m--> 241\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    242\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparser \u001b[38;5;241m=\u001b[39m PyPDFParser(\n\u001b[0;32m    243\u001b[0m     password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m    244\u001b[0m     extract_images\u001b[38;5;241m=\u001b[39mextract_images,\n\u001b[0;32m    245\u001b[0m     extraction_mode\u001b[38;5;241m=\u001b[39mextraction_mode,\n\u001b[0;32m    246\u001b[0m     extraction_kwargs\u001b[38;5;241m=\u001b[39mextraction_kwargs,\n\u001b[0;32m    247\u001b[0m )\n",
+      "File \u001b[1;32md:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\langchain_community\\document_loaders\\pdf.py:117\u001b[0m, in \u001b[0;36mBasePDFLoader.__init__\u001b[1;34m(self, file_path, headers)\u001b[0m\n\u001b[0;32m    115\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(temp_pdf)\n\u001b[0;32m    116\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path):\n\u001b[1;32m--> 117\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile path \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m is not a valid file or url\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path)\n",
+      "\u001b[1;31mValueError\u001b[0m: File path OOgaBoo.pdf is not a valid file or url"
+     ]
+    }
+   ],
+   "source": [
+    "#text splitter and stuff\n",
+    "loader = PyPDFLoader(\"data/acetone-acs-l.pdf\")\n",
+    "pages = loader.load_and_split()\n",
+    "documents = loader.load()\n",
+    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
+    "docs = text_splitter.split_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm, trange\n",
+      "d:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'str' object has no attribute 'page_content'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[6], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m#embed and store it in a vec db \u001b[39;00m\n\u001b[0;32m      2\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m HuggingFaceEmbeddings(model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall-MiniLM-L6-v2\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 3\u001b[0m db \u001b[38;5;241m=\u001b[39m \u001b[43mChroma\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membeddings\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      4\u001b[0m retriever \u001b[38;5;241m=\u001b[39m db\u001b[38;5;241m.\u001b[39mas_retriever()\n",
+      "File \u001b[1;32md:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\langchain_chroma\\vectorstores.py:1126\u001b[0m, in \u001b[0;36mChroma.from_documents\u001b[1;34m(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)\u001b[0m\n\u001b[0;32m   1092\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[0;32m   1093\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[0;32m   1094\u001b[0m     \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1103\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m   1104\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[0;32m   1105\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[0;32m   1106\u001b[0m \n\u001b[0;32m   1107\u001b[0m \u001b[38;5;124;03m    If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1124\u001b[0m \u001b[38;5;124;03m        Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[0;32m   1125\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1126\u001b[0m     texts \u001b[38;5;241m=\u001b[39m [\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m   1127\u001b[0m     metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m   1128\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[0;32m   1129\u001b[0m         texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[0;32m   1130\u001b[0m         embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1138\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m   1139\u001b[0m     )\n",
+      "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'page_content'"
+     ]
+    }
+   ],
+   "source": [
+    "#embed and store it in a vec db \n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
+    "db = Chroma.from_documents(docs, embeddings)\n",
+    "retriever = db.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#make a template for the output\n",
+    "prompt = ChatPromptTemplate.from_template(\"\"\"Answer the following question based only on the provided context:\n",
+    "\n",
+    "<context>\n",
+    "{context}\n",
+    "</context>\n",
+    "\n",
+    "Question: {input}\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#retrival \n",
+    "combine_docs_chain = create_stuff_documents_chain(\n",
+    "    llme, prompt\n",
+    ")\n",
+    "retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " {\n",
+      "      \"Name\": \"Acetone\",\n",
+      "      \"Revision Date\": \"13-Oct-2023\",\n",
+      "      \"Method\": \"CC (closed cup)\",\n",
+      "      \"Evaporation Rate\": \"5.6 (Butyl Acetate = 1.0)\",\n",
+      "      \"Flammability\": {\n",
+      "          \"Not applicable\": true\n",
+      "      },\n",
+      "      \"Flammability or explosive limits\": {\n",
+      "          \"Upper\": \"12.8 vol %\",\n",
+      "          \"Lower\": \"2.5 vol %\"\n",
+      "      },\n",
+      "      \"Vapor Pressure\": \"247 mbar  @  20 °C\",\n",
+      "      \"Vapor Density\": \"2.0\",\n",
+      "      \"Specific Gravity\": \"0.790\",\n",
+      "      \"Solubility\": \"Soluble in water\",\n",
+      "      \"Partition coefficient; n-octanol/water\": \"No data available\",\n",
+      "      \"Autoignition Temperature\": {\n",
+      "          \"°C\": \"465\",\n",
+      "          \"°F\": \"869\"\n",
+      "      },\n",
+      "      \"Decomposition Temperature\": \"> 4°C\",\n",
+      "      \"Viscosity\": \"0.32 mPa.s @ 20 °C\",\n",
+      "      \"Molecular Formula\": \"C3 H6 O\",\n",
+      "      \"Molecular Weight\": \"58.08\",\n",
+      "      \"VOC Content(%)\": \"100\",\n",
+      "      \"Refractive index\": \"1.358 - 1.359\",\n",
+      "      \"Stability and reactivity\": {\n",
+      "          \"Reactive Hazard\": \"None known, based on information available\",\n",
+      "          \"Stability\": \"Stable under normal conditions.\",\n",
+      "          \"Conditions to Avoid\": [\n",
+      "              \"Heat\",\n",
+      "              \"flames and sparks.\",\n",
+      "              \"Incompatible products.\"\n",
+      "          ],\n",
+      "          \"Incompatible Materials\": [\n",
+      "              \"Strong oxidizing agents\",\n",
+      "              \"Strong reducing agents\",\n",
+      "              \"Strong bases\",\n",
+      "              \"Peroxides\",\n",
+      "              \"Halogenated compounds\",\n",
+      "              \"Alkali metals\",\n",
+      "              \"Amines\"\n",
+      "          ]\n",
+      "      },\n",
+      "      \"Hazardous Decomposition Products\": [\n",
+      "          \"Carbon monoxide (CO)\",\n",
+      "          \"Carbon dioxide (CO2)\",\n",
+      "          \"Formaldehyde\",\n",
+      "          \"Methanol\"\n",
+      "      ],\n",
+      "      \"Hazardous Polymerization\": \"Hazardous polymerization does not occur.\",\n",
+      "      \"Hazardous Reactions\": \"None under normal processing.\",\n",
+      "      \"Toxicological information\": {\n",
+      "          \"Acute Toxicity\": {\n",
+      "              \"Product Information\": true,\n",
+      "              \"Component Information\": true,\n",
+      "              \"Component LD50 Oral\": \"Not available\",\n",
+      "              \"Dermal LC50\": \"Not available\",\n",
+      "              \"Inhalation\": \"EN 149. Use a NIOSH/MSHA or European Standard EN 149 approved respirator if exposure limits are exceeded or if irritation or other symptoms are experienced.\"\n",
+      "          },\n",
+      "          \"Recommended Filter type\": \"low boiling organic solvent. Type AX. Brown. conforming to EN371.\",\n",
+      "          \"Hygiene Measures\": \"Handle in accordance with good industrial hygiene and safety practice.\",\n",
+      "          \"_Other International Regulations_\": {\n",
+      "              \"Seveso III Directive (2012/18/EC) - Qualifying Quantities for Major Accident Notification\": \"Not applicable\",\n",
+      "              \"Seveso III Directive (2012/18/EC) - Qualifying Quantities for Safety Report Requirements\": \"Not applicable\",\n",
+      "              \"Rotterdam Convention (PIC)\": \"Not applicable\",\n",
+      "              \"Basel Convention (Hazardous Waste)\": \"Not applicable\",\n",
+      "              \"Annex I - Y42\": \"Acetone 67-64-1\"\n",
+      "          }\n",
+      "      },\n",
+      "      \"Other information\": {\n",
+      "          \"Prepared By\": \"Regulatory Affairs\",\n",
+      "          \"Email\": \"EMSDS.RA@thermofisher.com\",\n",
+      "          \"Creation Date\": \"28-Apr-2009\",\n",
+      "          \"Revision Date\": \"13-Oct-2023\",\n",
+      "          \"Print Date\": \"13-Oct-2023\",\n",
+      "          \"Revision Summary\": \"This document has been updated to comply with the US OSHA regulations.\",\n",
+      "          \"_Notes_\": [\n",
+      "              \"The values are rounded off or approximate due to the nature of the data.\"\n",
+      "          ]\n",
+      "      }\n",
+      "  }\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = retrieval_chain.invoke({\"input\": \"Here is some text extracted from a PDF. Analyze it and generate structured JSON where main sections are parent keys and subsections are children. clean the one which \"})\n",
+    "print(response[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..dce0b75d
Binary files /dev/null and b/requirements.txt differ
diff --git a/test.ipynb b/test.ipynb
new file mode 100644
index 00000000..0f8cc853
--- /dev/null
+++ b/test.ipynb
@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: PyPDF2 in d:\\data\\data_preprocessor\\.venv\\lib\\site-packages (3.0.1)Note: you may need to restart the kernel to use updated packages.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install PyPDF2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PDF text extraction successful!\n",
+      "Identification section parsed successfully!\n",
+      "Hazard Identification section parsed successfully!\n",
+      "SDS data successfully saved to output_sds.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "import PyPDF2\n",
+    "import json\n",
+    "import re\n",
+    "\n",
+    "class SafetyDataSheetParser:\n",
+    "    def __init__(self, pdf_path, output_json_path):\n",
+    "        self.pdf_path = pdf_path\n",
+    "        self.output_json_path = output_json_path\n",
+    "        self.sds_data = {\n",
+    "            \"SafetyDataSheet\": {\n",
+    "                \"Identification\": {},\n",
+    "                \"HazardIdentification\": {},\n",
+    "                \"Composition/Information on Ingredients\":{},\n",
+    "                \"First-aid measures\":{},\n",
+    "                \"Fire-fighting measures\":{},\n",
+    "                \"Accidental release measures\":{},\n",
+    "                \"Handling and storage\":{},\n",
+    "                \"Exposure controls/personal protection\":{},\n",
+    "                \"Physical and chemical properties\":{},\n",
+    "                \"Stability and reactivity\":{},\n",
+    "                \"Toxicological information\":{},\n",
+    "                \"Ecological information\":{},\n",
+    "                \"Disposal considerations\":{},\n",
+    "                \"Transport information\":{},\n",
+    "                \"Regulatory information\":{},\n",
+    "                \"Other information\":{}\n",
+    "            }\n",
+    "        }\n",
+    "\n",
+    "    def extract_pdf_text(self):\n",
+    "        try:\n",
+    "            with open(self.pdf_path, 'rb') as file:\n",
+    "                reader = PyPDF2.PdfReader(file)\n",
+    "                text = \"\"\n",
+    "                for page in reader.pages:\n",
+    "                    text += page.extract_text() or \"\"  # Avoid NoneType errors\n",
+    "            if not text:\n",
+    "                raise ValueError(\"PDF text extraction failed or PDF is empty.\")\n",
+    "            print(\"PDF text extraction successful!\")\n",
+    "            return text\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during PDF extraction: {e}\")\n",
+    "            return \"\"\n",
+    "\n",
+    "    def parse_identification_section(self, text):\n",
+    "        \"\"\"Parses the Identification section.\"\"\"\n",
+    "        try:\n",
+    "            product_name_match = re.search(r'Product Name\\s+(.+)', text)\n",
+    "            cat_no_match = re.search(r'Cat No. :\\s+[A-Z]+\\d+[A-Z]*-\\d+[A-Z\\d\\-]*',text)\n",
+    "            cas_no_match = re.search(r'CAS No\\s+(\\d+-\\d+-\\d+)', text)\n",
+    "            recommended_use_match = re.search(r'Recommended Use\\s+(.+)', text)\n",
+    "            supplier_match = re.search(r'Details of the supplier.+\\n(.+)\\n(.+)\\n(.+)', text)\n",
+    "\n",
+    "            self.sds_data[\"SafetyDataSheet\"][\"Identification\"] = {\n",
+    "                \"ProductName\": product_name_match.group(1) if product_name_match else \"N/A\",\n",
+    "                \"Cat No.\": cat_no_match.group(0) if cat_no_match else \"N/A\",\n",
+    "                \"CASNo\": cas_no_match.group(1) if cas_no_match else \"N/A\",\n",
+    "                \"RecommendedUse\": recommended_use_match.group(1) if recommended_use_match else \"N/A\",\n",
+    "                \"Supplier\": {\n",
+    "                    \"Name\": supplier_match.group(1) if supplier_match else \"N/A\",\n",
+    "                    \"Address\": supplier_match.group(2) if supplier_match else \"N/A\",\n",
+    "                    \"Telephone\": supplier_match.group(3) if supplier_match else \"N/A\"\n",
+    "                }\n",
+    "            }\n",
+    "            print(\"Identification section parsed successfully!\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during Identification section parsing: {e}\")\n",
+    "\n",
+    "    def parse_hazard_identification_section(self, text):\n",
+    "        \"\"\"Parses the Hazard Identification section.\"\"\"\n",
+    "        try:\n",
+    "            signal_word_match = re.search(r'Signal Word\\s+(.+)', text)\n",
+    "            hazard_statements = re.findall(r'Hazard Statements\\s+(.+)', text)\n",
+    "            precautionary_statements = re.findall(r'Precautionary Statements\\s+(.+)', text)\n",
+    "\n",
+    "            self.sds_data[\"SafetyDataSheet\"][\"HazardIdentification\"] = {\n",
+    "                \"SignalWord\": signal_word_match.group(1) if signal_word_match else \"N/A\",\n",
+    "                \"HazardStatements\": hazard_statements if hazard_statements else [\"N/A\"],\n",
+    "                \"PrecautionaryStatements\": precautionary_statements if precautionary_statements else [\"N/A\"]\n",
+    "            }\n",
+    "            print(\"Hazard Identification section parsed successfully!\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during Hazard Identification section parsing: {e}\")\n",
+    "\n",
+    "    def process_sds(self):\n",
+    "        \"\"\"Processes the SDS and saves it as a JSON.\"\"\"\n",
+    "        try:\n",
+    "            text = self.extract_pdf_text()\n",
+    "\n",
+    "            if text:\n",
+    "                # Parse relevant sections\n",
+    "                self.parse_identification_section(text)\n",
+    "                self.parse_hazard_identification_section(text)\n",
+    "\n",
+    "                # Save the structured data as JSON\n",
+    "                with open(self.output_json_path, 'w') as json_file:\n",
+    "                    json.dump(self.sds_data, json_file, indent=4)\n",
+    "\n",
+    "                print(f\"SDS data successfully saved to {self.output_json_path}\")\n",
+    "            else:\n",
+    "                print(\"No text extracted from the PDF.\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error during SDS processing: {e}\")\n",
+    "\n",
+    "# Example Usage:\n",
+    "if __name__ == \"__main__\":\n",
+    "    pdf_file = \"data/acetone-acs-l.pdf\"  # Path to your PDF file\n",
+    "    output_json = \"output_sds.json\"  # Path to save the output JSON\n",
+    "\n",
+    "    parser = SafetyDataSheetParser(pdf_file, output_json)\n",
+    "    parser.process_sds()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test.py b/test.py
new file mode 100644
index 00000000..d4968d62
--- /dev/null
+++ b/test.py
@@ -0,0 +1,44 @@
+import PyPDF2
+import json
+
+# Open and read the PDF file
+pdf_file_path = 'data/acetone-acs-l.pdf'
+with open(pdf_file_path, 'rb') as pdf_file:
+    reader = PyPDF2.PdfReader(pdf_file)
+    num_pages = len(reader.pages)
+
+    # Initialize the structure
+    structured_data = {}
+    
+    current_section = None
+    current_subsection = None
+
+    # Loop through each page of the PDF
+    for page_num in range(num_pages):
+        page = reader.pages[page_num]
+        text = page.extract_text()
+
+        # Process each line of text
+        for line in text.split('\n'):
+            # Detect main sections
+            if line.strip().isdigit() and int(line.strip()) in range(1, 20):  # Assuming sections are numbered
+                current_section = line.strip()
+                structured_data[current_section] = {}
+            elif line.strip().startswith('•'):  # Detect subsections by bullet points or specific patterns
+                current_subsection = line.strip()
+                if current_section:
+                    structured_data[current_section][current_subsection] = []
+            elif current_section and current_subsection:
+                structured_data[current_section][current_subsection].append(line.strip())
+            elif current_section:
+                structured_data[current_section].setdefault('content', []).append(line.strip())
+
+# Convert to JSON format
+json_data = json.dumps(structured_data, indent=4)
+
+# Save the JSON data to a file
+output_json_path = 'structured_data.json'
+with open(output_json_path, 'w') as json_file:
+    json_file.write(json_data)
+
+print(f"JSON data has been saved to {output_json_path}")
diff --git a/test2.ipynb b/test2.ipynb
new file mode 100644
index 00000000..e69de29b
diff --git a/vinol.py b/vinol.py
new file mode 100644
index 00000000..f145135d
--- /dev/null
+++ b/vinol.py
@@ -0,0 +1,54 @@
+#file location: data_preprocessor/data/acetone-acs-l (1).pdf
+
+import PyPDF2
+import os
+
+# Data extraction function
+def pdf_to_text(pdf_path):
+    # Extract the base name of the PDF file (without extension)
+    base_name = os.path.splitext(os.path.basename(pdf_path))[0]   
+    # Create the output text file name by appending ".txt"
+    output_txt = f"{base_name}.txt"
+    
+    # Open the PDF file in read-binary mode
+    with open(pdf_path, 'rb') as pdf_file:
+        # Create a PdfReader object
+        reader = PyPDF2.PdfReader(pdf_file)
+        
+        # Initialize an empty string to store the text
+        text = ''
+        
+        # Iterate through all pages and extract text
+        for page_num in range(len(reader.pages)):
+            page = reader.pages[page_num]
+            page_text = page.extract_text()
+            if page_text:  # Ensure text was extracted
+                text += page_text + "\n"  # Add a newline for page separation
+                
+    # Write the extracted text to a text file (output_txt)
+    with open(output_txt, 'w', encoding='utf-8') as txt_file:
+        txt_file.write(text)
+        
+    print(f"PDF converted to text successfully! Text saved as: {output_txt}")
+
+# Usage example
+pdf_to_text('data/acetone-acs-l (1).pdf')
+
+#pdf_to_text('data_preprocessor/data/acetone-acs-l (1).pdf') 
+
+#cleaning up
+def exclude_lines_with_keyword(input_file_path):
+    # Create the output file name by appending '_modified' to the original file name
+    base_name = os.path.basename(input_file_path)
+    file_name, file_extension = os.path.splitext(base_name)
+    output_file_path = os.path.join(os.path.dirname(input_file_path), f"{file_name}_modified{file_extension}")
+
+    with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file:
+        for line in input_file:
+            # Check if the specific keyword is NOT in the line
+            if "Page" not in line and "____" not in line:
+                output_file.write(line)  # Write the line to the output file
+
+    print(f"Lines excluding the keywords have been written to '{output_file_path}'.")
+
+exclude_lines_with_keyword("acetone-acs-l (1).txt")
\ No newline at end of file
diff --git a/vinol2.py b/vinol2.py
new file mode 100644
index 00000000..20f658bd
--- /dev/null
+++ b/vinol2.py
@@ -0,0 +1,76 @@
+#file location: data_preprocessor/data/acetone-acs-l (1).pdf
+
+import pdfplumber
+import json
+
+text=""
+with pdfplumber.open("data_preprocessor/data/acetone-acs-l (1).pdf") as pdf:
+    first_page = pdf.pages[0] #reading only first pages for testing
+    text = first_page.extract_text()
+    print(text)
+
+
+from nltk.tokenize import RegexpTokenizer
+tokenizer = RegexpTokenizer(r'\w+')
+textArr = tokenizer.tokenize(text)
+print(textArr)
+#sample o/p ['SAFETY', 'DATA', 'SHEET', 'Creation', 'Date', '28', 'Apr', '2009', 'Revision', 'Date', '13', 'Oct', '2023', 'Revision', 'Number']
+
+text2 = ' '.join(textArr)
+
+from transformers import BertTokenizer, BertForTokenClassification
+import torch
+
+# Load tokenizer and model for token classification
+tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
+model = BertForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english', num_labels=9)  # Change num_labels based on your use case
+
+# Your extracted text from the PDF
+text2 = ' '.join(textArr)
+
+# Tokenize the text
+tokens = text2.split()
+input_tokens = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)
+
+# Get predictions
+with torch.no_grad():
+    outputs = model(**input_tokens)
+    logits = outputs.logits
+    predictions = torch.argmax(logits, dim=2)
+
+# Define a mapping of label IDs to label names
+label_map = {
+    0: 'O',      # Outside
+    1: 'B-MISC', # Begin Miscellaneous
+    2: 'I-MISC', # Inside Miscellaneous
+    3: 'B-PER',  # Begin Person
+    4: 'I-PER',  # Inside Person
+    5: 'B-ORG',  # Begin Organization
+    6: 'I-ORG',  # Inside Organization
+    7: 'B-LOC',  # Begin Location
+    8: 'I-LOC',  # Inside Location
+}
+
+# Print the tokens and their predicted labels
+for token, pred in zip(tokens, predictions[0].numpy()):
+    print(f"Token: {token} -> Label: {label_map[pred]}")
+
+#Token: SAFETY -> Label: O
+#Token: DATA -> Label: I-ORG
+#Token: SHEET -> Label: I-ORG
+#Token: Creation -> Label: I-ORG
+#Token: Date -> Label: I-ORG
+#Token: 28 -> Label: I-ORG
+
+# Convert to JSON
+json_output = json.dumps(tokens, indent=4)
+# Print JSON output
+print(json_output)
+# Optionally, save to a JSON file
+with open('token_classification_output.json', 'w') as json_file:
+    json.dump(tokens, json_file, indent=4)
+
+
+
+
+