diff --git a/chatgpt.py b/chatgpt.py new file mode 100644 index 00000000..31dd3847 --- /dev/null +++ b/chatgpt.py @@ -0,0 +1,383 @@ +import PyPDF2 +import easyocr +import re +import json +from pdf2image import convert_from_path # Convert PDF pages to images +import camelot + +class SafetyDataSheetParser: + def __init__(self, pdf_path, output_json_path): + self.pdf_path = pdf_path + self.output_json_path = output_json_path + self.reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR reader + + def extract_pdf_text(self): + try: + # First, try to extract text with PyPDF2 + with open(self.pdf_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() or "" # Avoid NoneType errors + if text.strip(): + print("PDF text extraction successful with PyPDF2!") + return text + else: + raise ValueError("PyPDF2 extraction failed. Attempting OCR...") + + except Exception as e: + print(f"Error during PDF extraction with PyPDF2: {e}") + return self.extract_text_with_ocr() # Use EasyOCR as fallback + + def extract_text_with_ocr(self): + try: + print("Extracting text using EasyOCR...") + # Convert PDF pages to images for OCR processing + images = convert_from_path(self.pdf_path) + text = "" + for img in images: + text += "\n".join(self.reader.readtext(img, detail=0)) # Extract text from image + if text.strip(): + print("Text extraction with EasyOCR successful!") + return text + else: + raise ValueError("OCR text extraction failed.") + except Exception as e: + print(f"Error during OCR extraction: {e}") + return "" + +class SafetyDataSheetParser: + def __init__(self, pdf_path, output_json_path): + self.pdf_path = pdf_path + self.output_json_path = output_json_path + + self.sds_data = { + "SafetyDataSheet": { + "Identification": {}, + "HazardIdentification": {}, + "Composition/Information on Ingredients": {}, + "First-aid measures": {}, + "Fire-fighting measures": {}, + "Accidental release measures": {}, + "Handling and storage": {}, + "Exposure controls/personal protection": {}, + "Physical and chemical properties": {}, + "Stability and reactivity": {}, + "Toxicological information": {}, + "Ecological information": {}, + "Disposal considerations": {}, + "Transport information": {}, + "Regulatory information": {}, + "Other information": {}, + "Tables": [] + } + } + + def extract_tables(self): + try: + print("Extracting tables using Camelot...") + tables = camelot.read_pdf(self.pdf_path, pages='all', strip_text='\n') # Read all tables in the PDF + table_data = [] + for table in tables: + table_data.append(table.df.to_dict()) # Convert tables to dict (can also save as CSV/JSON) + self.sds_data["SafetyDataSheet"]["Tables"] = table_data + print(f"{len(tables)} table(s) extracted successfully!") + except Exception as e: + print(f"Error during table extraction: {e}") + + def extract_pdf_text(self): + try: + with open(self.pdf_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() or "" # Avoid NoneType errors + if not text: + raise ValueError("PDF text extraction failed or PDF is empty.") + print("PDF text extraction successful!") + return text + except Exception as e: + print(f"Error during PDF extraction: {e}") + return "" + + def parse_identification_section(self, text): + try: + product_name_match = re.search(r'Product Name\s+(.+)', text) + cat_no_match = re.search(r'Cat No. :\s+(.+)', text) + cas_no_match = re.search(r'CAS No\s+(\d+-\d+-\d+)', text) + recommended_use_match = re.search(r'Recommended Use\s+(.+)', text) + supplier_match = re.search(r'Details of the supplier.+\n(.+)\n(.+)\n(.+)', text) + + self.sds_data["SafetyDataSheet"]["Identification"] = { + "ProductName": product_name_match.group(1) if product_name_match else "N/A", + "Cat No.": cat_no_match.group(1) if cat_no_match else "N/A", + "CASNo": cas_no_match.group(1) if cas_no_match else "N/A", + "RecommendedUse": recommended_use_match.group(1) if recommended_use_match else "N/A", + "Supplier": { + "Name": supplier_match.group(1) if supplier_match else "N/A", + "Address": supplier_match.group(2) if supplier_match else "N/A", + "Telephone": supplier_match.group(3) if supplier_match else "N/A" + } + } + print("Identification section parsed successfully!") + except Exception as e: + print(f"Error during Identification section parsing: {e}") + + def parse_hazard_identification_section(self, text): + try: + signal_word_match = re.search(r'Signal Word\s+(.+)', text) + hazard_statements = re.findall(r'Hazard Statements\s+(.+)', text) + precautionary_statements = re.findall(r'Precautionary Statements\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["HazardIdentification"] = { + "SignalWord": signal_word_match.group(1) if signal_word_match else "N/A", + "HazardStatements": hazard_statements if hazard_statements else ["N/A"], + "PrecautionaryStatements": precautionary_statements if precautionary_statements else ["N/A"] + } + print("Hazard Identification section parsed successfully!") + except Exception as e: + print(f"Error during Hazard Identification section parsing: {e}") + + def parse_composition_section(self, text): + try: + component_match = re.search(r'Component\s+(.+)\nCAS No\s+(.+)\nWeight %\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Composition/Information on Ingredients"] = { + "Component": component_match.group(1) if component_match else "N/A", + "CASNo": component_match.group(2) if component_match else "N/A", + "WeightPercentage": component_match.group(3) if component_match else "N/A" + } + print("Composition section parsed successfully!") + except Exception as e: + print(f"Error during Composition section parsing: {e}") + + def parse_first_aid_measures(self, text): + try: + general_advice_match = re.search(r'General Advice\s+(.+)', text) + eye_contact_match = re.search(r'Eye Contact\s+(.+)', text) + skin_contact_match = re.search(r'Skin Contact\s+(.+)', text) + inhalation_match = re.search(r'Inhalation\s+(.+)', text) + ingestion_match = re.search(r'Ingestion\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["First-aid measures"] = { + "GeneralAdvice": general_advice_match.group(1) if general_advice_match else "N/A", + "EyeContact": eye_contact_match.group(1) if eye_contact_match else "N/A", + "SkinContact": skin_contact_match.group(1) if skin_contact_match else "N/A", + "Inhalation": inhalation_match.group(1) if inhalation_match else "N/A", + "Ingestion": ingestion_match.group(1) if ingestion_match else "N/A" + } + print("First-aid measures section parsed successfully!") + except Exception as e: + print(f"Error during First-aid measures section parsing: {e}") + + def parse_fire_fighting_measures(self, text): + try: + extinguishing_media_match = re.search(r'Suitable Extinguishing Media\s+(.+)', text) + fire_hazards_match = re.search(r'Specific Hazards Arising from the Chemical\s+(.+)', text) + protective_equipment_match = re.search(r'Protective Equipment and Precautions for Firefighters\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Fire-fighting measures"] = { + "ExtinguishingMedia": extinguishing_media_match.group(1) if extinguishing_media_match else "N/A", + "SpecificHazards": fire_hazards_match.group(1) if fire_hazards_match else "N/A", + "ProtectiveEquipment": protective_equipment_match.group(1) if protective_equipment_match else "N/A" + } + print("Fire-fighting measures section parsed successfully!") + except Exception as e: + print(f"Error during Fire-fighting measures section parsing: {e}") + + def parse_accidental_release_measures(self, text): + try: + personal_precautions_match = re.search(r'Personal Precautions\s+(.+)', text) + environmental_precautions_match = re.search(r'Environmental Precautions\s+(.+)', text) + containment_cleanup_match = re.search(r'Methods for Containment and Clean Up\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Accidental release measures"] = { + "PersonalPrecautions": personal_precautions_match.group(1) if personal_precautions_match else "N/A", + "EnvironmentalPrecautions": environmental_precautions_match.group(1) if environmental_precautions_match else "N/A", + "MethodsForContainmentAndCleanUp": containment_cleanup_match.group(1) if containment_cleanup_match else "N/A" + } + print("Accidental release measures section parsed successfully!") + except Exception as e: + print(f"Error during Accidental release measures section parsing: {e}") + + def parse_handling_and_storage(self, text): + try: + handling_match = re.search(r'Handling\s+(.+)', text) + storage_match = re.search(r'Storage\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Handling and storage"] = { + "Handling": handling_match.group(1) if handling_match else "N/A", + "Storage": storage_match.group(1) if storage_match else "N/A" + } + print("Handling and storage section parsed successfully!") + except Exception as e: + print(f"Error during Handling and storage section parsing: {e}") + + def parse_exposure_controls(self, text): + try: + exposure_guidelines_match = re.search(r'Exposure Guidelines\s+(.+)', text) + engineering_controls_match = re.search(r'Engineering Measures\s+(.+)', text) + personal_protection_match = re.search(r'Personal Protective Equipment\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Exposure controls/personal protection"] = { + "ExposureGuidelines": exposure_guidelines_match.group(1) if exposure_guidelines_match else "N/A", + "EngineeringControls": engineering_controls_match.group(1) if engineering_controls_match else "N/A", + "PersonalProtection": personal_protection_match.group(1) if personal_protection_match else "N/A" + } + print("Exposure controls/personal protection section parsed successfully!") + except Exception as e: + print(f"Error during Exposure controls/personal protection section parsing: {e}") + + def parse_physical_and_chemical_properties(self, text): + try: + physical_state_match = re.search(r'Physical State\s+(.+)', text) + appearance_match = re.search(r'Appearance\s+(.+)', text) + odor_match = re.search(r'Odor\s+(.+)', text) + pH_match = re.search(r'pH\s+(.+)', text) + melting_point_match = re.search(r'Melting Point/Range\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Physical and chemical properties"] = { + "PhysicalState": physical_state_match.group(1) if physical_state_match else "N/A", + "Appearance": appearance_match.group(1) if appearance_match else "N/A", + "Odor": odor_match.group(1) if odor_match else "N/A", + "pH": pH_match.group(1) if pH_match else "N/A", + "MeltingPoint": melting_point_match.group(1) if melting_point_match else "N/A" + } + print("Physical and chemical properties section parsed successfully!") + except Exception as e: + print(f"Error during Physical and chemical properties section parsing: {e}") + + def parse_stability_and_reactivity(self, text): + try: + stability_match = re.search(r'Stability\s+(.+)', text) + conditions_to_avoid_match = re.search(r'Conditions to Avoid\s+(.+)', text) + incompatible_materials_match = re.search(r'Incompatible Materials\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Stability and reactivity"] = { + "Stability": stability_match.group(1) if stability_match else "N/A", + "ConditionsToAvoid": conditions_to_avoid_match.group(1) if conditions_to_avoid_match else "N/A", + "IncompatibleMaterials": incompatible_materials_match.group(1) if incompatible_materials_match else "N/A" + } + print("Stability and reactivity section parsed successfully!") + except Exception as e: + print(f"Error during Stability and reactivity section parsing: {e}") + + def parse_toxicological_information(self, text): + try: + acute_toxicity_match = re.search(r'Acute Toxicity\s+(.+)', text) + symptoms_match = re.search(r'Symptoms\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Toxicological information"] = { + "AcuteToxicity": acute_toxicity_match.group(1) if acute_toxicity_match else "N/A", + "Symptoms": symptoms_match.group(1) if symptoms_match else "N/A" + } + print("Toxicological information section parsed successfully!") + except Exception as e: + print(f"Error during Toxicological information section parsing: {e}") + + def parse_ecological_information(self, text): + try: + ecotoxicity_match = re.search(r'Ecotoxicity\s+(.+)', text) + bioaccumulation_match = re.search(r'Bioaccumulation\s+(.+)', text) + mobility_match = re.search(r'Mobility\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Ecological information"] = { + "Ecotoxicity": ecotoxicity_match.group(1) if ecotoxicity_match else "N/A", + "Bioaccumulation": bioaccumulation_match.group(1) if bioaccumulation_match else "N/A", + "Mobility": mobility_match.group(1) if mobility_match else "N/A" + } + print("Ecological information section parsed successfully!") + except Exception as e: + print(f"Error during Ecological information section parsing: {e}") + + def parse_disposal_considerations(self, text): + try: + waste_disposal_match = re.search(r'Waste Disposal Methods\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Disposal considerations"] = { + "WasteDisposal": waste_disposal_match.group(1) if waste_disposal_match else "N/A" + } + print("Disposal considerations section parsed successfully!") + except Exception as e: + print(f"Error during Disposal considerations section parsing: {e}") + + def parse_transport_information(self, text): + try: + un_number_match = re.search(r'UN-No\s+(.+)', text) + proper_shipping_name_match = re.search(r'Proper Shipping Name\s+(.+)', text) + hazard_class_match = re.search(r'Hazard Class\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Transport information"] = { + "UNNumber": un_number_match.group(1) if un_number_match else "N/A", + "ProperShippingName": proper_shipping_name_match.group(1) if proper_shipping_name_match else "N/A", + "HazardClass": hazard_class_match.group(1) if hazard_class_match else "N/A" + } + print("Transport information section parsed successfully!") + except Exception as e: + print(f"Error during Transport information section parsing: {e}") + + def parse_regulatory_information(self, text): + try: + regulatory_match = re.search(r'U\.S\. Federal Regulations\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Regulatory information"] = { + "USFederalRegulations": regulatory_match.group(1) if regulatory_match else "N/A" + } + print("Regulatory information section parsed successfully!") + except Exception as e: + print(f"Error during Regulatory information section parsing: {e}") + + def parse_other_information(self, text): + try: + preparation_date_match = re.search(r'Creation Date\s+(.+)', text) + revision_date_match = re.search(r'Revision Date\s+(.+)', text) + + self.sds_data["SafetyDataSheet"]["Other information"] = { + "CreationDate": preparation_date_match.group(1) if preparation_date_match else "N/A", + "RevisionDate": revision_date_match.group(1) if revision_date_match else "N/A" + } + print("Other information section parsed successfully!") + except Exception as e: + print(f"Error during Other information section parsing: {e}") + + def process_sds(self): + try: + text = self.extract_pdf_text() + + if text: + # Parse all relevant sections + self.parse_identification_section(text) + self.parse_hazard_identification_section(text) + self.parse_composition_section(text) + self.parse_first_aid_measures(text) + self.parse_fire_fighting_measures(text) + self.parse_accidental_release_measures(text) + self.parse_handling_and_storage(text) + self.parse_exposure_controls(text) + self.parse_physical_and_chemical_properties(text) + self.parse_stability_and_reactivity(text) + self.parse_toxicological_information(text) + self.parse_ecological_information(text) + self.parse_disposal_considerations(text) + self.parse_transport_information(text) + self.parse_regulatory_information(text) + self.parse_other_information(text) + self.extract_tables() + + # Save the structured data as JSON + with open(self.output_json_path, 'w') as json_file: + json.dump(self.sds_data, json_file, indent=4) + + print(f"SDS data successfully saved to {self.output_json_path}") + else: + print("No text extracted from the PDF.") + except Exception as e: + print(f"Error during SDS processing: {e}") + +# Example Usage: +if __name__ == "__main__": + pdf_file = "data/acetone-acs-l.pdf" # Path to your PDF file + output_json = "output_sds.json" # Path to save the output JSON + + parser = SafetyDataSheetParser(pdf_file, output_json) + parser.process_sds() diff --git a/output_sds.json b/output_sds.json new file mode 100644 index 00000000..f49b84ae --- /dev/null +++ b/output_sds.json @@ -0,0 +1,38 @@ +{ + "SafetyDataSheet": { + "Identification": { + "ProductName": "Acetone", + "Cat No.": "A9-4", + "CASNo": "67-64-1", + "RecommendedUse": "Laboratory chemicals.", + "Supplier": { + "Name": "2. Hazard(s) identification", + "Address": "Classification Company ", + "Telephone": "Fisher Scientific Company" + } + }, + "HazardIdentification": { + "SignalWord": "Danger", + "HazardStatements": [ + "Highly flammable liquid and vapor" + ], + "PrecautionaryStatements": [ + "Prevention" + ] + }, + "Composition/Information on Ingredients": {}, + "First-aid measures": {}, + "Fire-fighting measures": {}, + "Accidental release measures": {}, + "Handling and storage": {}, + "Exposure controls/personal protection": {}, + "Physical and chemical properties": {}, + "Stability and reactivity": {}, + "Toxicological information": {}, + "Ecological information": {}, + "Disposal considerations": {}, + "Transport information": {}, + "Regulatory information": {}, + "Other information": {} + } +} \ No newline at end of file diff --git a/rag.ipynb b/rag.ipynb new file mode 100644 index 00000000..ee506afd --- /dev/null +++ b/rag.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "from langchain_chroma import Chroma\n", + "from langchain_community.embeddings.sentence_transformer import (\n", + " SentenceTransformerEmbeddings,\n", + ")\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", + "from langchain.chains import create_retrieval_chain\n", + "from langchain_community.chat_models import ChatOllama\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "from PIL import Image\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#trying different llm\n", + "llme = ChatOllama(model=\"mistral\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n" + ] + } + ], + "source": [ + "import easyocr\n", + "\n", + "# Initialize the reader for the desired languages\n", + "reader = easyocr.Reader(['en', 'es'] , gpu=True) # English and Spanish\n", + "\n", + "# Read text from an image\n", + "text = reader.readtext('ooga/acetone-acs-l-2.png', detail=0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "File path OOgaBoo.pdf is not a valid file or url", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[4], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#text splitter and stuff\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m loader \u001b[38;5;241m=\u001b[39m \u001b[43mPyPDFLoader\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mOOgaBoo.pdf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3\u001b[0m pages \u001b[38;5;241m=\u001b[39m loader\u001b[38;5;241m.\u001b[39mload_and_split()\n\u001b[0;32m 4\u001b[0m documents \u001b[38;5;241m=\u001b[39m loader\u001b[38;5;241m.\u001b[39mload()\n", + "File \u001b[1;32md:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\langchain_community\\document_loaders\\pdf.py:241\u001b[0m, in \u001b[0;36mPyPDFLoader.__init__\u001b[1;34m(self, file_path, password, headers, extract_images, extraction_mode, extraction_kwargs)\u001b[0m\n\u001b[0;32m 237\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m 238\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[0;32m 239\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpypdf package not found, please install it with `pip install pypdf`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 240\u001b[0m )\n\u001b[1;32m--> 241\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 242\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparser \u001b[38;5;241m=\u001b[39m PyPDFParser(\n\u001b[0;32m 243\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 244\u001b[0m extract_images\u001b[38;5;241m=\u001b[39mextract_images,\n\u001b[0;32m 245\u001b[0m extraction_mode\u001b[38;5;241m=\u001b[39mextraction_mode,\n\u001b[0;32m 246\u001b[0m extraction_kwargs\u001b[38;5;241m=\u001b[39mextraction_kwargs,\n\u001b[0;32m 247\u001b[0m )\n", + "File \u001b[1;32md:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\langchain_community\\document_loaders\\pdf.py:117\u001b[0m, in \u001b[0;36mBasePDFLoader.__init__\u001b[1;34m(self, file_path, headers)\u001b[0m\n\u001b[0;32m 115\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(temp_pdf)\n\u001b[0;32m 116\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path):\n\u001b[1;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile path \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m is not a valid file or url\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path)\n", + "\u001b[1;31mValueError\u001b[0m: File path OOgaBoo.pdf is not a valid file or url" + ] + } + ], + "source": [ + "#text splitter and stuff\n", + "loader = PyPDFLoader(\"data/acetone-acs-l.pdf\")\n", + "pages = loader.load_and_split()\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from tqdm.autonotebook import tqdm, trange\n", + "d:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", + " warnings.warn(\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'str' object has no attribute 'page_content'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#embed and store it in a vec db \u001b[39;00m\n\u001b[0;32m 2\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m HuggingFaceEmbeddings(model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall-MiniLM-L6-v2\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 3\u001b[0m db \u001b[38;5;241m=\u001b[39m \u001b[43mChroma\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membeddings\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m retriever \u001b[38;5;241m=\u001b[39m db\u001b[38;5;241m.\u001b[39mas_retriever()\n", + "File \u001b[1;32md:\\Data\\data_preprocessor\\.venv\\Lib\\site-packages\\langchain_chroma\\vectorstores.py:1126\u001b[0m, in \u001b[0;36mChroma.from_documents\u001b[1;34m(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)\u001b[0m\n\u001b[0;32m 1092\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[0;32m 1093\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_documents\u001b[39m(\n\u001b[0;32m 1094\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Chroma],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1103\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[0;32m 1104\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Chroma:\n\u001b[0;32m 1105\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create a Chroma vectorstore from a list of documents.\u001b[39;00m\n\u001b[0;32m 1106\u001b[0m \n\u001b[0;32m 1107\u001b[0m \u001b[38;5;124;03m If a persist_directory is specified, the collection will be persisted there.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1124\u001b[0m \u001b[38;5;124;03m Chroma: Chroma vectorstore.\u001b[39;00m\n\u001b[0;32m 1125\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1126\u001b[0m texts \u001b[38;5;241m=\u001b[39m [\u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpage_content\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m 1127\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m 1128\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_texts(\n\u001b[0;32m 1129\u001b[0m texts\u001b[38;5;241m=\u001b[39mtexts,\n\u001b[0;32m 1130\u001b[0m embedding\u001b[38;5;241m=\u001b[39membedding,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1138\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 1139\u001b[0m )\n", + "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'page_content'" + ] + } + ], + "source": [ + "#embed and store it in a vec db \n", + "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "db = Chroma.from_documents(docs, embeddings)\n", + "retriever = db.as_retriever()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#make a template for the output\n", + "prompt = ChatPromptTemplate.from_template(\"\"\"Answer the following question based only on the provided context:\n", + "\n", + "\n", + "{context}\n", + "\n", + "\n", + "Question: {input}\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#retrival \n", + "combine_docs_chain = create_stuff_documents_chain(\n", + " llme, prompt\n", + ")\n", + "retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " {\n", + " \"Name\": \"Acetone\",\n", + " \"Revision Date\": \"13-Oct-2023\",\n", + " \"Method\": \"CC (closed cup)\",\n", + " \"Evaporation Rate\": \"5.6 (Butyl Acetate = 1.0)\",\n", + " \"Flammability\": {\n", + " \"Not applicable\": true\n", + " },\n", + " \"Flammability or explosive limits\": {\n", + " \"Upper\": \"12.8 vol %\",\n", + " \"Lower\": \"2.5 vol %\"\n", + " },\n", + " \"Vapor Pressure\": \"247 mbar @ 20 °C\",\n", + " \"Vapor Density\": \"2.0\",\n", + " \"Specific Gravity\": \"0.790\",\n", + " \"Solubility\": \"Soluble in water\",\n", + " \"Partition coefficient; n-octanol/water\": \"No data available\",\n", + " \"Autoignition Temperature\": {\n", + " \"°C\": \"465\",\n", + " \"°F\": \"869\"\n", + " },\n", + " \"Decomposition Temperature\": \"> 4°C\",\n", + " \"Viscosity\": \"0.32 mPa.s @ 20 °C\",\n", + " \"Molecular Formula\": \"C3 H6 O\",\n", + " \"Molecular Weight\": \"58.08\",\n", + " \"VOC Content(%)\": \"100\",\n", + " \"Refractive index\": \"1.358 - 1.359\",\n", + " \"Stability and reactivity\": {\n", + " \"Reactive Hazard\": \"None known, based on information available\",\n", + " \"Stability\": \"Stable under normal conditions.\",\n", + " \"Conditions to Avoid\": [\n", + " \"Heat\",\n", + " \"flames and sparks.\",\n", + " \"Incompatible products.\"\n", + " ],\n", + " \"Incompatible Materials\": [\n", + " \"Strong oxidizing agents\",\n", + " \"Strong reducing agents\",\n", + " \"Strong bases\",\n", + " \"Peroxides\",\n", + " \"Halogenated compounds\",\n", + " \"Alkali metals\",\n", + " \"Amines\"\n", + " ]\n", + " },\n", + " \"Hazardous Decomposition Products\": [\n", + " \"Carbon monoxide (CO)\",\n", + " \"Carbon dioxide (CO2)\",\n", + " \"Formaldehyde\",\n", + " \"Methanol\"\n", + " ],\n", + " \"Hazardous Polymerization\": \"Hazardous polymerization does not occur.\",\n", + " \"Hazardous Reactions\": \"None under normal processing.\",\n", + " \"Toxicological information\": {\n", + " \"Acute Toxicity\": {\n", + " \"Product Information\": true,\n", + " \"Component Information\": true,\n", + " \"Component LD50 Oral\": \"Not available\",\n", + " \"Dermal LC50\": \"Not available\",\n", + " \"Inhalation\": \"EN 149. Use a NIOSH/MSHA or European Standard EN 149 approved respirator if exposure limits are exceeded or if irritation or other symptoms are experienced.\"\n", + " },\n", + " \"Recommended Filter type\": \"low boiling organic solvent. Type AX. Brown. conforming to EN371.\",\n", + " \"Hygiene Measures\": \"Handle in accordance with good industrial hygiene and safety practice.\",\n", + " \"_Other International Regulations_\": {\n", + " \"Seveso III Directive (2012/18/EC) - Qualifying Quantities for Major Accident Notification\": \"Not applicable\",\n", + " \"Seveso III Directive (2012/18/EC) - Qualifying Quantities for Safety Report Requirements\": \"Not applicable\",\n", + " \"Rotterdam Convention (PIC)\": \"Not applicable\",\n", + " \"Basel Convention (Hazardous Waste)\": \"Not applicable\",\n", + " \"Annex I - Y42\": \"Acetone 67-64-1\"\n", + " }\n", + " },\n", + " \"Other information\": {\n", + " \"Prepared By\": \"Regulatory Affairs\",\n", + " \"Email\": \"EMSDS.RA@thermofisher.com\",\n", + " \"Creation Date\": \"28-Apr-2009\",\n", + " \"Revision Date\": \"13-Oct-2023\",\n", + " \"Print Date\": \"13-Oct-2023\",\n", + " \"Revision Summary\": \"This document has been updated to comply with the US OSHA regulations.\",\n", + " \"_Notes_\": [\n", + " \"The values are rounded off or approximate due to the nature of the data.\"\n", + " ]\n", + " }\n", + " }\n" + ] + } + ], + "source": [ + "response = retrieval_chain.invoke({\"input\": \"Here is some text extracted from a PDF. Analyze it and generate structured JSON where main sections are parent keys and subsections are children. clean the one which \"})\n", + "print(response[\"answer\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..dce0b75d Binary files /dev/null and b/requirements.txt differ diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 00000000..0f8cc853 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: PyPDF2 in d:\\data\\data_preprocessor\\.venv\\lib\\site-packages (3.0.1)Note: you may need to restart the kernel to use updated packages.\n", + "\n" + ] + } + ], + "source": [ + "pip install PyPDF2" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PDF text extraction successful!\n", + "Identification section parsed successfully!\n", + "Hazard Identification section parsed successfully!\n", + "SDS data successfully saved to output_sds.json\n" + ] + } + ], + "source": [ + "import PyPDF2\n", + "import json\n", + "import re\n", + "\n", + "class SafetyDataSheetParser:\n", + " def __init__(self, pdf_path, output_json_path):\n", + " self.pdf_path = pdf_path\n", + " self.output_json_path = output_json_path\n", + " self.sds_data = {\n", + " \"SafetyDataSheet\": {\n", + " \"Identification\": {},\n", + " \"HazardIdentification\": {},\n", + " \"Composition/Information on Ingredients\":{},\n", + " \"First-aid measures\":{},\n", + " \"Fire-fighting measures\":{},\n", + " \"Accidental release measures\":{},\n", + " \"Handling and storage\":{},\n", + " \"Exposure controls/personal protection\":{},\n", + " \"Physical and chemical properties\":{},\n", + " \"Stability and reactivity\":{},\n", + " \"Toxicological information\":{},\n", + " \"Ecological information\":{},\n", + " \"Disposal considerations\":{},\n", + " \"Transport information\":{},\n", + " \"Regulatory information\":{},\n", + " \"Other information\":{}\n", + " }\n", + " }\n", + "\n", + " def extract_pdf_text(self):\n", + " try:\n", + " with open(self.pdf_path, 'rb') as file:\n", + " reader = PyPDF2.PdfReader(file)\n", + " text = \"\"\n", + " for page in reader.pages:\n", + " text += page.extract_text() or \"\" # Avoid NoneType errors\n", + " if not text:\n", + " raise ValueError(\"PDF text extraction failed or PDF is empty.\")\n", + " print(\"PDF text extraction successful!\")\n", + " return text\n", + " except Exception as e:\n", + " print(f\"Error during PDF extraction: {e}\")\n", + " return \"\"\n", + "\n", + " def parse_identification_section(self, text):\n", + " \"\"\"Parses the Identification section.\"\"\"\n", + " try:\n", + " product_name_match = re.search(r'Product Name\\s+(.+)', text)\n", + " cat_no_match = re.search(r'Cat No. :\\s+[A-Z]+\\d+[A-Z]*-\\d+[A-Z\\d\\-]*',text)\n", + " cas_no_match = re.search(r'CAS No\\s+(\\d+-\\d+-\\d+)', text)\n", + " recommended_use_match = re.search(r'Recommended Use\\s+(.+)', text)\n", + " supplier_match = re.search(r'Details of the supplier.+\\n(.+)\\n(.+)\\n(.+)', text)\n", + "\n", + " self.sds_data[\"SafetyDataSheet\"][\"Identification\"] = {\n", + " \"ProductName\": product_name_match.group(1) if product_name_match else \"N/A\",\n", + " \"Cat No.\": cat_no_match.group(0) if cat_no_match else \"N/A\",\n", + " \"CASNo\": cas_no_match.group(1) if cas_no_match else \"N/A\",\n", + " \"RecommendedUse\": recommended_use_match.group(1) if recommended_use_match else \"N/A\",\n", + " \"Supplier\": {\n", + " \"Name\": supplier_match.group(1) if supplier_match else \"N/A\",\n", + " \"Address\": supplier_match.group(2) if supplier_match else \"N/A\",\n", + " \"Telephone\": supplier_match.group(3) if supplier_match else \"N/A\"\n", + " }\n", + " }\n", + " print(\"Identification section parsed successfully!\")\n", + " except Exception as e:\n", + " print(f\"Error during Identification section parsing: {e}\")\n", + "\n", + " def parse_hazard_identification_section(self, text):\n", + " \"\"\"Parses the Hazard Identification section.\"\"\"\n", + " try:\n", + " signal_word_match = re.search(r'Signal Word\\s+(.+)', text)\n", + " hazard_statements = re.findall(r'Hazard Statements\\s+(.+)', text)\n", + " precautionary_statements = re.findall(r'Precautionary Statements\\s+(.+)', text)\n", + "\n", + " self.sds_data[\"SafetyDataSheet\"][\"HazardIdentification\"] = {\n", + " \"SignalWord\": signal_word_match.group(1) if signal_word_match else \"N/A\",\n", + " \"HazardStatements\": hazard_statements if hazard_statements else [\"N/A\"],\n", + " \"PrecautionaryStatements\": precautionary_statements if precautionary_statements else [\"N/A\"]\n", + " }\n", + " print(\"Hazard Identification section parsed successfully!\")\n", + " except Exception as e:\n", + " print(f\"Error during Hazard Identification section parsing: {e}\")\n", + "\n", + " def process_sds(self):\n", + " \"\"\"Processes the SDS and saves it as a JSON.\"\"\"\n", + " try:\n", + " text = self.extract_pdf_text()\n", + "\n", + " if text:\n", + " # Parse relevant sections\n", + " self.parse_identification_section(text)\n", + " self.parse_hazard_identification_section(text)\n", + "\n", + " # Save the structured data as JSON\n", + " with open(self.output_json_path, 'w') as json_file:\n", + " json.dump(self.sds_data, json_file, indent=4)\n", + "\n", + " print(f\"SDS data successfully saved to {self.output_json_path}\")\n", + " else:\n", + " print(\"No text extracted from the PDF.\")\n", + " except Exception as e:\n", + " print(f\"Error during SDS processing: {e}\")\n", + "\n", + "# Example Usage:\n", + "if __name__ == \"__main__\":\n", + " pdf_file = \"data/acetone-acs-l.pdf\" # Path to your PDF file\n", + " output_json = \"output_sds.json\" # Path to save the output JSON\n", + "\n", + " parser = SafetyDataSheetParser(pdf_file, output_json)\n", + " parser.process_sds()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test.py b/test.py new file mode 100644 index 00000000..d4968d62 --- /dev/null +++ b/test.py @@ -0,0 +1,44 @@ +import PyPDF2 +import json + +# Open and read the PDF file +pdf_file_path = 'data/acetone-acs-l.pdf' +with open(pdf_file_path, 'rb') as pdf_file: + reader = PyPDF2.PdfReader(pdf_file) + num_pages = len(reader.pages) + + # Initialize the structure + structured_data = {} + + current_section = None + current_subsection = None + + # Loop through each page of the PDF + for page_num in range(num_pages): + page = reader.pages[page_num] + text = page.extract_text() + + # Process each line of text + for line in text.split('\n'): + # Detect main sections + if line.strip().isdigit() and int(line.strip()) in range(1, 20): # Assuming sections are numbered + current_section = line.strip() + structured_data[current_section] = {} + elif line.strip().startswith('•'): # Detect subsections by bullet points or specific patterns + current_subsection = line.strip() + if current_section: + structured_data[current_section][current_subsection] = [] + elif current_section and current_subsection: + structured_data[current_section][current_subsection].append(line.strip()) + elif current_section: + structured_data[current_section].setdefault('content', []).append(line.strip()) + +# Convert to JSON format +json_data = json.dumps(structured_data, indent=4) + +# Save the JSON data to a file +output_json_path = 'structured_data.json' +with open(output_json_path, 'w') as json_file: + json_file.write(json_data) + +print(f"JSON data has been saved to {output_json_path}") diff --git a/test2.ipynb b/test2.ipynb new file mode 100644 index 00000000..e69de29b diff --git a/vinol.py b/vinol.py new file mode 100644 index 00000000..f145135d --- /dev/null +++ b/vinol.py @@ -0,0 +1,54 @@ +#file location: data_preprocessor/data/acetone-acs-l (1).pdf + +import PyPDF2 +import os + +# Data extraction function +def pdf_to_text(pdf_path): + # Extract the base name of the PDF file (without extension) + base_name = os.path.splitext(os.path.basename(pdf_path))[0] + # Create the output text file name by appending ".txt" + output_txt = f"{base_name}.txt" + + # Open the PDF file in read-binary mode + with open(pdf_path, 'rb') as pdf_file: + # Create a PdfReader object + reader = PyPDF2.PdfReader(pdf_file) + + # Initialize an empty string to store the text + text = '' + + # Iterate through all pages and extract text + for page_num in range(len(reader.pages)): + page = reader.pages[page_num] + page_text = page.extract_text() + if page_text: # Ensure text was extracted + text += page_text + "\n" # Add a newline for page separation + + # Write the extracted text to a text file (output_txt) + with open(output_txt, 'w', encoding='utf-8') as txt_file: + txt_file.write(text) + + print(f"PDF converted to text successfully! Text saved as: {output_txt}") + +# Usage example +pdf_to_text('data/acetone-acs-l (1).pdf') + +#pdf_to_text('data_preprocessor/data/acetone-acs-l (1).pdf') + +#cleaning up +def exclude_lines_with_keyword(input_file_path): + # Create the output file name by appending '_modified' to the original file name + base_name = os.path.basename(input_file_path) + file_name, file_extension = os.path.splitext(base_name) + output_file_path = os.path.join(os.path.dirname(input_file_path), f"{file_name}_modified{file_extension}") + + with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file: + for line in input_file: + # Check if the specific keyword is NOT in the line + if "Page" not in line and "____" not in line: + output_file.write(line) # Write the line to the output file + + print(f"Lines excluding the keywords have been written to '{output_file_path}'.") + +exclude_lines_with_keyword("acetone-acs-l (1).txt") \ No newline at end of file diff --git a/vinol2.py b/vinol2.py new file mode 100644 index 00000000..20f658bd --- /dev/null +++ b/vinol2.py @@ -0,0 +1,76 @@ +#file location: data_preprocessor/data/acetone-acs-l (1).pdf + +import pdfplumber +import json + +text="" +with pdfplumber.open("data_preprocessor/data/acetone-acs-l (1).pdf") as pdf: + first_page = pdf.pages[0] #reading only first pages for testing + text = first_page.extract_text() + print(text) + + +from nltk.tokenize import RegexpTokenizer +tokenizer = RegexpTokenizer(r'\w+') +textArr = tokenizer.tokenize(text) +print(textArr) +#sample o/p ['SAFETY', 'DATA', 'SHEET', 'Creation', 'Date', '28', 'Apr', '2009', 'Revision', 'Date', '13', 'Oct', '2023', 'Revision', 'Number'] + +text2 = ' '.join(textArr) + +from transformers import BertTokenizer, BertForTokenClassification +import torch + +# Load tokenizer and model for token classification +tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english') +model = BertForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english', num_labels=9) # Change num_labels based on your use case + +# Your extracted text from the PDF +text2 = ' '.join(textArr) + +# Tokenize the text +tokens = text2.split() +input_tokens = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True) + +# Get predictions +with torch.no_grad(): + outputs = model(**input_tokens) + logits = outputs.logits + predictions = torch.argmax(logits, dim=2) + +# Define a mapping of label IDs to label names +label_map = { + 0: 'O', # Outside + 1: 'B-MISC', # Begin Miscellaneous + 2: 'I-MISC', # Inside Miscellaneous + 3: 'B-PER', # Begin Person + 4: 'I-PER', # Inside Person + 5: 'B-ORG', # Begin Organization + 6: 'I-ORG', # Inside Organization + 7: 'B-LOC', # Begin Location + 8: 'I-LOC', # Inside Location +} + +# Print the tokens and their predicted labels +for token, pred in zip(tokens, predictions[0].numpy()): + print(f"Token: {token} -> Label: {label_map[pred]}") + +#Token: SAFETY -> Label: O +#Token: DATA -> Label: I-ORG +#Token: SHEET -> Label: I-ORG +#Token: Creation -> Label: I-ORG +#Token: Date -> Label: I-ORG +#Token: 28 -> Label: I-ORG + +# Convert to JSON +json_output = json.dumps(tokens, indent=4) +# Print JSON output +print(json_output) +# Optionally, save to a JSON file +with open('token_classification_output.json', 'w') as json_file: + json.dump(tokens, json_file, indent=4) + + + + +