Justmalhar · pallavi1428 · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.env
+venv/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,18 @@
+# 1. Start with Playwright's ready-to-use image
+FROM mcr.microsoft.com/playwright/python:v1.42.0-jammy
+
+# 2. Copy your files (no changes needed)
+WORKDIR /app
+COPY . .
+
+# 3. Install Python dependencies
+RUN pip install -r requirements.txt
+
+# 4. Set up the browser (automatically handled by Playwright image)
+RUN playwright install chromium
+
+# 5. Make Gradio accessible
+ENV GRADIO_SERVER_NAME=0.0.0.0
+
+# 6. Launch command (same as running locally)
+CMD ["python", "app.py"]
diff --git a/README.md b/README.md
@@ -1 +1,92 @@
-# openthanos
+# **OpenThanos: AI-Powered Browser Automation**  
+
+## **1. Introduction**  
+OpenThanos is an AI-powered browser automation tool that converts plain English instructions into Playwright commands using OpenAI. It automates web tasks while simulating human-like behavior—including natural typing delays and occasional "mistakes"—to make interactions appear more organic and bypass bot detection. The tool provides step-by-step visual feedback through screenshots and works seamlessly in Docker for easy deployment.  
+
+## **2. Features**  
+- **Natural Language Automation**: Execute browser tasks by providing simple text instructions.  
+- **Step-by-Step Visual Feedback**: Automatically captures screenshots for better understanding.  
+- **Cross-Platform Compatibility**: Works on Windows, macOS, and Linux.  
+- **Docker Support**: Runs in a containerized environment for easy deployment.  
+- **AI Agent Integration**: Compatible with OpenAI Agents SDK for advanced automation.  
+
+---
+
+## **3. Setup Instructions**  
+
+### **3.1 Prerequisites**  
+To use OpenThanos, ensure you have the following:  
+- **[Docker Desktop](https://www.docker.com/products/docker-desktop)** – Required for running OpenThanos in a containerized environment.  
+- **Python 3.12+** – Required for local development (not needed if using Docker).  
+
+---
+
+### **3.2 Quick Installation and Usage**  
+
+To install and run OpenThanos using Docker, follow these steps:  
+```bash
+# Clone the repository
+git clone <repo url>
+cd openthanos
+
+# Build and run using Docker
+docker build -t thanos-automator .
+docker run -p 7860:7860 -it thanos-automator
+
+# Access the interface at:
+http://localhost:7860 
+
+---
+
+## **4. Running OpenThanos Locally (Without Docker)**  
+
+If you prefer running OpenThanos without Docker, follow these steps:  
+```bash
+# Create and activate a virtual environment
+python -m venv venv
+source venv/bin/activate  # Linux/Mac
+venv\Scripts\activate     # Windows
+
+# Install dependencies
+pip install -r requirements.txt
+playwright install
+
+# Run the application
+python app.py
+```  
+
+---
+
+## **5. API Integration**  
+
+OpenThanos can be integrated into other applications using Python:  
+```python
+from automation.agent_tools import AutomationToolkit
+
+toolkit = AutomationToolkit()
+result = toolkit.browse_web("Search for AI news on TechCrunch")
+```  
+
+---
+
+## **6. Example Commands**  
+
+You can test the following commands in the OpenThanos interface:  
+- **"Search for cricket on Google"**  
+- **"Open Wikipedia and find information about artificial intelligence"**  
+- **"Check trending repositories on GitHub"**  
+
+---
+
+## **7. License**  
+
+OpenThanos is available under the **MIT License**. See [LICENSE](LICENSE) for more details.  
+
+---
+
+## **8. Important Notes**  
+
+If you plan to use OpenAI’s API with OpenThanos, create a `.env` file in the project folder and add your API key:  
+```ini
+OPENAI_API_KEY=your_api_key_here
+```  
diff --git a/app.py b/app.py
@@ -0,0 +1,116 @@
+import os
+import time
+from dotenv import load_dotenv
+import gradio as gr
+from automation.core import BrowserAutomator
+from automation.screenshot import ScreenshotManager
+from automation.agent_tools import AutomationToolkit
+
+# Load environment
+load_dotenv()
+
+# Initialize components
+automator = BrowserAutomator(os.getenv("OPENAI_API_KEY"))
+screenshot_manager = ScreenshotManager()
+toolkit = AutomationToolkit()
+
+def process_prompt(prompt: str) -> tuple:
+    """Original workflow handler"""
+    screenshot_manager.reset()
+    try:
+        start_time = time.time()
+        actions = automator.get_ai_response(prompt)
+        ai_time = time.time() - start_time
+
+        start_time = time.time()
+        execution_log = automator.execute_actions(
+            actions,
+            screenshot_callback=screenshot_manager.capture
+        )
+        exec_time = time.time() - start_time
+
+        log_output = (
+            f"AI Processing Time: {ai_time:.2f}s\n"
+            f"Execution Time: {exec_time:.2f}s\n\n"
+            f"Execution Log:\n{execution_log}"
+        )
+
+        current_img = screenshot_manager.get_current()
+        status = (
+            f"Screenshot {screenshot_manager.current_index + 1} of {len(screenshot_manager.history)}" 
+            if screenshot_manager.history else "No screenshots"
+        )
+        return log_output, current_img, status
+    except Exception as e:
+        return f"Error occurred: {str(e)}", None, "Error"
+
+def run_agent(prompt: str) -> str:
+    """New agent handler"""
+    try:
+        return toolkit.browse_web(prompt)
+    except Exception as e:
+        return f"Agent error: {str(e)}"
+
+# Define examples
+examples = [
+    ["Search cricket on Google"],
+    ["Go to example.com and click on the first link"],
+    ["Open Wikipedia and search for artificial intelligence"],
+    ["Search for latest iPhone models on Amazon"],
+    ["Check trending repositories on GitHub"]
+]
+
+# Build interface
+with gr.Blocks(css="""
+#screenshot-nav { display: flex; justify-content: center; gap: 10px; margin-top: 10px; }
+#screenshot-container { border: 1px solid #ccc; padding: 10px; border-radius: 5px; margin-top: 10px; }
+#screenshot-status { text-align: center; margin-top: 5px; font-style: italic; }
+""") as demo:
+    gr.Markdown("# 🤖 Advanced Browser Automation with Screenshots")
+
+    with gr.Tab("🔧 Manual Mode"):
+        with gr.Row():
+            with gr.Column():
+                prompt_input = gr.Textbox(lines=2, placeholder="Enter what you want to automate...", label="Instruction")
+                submit_btn = gr.Button("Execute", variant="primary")
+                log_output = gr.Textbox(label="Execution Log", interactive=False)
+
+                with gr.Accordion("Examples", open=False):
+                    gr.Examples(examples=examples, inputs=prompt_input, label="Click any example to load it")
+
+            with gr.Column():
+                with gr.Group(elem_id="screenshot-container"):
+                    screenshot_output = gr.Image(label="Browser Screenshot", interactive=False)
+                    screenshot_status = gr.Textbox(elem_id="screenshot-status", interactive=False, show_label=False)
+                    with gr.Row(elem_id="screenshot-nav"):
+                        prev_btn = gr.Button("Previous", variant="secondary")
+                        next_btn = gr.Button("Next", variant="secondary")
+
+    with gr.Tab("🤖 Agent Mode"):
+        gr.Markdown("## Describe your task in natural language")
+        agent_input = gr.Textbox(label="Task description", lines=3)
+        agent_output = gr.Textbox(label="Agent Execution Log", interactive=False)
+        agent_button = gr.Button("Run Agent", variant="primary")
+
+    # Event handlers
+    submit_btn.click(
+        fn=process_prompt,
+        inputs=prompt_input,
+        outputs=[log_output, screenshot_output, screenshot_status]
+    )
+    prev_btn.click(
+        fn=lambda: screenshot_manager.navigate("prev"),
+        outputs=[screenshot_output, screenshot_status]
+    )
+    next_btn.click(
+        fn=lambda: screenshot_manager.navigate("next"),
+        outputs=[screenshot_output, screenshot_status]
+    )
+    agent_button.click(
+        fn=run_agent,
+        inputs=agent_input,
+        outputs=agent_output
+    )
+
+if __name__ == "__main__":
+    demo.launch()
diff --git a/automation/__init__.py b/automation/__init__.py
diff --git a/automation/__pycache__/__init__.cpython-312.pyc b/automation/__pycache__/__init__.cpython-312.pyc
diff --git a/automation/__pycache__/agent_tools.cpython-312.pyc b/automation/__pycache__/agent_tools.cpython-312.pyc
diff --git a/automation/__pycache__/core.cpython-312.pyc b/automation/__pycache__/core.cpython-312.pyc
diff --git a/automation/__pycache__/screenshot.cpython-312.pyc b/automation/__pycache__/screenshot.cpython-312.pyc
diff --git a/automation/agent_integration.py b/automation/agent_integration.py
@@ -0,0 +1,47 @@
+import os
+import base64
+from typing import List, Dict
+from openai.agents import tool
+from automation.core import BrowserAutomator
+from automation.screenshot import ScreenshotManager
+
+class AutomationToolkit:
+    def __init__(self):
+        self.automator = BrowserAutomator(os.getenv("OPENAI_API_KEY"))
+        self.screenshot_mgr = ScreenshotManager()
+
+    @tool
+    def browse_web(self, instruction: str) -> Dict:
+        """Perform web automation tasks with screenshot support.
+
+        Args:
+            instruction: Natural language description of the task
+
+        Returns:
+            {
+                "log": List[str],
+                "screenshots": List[dict],
+                "summary": str
+            }
+        """
+        actions = self.automator.get_ai_response(instruction)
+        result = self.automator.execute_actions(actions)
+
+        # Store screenshots for later retrieval
+        for i, img_bytes in enumerate(self.automator.screenshot_history):
+            self.screenshot_mgr.store_screenshot(f"step_{i}.png", img_bytes)
+
+        return {
+            "log": result["log"],
+            "screenshots": [
+                {"step": s["step"], "action": s["action"]} 
+                for s in result["screenshots"]
+            ],
+            "summary": f"Completed {len(result['screenshots'])} steps"
+        }
+
+    @tool
+    def get_screenshot(self, step: int) -> str:
+        """Retrieve a specific screenshot by step number."""
+        img_bytes = self.screenshot_mgr.get_screenshot(step)
+        return base64.b64encode(img_bytes).decode('utf-8')
diff --git a/automation/agent_tools.py b/automation/agent_tools.py
@@ -0,0 +1,86 @@
+import os
+import logging
+from typing import Optional
+from automation.core import BrowserAutomator
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Tool decorator with fallback
+try:
+    from openai.agents import tool
+    logger.info("Using official OpenAI Agents SDK")
+except ImportError:
+    logger.info("Using local tool implementation - install openai-agents for full functionality")
+
+    def tool(func):
+        """Local @tool decorator fallback"""
+        func.is_tool = True  # Mark as tool
+        func.tool_name = func.__name__  # Add tool name attribute
+        func.tool_description = func.__doc__ or ""  # Store description
+        return func
+
+class AutomationToolkit:
+    def __init__(self):
+        """Initialize with your existing BrowserAutomator"""
+        self.automator = BrowserAutomator(os.getenv("OPENAI_API_KEY"))
+        self._setup_tools()
+
+    def _setup_tools(self):
+        """Register all available tools"""
+        self.tools = {
+            "browse_web": self.browse_web,
+            "get_screenshot": self.get_screenshot
+        }
+
+    @tool
+    def browse_web(self, instruction: str) -> str:
+        """
+        Perform web automation tasks. 
+
+        Args:
+            instruction: Natural language description of the task
+                Example: "Search for AI news on TechCrunch"
+
+        Returns:
+            Execution log with results
+        """
+        try:
+            actions = self.automator.get_ai_response(instruction)
+            return self.automator.execute_actions(
+                actions, 
+                headless=False  # Keep visible for Gradio
+            )
+        except Exception as e:
+            logger.error(f"Browser automation failed: {str(e)}")
+            return f"Error: {str(e)}"
+
+    @tool
+    def get_screenshot(self, step: int) -> Optional[bytes]:
+        """
+        Retrieve screenshot from automation history.
+
+        Args:
+            step: Index of the screenshot to retrieve (0-based)
+
+        Returns:
+            Screenshot image bytes or None if not found
+        """
+        if hasattr(self.automator, 'screenshot_history'):
+            try:
+                return self.automator.screenshot_history[step]
+            except IndexError:
+                logger.warning(f"Screenshot step {step} not found")
+        return None
+
+    def as_tool_list(self) -> list:
+        """Get tools in OpenAI-compatible format"""
+        return [
+            {
+                "name": tool.tool_name,
+                "description": tool.tool_description,
+                "function": tool
+            }
+            for tool in self.tools.values()
+        ]