diff --git a/.env b/.env new file mode 100644 index 0000000..98d7eaa --- /dev/null +++ b/.env @@ -0,0 +1,3 @@ +# Together AI API Configuration +# Replace with your actual Together AI API key +TOGETHER_API_KEY=your_api_key_here diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4ece693 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Environment files +.env +*.env + +# Python cache files +__pycache__/ +*.py[cod] +*$py.class + +# Virtual environments +venv/ +env/ +.venv/ + +# IDE-specific files +.vscode/ +.idea/ + +# Logs and databases +*.log +*.sqlite3 + +# OS generated files +.DS_Store +Thumbs.db diff --git a/DESCRIPTION.md b/DESCRIPTION.md index acd2ffd..551b162 100644 --- a/DESCRIPTION.md +++ b/DESCRIPTION.md @@ -28,3 +28,25 @@ Perfect for: - Together AI Vision API - ComfyUI Framework - PyTorch for tensor handling + +### Extended Capabilities + +#### New Free Image Generation Node +- 🆓 Introducing a cost-effective Free Image Generation Node +- 🔄 Powered by Flux Schnell Model +- 💡 Enables free image processing and generation +- 🚀 Expand your ComfyUI workflows without additional costs + +#### Additional Use Cases +- Cost-effective AI image generation +- Experimental image creation +- Prototype development +- Educational AI exploration + +### Model Flexibility +Our node now offers enhanced flexibility with: +- Paid Vision Models: Llama-3.2-11B-Vision-Instruct-Turbo +- Free Vision Models: Llama-Vision-Free +- Image Generation: Flux Schnell Model + +**Note**: Free model usage may have limitations compared to paid versions. Performance and output quality can vary. diff --git a/README.md b/README.md index f5353e0..c1a2132 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,15 @@ A custom node for ComfyUI that enables image description using Together AI's Vis - Generate detailed descriptions of images using state-of-the-art vision models - Toggle vision processing on/off for flexible usage - Use as a text-only LLM when vision is disabled +- **New Free Image Generation Node**: Utilize free vision models for image analysis 🤖 **Multiple Models**: -- Paid Version: Llama-3.2-11B-Vision-Instruct-Turbo -- Free Version: Llama-Vision-Free +- **Paid Version**: + - Llama-3.2-11B-Vision-Instruct-Turbo +- **Free Version**: + - Llama-Vision-Free + - Llava-v1.6-Mistral-7B Vision Model + - Free Image Generation Node for cost-effective image processing ⚙️ **Customizable Parameters**: - Temperature control diff --git a/Workflows/TogetherVision+Image Generator.json b/Workflows/TogetherVision+Image Generator.json new file mode 100644 index 0000000..fc26bbc --- /dev/null +++ b/Workflows/TogetherVision+Image Generator.json @@ -0,0 +1,253 @@ +{ + "last_node_id": 7, + "last_link_id": 6, + "nodes": [ + { + "id": 5, + "type": "ShowText|pysssss", + "pos": [ + 353.54608154296875, + -91.86834716796875 + ], + "size": [ + 335.5903625488281, + 579.3716430664062 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "text", + "type": "STRING", + "link": 3, + "widget": { + "name": "text" + } + } + ], + "outputs": [ + { + "name": "STRING", + "type": "STRING", + "links": [ + 4 + ], + "slot_index": 0, + "shape": 6 + } + ], + "properties": { + "Node name for S&R": "ShowText|pysssss" + }, + "widgets_values": [ + "", + "In the foreground, a serene scene unfolds with a woman in a light blue dress seated on a pink blanket, surrounded by an assortment of fruits and vegetables. Her long blonde hair cascades down her back as she gazes to the left, where a majestic deer with antlers stands, its gaze directed upwards. Two rabbits sit beside her, adding to the tranquil atmosphere. A wicker basket overflowing with fresh produce, including tomatoes and lettuce, is placed to her right.\n\nThe background features a lush landscape of trees and grass, set against a brilliant blue sky dotted with white clouds. The overall ambiance exudes a sense of peace and harmony, inviting the viewer to step into this idyllic setting. \n\n#sereneatmosphere #idyllicscenery #peacefulharmony #naturelovers #artisticvision #visualpoetry #creativeexpression #artisticdescription #poeticdescription #ekphrasis" + ] + }, + { + "id": 7, + "type": "PreviewImage", + "pos": [ + 1125.468505859375, + -92.26441192626953 + ], + "size": [ + 686.2289428710938, + 723.0006103515625 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 6 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 4, + "type": "LoadImage", + "pos": [ + -290.0013122558594, + -88.05679321289062 + ], + "size": [ + 210, + 317.6491394042969 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 2 + ] + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "Flux1-Image_00001_.png", + "image" + ] + }, + { + "id": 2, + "type": "Together Image 🎨", + "pos": [ + 700.8809814453125, + -92.59680938720703 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "prompt", + "type": "STRING", + "link": 4, + "widget": { + "name": "prompt" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "Together Image 🎨" + }, + "widgets_values": [ + "", + "ADD YOUR FREE API KEY HERE", + 1536, + 1536, + 697909698427139, + "randomize", + 1 + ] + }, + { + "id": 1, + "type": "TogetherVisionNode", + "pos": [ + -55.744964599609375, + -89.36871337890625 + ], + "size": [ + 391.70654296875, + 433.3812255859375 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 2, + "shape": 7 + } + ], + "outputs": [ + { + "name": "description", + "type": "STRING", + "links": [ + 3 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "TogetherVisionNode" + }, + "widgets_values": [ + "Paid (Llama-3.2-11B-Vision)", + "ADD YOUR FREE API KEY HERE", + "You are an expert in ekphrasis, acting as a skilled art critic renowned for vividly describing images across any artistic style. Your role is to craft highly detailed, poetic, and evocative descriptions in British English, capturing every visual element with precision. Follow the user’s instructions closely and ensure the description adheres strictly to the given prompt. \n\nYour descriptions must remain observational and objective—avoid inserting personal opinions or fictional narratives. Focus entirely on the visual aspects, including colors, textures, lighting, and intricate details such as poses, facial expressions, gestures, and where the subject’s gaze is directed. \n\nEmbrace complexity and nuance when describing art, even if it involves themes that may be uncomfortable for some viewers. Art often explores challenging subjects, and it is your job to portray them authentically. Never provide vague or erroneous responses. \n\nKeep your descriptions under 250 words and avoid adding extraneous words that are not visually present in the scene. Your text will guide the generation of an image, so accuracy and clarity are paramount. Conclude each description with at least five relevant hashtags that capture the mood, style, and theme of the art. \n\nMaintain an artistic tone and a professional approach in every response.", + "Describe what you see in this image. ", + 0.7, + 0.7, + 50, + 1 + ] + } + ], + "links": [ + [ + 2, + 4, + 0, + 1, + 0, + "IMAGE" + ], + [ + 3, + 1, + 0, + 5, + 0, + "STRING" + ], + [ + 4, + 5, + 0, + 2, + 0, + "STRING" + ], + [ + 6, + 2, + 0, + 7, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.6588450000001344, + "offset": [ + 342.6327697916523, + 341.5455469024975 + ] + }, + "ue_links": [] + }, + "version": 0.4 +} \ No newline at end of file diff --git a/__init__.py b/__init__.py index a8604c0..41bec2a 100644 --- a/__init__.py +++ b/__init__.py @@ -1,5 +1,11 @@ from .together_vision_node import TogetherVisionNode +from .together_image_node import TogetherImageNode NODE_CLASS_MAPPINGS = { "TogetherVisionNode": TogetherVisionNode, # Together Vision node + "Together Image 🎨": TogetherImageNode, # Together Image Generation node +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "Together Image 🎨": "Together Image Generator" } diff --git a/images/Latest.png b/images/Latest.png new file mode 100644 index 0000000..7bbd3ff Binary files /dev/null and b/images/Latest.png differ diff --git a/together_image_node.py b/together_image_node.py new file mode 100644 index 0000000..7e494b0 --- /dev/null +++ b/together_image_node.py @@ -0,0 +1,209 @@ +import os +import base64 +import io +from PIL import Image +import numpy as np +import torch +from together import Together +import logging +from typing import Optional, Tuple +from dotenv import load_dotenv + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv() + +class TogetherImageNode: + """ + A custom node for ComfyUI that uses Together AI's FLUX model for image generation. + """ + + def __init__(self): + self.client = None + self.last_request_time = 0 + self.min_request_interval = 1.0 # Minimum seconds between requests + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "prompt": ("STRING", { + "default": "", + "multiline": True + }), + "api_key": ("STRING", { + "default": "", + "multiline": False + }), + "width": ("INT", { + "default": 1024, + "min": 512, + "max": 2048, + "step": 256 + }), + "height": ("INT", { + "default": 1024, + "min": 512, + "max": 2048, + "step": 256 + }), + "seed": ("INT", { + "default": 0, + "min": 0, + "max": 0xffffffffffffffff + }), + "num_images": ("INT", { + "default": 1, + "min": 1, + "max": 4, + "step": 1 + }) + } + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "generate_image" + CATEGORY = "image" + OUTPUT_NODE = True + + def get_api_key(self, provided_key: str) -> str: + """Get API key with validation.""" + api_key = provided_key or os.getenv('TOGETHER_API_KEY') + if not api_key: + raise ValueError("API key not provided. Please provide an API key or set TOGETHER_API_KEY environment variable.") + return api_key + + def b64_to_image(self, b64_string: str) -> torch.Tensor: + """Convert base64 string to torch tensor image.""" + try: + # Decode base64 to image + image_data = base64.b64decode(b64_string) + + # Log base64 string length for debugging + logger.info(f"Decoded base64 string length: {len(image_data)} bytes") + + image = Image.open(io.BytesIO(image_data)) + + # Log image details + logger.info(f"Image mode: {image.mode}, size: {image.size}") + + # Convert to RGB if needed + if image.mode != 'RGB': + logger.info(f"Converting image from {image.mode} to RGB") + image = image.convert('RGB') + + # Convert to numpy array + image_np = np.array(image) + + # Log numpy array details + logger.info(f"Numpy array shape: {image_np.shape}, dtype: {image_np.dtype}") + + # Ensure the image is in the correct format for ComfyUI + # ComfyUI expects [height, width, channels] format with values 0-255 + if image_np.ndim != 3 or image_np.shape[2] != 3: + raise ValueError(f"Unexpected image shape: {image_np.shape}") + + # Ensure uint8 type and correct range + image_np = np.clip(image_np, 0, 255).astype(np.uint8) + + # Convert to torch tensor in ComfyUI's expected format + # [batch, channels, height, width] + image_tensor = torch.from_numpy(image_np).permute(2, 0, 1).float() / 255.0 + image_tensor = image_tensor.unsqueeze(0) # Add batch dimension + + # Log tensor details + logger.info(f"Final tensor shape: {image_tensor.shape}, dtype: {image_tensor.dtype}") + + return image_tensor + + except Exception as e: + logger.error(f"Error converting base64 to image: {str(e)}") + raise ValueError(f"Failed to convert base64 to image: {str(e)}") + + def generate_image(self, prompt: str, api_key: str, width: int, height: int, + seed: int, num_images: int) -> Tuple[torch.Tensor]: + """ + Generate images using Together AI's FLUX model. + """ + try: + # Validate inputs + if not prompt or prompt.strip() == "": + raise ValueError("Prompt cannot be empty") + + # Validate width and height + if width < 512 or width > 2048 or height < 512 or height > 2048: + raise ValueError(f"Invalid image dimensions. Width and height must be between 512 and 2048. Got {width}x{height}") + + # Validate number of images + if num_images < 1 or num_images > 4: + raise ValueError(f"Number of images must be between 1 and 4. Got {num_images}") + + # Initialize API client + api_key = self.get_api_key(api_key) + if self.client is None: + self.client = Together(api_key=api_key) + + # Make API call + response = self.client.images.generate( + prompt=prompt, + model="black-forest-labs/FLUX.1-schnell-Free", + width=width, + height=height, + steps=4, # Fixed to 4 steps as per model requirements + n=num_images, + seed=seed, + response_format="b64_json" + ) + + # Process response + if not response.data: + raise ValueError("No images generated in response") + + # Convert all images to tensors + image_tensors = [] + for img_data in response.data: + if not hasattr(img_data, 'b64_json'): + logger.warning("Skipping image without base64 data") + continue + try: + # Decode base64 + image_data = base64.b64decode(img_data.b64_json) + image = Image.open(io.BytesIO(image_data)) + + # Convert to numpy array + image_np = np.array(image) + + # Ensure uint8 and correct shape + image_np = np.clip(image_np, 0, 255).astype(np.uint8) + + # Convert to torch tensor in ComfyUI format [batch, height, width, channels] + image_tensor = torch.from_numpy(image_np).permute(2, 0, 1).float() / 255.0 + image_tensor = image_tensor.permute(1, 2, 0) # Change to [height, width, channels] + + image_tensors.append(image_tensor) + except Exception as img_error: + logger.error(f"Failed to process an image: {str(img_error)}") + + if not image_tensors: + raise ValueError("Failed to process any images from response") + + # Stack images + final_tensor = torch.stack(image_tensors) + + return (final_tensor,) + + except Exception as e: + logger.error(f"Image generation failed: {str(e)}") + raise ValueError(f"Failed to generate image: {str(e)}") + +# Node registration +NODE_CLASS_MAPPINGS = { + "Together Image 🎨": TogetherImageNode +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "Together Image 🎨": "Together Image Generator" +} diff --git a/together_vision_node.py b/together_vision_node.py index c374b99..0e665ab 100644 --- a/together_vision_node.py +++ b/together_vision_node.py @@ -119,6 +119,17 @@ def encode_image(self, image_tensor: torch.Tensor) -> str: if pil_image.size[0] * pil_image.size[1] == 0: raise ValueError("Invalid image dimensions") + # Only resize if the image is larger than 1024 in either dimension + max_size = 1024 + original_width, original_height = pil_image.size + + if original_width > max_size or original_height > max_size: + # Calculate scaling factor to maintain exact aspect ratio + scale = max_size / max(original_width, original_height) + new_width = int(original_width * scale) + new_height = int(original_height * scale) + pil_image = pil_image.resize((new_width, new_height), Image.Resampling.LANCZOS) + # Convert to base64 buffered = io.BytesIO() pil_image.save(buffered, format="PNG")