feat: deploy llm blogpost (#35)

ntua-unit-of-control-and-informatics · Dec 19, 2024 · 569af71 · 569af71
1 parent 5812a32
commit 569af71
Show file tree

Hide file tree

Showing 9 changed files with 407 additions and 26 deletions.
diff --git a/app/blog/(posts)/create-your-first-dataset-with-jaqpotpy/page.mdx b/app/blog/(posts)/create-your-first-dataset-with-jaqpotpy/page.mdx
@@ -1,9 +1,17 @@
 import MdxLayout from "../../components/mdx-layout";
+import {Link} from "@nextui-org/react";
 
 export const metadata = {
     title: 'Create your first dataset with Jaqpotpy',
     publishDate: '2024-12-11T00:00:00Z',
-    textPreview: 'Learn how to create datasets with jaqpotpy. This guide covers simple numeric data, SMILES-based features, and task setup for regression or classification.'
+    textPreview: 'Learn how to create datasets with jaqpotpy. This guide covers simple numeric data, SMILES-based features, and task setup for regression or classification.',
+    author: {
+        name: 'Vassilis Minadakis',
+        avatarUrl: 'https://d2zoqz4gyxc03g.cloudfront.net/avatars/8ba56047-97db-4954-affb-b4f78bab0c05.jpg',
+        description: <Link isExternal href="https://app.jaqpot.org/dashboard/user/vassilis_minadakis" size="sm">
+            @vassilis_minadakis
+        </Link>,
+    }
 };
 
 ![Dataset creation](/create_dataset.jpg)

diff --git a/app/blog/(posts)/deploy-your-first-model-with-jaqpot/page.mdx b/app/blog/(posts)/deploy-your-first-model-with-jaqpot/page.mdx
@@ -1,9 +1,17 @@
 import MdxLayout from "../../components/mdx-layout";
+import {Link} from "@nextui-org/react";
 
 export const metadata = {
     title: 'Deploy your first classification model with Jaqpot',
     publishDate: '2024-12-04T00:00:00Z',
-    textPreview: 'Ready to deploy your first machine learning model? This guide walks you through creating and deploying a simple classification model using Jaqpot. We\'ll show you how to prepare your data, train a model, and deploy it with just a few lines of code.'
+    textPreview: 'Ready to deploy your first machine learning model? This guide walks you through creating and deploying a simple classification model using Jaqpot. We\'ll show you how to prepare your data, train a model, and deploy it with just a few lines of code.',
+    author: {
+        name: 'Alex Arvanitidis',
+        avatarUrl: 'https://d2zoqz4gyxc03g.cloudfront.net/avatars/af2a132a-5997-4b52-936b-35695452035b.jpg',
+        description: <Link isExternal href="https://app.jaqpot.org/dashboard/user/alarv" size="sm">
+            @alarv
+        </Link>,
+    }
 };
 
 ![Deploy model](/deploy-first-model.png)

diff --git a/app/blog/(posts)/hello-world/page.mdx b/app/blog/(posts)/hello-world/page.mdx
@@ -1,10 +1,18 @@
 import MdxLayout from "../../components/mdx-layout";
 import Image from "next/image";
+import {Link} from "@nextui-org/react";
 
 export const metadata = {
     title: 'Hello world!',
     publishDate: '2024-09-05T00:00:00Z',
-    textPreview: 'Welcome to our very first blog post! We are excited to finally share our story, our mission, and the journey that led to the creation of Jaqpot. Whether you\'re a machine learning enthusiast, a researcher, or just curious about who we are, this post will give you a peek behind the scenes.'
+    textPreview: 'Welcome to our very first blog post! We are excited to finally share our story, our mission, and the journey that led to the creation of Jaqpot. Whether you\'re a machine learning enthusiast, a researcher, or just curious about who we are, this post will give you a peek behind the scenes.',
+    author: {
+        name: 'Alex Arvanitidis',
+        avatarUrl: 'https://d2zoqz4gyxc03g.cloudfront.net/avatars/af2a132a-5997-4b52-936b-35695452035b.jpg',
+        description: <Link isExternal href="https://app.jaqpot.org/dashboard/user/alarv" size="sm">
+            @alarv
+        </Link>,
+    }
 };
 
 ![ML image](/hello-world.png)

diff --git a/app/blog/(posts)/run-and-deploy-an-llm/opengraph-image.png b/app/blog/(posts)/run-and-deploy-an-llm/opengraph-image.png
diff --git a/app/blog/(posts)/run-and-deploy-an-llm/page.mdx b/app/blog/(posts)/run-and-deploy-an-llm/page.mdx
@@ -0,0 +1,280 @@
+import MdxLayout from "../../components/mdx-layout";
+import {Link} from "@nextui-org/react";
+
+export const metadata = {
+    title: 'Building a streaming LLM with Next.js, FastAPI & Docker: the complete stack (part 1)',
+    publishDate: '2024-12-19T00:00:00Z',
+    textPreview: 'Want to deploy your LLM with real-time streaming responses? Here\'s a complete guide covering backend, frontend, and deployment.',
+    author: {
+        name: 'Alex Arvanitidis',
+        avatarUrl: 'https://d2zoqz4gyxc03g.cloudfront.net/avatars/af2a132a-5997-4b52-936b-35695452035b.jpg',
+        description: <Link isExternal href="https://app.jaqpot.org/dashboard/user/alarv" size="sm">
+            @alarv
+        </Link>,
+    }
+};
+
+![Deploy LLM](/deploy-llm.svg)
+
+Want to build a modern LLM application with real-time streaming responses? Here's a complete guide covering backend, frontend, and deployment. We'll be using some of the most popular technologies in the modern web development ecosystem: Next.js with the Next UI component library, FastAPI for our backend, and Docker for containerization.
+
+## Preparing your LLM weights
+
+First, let's talk about the model weights. The Llama model comes in various sizes (7B, 13B, 70B parameters) and versions (Llama 3, Llama 3 Chat). For this tutorial, we'll use the Llama 3.2-1B-Instruct model, which is a good balance between performance and resource requirements.
+
+Before we can use the model, we need to convert the weights to a format that works with the Hugging Face transformers library. This conversion process is necessary because Llama models are originally distributed in their own format, but we want to use them with the transformers ecosystem.
+
+Here's the command to convert the weights:
+
+```bash
+python convert_llama_weights_to_hf.py \
+    --input_dir ~/.llama/checkpoints/Llama3.2-1B-Instruct/ \
+    --model_size 1B \
+    --output_dir ./llama \
+    --llama_version 3.2
+```
+
+Let's break down what's happening:
+- `--input_dir`: points to where your original Llama weights are stored
+- `--model_size`: specifies the model size (1B in our case)
+- `--output_dir`: where the converted weights will be saved
+- `--llama_version`: specifies which version of Llama we're using
+
+The conversion process creates a model that's compatible with the transformers library's LlamaForCausalLM architecture. This enables us to use all the powerful features of the transformers library, including:
+- Efficient tokenization
+- Model parallelism
+- Gradient checkpointing
+- Different generation strategies
+
+For more details about Llama model usage and optimization tips, check out the [official documentation](https://huggingface.co/docs/transformers/main/en/model_doc/llama#usage-tips).
+
+## Setting up the backend
+
+Our backend is built with FastAPI, a modern Python web framework known for its speed and ease of use. Here's our complete model service that handles predictions and streaming:
+
+```python
+import asyncio
+from queue import Queue
+from threading import Thread
+
+import joblib
+import pandas as pd
+import torch
+from fastapi.responses import StreamingResponse
+from jaqpot_api_client.models.prediction_request import PredictionRequest
+
+from src.streamer import CustomStreamer
+
+class ModelService:
+    def __init__(self):
+        self.model = joblib.load('model.pkl')
+        self.tokenizer = joblib.load('tokenizer.pkl')
+
+    def infer(self, request: PredictionRequest) -> StreamingResponse:
+        # Convert input list to DataFrame
+        input_data = pd.DataFrame(request.dataset.input)
+
+        last_index = input_data.index[-1]
+        input_row = input_data.iloc[last_index]
+
+        prompt = input_row['prompt']
+
+        return StreamingResponse(self.response_generator(prompt),
+                               media_type='text/event-stream')
+
+    def start_generation(self, query: str, streamer):
+        prompt = f"""{query}"""
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = self.model.to(device)
+        inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
+        generation_kwargs = dict(inputs, streamer=streamer,
+                               max_new_tokens=4096, temperature=0.1)
+        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+    async def response_generator(self, query: str):
+        streamer_queue = Queue()
+        streamer = CustomStreamer(streamer_queue, self.tokenizer, True)
+
+        self.start_generation(query, streamer)
+
+        while True:
+            value = streamer_queue.get()
+            if value is None:
+                break
+            yield value
+            streamer_queue.task_done()
+            await asyncio.sleep(0.1)
+```
+
+Let's examine the key components of our service:
+
+1.**Model initialization**: We load both the model and tokenizer from saved files. This happens once when the service starts, keeping them in memory for quick access.
+
+2.**Inference endpoint**: The `infer` method handles incoming requests. It expects a prompt and returns a streaming response.
+
+3.**Generation process**: Text generation happens in a separate thread to prevent blocking. We use GPU acceleration when available.
+
+4.**Streaming setup**: We use FastAPI's StreamingResponse to send tokens back to the client as they're generated, providing a real-time experience.
+
+## Creating the frontend
+
+Our frontend is built with Next.js and the Next UI component library, providing a modern and responsive interface. Next UI gives us a set of beautiful, accessible components that work great with Next.js 13+.
+
+We'll break down our frontend into two main components:
+
+### Navigation component
+```typescript
+export default function LLMNavigation({ model }: LLMTabsProps) {
+  const params = useParams<{ datasetId: string }>();
+  const [selectedKeys, setSelectedKeys] = useState<Set<string>>(
+    params.datasetId ? new Set([params.datasetId]) : new Set<string>(),
+  );
+
+  // handle chat history and navigation
+  return (
+    <div className="flex flex-row gap-5">
+      <div className="flex max-w-48 flex-col gap-3">
+        <Button
+          color="primary"
+          onPress={async () => {
+            router.push(`${pathnameWithoutDatasetId}/new`);
+          }}
+        >
+          start new chat
+        </Button>
+        <Table
+          selectedKeys={selectedKeys}
+          onSelectionChange={(keys) => {
+            router.push(`${pathnameWithoutDatasetId}/${[...keys][0]}`);
+          }}
+          selectionMode="single"
+        >
+          {/* table content */}
+        </Table>
+      </div>
+      <LLMForm model={model} datasetId={params.datasetId} />
+    </div>
+  );
+}
+```
+
+The navigation component manages chat history and session navigation. It uses Next UI's Table and Button components for a polished look.
+
+### Chat form component
+```typescript
+export function LLMForm({ model, datasetId }: LLMFormProps) {
+  const [isFormLoading, setIsFormLoading] = useState(false);
+  const [currentResponse, setCurrentResponse] = useState<string>();
+  const [chatHistory, setChatHistory] = useState<Array<ChatMessage>>([]);
+
+  const createStreamingPrediction = async (
+    modelId: string,
+    datasetId: string,
+    streamingPredictionRequestDto: { prompt: string },
+  ) => {
+    const apiResponse = await fetch(
+      `/api/models/${modelId}/predict/stream/${datasetId}`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'text/event-stream' },
+        body: JSON.stringify(streamingPredictionRequestDto),
+      },
+    );
+
+    if (!apiResponse.body) return;
+
+    const reader = apiResponse.body
+      .pipeThrough(new TextDecoderStream())
+      .getReader();
+    let fullResponse = '';
+
+    while (true) {
+      const { value, done } = await reader.read();
+      if (done) break;
+
+      if (value) {
+        fullResponse += value + ' ';
+        setCurrentResponse(fullResponse);
+      }
+    }
+
+    return fullResponse;
+  };
+
+  const handleFormSubmit = async () => {
+    // handle form submission and streaming
+    try {
+      const response = await createStreamingPrediction(
+        model.id!.toString(),
+        dataset.id!.toString(),
+        { prompt },
+      );
+
+      setChatHistory((prev) => [
+        ...prev,
+        { prompt, output: response },
+      ]);
+    } catch (error) {
+      // error handling
+    }
+  };
+
+  return (
+    <Card>
+      <CardBody>
+        <div ref={chatContainerRef}>
+          {/* chat messages */}
+          <form>
+            <Textarea
+              onKeyDown={handleKeyPress}
+              endContent={
+                <Button
+                  isIconOnly
+                  onPress={handleFormSubmit}
+                  isLoading={isFormLoading}
+                >
+                  <ArrowUpIcon />
+                </Button>
+              }
+            />
+          </form>
+        </div>
+      </CardBody>
+    </Card>
+  );
+}
+```
+
+The chat form component is where the magic happens. It:
+- Manages the chat interface using Next UI components (Card, Button, Textarea)
+- Handles real-time streaming using the web streams API
+- Maintains chat history and UI state
+- Provides a smooth typing experience with loading states
+- Implements auto-scrolling for new messages
+- Supports keyboard shortcuts for better usability
+
+Key features of our frontend implementation:
+- Real-time streaming using the web streams API
+- Responsive design with Next UI components
+- Chat history management with state persistence
+- Clean, accessible UI with loading states
+- Auto-scrolling chat container
+- Keyboard shortcuts (enter to send)
+- Error handling and recovery
+
+## Deployment
+
+Once your application is ready:
+1. Build your docker image with the model service
+2. Set up your Next.js frontend
+3. Configure your environment variables
+4. Deploy using your preferred hosting solution
+
+In part 2, I will explain how you can deploy this setup on AWS using an EC2 instance with a GPU (g4dn.xlarge) on your EKS cluster with a GPU node group that will only be used by your LLM pod.
+
+P.s. This exact setup is currently running on the Jaqpot platform at `https://app.jaqpot.org/dashboard/models/1983/description`. Check it out to see it in action!
+
+export default function MDXPage({children}) {
+    return <MdxLayout metadata={metadata}>{children}</MdxLayout>
+}
diff --git a/app/blog/components/mdx-layout.tsx b/app/blog/components/mdx-layout.tsx
@@ -3,6 +3,8 @@ import { CalendarDaysIcon } from '@heroicons/react/24/solid';
 import JaqpotTimeAgo from '@/app/components/JaqpotTimeAgo';
 import { Divider } from '@nextui-org/divider';
 import { Post } from '@/app/blog/posts';
+import Link from 'next/link';
+import { User } from '@nextui-org/react';
 
 export default function MdxLayout({
   metadata,
@@ -25,14 +27,26 @@ export default function MdxLayout({
                 {metadata.title}
               </h2>
             </div>
-            <div className="flex w-full items-center text-gray-400">
+            <div className="flex w-full items-center">
+              {metadata.author && (
+                <User
+                  avatarProps={{
+                    src: metadata.author.avatarUrl,
+                  }}
+                  description={metadata.author.description}
+                  name={metadata.author.name}
+                />
+              )}
+            </div>
+            <div className="flex w-full items-center text-tiny text-gray-400">
               <>
                 <CalendarDaysIcon className="mr-2 size-5" />
                 <JaqpotTimeAgo
                   date={new Date(metadata.publishDate as unknown as string)}
                 />
               </>
             </div>
+            <Divider className="my-4" />
           </CardHeader>
           <CardBody className="text-sm">{children}</CardBody>
         </Card>