diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..db3f1971
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,31 @@
+.git/
+.gitignore
+.github/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+*.egg-info/
+.eggs/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.pytest_cache/
+.env
+.venv/
+env/
+venv/
+ENV/
+.idea/
+.vscode/
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 8d77d668..ae03a5a5 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -1,5 +1,4 @@
-name: Upload Python Package to PyPI when a Release is Created
-
+name: Upload Python Package and Docker Image on Release
 on:
   release:
     types: [created]
@@ -25,6 +24,76 @@ jobs:
           pip install setuptools wheel
       - name: Build package
         run: |
-          python setup.py sdist bdist_wheel  # Could also be python -m build
+          python setup.py sdist bdist_wheel
       - name: Publish package distributions to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
+
+  docker-publish:
+    name: Publish Docker image
+    runs-on: ubuntu-latest
+    needs: pypi-publish
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+          
+      # Extract metadata for proxy_only image
+      - name: Extract metadata for proxy_only Docker
+        id: meta-proxy
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/${{ github.repository }}
+          flavor: |
+            suffix=-slim
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            latest
+      
+      # Build and push proxy image
+      - name: Build and push proxy_only Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: Dockerfile.proxy_only
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: ${{ steps.meta-proxy.outputs.tags }}
+          labels: ${{ steps.meta-proxy.outputs.labels }}
+          cache-from: type=gha,scope=proxy
+          cache-to: type=gha,scope=proxy,mode=max
+      
+      - name: Extract metadata for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/${{ github.repository }}
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            latest
+      
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.gitignore b/.gitignore
index f0f43ab1..8d73ad5f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+# Apache license 2 - modified after the fork to add VS code 
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -164,3 +166,6 @@ cython_debug/
 # Ignore Mac DS_Store files
 .DS_Store
 **/.DS_Store
+
+# VS Code
+.vscode/
diff --git a/Dockerfile b/Dockerfile
index 5b22f473..f0021fc2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,12 +1,20 @@
 # Build stage
 FROM python:3.12-slim AS builder
 
+# Define build argument with default value
+ARG PORT=8000
+# Make it available as env variable at runtime
+ENV OPTILLM_PORT=$PORT
+
 # Set working directory
 WORKDIR /app
 
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
-  gcc libc6-dev \
+  build-essential \
+  python3-dev \
+  gcc \
+  g++ \
   && rm -rf /var/lib/apt/lists/*
 
 # Copy only the requirements file first to leverage Docker cache
@@ -40,8 +48,8 @@ USER appuser
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 
-# Expose the port the app runs on
-EXPOSE 8000
+# Use the ARG in EXPOSE
+EXPOSE ${PORT}
 
 # Run the application
 ENTRYPOINT ["python", "optillm.py"]
diff --git a/Dockerfile.proxy_only b/Dockerfile.proxy_only
new file mode 100644
index 00000000..bc4cc90b
--- /dev/null
+++ b/Dockerfile.proxy_only
@@ -0,0 +1,55 @@
+# Build stage
+FROM python:3.12-slim AS builder
+
+# Define build argument with default value
+ARG PORT=8000
+# Make it available as env variable at runtime
+ENV OPTILLM_PORT=$PORT
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential \
+  python3-dev \
+  gcc \
+  g++ \
+  && rm -rf /var/lib/apt/lists/*
+
+# Copy only the requirements file first to leverage Docker cache
+COPY requirements_proxy_only.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements_proxy_only.txt
+
+# Final stage
+FROM python:3.12-slim
+
+# Install curl for the healthcheck
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  curl && \
+  apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy installed dependencies from builder stage
+COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Copy application code
+COPY . .
+
+# Create a non-root user and switch to it
+RUN useradd -m appuser
+USER appuser
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+
+# Use the ARG in EXPOSE
+EXPOSE ${PORT}
+
+# Run the application
+ENTRYPOINT ["python", "optillm.py"]
diff --git a/NOTICE.md b/NOTICE.md
new file mode 100644
index 00000000..6c0d4a0e
--- /dev/null
+++ b/NOTICE.md
@@ -0,0 +1,24 @@
+## NOTICE
+
+This project is a fork of [https://github.com/codelion/optillm/tree/main](https://github.com/codelion/optillm/tree/main)
+
+The [original project](https://github.com/codelion/optillm/tree/main) is licensed under the Apache 2 License as detailed in [the original LICENSE](https://github.com/codelion/optillm/blob/main/LICENSE). The fork was created from a December 2024 fork of the original project.
+
+
+This project, i.e., [CePO](https://github.com/CerebrasResearch/cb_optillm/tree/cepo), is licensed under the Apache License, Version 2.0 as detailed in [this LICENSE](./LICENSE)
+
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+
+Initial change: 
+Adding Cerebras Planning and Optimization (CePO). CePO is a method to empower Llama with reasoning, via test-time compute. [Learn more in the blog](https://cerebras.ai/blog/cepo)
+
+At a high-level, in CePO, we make m attempts to generate n step-by-step plans, refine the plans, check inconsistencies between them, use the above feedback to generate the final plan and produce the answer. This process is then repeated N times in a classical best of n manner.
+
+Ongoing Apache-licensed contributions:
+* Added the implementation of CePO
+* Integrated CePO with optillm
+See updated files [here](https://github.com/codelion/optillm/compare/main...CerebrasResearch:cb_optillm:cepo)
+
+
+Last updated: 01/14/2025
diff --git a/README.md b/README.md
index 3832c182..e88f16a6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,60 @@
-# optillm
+# Cerebras Planning and Optimization (CePO)
+
+CePO is an inference-time computation method designed to enhance the accuracy of large language models (LLMs) on tasks requiring reasoning and planning, such as solving math or coding problems. It integrates several advanced techniques, including Best of N, Chain of Thought (CoT), Self-Reflection, Self-Improvement, and Prompt Engineering.
+
+If you have any questions or want to contribute, please reach out to us on [cerebras.ai/discord](https://discord.com/channels/1085960591052644463/1325883896964841582)
+
+## Methodology
+
+In CePO, the Best of N technique is applied to `bestofn_n` solution candidates. Each solution is generated through the following four steps:
+
+**Step 1**: Plan Generation
+The model generates a detailed, step-by-step plan to solve the problem, along with its confidence level for each step.
+
+**Step 2**: Initial Solution
+Using the plan from Step 1, the model produces an initial solution.
+
+Steps 1 and 2 are repeated `planning_n` times to generate multiple solution proposals.
+If the model exceeds the token budget during Step 1 or 2, the plan/solution is marked as incomplete, rejected, and regenerated. A maximum of `planning_m` attempts is made to generate `planning_n` valid proposals.
+
+**Step 3**: Plan Refinement
+The model reviews all generated solution proposals and their associated plans, identifying inconsistencies. Based on this analysis, a refined, final step-by-step plan is constructed.
+
+**Step 4**: Final Solution
+The model uses the refined plan from Step 3 to produce the final answer.
+
+## Current Status
+
+This project is a work in progress, and the provided code is in an early experimental stage. While the proposed approach works well across the benchmarks we tested, further improvements can be achieved by task-specific customizations to prompts.
+
+## Results
+
+### Comparison of CePO with default settings and base model
+
+| Method                     | Math-L5 | MMLU-Pro (Math) | GPQA | CRUX | LiveCodeBench (pass@1) | Simple QA |
+| -------------------------: | :-----: | :-------------: | :--: | :--: | :--------------------: | :-------: |
+| Llama 3.1 70B              |  41.6   |      72.9       | 41.7 | 64.2 |          24.5          |    14.7   |
+| Llama 3.3 70B              |  51.0   |      78.6       | 49.1 | 72.6 |          27.1          |    20.9   |
+| Llama 3.1 405B             |  49.8   |      79.2       | 50.7 | 73.0 |          31.8          |    13.5   |
+| CePO (using Llama 3.3 70B) |  69.6   |      84.8       | 55.5 | 80.1 |          31.9          |    22.6   |
+
+### Ablation studies
+
+We conducted ablation studies to evaluate the impact of various hyperparameters in the CePO framework. Our results indicate that the chosen hyperparameter settings strike a good balance between computational cost and accuracy.
+
+Interestingly, the self-critique and quality improvement capabilities of existing off-the-shelf models do not always scale proportionally with increased inference compute. Addressing this limitation remains a key focus, and we plan to explore custom model fine-tuning as a potential solution in the future.
+
+| bestofn_n | planning_n | planning_m | bestofn_rating_type | Math-L5 | MMLU-Pro (Math) | GPQA  | CRUX  | Comments       |
+| :-------: | :--------: | :--------: | :-----------------: | :-----: | :-------------: | :---: | :---: | :------------- |
+|     3     |      3     |      6     |       absolute      |  69.6   |      84.8       | 55.5  | 80.1  | Default config |
+|     3     |      3     |      6     |       pairwise      |  67.7   |      83.5       | 55.6  | 79.8  |                |
+|     3     |      2     |      5     |       absolute      |  67.1   |      85.1       | 55.1  | 79.0  |                |
+|     3     |      5     |      8     |       absolute      |  69.4   |      84.3       | 55.6  | 81.1  |                |
+|     5     |      3     |      6     |       absolute      |  68.7   |      85.4       | 54.8  | 79.9  |                |
+|     7     |      3     |      6     |       absolute      |  69.6   |      82.8       | 54.7  | 78.4  |                |
+|     9     |      3     |      6     |       absolute      |  68.9   |      83.4       | 55.7  | 80.6  |                |
+
+# Implemented with OptiLLM
 
 optillm is an OpenAI API compatible optimizing inference proxy which implements several state-of-the-art techniques that can improve the accuracy and performance of LLMs. The current focus is on implementing techniques that improve reasoning over coding, logical and mathematical queries. It is possible to beat the frontier models using these techniques across diverse tasks by doing additional compute at inference time.
 
@@ -18,6 +74,16 @@ optillm
 2024-10-22 07:45:06,293 - INFO - Starting server with approach: auto
 ```
 
+### Using docker
+
+```bash
+docker pull ghcr.io/codelion/optillm:latest
+docker run -p 8000:8000 ghcr.io/codelion/optillm:latest
+2024-10-22 07:45:05,612 - INFO - Loaded plugin: privacy
+2024-10-22 07:45:06,293 - INFO - Loaded plugin: memory
+2024-10-22 07:45:06,293 - INFO - Starting server with approach: auto
+```
+
 ### Install from source
 
 Clone the repository with `git` and use `pip install` to setup the dependencies.
@@ -196,28 +262,29 @@ response = client.chat.completions.create(
 
 ## Implemented techniques
 
-| Approach                | Slug               | Description                                                                                    |
-| ----------------------- | ------------------ | ---------------------------------------------------------------------------------------------- |
-| CoT with Reflection     | `cot_reflection`   | Implements chain-of-thought reasoning with \<thinking\>, \<reflection> and \<output\> sections |
-| PlanSearch              | `plansearch`       | Implements a search algorithm over candidate plans for solving a problem in natural language   |
-| ReRead                  | `re2`              | Implements rereading to improve reasoning by processing queries twice                          |
-| Self-Consistency        | `self_consistency` | Implements an advanced self-consistency method                                                 |
-| Z3 Solver               | `z3`               | Utilizes the Z3 theorem prover for logical reasoning                                           |
-| R* Algorithm            | `rstar`            | Implements the R* algorithm for problem-solving                                                |
-| LEAP                    | `leap`             | Learns task-specific principles from few shot examples                                         |
-| Round Trip Optimization | `rto`              | Optimizes responses through a round-trip process                                               |
-| Best of N Sampling      | `bon`              | Generates multiple responses and selects the best one                                          |
-| Mixture of Agents       | `moa`              | Combines responses from multiple critiques                                                     |
-| Monte Carlo Tree Search | `mcts`             | Uses MCTS for decision-making in chat responses                                                |
-| PV Game                 | `pvg`              | Applies a prover-verifier game approach at inference time                                      |
-| CoT Decoding            |  N/A for proxy     | Implements chain-of-thought decoding to elicit reasoning without explicit prompting            |
-| Entropy Decoding        |  N/A for proxy     | Implements adaptive sampling based on the uncertainty of tokens during generation              |
+| Approach                             | Slug               | Description                                                                                    |
+| ------------------------------------ | ------------------ | ---------------------------------------------------------------------------------------------- |
+| Cerebras Planning and Optimimization | `cepo`             | Combines Best of N, Chain-of-Thought, Self-Reflection, Self-Improvement, and various prompting techniques |
+| CoT with Reflection                  | `cot_reflection`   | Implements chain-of-thought reasoning with \<thinking\>, \<reflection> and \<output\> sections |
+| PlanSearch                           | `plansearch`       | Implements a search algorithm over candidate plans for solving a problem in natural language   |
+| ReRead                               | `re2`              | Implements rereading to improve reasoning by processing queries twice                          |
+| Self-Consistency                     | `self_consistency` | Implements an advanced self-consistency method                                                 |
+| Z3 Solver                            | `z3`               | Utilizes the Z3 theorem prover for logical reasoning                                           |
+| R* Algorithm                         | `rstar`            | Implements the R* algorithm for problem-solving                                                |
+| LEAP                                 | `leap`             | Learns task-specific principles from few shot examples                                         |
+| Round Trip Optimization              | `rto`              | Optimizes responses through a round-trip process                                               |
+| Best of N Sampling                   | `bon`              | Generates multiple responses and selects the best one                                          |
+| Mixture of Agents                    | `moa`              | Combines responses from multiple critiques                                                     |
+| Monte Carlo Tree Search              | `mcts`             | Uses MCTS for decision-making in chat responses                                                |
+| PV Game                              | `pvg`              | Applies a prover-verifier game approach at inference time                                      |
+| CoT Decoding                         |  N/A for proxy     | Implements chain-of-thought decoding to elicit reasoning without explicit prompting            |
+| Entropy Decoding                     |  N/A for proxy     | Implements adaptive sampling based on the uncertainty of tokens during generation              |
 
 ## Implemented plugins
 
 | Plugin                  | Slug               | Description                                                                                    |
 | ----------------------- | ------------------ | ---------------------------------------------------------------------------------------------- |
-| Router                  | `router`           | Uses the [optillm-bert-uncased](https://huggingface.co/codelion/optillm-bert-uncased) model to route requests to different approaches based on the user prompt |
+| Router                  | `router`           | Uses the [optillm-modernbert-large](https://huggingface.co/codelion/optillm-modernbert-large) model to route requests to different approaches based on the user prompt |
 | Chain-of-Code           | `coc`              | Implements a chain of code approach that combines CoT with code execution and LLM based code simulation |
 | Memory                  | `memory`           | Implements a short term memory layer, enables you to use unbounded context length with any LLM |
 | Privacy                 | `privacy`          | Anonymize PII data in request and deanonymize it back to original value in response            |
@@ -228,22 +295,37 @@ response = client.chat.completions.create(
 
 optillm supports various command-line arguments and environment variables for configuration.
 
-| Parameter                | Description                                                     | Default Value   |
-|--------------------------|-----------------------------------------------------------------|-----------------|
-| `--approach`             | Inference approach to use                                       | `"auto"`        |
-| `--simulations`          | Number of MCTS simulations                                      | 2               |
-| `--exploration`          | Exploration weight for MCTS                                     | 0.2             |
-| `--depth`                | Simulation depth for MCTS                                       | 1               |
-| `--best-of-n`            | Number of samples for best_of_n approach                        | 3               |
-| `--model`                | OpenAI model to use                                             | `"gpt-4o-mini"` |
-| `--base-url`             | Base URL for OpenAI compatible endpoint                         | `""`            |
-| `--rstar-max-depth`      | Maximum depth for rStar algorithm                               | 3               |
-| `--rstar-num-rollouts`   | Number of rollouts for rStar algorithm                          | 5               |
-| `--rstar-c`              | Exploration constant for rStar algorithm                        | 1.4             |
-| `--n`                    | Number of final responses to be returned                        | 1               |
-| `--return-full-response` | Return the full response including the CoT with <thinking> tags | `False`         |
-| `--port`                 | Specify the port to run the proxy                               | 8000            |
-| `--optillm-api-key`      | Optional API key for client authentication to optillm           | `""`            |
+| Parameter                           | Description                                                     | Default Value   |
+|-------------------------------------|-----------------------------------------------------------------|-----------------|
+| `--approach`                        | Inference approach to use                                       | `"auto"`        |
+| `--simulations`                     | Number of MCTS simulations                                      | 2               |
+| `--exploration`                     | Exploration weight for MCTS                                     | 0.2             |
+| `--depth`                           | Simulation depth for MCTS                                       | 1               |
+| `--best-of-n`                       | Number of samples for best_of_n approach                        | 3               |
+| `--model`                           | OpenAI model to use                                             | `"gpt-4o-mini"` |
+| `--base-url`                        | Base URL for OpenAI compatible endpoint                         | `""`            |
+| `--rstar-max-depth`                 | Maximum depth for rStar algorithm                               | 3               |
+| `--rstar-num-rollouts`              | Number of rollouts for rStar algorithm                          | 5               |
+| `--rstar-c`                         | Exploration constant for rStar algorithm                        | 1.4             |
+| `--n`                               | Number of final responses to be returned                        | 1               |
+| `--return-full-response`            | Return the full response including the CoT with <thinking> tags | `False`         |
+| `--port`                            | Specify the port to run the proxy                               | 8000            |
+| `--optillm-api-key`                 | Optional API key for client authentication to optillm           | `""`            |
+| `--cepo_bestofn_n`                  | Number of responses to be generated in best of n stage          | 3               |
+| `--cepo_bestofn_temperature`        | Temperature for verifier in best of n stage                     | 0.1             |
+| `--cepo_bestofn_max_tokens`         | Maximum number of tokens for verifier in best of n stage        | 4096            |
+| `--cepo_bestofn_rating_type`        | Type of rating in best of n stage ("absolute" or "pairwise")    | `"absolute"`    |
+| `--cepo_planning_n`                 | Number of plans generated in planning stage                     | 3               |
+| `--cepo_planning_m`                 | Number of attempts to generate n plans in planning stage        | 6               |
+| `--cepo_planning_temperature_step1` | Temperature for generator in step 1 of planning stage           | 0.55            |
+| `--cepo_planning_temperature_step2` | Temperature for generator in step 2 of planning stage           | 0.25            |
+| `--cepo_planning_temperature_step3` | Temperature for generator in step 3 of planning stage           | 0.1             |
+| `--cepo_planning_temperature_step4` | Temperature for generator in step 4 of planning stage           | 0               |
+| `--cepo_planning_max_tokens_step1`  | Maximum number of tokens in step 1 of planning stage            | 4096            |
+| `--cepo_planning_max_tokens_step2`  | Maximum number of tokens in step 2 of planning stage            | 4096            |
+| `--cepo_planning_max_tokens_step3`  | Maximum number of tokens in step 3 of planning stage            | 4096            |
+| `--cepo_planning_max_tokens_step4`  | Maximum number of tokens in step 4 of planning stage            | 4096            |
+| `--cepo_config_file`                | Path to CePO configuration file                                 | None            |
 
 When using Docker, these can be set as environment variables prefixed with `OPTILLM_`.
 
@@ -354,3 +436,17 @@ called patchflows. We saw huge performance gains across all the supported patchf
 - [Unsupervised Evaluation of Code LLMs with Round-Trip Correctness](https://arxiv.org/abs/2402.08699) - [Inspired the implementation of rto](https://github.com/codelion/optillm/blob/main/optillm/rto.py)
 - [Patched MOA: optimizing inference for diverse software development tasks](https://arxiv.org/abs/2407.18521) - [Implementation](https://github.com/codelion/optillm/blob/main/optillm/moa.py)
 - [Patched RTC: evaluating LLMs for diverse software development tasks](https://arxiv.org/abs/2407.16557) - [Implementation](https://github.com/codelion/optillm/blob/main/optillm/rto.py)
+
+## Citation
+
+If you use this library in your research, please cite:
+
+```bibtex
+@software{optillm,
+  title = {Optillm: Optimizing inference proxy for LLMs},
+  author = {Asankhaya Sharma},
+  year = {2024},
+  publisher = {GitHub},
+  url = {https://github.com/codelion/optillm}
+}
+```
diff --git a/configs/cepo_config.yaml b/configs/cepo_config.yaml
new file mode 100644
index 00000000..3b2f55fd
--- /dev/null
+++ b/configs/cepo_config.yaml
@@ -0,0 +1,14 @@
+bestofn_n: 3
+bestofn_temperature: 0.1
+bestofn_max_tokens: 4096
+bestofn_rating_type: "absolute"  # or "pairwise"
+planning_n: 3
+planning_m: 6
+planning_temperature_step1: 0.55
+planning_temperature_step2: 0.25
+planning_temperature_step3: 0.1
+planning_temperature_step4: 0
+planning_max_tokens_step1: 4096
+planning_max_tokens_step2: 4096
+planning_max_tokens_step3: 4096
+planning_max_tokens_step4: 4096
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 1ecc2090..dd9ef524 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -6,15 +6,18 @@ services:
       dockerfile: Dockerfile
       tags:
         - optillm:latest
+      args:
+        - PORT=${OPTILLM_PORT:-8000}
     image: optillm:latest
     container_name: *name
     hostname: *name
     ports:
-      - "8000:8000"
+      - "${OPTILLM_PORT:-8000}:${OPTILLM_PORT:-8000}"
     env_file:
       - .env
     environment:
       OPENAI_API_KEY: ${OPENAI_API_KEY:-""}
+      OPTILLM_PORT: ${OPTILLM_PORT:-8000}
       # OPTILLM_BASE_URL: ${OPENAI_BASE_URL:-"https://openrouter.ai/api/v1"} # can be set to any OpenAI API compatible endpoint
       # OPTILLM_API_KEY: ${OPTILLM_API_KEY:-} # optionally sets an API key for Optillm clients
       # Uncomment and set values for other arguments (prefixed with OPTILLM_) as needed, e.g.:
@@ -29,11 +32,10 @@ services:
       # OPTILLM_RSTAR_C: 1.4
       # OPTILLM_N: 1
       # OPTILLM_RETURN_FULL_RESPONSE: false
-      # OPTILLM_PORT: 8000
     restart: on-failure
     stop_grace_period: 2s
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://127.0.0.1:8000/health"]
+      test: ["CMD", "curl", "-f", "http://127.0.0.1:${OPTILLM_PORT:-8000}/health"]
       interval: 30s
       timeout: 5s
       retries: 3
diff --git a/optillm.py b/optillm.py
index 25b623a2..5bc2ddc6 100644
--- a/optillm.py
+++ b/optillm.py
@@ -1,8 +1,10 @@
+# Apache license 2 - modified after the fork to add the Cerebras API option and CePO as a test-time compute method
 import argparse
 import logging
 import os
 import secrets
 from flask import Flask, request, jsonify
+from cerebras.cloud.sdk import Cerebras
 from openai import AzureOpenAI, OpenAI
 from flask import Response
 import json
@@ -13,6 +15,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import Tuple, Optional, Union, Dict, Any, List
 from importlib.metadata import version
+from dataclasses import fields
 
 # Import approach modules
 from optillm.mcts import chat_with_mcts
@@ -27,6 +30,7 @@
 from optillm.plansearch import plansearch
 from optillm.leap import leap
 from optillm.reread import re2_approach
+from optillm.cepo import cepo, CepoConfig, init_cepo_config
 
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -50,7 +54,14 @@ def get_config():
         from optillm.inference import create_inference_client
         API_KEY = os.environ.get("OPTILLM_API_KEY")
         default_client = create_inference_client()
-    # OpenAI, Azure, or LiteLLM API configuration
+    # Cerebras, OpenAI, Azure, or LiteLLM API configuration
+    elif os.environ.get("CEREBRAS_API_KEY"):
+        API_KEY = os.environ.get("CEREBRAS_API_KEY")
+        base_url = server_config['base_url']
+        if base_url != "":
+            default_client = Cerebras(api_key=API_KEY, base_url=base_url)
+        else:
+            default_client = Cerebras(api_key=API_KEY)
     elif os.environ.get("OPENAI_API_KEY"):
         API_KEY = os.environ.get("OPENAI_API_KEY")
         base_url = server_config['base_url']
@@ -104,7 +115,7 @@ def get_config():
 
 # List of known approaches
 known_approaches = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", 
-                   "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
+                   "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2", "cepo"]
 
 plugin_approaches = {}
 
@@ -124,7 +135,7 @@ def none_approach(
         model: Model identifier
         original_messages: Original messages from the request
         **kwargs: Additional parameters to pass through
-        
+    
     Returns:
         Dict[str, Any]: Full OpenAI API response
     """
@@ -158,7 +169,7 @@ def load_plugins():
    package_plugin_dir = os.path.join(os.path.dirname(optillm.__file__), 'plugins')
    
    # Get local project plugins directory
-   current_dir = os.getcwd()
+   current_dir = os.getcwd() if server_config.get("plugins_dir", "") == "" else server_config["plugins_dir"]
    local_plugin_dir = os.path.join(current_dir, 'optillm', 'plugins')
    
    plugin_dirs = []
@@ -246,11 +257,10 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
             if hasattr(request, 'json'):
                 data = request.get_json()
                 messages = data.get('messages', [])
-                # Copy all parameters except 'model' and 'messages'
+                # Copy all parameters except 'stream', 'model' , 'n' and 'messages'
                 kwargs = {k: v for k, v in data.items() 
-                         if k not in ['model', 'messages', 'optillm_approach']}
+                         if k not in ['model', 'messages', 'stream', 'n', 'optillm_approach']}
             response = none_approach(original_messages=messages, client=client, model=model, **kwargs)
-            
             # For none approach, we return the response and a token count of 0
             # since the full token count is already in the response
             return response, 0
@@ -283,6 +293,9 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
             return leap(system_prompt, initial_query, client, model)
         elif approach == 're2':
             return re2_approach(system_prompt, initial_query, client, model, n=server_config['n'])
+        elif approach == 'cepo':
+            logger.debug(f"Calling with {cepo_config}")
+            return cepo(system_prompt, initial_query, client, model, cepo_config)
     elif approach in plugin_approaches:
         return plugin_approaches[approach](system_prompt, initial_query, client, model)
     else:
@@ -369,6 +382,21 @@ def generate_streaming_response(final_response, model):
     # Yield the final message to indicate the stream has ended
     yield "data: [DONE]\n\n"
 
+def extract_contents(response_obj):
+    contents = []
+    # Handle both single response and list of responses
+    responses = response_obj if isinstance(response_obj, list) else [response_obj]
+    
+    for response in responses:
+        # Extract content from first choice if it exists
+        if (response.get('choices') and 
+            len(response['choices']) > 0 and 
+            response['choices'][0].get('message') and 
+            response['choices'][0]['message'].get('content')):
+            contents.append(response['choices'][0]['message']['content'])
+    
+    return contents
+
 def parse_conversation(messages):
     system_prompt = ""
     conversation = []
@@ -400,6 +428,54 @@ def parse_conversation(messages):
     initial_query = "\n".join(conversation)
     return system_prompt, initial_query, optillm_approach
 
+def tagged_conversation_to_messages(response_text):
+    """Convert a tagged conversation string or list of strings into a list of messages.
+    If the input doesn't contain User:/Assistant: tags, return it as is.
+    
+    Args:
+        response_text: Either a string containing "User:" and "Assistant:" tags,
+                      or a list of such strings.
+    
+    Returns:
+        If input has tags: A list of message dictionaries.
+        If input has no tags: The original input.
+    """
+    def has_conversation_tags(text):
+        return "User:" in text or "Assistant:" in text
+    
+    def process_single_response(text):
+        if not has_conversation_tags(text):
+            return text
+            
+        messages = []
+        # Split on "User:" or "Assistant:" while keeping the delimiter
+        parts = re.split(r'(?=(User:|Assistant:))', text.strip())
+        # Remove empty strings
+        parts = [p for p in parts if p.strip()]
+        
+        for part in parts:
+            part = part.strip()
+            if part.startswith('User:'):
+                messages.append({
+                    'role': 'user',
+                    'content': part[5:].strip()
+                })
+            elif part.startswith('Assistant:'):
+                messages.append({
+                    'role': 'assistant',
+                    'content': part[10:].strip()
+                })
+        return messages
+
+    if isinstance(response_text, list):
+        processed = [process_single_response(text) for text in response_text]
+        # If none of the responses had tags, return original list
+        if all(isinstance(p, str) for p in processed):
+            return response_text
+        return processed
+    else:
+        return process_single_response(response_text)
+
 def extract_optillm_approach(content):
     match = re.search(r'<optillm_approach>(.*?)</optillm_approach>', content)
     if match:
@@ -486,8 +562,13 @@ def proxy():
                 result = responses
             else:
                 result, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
+            
             logger.debug(f'Direct proxy response: {result}')
-            return jsonify(result), 200
+
+            if stream:
+                return Response(generate_streaming_response(extract_contents(result), model), content_type='text/event-stream') 
+            else :
+                return jsonify(result), 200
             
         elif operation == 'AND' or operation == 'OR':
             if contains_none:
@@ -499,7 +580,20 @@ def proxy():
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         return jsonify({"error": str(e)}), 500
-    
+
+    # Convert tagged conversation to messages format if needed
+    if isinstance(response, list):
+        processed_response = tagged_conversation_to_messages(response)
+        # If processed_response is a list of message lists, extract last message content
+        if processed_response != response:  # Only process if format changed
+            response = [msg[-1]['content'] if isinstance(msg, list) and msg else msg 
+                    for msg in processed_response]
+        # Otherwise keep original response
+    else:
+        messages = tagged_conversation_to_messages(response)
+        if isinstance(messages, list) and messages:  # Only process if format changed
+            response = messages[-1]['content']
+
     if stream:
         return Response(generate_streaming_response(response, model), content_type='text/event-stream')
     else:
@@ -559,12 +653,11 @@ def health():
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Run LLM inference with various approaches.")
-    
-     # Add version argument using importlib.metadata
+
     try:
-        package_version = version('optillm')
-    except Exception:
-        package_version = "unknown"  # Fallback if package is not installed
+        from optillm import __version__ as package_version
+    except ImportError:
+        package_version = "unknown"
         
     parser.add_argument('--version', action='version', 
                        version=f'%(prog)s {package_version}',
@@ -584,7 +677,9 @@ def parse_args():
         ("--n", "OPTILLM_N", int, 1, "Number of final responses to be returned"),
         ("--return-full-response", "OPTILLM_RETURN_FULL_RESPONSE", bool, False, "Return the full response including the CoT with <thinking> tags"),
         ("--port", "OPTILLM_PORT", int, 8000, "Specify the port to run the proxy"),
-        ("--log", "OPTILLM_LOG", str, "info", "Specify the logging level", list(logging_levels.keys()))
+        ("--log", "OPTILLM_LOG", str, "info", "Specify the logging level", list(logging_levels.keys())),
+        ("--launch-gui", "OPTILLM_LAUNCH_GUI", bool, False, "Launch a Gradio chat interface"),
+        ("--plugins-dir", "OPTILLM_PLUGINS_DIR", str, "", "Path to the plugins directory"),
     ]
 
     for arg, env, type_, default, help_text, *extra in args_env:
@@ -609,6 +704,12 @@ def parse_args():
     parser.add_argument("--base-url", "--base_url", dest="base_url", type=str, default=base_url_default,
                         help="Base url for OpenAI compatible endpoint")
 
+    # Special handling of all the CePO Configurations
+    for field in fields(CepoConfig):
+        parser.add_argument(f"--cepo_{field.name}", dest=f"cepo_{field.name}", type=field.type, default=None, help=f"CePO configuration for {field.name}")
+
+    parser.add_argument(f"--cepo_config_file", dest=f"cepo_config_file", type=str, default="./configs/cepo_config.yaml", help="Path to CePO configuration file")
+
     args = parser.parse_args()
 
     # Convert argument names to match server_config keys
@@ -622,13 +723,14 @@ def parse_args():
 
 def main():
     global server_config
+    global cepo_config
     # Call this function at the start of main()
-    load_plugins()
     args = parse_args()
-
     # Update server_config with all argument values
     server_config.update(vars(args))
 
+    load_plugins()
+
     port = server_config['port']
 
     # Set logging level from user request
@@ -636,11 +738,42 @@ def main():
     if logging_level in logging_levels.keys():
         logger.setLevel(logging_levels[logging_level])
     
+    # set and log the cepo configs
+    cepo_config = init_cepo_config(server_config)
+    if args.approach == 'cepo':
+        logger.info(f"CePO Config: {cepo_config}")
+    
     logger.info(f"Starting server with approach: {server_config['approach']}")
     server_config_clean = server_config.copy()
     if server_config_clean['optillm_api_key']:
         server_config_clean['optillm_api_key'] = '[REDACTED]'
     logger.info(f"Server configuration: {server_config_clean}")
+
+    # Launch GUI if requested
+    if server_config.get('launch_gui'):
+        try:
+            import gradio as gr
+            # Start server in a separate thread
+            import threading
+            server_thread = threading.Thread(target=app.run, kwargs={'host': '0.0.0.0', 'port': port})
+            server_thread.daemon = True
+            server_thread.start()
+            
+            # Configure the base URL for the Gradio interface
+            base_url = f"http://localhost:{port}/v1"
+            logger.info(f"Launching Gradio interface connected to {base_url}")
+            
+            # Launch Gradio interface
+            demo = gr.load_chat(
+                base_url,
+                model=server_config['model'],
+                token=None
+            )
+            demo.launch(server_name="0.0.0.0", share=False)
+        except ImportError:
+            logger.error("Gradio is required for GUI. Install it with: pip install gradio")
+            return
+        
     app.run(host='0.0.0.0', port=port)
 
 if __name__ == "__main__":
diff --git a/optillm/__init__.py b/optillm/__init__.py
index ed380a9e..c18d85d7 100644
--- a/optillm/__init__.py
+++ b/optillm/__init__.py
@@ -1,5 +1,18 @@
 from importlib import util
 import os
+import re
+
+def get_version_from_setup():
+    try:
+        setup_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'setup.py')
+        with open(setup_path, 'r') as f:
+            content = f.read()
+            version_match = re.search(r'version=["\']([^"\']+)["\']', content)
+            if version_match:
+                return version_match.group(1)
+    except Exception:
+        pass
+    return "unknown"
 
 # Get the path to the root optillm.py
 spec = util.spec_from_file_location(
@@ -34,7 +47,7 @@
 generate_streaming_response = module.generate_streaming_response
 
 # Version information
-__version__ = "0.0.8"  # Match with setup.py
+__version__ = get_version_from_setup()
 
 # List of exported symbols
 __all__ = [
diff --git a/optillm/cepo.py b/optillm/cepo.py
new file mode 100644
index 00000000..c73b901f
--- /dev/null
+++ b/optillm/cepo.py
@@ -0,0 +1,431 @@
+# Apache license 2 - added after the fork for the CePO method
+import re
+import cerebras
+import openai
+import yaml
+
+from dataclasses import dataclass
+from cerebras.cloud.sdk import BadRequestError as CerebrasBadRequestError
+from openai import BadRequestError as OpenAIBadRequestError
+from typing import Optional, Literal
+
+
+@dataclass
+class CepoConfig:
+    bestofn_n: int  # number of responses to be generated in best of n stage
+    bestofn_temperature: float  # temperature for verifier in best of n stage
+    bestofn_max_tokens: int  # maximum number of tokens for verifier in best of n stage
+    bestofn_rating_type: Literal["absolute", "pairwise"]  # type of rating in best of n stage
+    planning_n: int  # number of plans generated in planning stage
+    planning_m: int  # number of attempts to generate n plans in planning stage
+    planning_temperature_step1: float  # temperature for generator in step 1 of planning stage
+    planning_temperature_step2: float  # temperature for generator in step 2 of planning stage
+    planning_temperature_step3: float  # temperature for generator in step 3 of planning stage
+    planning_temperature_step4: float  # temperature for generator in step 4 of planning stage
+    planning_max_tokens_step1: int  # maximum number of tokens in step 1 of planning stage
+    planning_max_tokens_step2: int  # maximum number of tokens in step 2 of planning stage
+    planning_max_tokens_step3: int  # maximum number of tokens in step 3 of planning stage
+    planning_max_tokens_step4: int  # maximum number of tokens in step 4 of planning stage
+
+
+# given command line arguments which includes a yaml file path, initialize a CePO configuration
+def init_cepo_config(cmd_line_args: dict) -> CepoConfig:
+    # get the command line arguments
+    cepo_args = {
+        key.split("cepo_")[1]: value
+        for key, value in cmd_line_args.items()
+        if "cepo" in key and "cepo_config_file" != key and value is not None
+    }
+
+    # get the yaml file arguments
+    cepo_config_yaml = {}
+    if cmd_line_args.get("cepo_config_file", None):
+        with open(cmd_line_args["cepo_config_file"], "r") as yaml_file:
+            cepo_config_yaml = yaml.safe_load(yaml_file)
+
+    # merge cepo args from command line and yaml file, args from command line will overwrite the ones from yaml file
+    cepo_args = {**cepo_config_yaml, **cepo_args}
+    return CepoConfig(**cepo_args)
+
+
+# given command line arguments which includes a yaml file path, initialize a CePO configuration
+def init_cepo_config(cmd_line_args: dict) -> CepoConfig:
+    # get the command line arguments
+    cepo_args = {
+        key.split("cepo_")[1]: value
+        for key, value in cmd_line_args.items()
+        if "cepo" in key and "cepo_config_file" != key and value is not None
+    }
+
+    # get the yaml file arguments
+    cepo_config_yaml = {}
+    if "cepo_config_file" in cmd_line_args.keys():
+        with open(cmd_line_args["cepo_config_file"], "r") as yaml_file:
+            cepo_config_yaml = yaml.safe_load(yaml_file)
+
+    # check if any of the keys overlap, and if they do, error out
+    for key in cepo_config_yaml.keys():
+        if key in cepo_args.keys():
+            raise RuntimeError(f"Key {key} is found in both yaml file and command line arguments")
+
+    # if not, then we take both of them and add them to the cepo config
+    cepo_config = CepoConfig()
+    cepo_attrs = [key for key, _ in cepo_config.__dict__.items() if not key.startswith('__')]
+
+    # add command line arguments
+    for key, value in cepo_args.items():
+        # this assert should not be raised as the cli parser should catch this
+        assert key in cepo_attrs, f"Command line argument {key} is not found in CepoConfig"
+        setattr(cepo_config, key, value)
+
+    # add yaml arguments
+    for key, value in cepo_config_yaml.items():
+        assert key in cepo_attrs, f"Yaml argument {key} is not found in CepoConfig"
+        setattr(cepo_config, key, value)
+
+    return cepo_config
+
+def extract_question_only(task: str) -> str:
+    """We noticed that sometimes if the task includes specific formatting instructions, they may interfere with the reasoning flow. This
+    is a temporary workaround to extract the question only from the task. Work in progress.
+    """
+    question_only = task.replace('\n## Question: \n\n', '')
+    question_only = question_only.replace('\n\n\n## Instruction \n\nPlease answer this question by first reasoning and then providing your answer.\nPresent your reasoning and solution in the following json format. \nPlease show your final answer in the `answer` field, e.g.,`"answer": "42"`.\n\n```json\n{\n    "reasoning": "___",\n    "answer": "___"\n}\n```\n', '')
+    return question_only
+
+
+def generate_completion(system_prompt: str, task: str, client: Any, model: str, cepo_config: CepoConfig) -> str:
+    """
+    Generates a completion based on the provided system prompt and task.
+
+    Parameters:
+        system_prompt (str): The system prompt to guide the model.
+        task (str): The task or question to be addressed.
+        client (Any): The client instance for interacting with the AI model.
+        model (str): The model name to be used for generating completions.
+        cepo_config (CepoConfig): Configuration parameters for CePO flow.
+
+    Returns:
+        Tuple[str, int, dict]: The generated completion, number of tokens used, and a log dictionary.
+    """
+    completion_tokens = 0
+    question_only = extract_question_only(task)
+    cb_log = {}
+    plans = []
+
+    for i in range(cepo_config.planning_m):  # m is the maximum number of attempts to generate n plans
+        # Step 1 - Generate a plan
+        content = f"To answer this question, can you come up with a concise plan to solve it step-by-step but do not provide the "\
+                  f"final answer. Also, for each step, provide your confidence in the correctness of that step as well as your ability "\
+                  f"to execute it correctly. Here is the question:\n{question_only}\nRead the question again:\n\n{question_only}"
+
+        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": content}]
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            max_tokens=cepo_config.planning_max_tokens_step1,
+            temperature=cepo_config.planning_temperature_step1,
+            stream=False,
+        )
+        completion_tokens += response.usage.completion_tokens
+
+        if response.choices[0].finish_reason == "length":
+            # Skipping plan generation due to exceeding the token budget. Usually it means the plan is incomplete.
+            continue
+
+        # Step 2 - Execute the plan
+        content = f"Can you execute the above plan step-by-step to produce the final answer. "\
+                  f"Be extra careful when executing steps where your confidence is lower."
+        messages.extend([{"role": "assistant", "content": response.choices[0].message.content}, {"role": "user", "content": content}])
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            max_tokens=cepo_config.planning_max_tokens_step2,
+            temperature=cepo_config.planning_temperature_step2,
+            stream=False,
+        )
+        completion_tokens += response.usage.completion_tokens
+
+        if response.choices[0].finish_reason == "length":
+            messages.append({"role": "assistant", "content": response.choices[0].message.content})
+            cb_log[f"messages_planning_{i}_rejected_due_to_length"] = messages
+            continue
+
+        plans.append(response.choices[0].message.content)
+        messages.append({"role": "assistant", "content": response.choices[0].message.content})
+        cb_log[f"messages_planning_{i}"] = messages
+        
+        if len(plans) == cepo_config.planning_n:
+            break
+
+    if not plans:
+        # If no plans were generated succesfully, take the last one even if it was rejected due to length
+        plans.append(response.choices[0].message.content)
+        messages.append({"role": "assistant", "content": response.choices[0].message.content})
+        cb_log[f"messages_planning_{i}_no_plans_so_taking_the_last_one"] = messages
+
+    # Step 3 - Review and address inconsistencies
+    try:
+        plans_message = ""
+        for i, plan in enumerate(plans):
+            plans_message += f"Response {i + 1}:\n{plan}\n\n"
+        plans_message = plans_message[:-2]  # remove the last 2x newline
+        content = f"Can you review your last {len(plans)} responses and identify any inconsistency between them. After that, can you address "\
+                  f"it and present a final step-by-step solution to the problem? Here is the question:\n{question_only}"
+        messages = [{"role": "assistant", "content": plans_message}, {"role": "user", "content": content}]
+
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            max_tokens=cepo_config.planning_max_tokens_step3,
+            temperature=cepo_config.planning_temperature_step3,
+            stream=False,
+        )
+        final_solution = response.choices[0].message.content
+        completion_tokens += response.usage.completion_tokens
+    except (CerebrasBadRequestError, OpenAIBadRequestError) as e:
+        # In case of an error, take the first plan as the final solution
+        final_solution = plans[0]
+        messages = []
+
+    # Step 4 - Answer the question
+    content = f"Use your final solution from above to correctly answer the question. Here is the question:\n{task}"
+    messages = [{"role": "assistant", "content": final_solution}, {"role": "user", "content": content}]
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        max_tokens=cepo_config.planning_max_tokens_step4,
+        temperature=cepo_config.planning_temperature_step4,
+        stream=False,
+    )
+    completion_tokens += response.usage.completion_tokens
+
+    cb_log["messages"] = messages
+    return response.choices[0].message.content, completion_tokens, cb_log
+
+
+def generate_n_completions(system_prompt: str, initial_query: str, client: Any, model: str, cepo_config: CepoConfig) -> tuple[list[str], int, dict]:
+    """
+    Generates n completions for the Best of N step of CePO.
+
+    Parameters:
+        system_prompt (str): The system prompt to guide the model.
+        initial_query (str): The task or question to be addressed.
+        client (Any): The client instance for interacting with the AI model.
+        model (str): The model name to be used for generating completions.
+        cepo_config (CepoConfig): Configuration parameters for CePO flow.
+
+    Returns:
+        Tuple[str, int, dict]: The generated completion, number of tokens used, and a log dictionary.
+    """
+    completion_tokens = 0
+    cb_log = {}
+    completions = []
+
+    for i in range(cepo_config.bestofn_n):
+        response_i, completion_tokens_i, cb_log_i = generate_completion(system_prompt, initial_query, client, model, cepo_config)
+        completions.append(response_i)
+        completion_tokens += completion_tokens_i
+        cb_log[f"completion_{i}_response"] = response_i
+        cb_log[f"completion_{i}_log"] = cb_log_i
+        cb_log[f"completion_{i}_completion_tokens"] = completion_tokens_i    
+
+    return completions, completion_tokens, cb_log
+
+
+def rate_completions_absolute(system_prompt: str, initial_query: str, client: Any, model: str, completions: list[str], cepo_config: CepoConfig, cb_log: dict) -> tuple[str, int, dict]:
+    """
+    Rates completions for the Best of N step of CePO. Each completion is rated on a scale of 1 to 10 individually.
+
+    Parameters:
+        system_prompt (str): The system prompt to guide the model.
+        initial_query (str): The task or question to be addressed.
+        client (Any): The client instance for interacting with the AI model.
+        model (str): The model name to be used for generating completions.
+        completions (list[str]): List of completions to be rated.
+        cepo_config (CepoConfig): Configuration parameters for CePO flow.
+
+    Returns:
+        Tuple[str, int, dict]: The generated completion, number of tokens used, and a log dictionary.
+    """
+    completion_tokens = 0
+    rating_messages = [{"role": "system", "content": system_prompt},
+                       {"role": "user", "content": initial_query}]
+    content = "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to "\
+              "the user question displayed below. Your evaluation should consider correctness as a primary factor as "\
+              "well as other factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of "\
+              "detail of the response. Evaluation Criteria:\n"\
+              "- Correctness: How free is it from errors or mistakes?\n"\
+              "- Helpfulness: How effectively does the response meet the user's needs?\n"\
+              "- Relevance: How directly does the response address the original question?\n"\
+              "- Accuracy: Are the information and explanations factually correct?\n"\
+              "- Depth: Does the response provide comprehensive and meaningful insights?\n"\
+              "- Creativity: Does the response offer unique or innovative perspectives?\n"\
+              "- Clarity: Is the response well-organized, coherent, and easy to understand?\n"\
+              "Evaluation Process:\n"\
+              "1. Carefully review the user question and the AI assistant's response.\n"\
+              "2. Assess the response against each criterion.\n"\
+              "3. Provide a concise explanation of your overall evaluation.\n"\
+              "4. Rate the response on a 1-10 scale with the following guidelines:\n"\
+              "    - 1-2: Completely inadequate, fails to address the question\n"\
+              "    - 3-4: Minimal relevance, significant deficiencies\n"\
+              "    - 5-6: Partially helpful, requires substantial improvement\n"\
+              "    - 7-8: Good response with minor areas for enhancement\n"\
+              "    - 9-10: Correct, comprehensive, and highly insightful.\n"\
+              "Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your "\
+              "explanation, please rate the response on a scale of 1 to 10 by strictly following this format:  \"Rating: "\
+              "[[rating]]\", for example: \"Rating: [[5]]\""
+    rating_messages.append({"role": "system", "content": content})
+    
+    ratings = []
+    for i, completion in enumerate(completions):
+        rating_messages.append({"role": "assistant", "content": completion})
+        content = "Rate the above response beginning with a small evaluation blurb followed by a rating on a scale of 1 to 10 "\
+                  "by strictly following this format: \"Explanation: <reason for your rating>\n\nRating: [[rating]]\"."
+        rating_messages.append({"role": "system", "content": content})
+
+        rating_response = client.chat.completions.create(
+            model=model,
+            messages=rating_messages,
+            max_tokens=cepo_config.bestofn_max_tokens,
+            temperature=cepo_config.bestofn_temperature
+        )
+        completion_tokens += rating_response.usage.completion_tokens
+        
+        rating_response = rating_response.choices[0].message.content.strip()
+        cb_log[f"rating_response_{i}"] = rating_response
+
+        pattern = r"Rating: \[\[(\d+)\]\]"
+        match = re.search(pattern, rating_response)
+        rating_response = match.group(1) if match else "0"
+
+        try:
+            ratings.append(float(rating_response))
+        except ValueError:
+            ratings.append(0)
+        
+        rating_messages = rating_messages[:-2]  # clear the last two messages to start over in the next iteration
+    
+    best_index = ratings.index(max(ratings))
+    cb_log["ratings"] = ratings
+    cb_log["best_index"] = best_index
+    return completions[best_index], completion_tokens, cb_log
+
+
+def rate_completions_pairwise(system_prompt: str, initial_query: str, client: Any, model: str, completions: list[str], cepo_config: CepoConfig, cb_log: dict) -> tuple[str, int, dict]:
+    """
+    Rates completions for the Best of N step of CePO. Completions are rated pairwise against each other in both orders (A vs B and B vs A).
+
+    Parameters:
+        system_prompt (str): The system prompt to guide the model.
+        initial_query (str): The task or question to be addressed.
+        client (Any): The client instance for interacting with the AI model.
+        model (str): The model name to be used for generating completions.
+        completions (list[str]): List of completions to be rated.
+        cepo_config (CepoConfig): Configuration parameters for CePO flow.
+
+    Returns:
+        Tuple[str, int, dict]: The generated completion, number of tokens used, and a log dictionary.
+    """
+    completion_tokens = 0
+    rating_messages = [{"role": "system", "content": system_prompt},
+                       {"role": "user", "content": initial_query}]
+    content = "Please act as an impartial judge and compare the quality of the two responses provided by the AI assistant " \
+              "to the user's question displayed below. Evaluation Criteria:\n" \
+              "- Helpfulness: How effectively does the response meet the user's needs?\n" \
+              "- Relevance: How directly does the response address the original question?\n" \
+              "- Accuracy: Are the information and explanations factually correct?\n" \
+              "- Depth: Does the response provide comprehensive and meaningful insights?\n" \
+              "- Creativity: Does the response offer unique or innovative perspectives?\n" \
+              "- Clarity: Is the response well-organized, coherent, and easy to understand?\n" \
+              "Evaluation Process:\n" \
+              "1. Carefully review the user's question and the AI assistant's responses.\n" \
+              "2. Compare the responses against each other for each criterion.\n" \
+              "3. Provide a concise explanation of your overall evaluation.\n" \
+              "4. Select the response that is superior based on the above criteria.\n" \
+              "Reply with \"Better Response: [[response id]]\".\n" \
+              "If the first response is better, reply with \"Better Response: [[0]]\". " \
+              "If the second response is better, reply with \"Better Response: [[1]]\"."
+    rating_messages.append({"role": "system", "content": content})
+
+    ratings = [0] * cepo_config.bestofn_n
+    pairs = [(i, j) for i in range(cepo_config.bestofn_n) for j in range(cepo_config.bestofn_n) if i != j]
+    for pair in pairs:
+        responses_pair = f"Response 0: {completions[pair[0]]}\n\nResponse 1: {completions[pair[1]]}"
+        rating_messages.append({"role": "assistant", "content": responses_pair})
+        content =  "Reply with \"Better Response: [[response id]]\".\n" \
+                   "If the first response is better, reply with \"Better Response: [[0]]\". " \
+                   "If the second response is better, reply with \"Better Response: [[1]]\"."
+        rating_messages.append({"role": "system", "content": content})
+
+        rating_response = client.chat.completions.create(
+            model=model,
+            messages=rating_messages,
+            max_tokens=cepo_config.bestofn_max_tokens,
+            temperature=cepo_config.bestofn_temperature
+        )
+        completion_tokens += rating_response.usage.completion_tokens
+        
+        rating_response = rating_response.choices[0].message.content.strip()
+        cb_log[f"rating_response_for_pair_{pair[0]}_{pair[1]}"] = rating_response
+
+        pattern = r"Better Response: \[\[(\d+)\]\]"
+        match = re.search(pattern, rating_response)
+        if match:
+            rating_response = match.group(1)
+            try:
+                rating = int(rating_response)
+                ratings[pair[rating]] += 1
+            except ValueError:
+                ratings[pair[0]] += 1  # if parsing unsuccessful, default to the first response
+        else:
+            ratings[pair[0]] += 1  # if parsing unsuccessful, default to the first response
+
+        rating_messages = rating_messages[:-2]
+    
+    best_index = ratings.index(max(ratings))
+    cb_log["ratings"] = ratings
+    cb_log["best_index"] = best_index
+    return completions[best_index], completion_tokens, cb_log
+
+
+def cepo(system_prompt: str, initial_query: str, client: Any, model: str, cepo_config: CepoConfig) -> tuple[str, int]:
+    """
+    Applies CePO reasoning flow for the given task. First, it generates multiple completions, and then rates them to select the best one.
+    Each completion is generated as follows:
+    
+    Generate `planning_n` solution proposals:
+        Step 1: Plan Generation - The model generates a detailed, step-by-step plan to solve the problem, along with its confidence level for 
+                each step.
+        Step 2: Initial Solution - Using the plan from Step 1, the model produces an initial solution.
+    
+    Step 3: Plan Refinement - The model reviews all generated solution proposals and their associated plans, identifying inconsistencies.
+            Based on this analysis, a refined, final step-by-step plan is constructed.
+    Step 4: Final Solution - The model uses the refined plan from Step 3 to produce the final answer.
+    
+    Parameters:
+        system_prompt (str): The system prompt to guide the model.
+        initial_query (str): The task or question to be addressed.
+        client (Any): The client instance for interacting with the AI model.
+        model (str): The model name to be used for generating completions.
+        cepo_config (CepoConfig): Configuration parameters for CePO flow.
+
+    Returns:
+        Tuple[str, int, dict]: The generated completion, number of tokens used
+    """
+    
+    # Generate completions
+    completions, completion_tokens_planning, cb_log = generate_n_completions(system_prompt, initial_query, client, model, cepo_config)  # cb_log is a dictionary for debugging purposes
+    
+    # Rate the completions
+    if cepo_config.bestofn_rating_type == "absolute":
+        rate_completions_fn = rate_completions_absolute
+    elif cepo_config.bestofn_rating_type == "pairwise":
+        rate_completions_fn = rate_completions_pairwise
+    else:
+        raise ValueError("Invalid rating type in cepo_config")
+
+    best_completion, completion_tokens_rating, cb_log = rate_completions_fn(system_prompt, initial_query, client, model, completions, cepo_config, cb_log)
+    
+    return best_completion, completion_tokens_planning + completion_tokens_rating
diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
index 62ef5ce6..34d18776 100644
--- a/optillm/plugins/coc_plugin.py
+++ b/optillm/plugins/coc_plugin.py
@@ -104,6 +104,7 @@ def sanitize_code(code: str) -> str:
         safe_lines.append(line)
     
     safe_code = '\n'.join(safe_lines)
+    safe_code = safe_code.replace('\n', '\n    ')
     
     # Add safety wrapper
     wrapper = f"""
@@ -111,7 +112,7 @@ def sanitize_code(code: str) -> str:
 
 def safe_execute():
     import numpy as np  # Always allow numpy
-    {safe_code.replace('\n', '\n    ')}
+    {safe_code}
     return answer if 'answer' in locals() else None
 
 result = safe_execute()
diff --git a/optillm/plugins/readurls_plugin.py b/optillm/plugins/readurls_plugin.py
index 13c31c05..4b96942a 100644
--- a/optillm/plugins/readurls_plugin.py
+++ b/optillm/plugins/readurls_plugin.py
@@ -1,11 +1,28 @@
 import re
 from typing import Tuple, List
 import requests
+import os
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 
 SLUG = "readurls"
 
+def get_version():
+    try:
+        # Get path to setup.py relative to this file
+        current_dir = os.path.dirname(__file__)
+        package_root = os.path.dirname(os.path.dirname(current_dir))
+        setup_path = os.path.join(package_root, 'setup.py')
+        
+        with open(setup_path, 'r') as f:
+            content = f.read()
+            version_match = re.search(r'version=["\']([^"\']+)["\']', content)
+            if version_match:
+                return version_match.group(1)
+    except Exception:
+        pass
+    return "unknown"
+
 def extract_urls(text: str) -> List[str]:
     # Updated regex pattern to be more precise
     url_pattern = re.compile(r'https?://[^\s\'"]+')
@@ -24,8 +41,9 @@ def extract_urls(text: str) -> List[str]:
 
 def fetch_webpage_content(url: str, max_length: int = 100000) -> str:
     try:
+        version = get_version()
         headers = {
-            'User-Agent': 'optillm/0.0.1 (https://github.com/codelion/optillm)'
+            'User-Agent': f'optillm/{version} (https://github.com/codelion/optillm)'
         }
         
         response = requests.get(url, headers=headers, timeout=10)
@@ -106,4 +124,4 @@ def run(system_prompt, initial_query: str, client=None, model=None) -> Tuple[str
         domain = urlparse(url).netloc
         modified_query = modified_query.replace(url, f"{url} [Content from {domain}: {content}]")
     # print(modified_query)
-    return modified_query, 0
\ No newline at end of file
+    return modified_query, 0
diff --git a/optillm/plugins/router_plugin.py b/optillm/plugins/router_plugin.py
index de2b0805..3c9b1381 100644
--- a/optillm/plugins/router_plugin.py
+++ b/optillm/plugins/router_plugin.py
@@ -22,9 +22,10 @@
 SLUG = "router"
 
 # Constants
-MAX_LENGTH = 512
+MAX_LENGTH = 1024
 APPROACHES = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
-MODEL_NAME = "codelion/optillm-bert-uncased"
+BASE_MODEL = "answerdotai/ModernBERT-large"
+OPTILLM_MODEL_NAME = "codelion/optillm-modernbert-large"
 
 class OptILMClassifier(nn.Module):
     def __init__(self, base_model, num_labels):
@@ -49,16 +50,16 @@ def forward(self, input_ids, attention_mask, effort):
 def load_optillm_model():
     device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
     # Load the base model
-    base_model = AutoModel.from_pretrained("google-bert/bert-large-uncased")
+    base_model = AutoModel.from_pretrained(BASE_MODEL)
     # Create the OptILMClassifier
     model = OptILMClassifier(base_model, num_labels=len(APPROACHES))  
     model.to(device)
     # Download the safetensors file
-    safetensors_path = hf_hub_download(repo_id=MODEL_NAME, filename="model.safetensors")
+    safetensors_path = hf_hub_download(repo_id=OPTILLM_MODEL_NAME, filename="model.safetensors")
     # Load the state dict from the safetensors file
     load_model(model, safetensors_path)
 
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    tokenizer = AutoTokenizer.from_pretrained(OPTILLM_MODEL_NAME)
     return model, tokenizer, device
 
 def preprocess_input(tokenizer, system_prompt, initial_query):
diff --git a/requirements.txt b/requirements.txt
index 1d710c6f..8e1309ae 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,7 @@ nbconvert
 ipython
 ipykernel
 peft
-bitsandbytes
\ No newline at end of file
+bitsandbytes
+gradio
+protobuf==3.20.3
+cerebras_cloud_sdk
diff --git a/requirements_proxy_only.txt b/requirements_proxy_only.txt
new file mode 100644
index 00000000..84e02764
--- /dev/null
+++ b/requirements_proxy_only.txt
@@ -0,0 +1,19 @@
+numpy
+networkx
+openai
+z3-solver
+aiohttp
+flask
+azure.identity
+scikit-learn
+litellm
+requests
+beautifulsoup4
+lxml
+presidio_analyzer
+presidio_anonymizer
+nbformat
+nbconvert
+ipython
+ipykernel
+gradio
\ No newline at end of file
diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
index 12ec9b39..5fc72576 100644
--- a/scripts/eval_aime_benchmark.py
+++ b/scripts/eval_aime_benchmark.py
@@ -4,7 +4,7 @@
 import logging
 import re
 import time
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict, Tuple, Optional, Union
 from datetime import datetime
 from openai import OpenAI
 from datasets import load_dataset
@@ -15,7 +15,7 @@
 logger = logging.getLogger(__name__)
 
 # Initialize OpenAI client
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8888/v1")
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="https://ot7nh9nqf4l7b43s.us-east-1.aws.endpoints.huggingface.cloud/v1/")
 
 SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.
 
@@ -89,9 +89,17 @@ def extract_answer(response: str) -> Optional[int]:
             
     return None
 
-def get_llm_response(problem: str, model: str) -> str:
+def get_llm_response(problem: str, model: str) -> Union[str, List[Dict]]:
     """
     Get response from the LLM for a given problem.
+    If multiple choices are returned, formats them as attempt dictionaries.
+    
+    Args:
+        problem (str): The problem text
+        model (str): The model identifier
+        
+    Returns:
+        Union[str, List[Dict]]: Either a string response or list of attempt dictionaries
     """
     try:
         response = client.with_options(timeout=1000.0).chat.completions.create(
@@ -101,7 +109,23 @@ def get_llm_response(problem: str, model: str) -> str:
             ],
             max_tokens=8192,
         )
+        
+        # If there's more than one choice, format as attempts
+        if len(response.choices) > 1:
+            attempts = []
+            for i, choice in enumerate(response.choices):
+                response_text = choice.message.content.strip()
+                predicted_answer = extract_answer(response_text)
+                attempts.append({
+                    "attempt_number": i + 1,
+                    "response": response_text,
+                    "predicted_answer": predicted_answer
+                })
+            return attempts
+            
+        # If single choice, return as before
         return response.choices[0].message.content.strip()
+        
     except Exception as e:
         logger.error(f"Error getting LLM response: {e}")
         return ""
@@ -119,14 +143,25 @@ def make_n_attempts(problem: str, model: str, n: int) -> List[Dict]:
         List[Dict]: List of dictionaries containing response and predicted answer for each attempt
     """
     attempts = []
-    for i in range(n):
+    remaining_attempts = n
+    
+    while remaining_attempts > 0:
         response = get_llm_response(problem, model)
-        predicted_answer = extract_answer(response)
-        attempts.append({
-            "attempt_number": i + 1,
-            "response": response,
-            "predicted_answer": predicted_answer
-        })
+        
+        # If response is already formatted as attempts
+        if isinstance(response, list):
+            attempts.extend(response)
+            remaining_attempts = n - len(attempts)
+        else:
+            # Process single response as before
+            predicted_answer = extract_answer(response)
+            attempts.append({
+                "attempt_number": len(attempts) + 1,
+                "response": response,
+                "predicted_answer": predicted_answer
+            })
+            remaining_attempts -= 1
+    
     return attempts
 
 def evaluate_pass_at_n(attempts: List[Dict], correct_answer: int) -> Tuple[bool, Optional[int]]:
@@ -206,18 +241,21 @@ def analyze_results(results: List[Dict], n: int):
             print("---")
 
 def main(model: str, n_attempts: int):
-    """Main evaluation function."""
+    """Main evaluation function that handles gaps in processed indexes."""
     os.makedirs("results", exist_ok=True)
     
-    # Include n_attempts in filename to keep separate results for different n values
     results_file = f"evaluation_results_{model.replace('/', '_')}_pass_at_{n_attempts}.json"
     
     dataset = load_2024_dataset()
     existing_results = load_existing_results(results_file)
-    last_processed_index = get_last_processed_index(existing_results)
     
-    for idx, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
-        if idx <= last_processed_index:
+    # Create a set of already processed indexes for efficient lookup
+    processed_indexes = {result['index'] for result in existing_results}
+    
+    for _, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
+        id = int(item['id'])
+        # Skip if this index has already been processed
+        if id in processed_indexes:
             continue
             
         problem_text = item['problem']
@@ -228,7 +266,7 @@ def main(model: str, n_attempts: int):
         is_correct, first_correct = evaluate_pass_at_n(attempts, correct_answer)
         
         result = {
-            "index": idx,
+            "index": id,
             "problem": problem_text,
             "attempts": attempts,
             "correct_answer": correct_answer,
@@ -240,6 +278,7 @@ def main(model: str, n_attempts: int):
     final_results = load_existing_results(results_file)
     analyze_results(final_results, n_attempts)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Evaluate LLM performance on AIME 2024 problems")
     parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4, gpt-3.5-turbo)")
diff --git a/scripts/eval_arena_hard_auto_rtc.py b/scripts/eval_arena_hard_auto_rtc.py
index 890c8019..76ab4835 100644
--- a/scripts/eval_arena_hard_auto_rtc.py
+++ b/scripts/eval_arena_hard_auto_rtc.py
@@ -17,7 +17,8 @@
 logger = logging.getLogger(__name__)
 
 # Initialize OpenAI client (only used for chat completions now)
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+client = OpenAI(base_url="http://localhost:8000/v1", api_key=os.environ.get("OPENAI_API_KEY"))
+# client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
 @dataclass
 class RTCConfig:
@@ -58,8 +59,7 @@ def get_llm_response(messages: List[Dict], model: str) -> Optional[str]:
             response = client.chat.completions.create(
                 model=model,
                 messages=messages,
-                temperature=0.7,
-                max_tokens=1000
+                max_tokens=4096
             )
             return response.choices[0].message.content.strip()
         except Exception as e:
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 6483f66e..dd662e3e 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,3 +1,3 @@
 datasets
 accelerate
-huggingface_hub
\ No newline at end of file
+huggingface_hub
diff --git a/scripts/train_optillm_classifier.py b/scripts/train_optillm_classifier.py
index 904d5f82..a295fdfe 100644
--- a/scripts/train_optillm_classifier.py
+++ b/scripts/train_optillm_classifier.py
@@ -15,7 +15,7 @@
 
 # Constants
 APPROACHES = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
-MAX_LENGTH = 512
+MAX_LENGTH = 1024
 
 # Device selection
 device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
@@ -233,6 +233,18 @@ def inference(model, tokenizer, prompt, effort_levels):
     return results
 
 def main(args):
+
+    if args.push_to_hub:
+        base_model = AutoModel.from_pretrained(args.model_name)
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+        # best_model = OptILMClassifier(base_model, num_labels=len(APPROACHES))
+        # best_model.to(device)
+        # load_model(best_model, "best_model.safetensors")
+        # we just push the base model and then upload the safetensors file manually as OptILMClassifier class doesn't have a push_to_hub method.
+        base_model.push_to_hub(args.hub_model_id)
+        tokenizer.push_to_hub(args.hub_model_id)
+        return
+    
     tokenizer = AutoTokenizer.from_pretrained(args.model_name)
     dataset = load_and_preprocess_data(tokenizer)
 
@@ -273,15 +285,6 @@ def main(args):
 
     print(f"\nBest performing model was from fold {best_fold} with validation accuracy {best_val_accuracy:.4f}")
 
-    if args.push_to_hub:
-        base_model = AutoModel.from_pretrained(args.model_name)
-        # best_model = OptILMClassifier(base_model, num_labels=len(APPROACHES))
-        # best_model.to(device)
-        # load_model(best_model, "best_model.safetensors")
-        # we just push the base model and then upload the safetensors file manually as OptILMClassifier class doesn't have a push_to_hub method.
-        base_model.push_to_hub(args.hub_model_id)
-        tokenizer.push_to_hub(args.hub_model_id)
-
     # Load the best model for inference
     base_model = AutoModel.from_pretrained(args.model_name)
     best_model = OptILMClassifier(base_model, num_labels=len(APPROACHES))
diff --git a/setup.py b/setup.py
index 0a7743f1..610493c8 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="optillm",
-    version="0.0.16",
+    version="0.0.25",
     packages=find_packages(),
     py_modules=['optillm'],
     package_data={
@@ -33,6 +33,7 @@
         "ipykernel",
         "peft",
         "bitsandbytes",
+        "gradio"
     ],
     entry_points={
         'console_scripts': [