From 0a79b106e8fd3b9a9b9a36bc9c9a5af34afac716 Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 04:04:27 +0530
Subject: [PATCH 1/9] feat: Add BM25 ranking engine with PostgreSQL and RocksDB
 integration, updating Docker and app to use it.

---
 python/ranker/Dockerfile       |  28 +++++-
 python/ranker/app.py           |  39 +++++++--
 python/ranker/engine.py        | 154 +++++++++++++++++++++++++++++++++
 python/ranker/requirements.txt |   5 +-
 4 files changed, 214 insertions(+), 12 deletions(-)
 create mode 100644 python/ranker/engine.py

diff --git a/python/ranker/Dockerfile b/python/ranker/Dockerfile
index 8273e75..9b94ae3 100644
--- a/python/ranker/Dockerfile
+++ b/python/ranker/Dockerfile
@@ -1,6 +1,28 @@
-FROM python:3.9-slim
+FROM ubuntu:22.04
+
+# Install system dependencies
+# python3-dev is needed for headers
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-dev \
+    git \
+    build-essential \
+    librocksdb-dev \
+    libpq-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    liblz4-dev \
+    libsnappy-dev \
+    libzstd-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create a symlink for python if needed, though python3 is standard
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
 WORKDIR /app
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip3 install --no-cache-dir "Cython<3"
+RUN pip3 install --no-cache-dir -r requirements.txt
 COPY . .
-CMD ["python", "app.py"]
\ No newline at end of file
+CMD ["python3", "app.py"]
\ No newline at end of file
diff --git a/python/ranker/app.py b/python/ranker/app.py
index dc829e2..6982c4d 100644
--- a/python/ranker/app.py
+++ b/python/ranker/app.py
@@ -1,23 +1,46 @@
 from flask import Flask, jsonify, request
+from engine import Ranker
+import time
 
 app = Flask(__name__)
 
-# Mock Data (The "Database")
-MOCK_INDEX = {
-    "computer": [{"id": 1, "title": "History of Computers"}, {"id": 2, "title": "Computer Science 101"}],
-    "cats": [{"id": 3, "title": "Funny Cats"}, {"id": 4, "title": "Cat Care"}]
-}
+# Initialize Ranker (Global Singleton)
+ranker = None
+try:
+    ranker = Ranker()
+except Exception as e:
+    print(f"Failed to initialize Ranker: {e}")
 
 @app.route('/health')
 def health():
-    return jsonify({"status": "healthy", "service": "ranker"})
+    status = "healthy" if ranker else "degraded"
+    return jsonify({"status": status, "service": "ranker"})
 
 @app.route('/search')
 def search():
+    global ranker
+    if not ranker:
+        # Fallback for dev/restart if before_first_request didn't fire or failed
+        try:
+            ranker = Ranker()
+        except Exception as e:
+            return jsonify({"error": f"Ranker not initialized: {str(e)}"}), 500
+
     query = request.args.get('q', '').lower()
     print(f"Received query: {query}")
-    results = MOCK_INDEX.get(query, [])
-    return jsonify({"query": query, "results": results})
+    
+    start_time = time.time()
+    results = ranker.search(query)
+    duration_ms = (time.time() - start_time) * 1000
+    
+    return jsonify({
+        "query": query, 
+        "results": results,
+        "meta": {
+            "count": len(results),
+            "latency_ms": round(duration_ms, 2)
+        }
+    })
 
 if __name__ == '__main__':
     # host='0.0.0.0' is CRITICAL for Docker networking
diff --git a/python/ranker/engine.py b/python/ranker/engine.py
new file mode 100644
index 0000000..01419ef
--- /dev/null
+++ b/python/ranker/engine.py
@@ -0,0 +1,154 @@
+import os
+import psycopg2
+import numpy as np
+from collections import defaultdict
+
+# Try to import rocksdb, fallback to mock if failed (e.g. build issues)
+try:
+    import rocksdb
+    ROCKSDB_AVAILABLE = True
+except ImportError:
+    ROCKSDB_AVAILABLE = False
+    print("WARNING: python-rocksdb not available. Using Mock Index.")
+
+class Ranker:
+    def __init__(self):
+        # 1. Connect to Postgres (Metadata)
+        try:
+            self.db_conn = psycopg2.connect(
+                host=os.environ.get("DB_HOST", "postgres_service"),
+                database=os.environ.get("DB_NAME", "search_engine"),
+                user=os.environ.get("DB_USER", "admin"),
+                password=os.environ.get("DB_PASS", "password123")
+            )
+            print("Connected to Postgres")
+        except Exception as e:
+            print(f"Failed to connect to Postgres: {e}")
+            self.db_conn = None
+        
+        # 2. Open RocksDB (Inverted Index) - Read Only
+        rocksdb_path = os.environ.get("ROCKSDB_PATH", "/shared_data/search_index.db")
+        self.index_db = None
+        
+        if ROCKSDB_AVAILABLE:
+            try:
+                opts = rocksdb.Options()
+                # We only need read access
+                self.index_db = rocksdb.DB(rocksdb_path, opts, read_only=True)
+                print(f"Opened RocksDB at {rocksdb_path}")
+            except Exception as e:
+                print(f"Failed to open RocksDB: {e}")
+        
+        # Mock Index for fallback
+        self.mock_index = {
+            "computer": "1,2",
+            "cats": "3,4"
+        }
+        
+        # 3. Load Global Stats (avgdl)
+        self.avgdl = self._calculate_avgdl()
+        print(f"Ranker initialized. AvgDL: {self.avgdl}")
+
+    def _calculate_avgdl(self):
+        if not self.db_conn:
+            return 100.0 # Default if DB not connected
+        try:
+            with self.db_conn.cursor() as cur:
+                cur.execute("SELECT AVG(doc_length) FROM documents")
+                avg = cur.fetchone()[0]
+                return float(avg) if avg else 0.0
+        except Exception as e:
+            print(f"Error calculating avgdl: {e}")
+            return 100.0
+
+    def search(self, query, k=10):
+        """
+        Performs BM25 search for the given query.
+        Returns top k results: [{'url': ..., 'title': ..., 'score': ...}]
+        """
+        tokens = query.lower().split() # Simple tokenization
+        if not tokens:
+            return []
+
+        # BM25 Constants
+        k1 = 1.5
+        b = 0.75
+        
+        # Accumulate scores: doc_id -> score
+        scores = defaultdict(float)
+        
+        for token in tokens:
+            # A. Get Posting List from RocksDB or Mock
+            postings_str = None
+            
+            if self.index_db:
+                try:
+                    val = self.index_db.get(token.encode('utf-8'))
+                    if val:
+                        postings_str = val.decode('utf-8')
+                except Exception as e:
+                    print(f"Error fetching token {token}: {e}")
+            elif not ROCKSDB_AVAILABLE:
+                # Fallback to mock
+                postings_str = self.mock_index.get(token)
+
+            if not postings_str:
+                continue
+                
+            # Format: "doc_id1,doc_id2,..." (Simplified for now, ideally should have TF)
+            # For this phase, we assume TF=1 for all occurrences in the simplified index
+            if isinstance(postings_str, bytes):
+                postings_str = postings_str.decode('utf-8')
+            
+            doc_ids = [int(d) for d in postings_str.split(',')]
+            
+            # Calculate IDF
+            # IDF(q_i) = log( (N - n(q_i) + 0.5) / (n(q_i) + 0.5) + 1 )
+            # For simplicity in this phase, we'll use a basic IDF or just count
+            # We need N (total docs)
+            N = 1000 # Placeholder or fetch from DB
+            n_qi = len(doc_ids)
+            idf = np.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1)
+            
+            for doc_id in doc_ids:
+                # In a real implementation, we'd fetch doc_length and TF from the index/DB
+                # Here we do a simplified calculation
+                tf = 1 # Simplified
+                doc_len = 100 # Simplified placeholder
+                
+                # BM25 Score for this term
+                numerator = idf * tf * (k1 + 1)
+                denominator = tf + k1 * (1 - b + b * (doc_len / self.avgdl))
+                scores[doc_id] += numerator / denominator
+
+        # Sort by score
+        sorted_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:k]
+        
+        # Fetch Metadata for top results
+        results = []
+        if self.db_conn:
+            try:
+                with self.db_conn.cursor() as cur:
+                    for doc_id, score in sorted_docs:
+                        cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,))
+                        row = cur.fetchone()
+                        if row:
+                            results.append({
+                                "id": doc_id,
+                                "url": row[0],
+                                "score": score,
+                                "title": row[0] # Use URL as title for now
+                            })
+            except Exception as e:
+                print(f"Error fetching metadata: {e}")
+        else:
+            # Fallback if DB is down
+            for doc_id, score in sorted_docs:
+                results.append({
+                    "id": doc_id,
+                    "url": f"http://mock-url.com/{doc_id}",
+                    "score": score,
+                    "title": f"Mock Document {doc_id}"
+                })
+                    
+        return results
diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt
index 0f800fc..1ac3e1a 100644
--- a/python/ranker/requirements.txt
+++ b/python/ranker/requirements.txt
@@ -1 +1,4 @@
-flask==3.0.0
\ No newline at end of file
+flask
+git+https://github.com/twmht/python-rocksdb.git
+psycopg2-binary
+numpy
\ No newline at end of file

From 74a9ab315a9935ccc5392337bbd2128113ff0494 Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 04:29:47 +0530
Subject: [PATCH 2/9] feat: update indexer Dockerfile base image, mount shared
 data volume for ranker, and set API root to search index.

---
 API/config/routes.rb   | 2 +-
 cpp/indexer/Dockerfile | 2 +-
 docker-compose.yml     | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/API/config/routes.rb b/API/config/routes.rb
index 3e6e365..4fe4af7 100644
--- a/API/config/routes.rb
+++ b/API/config/routes.rb
@@ -8,5 +8,5 @@
   get "/search", to: "search#index"
 
   # Defines the root path route ("/")
-  # root "posts#index"
+  root "search#index"
 end
diff --git a/cpp/indexer/Dockerfile b/cpp/indexer/Dockerfile
index c3b3ab4..bb61cec 100644
--- a/cpp/indexer/Dockerfile
+++ b/cpp/indexer/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:latest
+FROM ubuntu:22.04
 
 RUN apt-get update && apt-get install -y \
     build-essential \
diff --git a/docker-compose.yml b/docker-compose.yml
index 3aa0f37..e90c617 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -29,6 +29,7 @@ services:
       - "5000:5000"
     volumes:
       - ./python/ranker:/app # Hot-reloading for Python
+      - ./data/crawled_pages:/shared_data
     environment:
       - FLASK_ENV=development
     networks:

From 5ffcbbcb0e43230c16bab0949d54f6760c5a8388 Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 04:42:15 +0530
Subject: [PATCH 3/9] build: Add RocksDB system dependencies to CI and fix
 average document length default in engine.

---
 .github/workflows/ci.yml       | 5 +++++
 python/ranker/engine.py        | 4 ++--
 python/ranker/requirements.txt | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b7b0045..6fe57ed 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -127,6 +127,11 @@ jobs:
         python-version: '3.9'
         cache: 'pip'
         
+    - name: Install System Dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y librocksdb-dev zlib1g-dev libbz2-dev liblz4-dev libsnappy-dev libzstd-dev
+
     - name: Install Dependencies
       run: pip install -r requirements.txt
       
diff --git a/python/ranker/engine.py b/python/ranker/engine.py
index 01419ef..f918ea9 100644
--- a/python/ranker/engine.py
+++ b/python/ranker/engine.py
@@ -56,7 +56,7 @@ def _calculate_avgdl(self):
             with self.db_conn.cursor() as cur:
                 cur.execute("SELECT AVG(doc_length) FROM documents")
                 avg = cur.fetchone()[0]
-                return float(avg) if avg else 0.0
+                return float(avg) if avg else 100.0 #Default to 100 to avoid dividing by 0 
         except Exception as e:
             print(f"Error calculating avgdl: {e}")
             return 100.0
@@ -88,7 +88,7 @@ def search(self, query, k=10):
                         postings_str = val.decode('utf-8')
                 except Exception as e:
                     print(f"Error fetching token {token}: {e}")
-            elif not ROCKSDB_AVAILABLE:
+            else:
                 # Fallback to mock
                 postings_str = self.mock_index.get(token)
 
diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt
index 1ac3e1a..ebb53d0 100644
--- a/python/ranker/requirements.txt
+++ b/python/ranker/requirements.txt
@@ -1,3 +1,4 @@
+Cython<3.0
 flask
 git+https://github.com/twmht/python-rocksdb.git
 psycopg2-binary

From fac0362be659d4cf46f44b1f958c3f48221f8efd Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 04:50:48 +0530
Subject: [PATCH 4/9] ci: Add pre-installation of build tools and
 `--no-build-isolation` for dependency installation.

---
 .github/workflows/ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6fe57ed..4804660 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -133,7 +133,9 @@ jobs:
         sudo apt-get install -y librocksdb-dev zlib1g-dev libbz2-dev liblz4-dev libsnappy-dev libzstd-dev
 
     - name: Install Dependencies
-      run: pip install -r requirements.txt
+      run: |
+        pip install "Cython<3.0" setuptools wheel
+        pip install --no-build-isolation -r requirements.txt
       
     - name: Syntax Check
       run: python -m compileall .

From 56b5236879d14732f49438cd1f131c58490da741 Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 05:03:26 +0530
Subject: [PATCH 5/9] feat: Enhance ranker's BM25 scoring with improved query
 parsing, dynamic document statistics, and robust database connection
 handling.

---
 docker-compose.yml             |   4 +
 python/ranker/app.py           |   2 +
 python/ranker/engine.py        | 130 +++++++++++++++++++++++++++++----
 python/ranker/requirements.txt |  10 +--
 4 files changed, 126 insertions(+), 20 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index e90c617..44769fb 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -32,6 +32,10 @@ services:
       - ./data/crawled_pages:/shared_data
     environment:
       - FLASK_ENV=development
+      - DB_HOST=postgres_service
+      - DB_NAME=search_engine
+      - DB_USER=admin
+      - DB_PASS=password123
     networks:
       - search_net
 
diff --git a/python/ranker/app.py b/python/ranker/app.py
index 6982c4d..3c5bf24 100644
--- a/python/ranker/app.py
+++ b/python/ranker/app.py
@@ -1,6 +1,7 @@
 from flask import Flask, jsonify, request
 from engine import Ranker
 import time
+import atexit
 
 app = Flask(__name__)
 
@@ -8,6 +9,7 @@
 ranker = None
 try:
     ranker = Ranker()
+    atexit.register(ranker.close)
 except Exception as e:
     print(f"Failed to initialize Ranker: {e}")
 
diff --git a/python/ranker/engine.py b/python/ranker/engine.py
index f918ea9..f5f89b2 100644
--- a/python/ranker/engine.py
+++ b/python/ranker/engine.py
@@ -1,4 +1,5 @@
 import os
+import re
 import psycopg2
 import numpy as np
 from collections import defaultdict
@@ -15,11 +16,19 @@ class Ranker:
     def __init__(self):
         # 1. Connect to Postgres (Metadata)
         try:
+            db_host = os.environ.get("DB_HOST", "postgres_service")
+            db_name = os.environ.get("DB_NAME", "search_engine")
+            db_user = os.environ.get("DB_USER")
+            db_pass = os.environ.get("DB_PASS")
+
+            if not db_user or not db_pass:
+                raise ValueError("DB_USER and DB_PASS environment variables must be set.")
+
             self.db_conn = psycopg2.connect(
-                host=os.environ.get("DB_HOST", "postgres_service"),
-                database=os.environ.get("DB_NAME", "search_engine"),
-                user=os.environ.get("DB_USER", "admin"),
-                password=os.environ.get("DB_PASS", "password123")
+                host=db_host,
+                database=db_name,
+                user=db_user,
+                password=db_pass
             )
             print("Connected to Postgres")
         except Exception as e:
@@ -45,9 +54,10 @@ def __init__(self):
             "cats": "3,4"
         }
         
-        # 3. Load Global Stats (avgdl)
+        # 3. Load Global Stats (avgdl, total_docs)
         self.avgdl = self._calculate_avgdl()
-        print(f"Ranker initialized. AvgDL: {self.avgdl}")
+        self.total_docs = self._get_total_docs()
+        print(f"Ranker initialized. AvgDL: {self.avgdl}, Total Docs: {self.total_docs}")
 
     def _calculate_avgdl(self):
         if not self.db_conn:
@@ -61,12 +71,60 @@ def _calculate_avgdl(self):
             print(f"Error calculating avgdl: {e}")
             return 100.0
 
+    def _get_total_docs(self):
+        if not self.db_conn:
+            return 1000 # Default
+        try:
+            with self.db_conn.cursor() as cur:
+                cur.execute("SELECT COUNT(*) FROM documents")
+                count = cur.fetchone()[0]
+                return int(count) if count else 0
+        except Exception as e:
+            print(f"Error fetching total docs: {e}")
+            return 1000
+
+    def _get_doc_lengths(self, doc_ids):
+        """
+        Fetches document lengths for a list of doc_ids.
+        Returns a dictionary: {doc_id: length}
+        """
+        if not self.db_conn or not doc_ids:
+            return {}
+        
+        lengths = {}
+        try:
+            with self.db_conn.cursor() as cur:
+                # Use tuple(doc_ids) for SQL IN clause
+                # Handle single item tuple correctly
+                if len(doc_ids) == 1:
+                    query = "SELECT id, length FROM documents WHERE id = %s"
+                    params = (doc_ids[0],)
+                else:
+                    query = "SELECT id, length FROM documents WHERE id IN %s"
+                    params = (tuple(doc_ids),)
+                
+                cur.execute(query, params)
+                rows = cur.fetchall()
+                for r in rows:
+                    lengths[r[0]] = r[1]
+        except Exception as e:
+            print(f"Error fetching doc lengths: {e}")
+        return lengths
+
     def search(self, query, k=10):
         """
         Performs BM25 search for the given query.
         Returns top k results: [{'url': ..., 'title': ..., 'score': ...}]
         """
-        tokens = query.lower().split() # Simple tokenization
+        # Preprocessing to match Indexer:
+        # 1. Lowercase
+        # 2. Remove non-alphanumeric (keep spaces)
+        # 3. Split by whitespace
+        # 4. Filter length >= 3
+        
+        query_clean = re.sub(r'[^a-z0-9\s]', '', query.lower())
+        tokens = [t for t in query_clean.split() if len(t) >= 3]
+        
         if not tokens:
             return []
 
@@ -77,6 +135,10 @@ def search(self, query, k=10):
         # Accumulate scores: doc_id -> score
         scores = defaultdict(float)
         
+        # 1. Retrieve all posting lists and candidate docs
+        token_postings = {} # token -> [doc_ids]
+        candidate_doc_ids = set()
+
         for token in tokens:
             # A. Get Posting List from RocksDB or Mock
             postings_str = None
@@ -101,20 +163,38 @@ def search(self, query, k=10):
                 postings_str = postings_str.decode('utf-8')
             
             doc_ids = [int(d) for d in postings_str.split(',')]
-            
+            token_postings[token] = doc_ids
+            candidate_doc_ids.update(doc_ids)
+
+        if not candidate_doc_ids:
+            return []
+
+        # 2. Batch fetch document lengths
+        doc_lengths = self._get_doc_lengths(list(candidate_doc_ids))
+
+        # 3. Calculate BM25 Scores
+        for token in tokens:
+            doc_ids = token_postings.get(token, [])
+            if not doc_ids:
+                continue
+
             # Calculate IDF
             # IDF(q_i) = log( (N - n(q_i) + 0.5) / (n(q_i) + 0.5) + 1 )
-            # For simplicity in this phase, we'll use a basic IDF or just count
-            # We need N (total docs)
-            N = 1000 # Placeholder or fetch from DB
+            N = self.total_docs
+            if N == 0: N = 1 # Avoid division by zero issues if DB is empty
+            
             n_qi = len(doc_ids)
             idf = np.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1)
             
             for doc_id in doc_ids:
-                # In a real implementation, we'd fetch doc_length and TF from the index/DB
-                # Here we do a simplified calculation
-                tf = 1 # Simplified
-                doc_len = 100 # Simplified placeholder
+                # TODO: Fetch real TF from index. Currently index only stores doc_ids.
+                # We assume TF=1 for now.
+                tf = 1 
+                
+                # Get doc_len, fallback to avgdl if missing (e.g. sync issue)
+                doc_len = doc_lengths.get(doc_id, self.avgdl)
+                if doc_len is None or doc_len == 0:
+                    doc_len = self.avgdl # Safety fallback
                 
                 # BM25 Score for this term
                 numerator = idf * tf * (k1 + 1)
@@ -152,3 +232,23 @@ def search(self, query, k=10):
                 })
                     
         return results
+
+    def close(self):
+        """Closes the database connection."""
+        if self.db_conn:
+            try:
+                self.db_conn.close()
+                print("Closed Postgres connection")
+            except Exception as e:
+                print(f"Error closing Postgres connection: {e}")
+            finally:
+                self.db_conn = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def __del__(self):
+        self.close()
diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt
index ebb53d0..fa07eef 100644
--- a/python/ranker/requirements.txt
+++ b/python/ranker/requirements.txt
@@ -1,5 +1,5 @@
-Cython<3.0
-flask
-git+https://github.com/twmht/python-rocksdb.git
-psycopg2-binary
-numpy
\ No newline at end of file
+Cython>=2.0.0,<3.0
+flask==3.1.2
+git+https://github.com/twmht/python-rocksdb.git@v0.7.0
+psycopg2-binary==2.9.11
+numpy==2.3.5
\ No newline at end of file

From 974e6a601a7a3589b055a2d3f06da88dd91fcc2b Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 05:11:13 +0530
Subject: [PATCH 6/9] build: update ranker dependencies

---
 python/ranker/requirements.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt
index fa07eef..ebb53d0 100644
--- a/python/ranker/requirements.txt
+++ b/python/ranker/requirements.txt
@@ -1,5 +1,5 @@
-Cython>=2.0.0,<3.0
-flask==3.1.2
-git+https://github.com/twmht/python-rocksdb.git@v0.7.0
-psycopg2-binary==2.9.11
-numpy==2.3.5
\ No newline at end of file
+Cython<3.0
+flask
+git+https://github.com/twmht/python-rocksdb.git
+psycopg2-binary
+numpy
\ No newline at end of file

From e8a39885ecd3f82b5d68cc1a077e6632c1773072 Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 13:26:08 +0530
Subject: [PATCH 7/9] fix: Replace python-rocksdb with rocksdict to fix CI
 build

---
 python/ranker/engine.py        | 18 +++++++++++++-----
 python/ranker/requirements.txt |  2 +-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/python/ranker/engine.py b/python/ranker/engine.py
index f5f89b2..54ab263 100644
--- a/python/ranker/engine.py
+++ b/python/ranker/engine.py
@@ -4,13 +4,13 @@
 import numpy as np
 from collections import defaultdict
 
-# Try to import rocksdb, fallback to mock if failed (e.g. build issues)
+# Try to import rocksdict, fallback to mock if failed
 try:
-    import rocksdb
+    from rocksdict import Rdict, Options, AccessType
     ROCKSDB_AVAILABLE = True
 except ImportError:
     ROCKSDB_AVAILABLE = False
-    print("WARNING: python-rocksdb not available. Using Mock Index.")
+    print("WARNING: rocksdict not available. Using Mock Index.")
 
 class Ranker:
     def __init__(self):
@@ -41,9 +41,8 @@ def __init__(self):
         
         if ROCKSDB_AVAILABLE:
             try:
-                opts = rocksdb.Options()
                 # We only need read access
-                self.index_db = rocksdb.DB(rocksdb_path, opts, read_only=True)
+                self.index_db = Rdict(rocksdb_path, options=Options(), access_type=AccessType.read_only())
                 print(f"Opened RocksDB at {rocksdb_path}")
             except Exception as e:
                 print(f"Failed to open RocksDB: {e}")
@@ -235,6 +234,15 @@ def search(self, query, k=10):
 
     def close(self):
         """Closes the database connection."""
+        if self.index_db:
+            try:
+                self.index_db.close()
+                print("Closed RocksDB connection")
+            except Exception as e:
+                print(f"Error closing RocksDB connection: {e}")
+            finally:
+                self.index_db = None
+
         if self.db_conn:
             try:
                 self.db_conn.close()
diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt
index ebb53d0..77e6c3c 100644
--- a/python/ranker/requirements.txt
+++ b/python/ranker/requirements.txt
@@ -1,5 +1,5 @@
 Cython<3.0
 flask
-git+https://github.com/twmht/python-rocksdb.git
+rocksdict
 psycopg2-binary
 numpy
\ No newline at end of file

From 83ccbd609a5e5e3e0a5d080b37d5b10f2fc1d757 Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 13:30:00 +0530
Subject: [PATCH 8/9] feat: Add example environment file and update
 docker-compose to use environment variables for database configuration

---
 .env.example       |  3 +++
 README.md          | 23 +++++++++++++++++++++++
 docker-compose.yml | 18 +++++++++---------
 3 files changed, 35 insertions(+), 9 deletions(-)
 create mode 100644 .env.example

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..ca2d73a
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,3 @@
+DB_USER=admin
+DB_PASS=change_me_in_production
+DB_NAME=search_engine
diff --git a/README.md b/README.md
index 5d06fde..d6c3111 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,24 @@
 ![CodeRabbit Pull Request Reviews](https://img.shields.io/coderabbit/prs/github/Digvijay-x1/Search-Engine?utm_source=oss&utm_medium=github&utm_campaign=Digvijay-x1%2FSearch-Engine&labelColor=171717&color=FF570A&link=https%3A%2F%2Fcoderabbit.ai&label=CodeRabbit+Reviews)
+
+## Setup
+
+### Environment Variables
+
+This project uses environment variables for configuration, including database credentials.
+
+1.  Copy the example environment file:
+    ```bash
+    cp .env.example .env
+    ```
+2.  Edit `.env` and set your own secure passwords and configuration:
+    ```ini
+    DB_USER=admin
+    DB_PASS=your_secure_password
+    DB_NAME=search_engine
+    ```
+
+### Running with Docker
+
+```bash
+docker-compose up --build
+```
diff --git a/docker-compose.yml b/docker-compose.yml
index 44769fb..276c2ea 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -13,9 +13,9 @@ services:
   postgres_service:
     image: postgres:17.2-alpine3.21
     environment:
-      POSTGRES_USER: admin
-      POSTGRES_PASSWORD: password123
-      POSTGRES_DB: search_engine
+      POSTGRES_USER: ${DB_USER}
+      POSTGRES_PASSWORD: ${DB_PASS}
+      POSTGRES_DB: ${DB_NAME}
     volumes:
       - ./data/init.sql:/docker-entrypoint-initdb.d/init.sql # Runs on first startup
       - pg_data:/var/lib/postgresql/data # Persistence
@@ -33,9 +33,9 @@ services:
     environment:
       - FLASK_ENV=development
       - DB_HOST=postgres_service
-      - DB_NAME=search_engine
-      - DB_USER=admin
-      - DB_PASS=password123
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASS=${DB_PASS}
     networks:
       - search_net
 
@@ -70,9 +70,9 @@ services:
       - ./data/crawled_pages:/shared_data
     environment:
       - DB_HOST=postgres_service
-      - DB_NAME=search_engine
-      - DB_USER=admin
-      - DB_PASS=password123
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASS=${DB_PASS}
     depends_on:
       - redis_service
       - postgres_service

From 01f26835032ed640500851298aa9b8e9bd8f82cc Mon Sep 17 00:00:00 2001
From: Digvijay Singh Rawat <hackerearthx1@gmail.com>
Date: Tue, 9 Dec 2025 13:34:26 +0530
Subject: [PATCH 9/9] fix: Update SQL query to use 'doc_length' instead of
 'length' for document retrieval

---
 python/ranker/engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ranker/engine.py b/python/ranker/engine.py
index 54ab263..ba275f9 100644
--- a/python/ranker/engine.py
+++ b/python/ranker/engine.py
@@ -96,10 +96,10 @@ def _get_doc_lengths(self, doc_ids):
                 # Use tuple(doc_ids) for SQL IN clause
                 # Handle single item tuple correctly
                 if len(doc_ids) == 1:
-                    query = "SELECT id, length FROM documents WHERE id = %s"
+                    query = "SELECT id, doc_length FROM documents WHERE id = %s"
                     params = (doc_ids[0],)
                 else:
-                    query = "SELECT id, length FROM documents WHERE id IN %s"
+                    query = "SELECT id, doc_length FROM documents WHERE id IN %s"
                     params = (tuple(doc_ids),)
                 
                 cur.execute(query, params)