Digvijay-x1 · Digvijay-x1 · Dec 9, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -127,6 +127,11 @@ jobs:
         python-version: '3.9'
         cache: 'pip'
 
+    - name: Install System Dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y librocksdb-dev zlib1g-dev libbz2-dev liblz4-dev libsnappy-dev libzstd-dev
+
     - name: Install Dependencies
       run: pip install -r requirements.txt
 

diff --git a/API/config/routes.rb b/API/config/routes.rb
@@ -8,5 +8,5 @@
   get "/search", to: "search#index"
 
   # Defines the root path route ("/")
-  # root "posts#index"
+  root "search#index"
 end
diff --git a/cpp/indexer/Dockerfile b/cpp/indexer/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:latest
+FROM ubuntu:22.04
 
 RUN apt-get update && apt-get install -y \
     build-essential \

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -29,6 +29,7 @@ services:
       - "5000:5000"
     volumes:
       - ./python/ranker:/app # Hot-reloading for Python
+      - ./data/crawled_pages:/shared_data
     environment:
       - FLASK_ENV=development
     networks:

diff --git a/python/ranker/Dockerfile b/python/ranker/Dockerfile
@@ -1,6 +1,28 @@
-FROM python:3.9-slim
+FROM ubuntu:22.04
+
+# Install system dependencies
+# python3-dev is needed for headers
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-dev \
+    git \
+    build-essential \
+    librocksdb-dev \
+    libpq-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    liblz4-dev \
+    libsnappy-dev \
+    libzstd-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create a symlink for python if needed, though python3 is standard
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
 WORKDIR /app
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip3 install --no-cache-dir "Cython<3"
+RUN pip3 install --no-cache-dir -r requirements.txt
 COPY . .
-CMD ["python", "app.py"]
+CMD ["python3", "app.py"]
diff --git a/python/ranker/app.py b/python/ranker/app.py
@@ -1,23 +1,46 @@
 from flask import Flask, jsonify, request
+from engine import Ranker
+import time
 
 app = Flask(__name__)
 
-# Mock Data (The "Database")
-MOCK_INDEX = {
-    "computer": [{"id": 1, "title": "History of Computers"}, {"id": 2, "title": "Computer Science 101"}],
-    "cats": [{"id": 3, "title": "Funny Cats"}, {"id": 4, "title": "Cat Care"}]
-}
+# Initialize Ranker (Global Singleton)
+ranker = None
+try:
+    ranker = Ranker()
+except Exception as e:
+    print(f"Failed to initialize Ranker: {e}")
 
 @app.route('/health')
 def health():
-    return jsonify({"status": "healthy", "service": "ranker"})
+    status = "healthy" if ranker else "degraded"
+    return jsonify({"status": status, "service": "ranker"})
 
 @app.route('/search')
 def search():
+    global ranker
+    if not ranker:
+        # Fallback for dev/restart if before_first_request didn't fire or failed
+        try:
+            ranker = Ranker()
+        except Exception as e:
+            return jsonify({"error": f"Ranker not initialized: {str(e)}"}), 500
+
     query = request.args.get('q', '').lower()
     print(f"Received query: {query}")
-    results = MOCK_INDEX.get(query, [])
-    return jsonify({"query": query, "results": results})
+
+    start_time = time.time()
+    results = ranker.search(query)
+    duration_ms = (time.time() - start_time) * 1000
+
+    return jsonify({
+        "query": query, 
+        "results": results,
+        "meta": {
+            "count": len(results),
+            "latency_ms": round(duration_ms, 2)
+        }
+    })
 
 if __name__ == '__main__':
     # host='0.0.0.0' is CRITICAL for Docker networking

diff --git a/python/ranker/engine.py b/python/ranker/engine.py
@@ -0,0 +1,154 @@
+import os
+import psycopg2
+import numpy as np
+from collections import defaultdict
+
+# Try to import rocksdb, fallback to mock if failed (e.g. build issues)
+try:
+    import rocksdb
+    ROCKSDB_AVAILABLE = True
+except ImportError:
+    ROCKSDB_AVAILABLE = False
+    print("WARNING: python-rocksdb not available. Using Mock Index.")
+
+class Ranker:
+    def __init__(self):
+        # 1. Connect to Postgres (Metadata)
+        try:
+            self.db_conn = psycopg2.connect(
+                host=os.environ.get("DB_HOST", "postgres_service"),
+                database=os.environ.get("DB_NAME", "search_engine"),
+                user=os.environ.get("DB_USER", "admin"),
+                password=os.environ.get("DB_PASS", "password123")
+            )
+            print("Connected to Postgres")
+        except Exception as e:
+            print(f"Failed to connect to Postgres: {e}")
+            self.db_conn = None
+
+        # 2. Open RocksDB (Inverted Index) - Read Only
+        rocksdb_path = os.environ.get("ROCKSDB_PATH", "/shared_data/search_index.db")
+        self.index_db = None
+
+        if ROCKSDB_AVAILABLE:
+            try:
+                opts = rocksdb.Options()
+                # We only need read access
+                self.index_db = rocksdb.DB(rocksdb_path, opts, read_only=True)
+                print(f"Opened RocksDB at {rocksdb_path}")
+            except Exception as e:
+                print(f"Failed to open RocksDB: {e}")
+
+        # Mock Index for fallback
+        self.mock_index = {
+            "computer": "1,2",
+            "cats": "3,4"
+        }
+
+        # 3. Load Global Stats (avgdl)
+        self.avgdl = self._calculate_avgdl()
+        print(f"Ranker initialized. AvgDL: {self.avgdl}")
+
+    def _calculate_avgdl(self):
+        if not self.db_conn:
+            return 100.0 # Default if DB not connected
+        try:
+            with self.db_conn.cursor() as cur:
+                cur.execute("SELECT AVG(doc_length) FROM documents")
+                avg = cur.fetchone()[0]
+                return float(avg) if avg else 100.0 #Default to 100 to avoid dividing by 0 
+        except Exception as e:
+            print(f"Error calculating avgdl: {e}")
+            return 100.0
+
+    def search(self, query, k=10):
+        """
+        Performs BM25 search for the given query.
+        Returns top k results: [{'url': ..., 'title': ..., 'score': ...}]
+        """
+        tokens = query.lower().split() # Simple tokenization
+        if not tokens:
+            return []
+
+        # BM25 Constants
+        k1 = 1.5
+        b = 0.75
+
+        # Accumulate scores: doc_id -> score
+        scores = defaultdict(float)
+
+        for token in tokens:
+            # A. Get Posting List from RocksDB or Mock
+            postings_str = None
+
+            if self.index_db:
+                try:
+                    val = self.index_db.get(token.encode('utf-8'))
+                    if val:
+                        postings_str = val.decode('utf-8')
+                except Exception as e:
+                    print(f"Error fetching token {token}: {e}")
+            else:
+                # Fallback to mock
+                postings_str = self.mock_index.get(token)
+
+            if not postings_str:
+                continue
+
+            # Format: "doc_id1,doc_id2,..." (Simplified for now, ideally should have TF)
+            # For this phase, we assume TF=1 for all occurrences in the simplified index
+            if isinstance(postings_str, bytes):
+                postings_str = postings_str.decode('utf-8')
+
+            doc_ids = [int(d) for d in postings_str.split(',')]
+
+            # Calculate IDF
+            # IDF(q_i) = log( (N - n(q_i) + 0.5) / (n(q_i) + 0.5) + 1 )
+            # For simplicity in this phase, we'll use a basic IDF or just count
+            # We need N (total docs)
+            N = 1000 # Placeholder or fetch from DB
+            n_qi = len(doc_ids)
+            idf = np.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1)
+
+            for doc_id in doc_ids:
+                # In a real implementation, we'd fetch doc_length and TF from the index/DB
+                # Here we do a simplified calculation
+                tf = 1 # Simplified
+                doc_len = 100 # Simplified placeholder
+
+                # BM25 Score for this term
+                numerator = idf * tf * (k1 + 1)
+                denominator = tf + k1 * (1 - b + b * (doc_len / self.avgdl))
+                scores[doc_id] += numerator / denominator
+
+        # Sort by score
+        sorted_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:k]
+
+        # Fetch Metadata for top results
+        results = []
+        if self.db_conn:
+            try:
+                with self.db_conn.cursor() as cur:
+                    for doc_id, score in sorted_docs:
+                        cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,))
+                        row = cur.fetchone()
+                        if row:
+                            results.append({
+                                "id": doc_id,
+                                "url": row[0],
+                                "score": score,
+                                "title": row[0] # Use URL as title for now
+                            })
+            except Exception as e:
+                print(f"Error fetching metadata: {e}")
+        else:
+            # Fallback if DB is down
+            for doc_id, score in sorted_docs:
+                results.append({
+                    "id": doc_id,
+                    "url": f"http://mock-url.com/{doc_id}",
+                    "score": score,
+                    "title": f"Mock Document {doc_id}"
+                })
+
+        return results
diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt
@@ -1 +1,5 @@
-flask==3.0.0
+Cython<3.0
+flask
+git+https://github.com/twmht/python-rocksdb.git
+psycopg2-binary
+numpy