diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..ca2d73a --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +DB_USER=admin +DB_PASS=change_me_in_production +DB_NAME=search_engine diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b7b0045..4804660 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -127,8 +127,15 @@ jobs: python-version: '3.9' cache: 'pip' + - name: Install System Dependencies + run: | + sudo apt-get update + sudo apt-get install -y librocksdb-dev zlib1g-dev libbz2-dev liblz4-dev libsnappy-dev libzstd-dev + - name: Install Dependencies - run: pip install -r requirements.txt + run: | + pip install "Cython<3.0" setuptools wheel + pip install --no-build-isolation -r requirements.txt - name: Syntax Check run: python -m compileall . diff --git a/API/config/routes.rb b/API/config/routes.rb index 3e6e365..4fe4af7 100644 --- a/API/config/routes.rb +++ b/API/config/routes.rb @@ -8,5 +8,5 @@ get "/search", to: "search#index" # Defines the root path route ("/") - # root "posts#index" + root "search#index" end diff --git a/README.md b/README.md index 5d06fde..d6c3111 100644 --- a/README.md +++ b/README.md @@ -1 +1,24 @@ ![CodeRabbit Pull Request Reviews](https://img.shields.io/coderabbit/prs/github/Digvijay-x1/Search-Engine?utm_source=oss&utm_medium=github&utm_campaign=Digvijay-x1%2FSearch-Engine&labelColor=171717&color=FF570A&link=https%3A%2F%2Fcoderabbit.ai&label=CodeRabbit+Reviews) + +## Setup + +### Environment Variables + +This project uses environment variables for configuration, including database credentials. + +1. Copy the example environment file: + ```bash + cp .env.example .env + ``` +2. Edit `.env` and set your own secure passwords and configuration: + ```ini + DB_USER=admin + DB_PASS=your_secure_password + DB_NAME=search_engine + ``` + +### Running with Docker + +```bash +docker-compose up --build +``` diff --git a/cpp/indexer/Dockerfile b/cpp/indexer/Dockerfile index c3b3ab4..bb61cec 100644 --- a/cpp/indexer/Dockerfile +++ b/cpp/indexer/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:latest +FROM ubuntu:22.04 RUN apt-get update && apt-get install -y \ build-essential \ diff --git a/docker-compose.yml b/docker-compose.yml index 3aa0f37..276c2ea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,9 +13,9 @@ services: postgres_service: image: postgres:17.2-alpine3.21 environment: - POSTGRES_USER: admin - POSTGRES_PASSWORD: password123 - POSTGRES_DB: search_engine + POSTGRES_USER: ${DB_USER} + POSTGRES_PASSWORD: ${DB_PASS} + POSTGRES_DB: ${DB_NAME} volumes: - ./data/init.sql:/docker-entrypoint-initdb.d/init.sql # Runs on first startup - pg_data:/var/lib/postgresql/data # Persistence @@ -29,8 +29,13 @@ services: - "5000:5000" volumes: - ./python/ranker:/app # Hot-reloading for Python + - ./data/crawled_pages:/shared_data environment: - FLASK_ENV=development + - DB_HOST=postgres_service + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASS=${DB_PASS} networks: - search_net @@ -65,9 +70,9 @@ services: - ./data/crawled_pages:/shared_data environment: - DB_HOST=postgres_service - - DB_NAME=search_engine - - DB_USER=admin - - DB_PASS=password123 + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASS=${DB_PASS} depends_on: - redis_service - postgres_service diff --git a/python/ranker/Dockerfile b/python/ranker/Dockerfile index 8273e75..9b94ae3 100644 --- a/python/ranker/Dockerfile +++ b/python/ranker/Dockerfile @@ -1,6 +1,28 @@ -FROM python:3.9-slim +FROM ubuntu:22.04 + +# Install system dependencies +# python3-dev is needed for headers +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + python3-dev \ + git \ + build-essential \ + librocksdb-dev \ + libpq-dev \ + zlib1g-dev \ + libbz2-dev \ + liblz4-dev \ + libsnappy-dev \ + libzstd-dev \ + && rm -rf /var/lib/apt/lists/* + +# Create a symlink for python if needed, though python3 is standard +RUN ln -s /usr/bin/python3 /usr/bin/python + WORKDIR /app COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +RUN pip3 install --no-cache-dir "Cython<3" +RUN pip3 install --no-cache-dir -r requirements.txt COPY . . -CMD ["python", "app.py"] \ No newline at end of file +CMD ["python3", "app.py"] \ No newline at end of file diff --git a/python/ranker/app.py b/python/ranker/app.py index dc829e2..3c5bf24 100644 --- a/python/ranker/app.py +++ b/python/ranker/app.py @@ -1,23 +1,48 @@ from flask import Flask, jsonify, request +from engine import Ranker +import time +import atexit app = Flask(__name__) -# Mock Data (The "Database") -MOCK_INDEX = { - "computer": [{"id": 1, "title": "History of Computers"}, {"id": 2, "title": "Computer Science 101"}], - "cats": [{"id": 3, "title": "Funny Cats"}, {"id": 4, "title": "Cat Care"}] -} +# Initialize Ranker (Global Singleton) +ranker = None +try: + ranker = Ranker() + atexit.register(ranker.close) +except Exception as e: + print(f"Failed to initialize Ranker: {e}") @app.route('/health') def health(): - return jsonify({"status": "healthy", "service": "ranker"}) + status = "healthy" if ranker else "degraded" + return jsonify({"status": status, "service": "ranker"}) @app.route('/search') def search(): + global ranker + if not ranker: + # Fallback for dev/restart if before_first_request didn't fire or failed + try: + ranker = Ranker() + except Exception as e: + return jsonify({"error": f"Ranker not initialized: {str(e)}"}), 500 + query = request.args.get('q', '').lower() print(f"Received query: {query}") - results = MOCK_INDEX.get(query, []) - return jsonify({"query": query, "results": results}) + + start_time = time.time() + results = ranker.search(query) + duration_ms = (time.time() - start_time) * 1000 + + return jsonify({ + "query": query, + "results": results, + "meta": { + "count": len(results), + "latency_ms": round(duration_ms, 2) + } + }) if __name__ == '__main__': # host='0.0.0.0' is CRITICAL for Docker networking diff --git a/python/ranker/engine.py b/python/ranker/engine.py new file mode 100644 index 0000000..ba275f9 --- /dev/null +++ b/python/ranker/engine.py @@ -0,0 +1,262 @@ +import os +import re +import psycopg2 +import numpy as np +from collections import defaultdict + +# Try to import rocksdict, fallback to mock if failed +try: + from rocksdict import Rdict, Options, AccessType + ROCKSDB_AVAILABLE = True +except ImportError: + ROCKSDB_AVAILABLE = False + print("WARNING: rocksdict not available. Using Mock Index.") + +class Ranker: + def __init__(self): + # 1. Connect to Postgres (Metadata) + try: + db_host = os.environ.get("DB_HOST", "postgres_service") + db_name = os.environ.get("DB_NAME", "search_engine") + db_user = os.environ.get("DB_USER") + db_pass = os.environ.get("DB_PASS") + + if not db_user or not db_pass: + raise ValueError("DB_USER and DB_PASS environment variables must be set.") + + self.db_conn = psycopg2.connect( + host=db_host, + database=db_name, + user=db_user, + password=db_pass + ) + print("Connected to Postgres") + except Exception as e: + print(f"Failed to connect to Postgres: {e}") + self.db_conn = None + + # 2. Open RocksDB (Inverted Index) - Read Only + rocksdb_path = os.environ.get("ROCKSDB_PATH", "/shared_data/search_index.db") + self.index_db = None + + if ROCKSDB_AVAILABLE: + try: + # We only need read access + self.index_db = Rdict(rocksdb_path, options=Options(), access_type=AccessType.read_only()) + print(f"Opened RocksDB at {rocksdb_path}") + except Exception as e: + print(f"Failed to open RocksDB: {e}") + + # Mock Index for fallback + self.mock_index = { + "computer": "1,2", + "cats": "3,4" + } + + # 3. Load Global Stats (avgdl, total_docs) + self.avgdl = self._calculate_avgdl() + self.total_docs = self._get_total_docs() + print(f"Ranker initialized. AvgDL: {self.avgdl}, Total Docs: {self.total_docs}") + + def _calculate_avgdl(self): + if not self.db_conn: + return 100.0 # Default if DB not connected + try: + with self.db_conn.cursor() as cur: + cur.execute("SELECT AVG(doc_length) FROM documents") + avg = cur.fetchone()[0] + return float(avg) if avg else 100.0 #Default to 100 to avoid dividing by 0 + except Exception as e: + print(f"Error calculating avgdl: {e}") + return 100.0 + + def _get_total_docs(self): + if not self.db_conn: + return 1000 # Default + try: + with self.db_conn.cursor() as cur: + cur.execute("SELECT COUNT(*) FROM documents") + count = cur.fetchone()[0] + return int(count) if count else 0 + except Exception as e: + print(f"Error fetching total docs: {e}") + return 1000 + + def _get_doc_lengths(self, doc_ids): + """ + Fetches document lengths for a list of doc_ids. + Returns a dictionary: {doc_id: length} + """ + if not self.db_conn or not doc_ids: + return {} + + lengths = {} + try: + with self.db_conn.cursor() as cur: + # Use tuple(doc_ids) for SQL IN clause + # Handle single item tuple correctly + if len(doc_ids) == 1: + query = "SELECT id, doc_length FROM documents WHERE id = %s" + params = (doc_ids[0],) + else: + query = "SELECT id, doc_length FROM documents WHERE id IN %s" + params = (tuple(doc_ids),) + + cur.execute(query, params) + rows = cur.fetchall() + for r in rows: + lengths[r[0]] = r[1] + except Exception as e: + print(f"Error fetching doc lengths: {e}") + return lengths + + def search(self, query, k=10): + """ + Performs BM25 search for the given query. + Returns top k results: [{'url': ..., 'title': ..., 'score': ...}] + """ + # Preprocessing to match Indexer: + # 1. Lowercase + # 2. Remove non-alphanumeric (keep spaces) + # 3. Split by whitespace + # 4. Filter length >= 3 + + query_clean = re.sub(r'[^a-z0-9\s]', '', query.lower()) + tokens = [t for t in query_clean.split() if len(t) >= 3] + + if not tokens: + return [] + + # BM25 Constants + k1 = 1.5 + b = 0.75 + + # Accumulate scores: doc_id -> score + scores = defaultdict(float) + + # 1. Retrieve all posting lists and candidate docs + token_postings = {} # token -> [doc_ids] + candidate_doc_ids = set() + + for token in tokens: + # A. Get Posting List from RocksDB or Mock + postings_str = None + + if self.index_db: + try: + val = self.index_db.get(token.encode('utf-8')) + if val: + postings_str = val.decode('utf-8') + except Exception as e: + print(f"Error fetching token {token}: {e}") + else: + # Fallback to mock + postings_str = self.mock_index.get(token) + + if not postings_str: + continue + + # Format: "doc_id1,doc_id2,..." (Simplified for now, ideally should have TF) + # For this phase, we assume TF=1 for all occurrences in the simplified index + if isinstance(postings_str, bytes): + postings_str = postings_str.decode('utf-8') + + doc_ids = [int(d) for d in postings_str.split(',')] + token_postings[token] = doc_ids + candidate_doc_ids.update(doc_ids) + + if not candidate_doc_ids: + return [] + + # 2. Batch fetch document lengths + doc_lengths = self._get_doc_lengths(list(candidate_doc_ids)) + + # 3. Calculate BM25 Scores + for token in tokens: + doc_ids = token_postings.get(token, []) + if not doc_ids: + continue + + # Calculate IDF + # IDF(q_i) = log( (N - n(q_i) + 0.5) / (n(q_i) + 0.5) + 1 ) + N = self.total_docs + if N == 0: N = 1 # Avoid division by zero issues if DB is empty + + n_qi = len(doc_ids) + idf = np.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1) + + for doc_id in doc_ids: + # TODO: Fetch real TF from index. Currently index only stores doc_ids. + # We assume TF=1 for now. + tf = 1 + + # Get doc_len, fallback to avgdl if missing (e.g. sync issue) + doc_len = doc_lengths.get(doc_id, self.avgdl) + if doc_len is None or doc_len == 0: + doc_len = self.avgdl # Safety fallback + + # BM25 Score for this term + numerator = idf * tf * (k1 + 1) + denominator = tf + k1 * (1 - b + b * (doc_len / self.avgdl)) + scores[doc_id] += numerator / denominator + + # Sort by score + sorted_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:k] + + # Fetch Metadata for top results + results = [] + if self.db_conn: + try: + with self.db_conn.cursor() as cur: + for doc_id, score in sorted_docs: + cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,)) + row = cur.fetchone() + if row: + results.append({ + "id": doc_id, + "url": row[0], + "score": score, + "title": row[0] # Use URL as title for now + }) + except Exception as e: + print(f"Error fetching metadata: {e}") + else: + # Fallback if DB is down + for doc_id, score in sorted_docs: + results.append({ + "id": doc_id, + "url": f"http://mock-url.com/{doc_id}", + "score": score, + "title": f"Mock Document {doc_id}" + }) + + return results + + def close(self): + """Closes the database connection.""" + if self.index_db: + try: + self.index_db.close() + print("Closed RocksDB connection") + except Exception as e: + print(f"Error closing RocksDB connection: {e}") + finally: + self.index_db = None + + if self.db_conn: + try: + self.db_conn.close() + print("Closed Postgres connection") + except Exception as e: + print(f"Error closing Postgres connection: {e}") + finally: + self.db_conn = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def __del__(self): + self.close() diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index 0f800fc..77e6c3c 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1 +1,5 @@ -flask==3.0.0 \ No newline at end of file +Cython<3.0 +flask +rocksdict +psycopg2-binary +numpy \ No newline at end of file