From 0a79b106e8fd3b9a9b9a36bc9c9a5af34afac716 Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 04:04:27 +0530 Subject: [PATCH 1/9] feat: Add BM25 ranking engine with PostgreSQL and RocksDB integration, updating Docker and app to use it. --- python/ranker/Dockerfile | 28 +++++- python/ranker/app.py | 39 +++++++-- python/ranker/engine.py | 154 +++++++++++++++++++++++++++++++++ python/ranker/requirements.txt | 5 +- 4 files changed, 214 insertions(+), 12 deletions(-) create mode 100644 python/ranker/engine.py diff --git a/python/ranker/Dockerfile b/python/ranker/Dockerfile index 8273e75..9b94ae3 100644 --- a/python/ranker/Dockerfile +++ b/python/ranker/Dockerfile @@ -1,6 +1,28 @@ -FROM python:3.9-slim +FROM ubuntu:22.04 + +# Install system dependencies +# python3-dev is needed for headers +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + python3-dev \ + git \ + build-essential \ + librocksdb-dev \ + libpq-dev \ + zlib1g-dev \ + libbz2-dev \ + liblz4-dev \ + libsnappy-dev \ + libzstd-dev \ + && rm -rf /var/lib/apt/lists/* + +# Create a symlink for python if needed, though python3 is standard +RUN ln -s /usr/bin/python3 /usr/bin/python + WORKDIR /app COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +RUN pip3 install --no-cache-dir "Cython<3" +RUN pip3 install --no-cache-dir -r requirements.txt COPY . . -CMD ["python", "app.py"] \ No newline at end of file +CMD ["python3", "app.py"] \ No newline at end of file diff --git a/python/ranker/app.py b/python/ranker/app.py index dc829e2..6982c4d 100644 --- a/python/ranker/app.py +++ b/python/ranker/app.py @@ -1,23 +1,46 @@ from flask import Flask, jsonify, request +from engine import Ranker +import time app = Flask(__name__) -# Mock Data (The "Database") -MOCK_INDEX = { - "computer": [{"id": 1, "title": "History of Computers"}, {"id": 2, "title": "Computer Science 101"}], - "cats": [{"id": 3, "title": "Funny Cats"}, {"id": 4, "title": "Cat Care"}] -} +# Initialize Ranker (Global Singleton) +ranker = None +try: + ranker = Ranker() +except Exception as e: + print(f"Failed to initialize Ranker: {e}") @app.route('/health') def health(): - return jsonify({"status": "healthy", "service": "ranker"}) + status = "healthy" if ranker else "degraded" + return jsonify({"status": status, "service": "ranker"}) @app.route('/search') def search(): + global ranker + if not ranker: + # Fallback for dev/restart if before_first_request didn't fire or failed + try: + ranker = Ranker() + except Exception as e: + return jsonify({"error": f"Ranker not initialized: {str(e)}"}), 500 + query = request.args.get('q', '').lower() print(f"Received query: {query}") - results = MOCK_INDEX.get(query, []) - return jsonify({"query": query, "results": results}) + + start_time = time.time() + results = ranker.search(query) + duration_ms = (time.time() - start_time) * 1000 + + return jsonify({ + "query": query, + "results": results, + "meta": { + "count": len(results), + "latency_ms": round(duration_ms, 2) + } + }) if __name__ == '__main__': # host='0.0.0.0' is CRITICAL for Docker networking diff --git a/python/ranker/engine.py b/python/ranker/engine.py new file mode 100644 index 0000000..01419ef --- /dev/null +++ b/python/ranker/engine.py @@ -0,0 +1,154 @@ +import os +import psycopg2 +import numpy as np +from collections import defaultdict + +# Try to import rocksdb, fallback to mock if failed (e.g. build issues) +try: + import rocksdb + ROCKSDB_AVAILABLE = True +except ImportError: + ROCKSDB_AVAILABLE = False + print("WARNING: python-rocksdb not available. Using Mock Index.") + +class Ranker: + def __init__(self): + # 1. Connect to Postgres (Metadata) + try: + self.db_conn = psycopg2.connect( + host=os.environ.get("DB_HOST", "postgres_service"), + database=os.environ.get("DB_NAME", "search_engine"), + user=os.environ.get("DB_USER", "admin"), + password=os.environ.get("DB_PASS", "password123") + ) + print("Connected to Postgres") + except Exception as e: + print(f"Failed to connect to Postgres: {e}") + self.db_conn = None + + # 2. Open RocksDB (Inverted Index) - Read Only + rocksdb_path = os.environ.get("ROCKSDB_PATH", "/shared_data/search_index.db") + self.index_db = None + + if ROCKSDB_AVAILABLE: + try: + opts = rocksdb.Options() + # We only need read access + self.index_db = rocksdb.DB(rocksdb_path, opts, read_only=True) + print(f"Opened RocksDB at {rocksdb_path}") + except Exception as e: + print(f"Failed to open RocksDB: {e}") + + # Mock Index for fallback + self.mock_index = { + "computer": "1,2", + "cats": "3,4" + } + + # 3. Load Global Stats (avgdl) + self.avgdl = self._calculate_avgdl() + print(f"Ranker initialized. AvgDL: {self.avgdl}") + + def _calculate_avgdl(self): + if not self.db_conn: + return 100.0 # Default if DB not connected + try: + with self.db_conn.cursor() as cur: + cur.execute("SELECT AVG(doc_length) FROM documents") + avg = cur.fetchone()[0] + return float(avg) if avg else 0.0 + except Exception as e: + print(f"Error calculating avgdl: {e}") + return 100.0 + + def search(self, query, k=10): + """ + Performs BM25 search for the given query. + Returns top k results: [{'url': ..., 'title': ..., 'score': ...}] + """ + tokens = query.lower().split() # Simple tokenization + if not tokens: + return [] + + # BM25 Constants + k1 = 1.5 + b = 0.75 + + # Accumulate scores: doc_id -> score + scores = defaultdict(float) + + for token in tokens: + # A. Get Posting List from RocksDB or Mock + postings_str = None + + if self.index_db: + try: + val = self.index_db.get(token.encode('utf-8')) + if val: + postings_str = val.decode('utf-8') + except Exception as e: + print(f"Error fetching token {token}: {e}") + elif not ROCKSDB_AVAILABLE: + # Fallback to mock + postings_str = self.mock_index.get(token) + + if not postings_str: + continue + + # Format: "doc_id1,doc_id2,..." (Simplified for now, ideally should have TF) + # For this phase, we assume TF=1 for all occurrences in the simplified index + if isinstance(postings_str, bytes): + postings_str = postings_str.decode('utf-8') + + doc_ids = [int(d) for d in postings_str.split(',')] + + # Calculate IDF + # IDF(q_i) = log( (N - n(q_i) + 0.5) / (n(q_i) + 0.5) + 1 ) + # For simplicity in this phase, we'll use a basic IDF or just count + # We need N (total docs) + N = 1000 # Placeholder or fetch from DB + n_qi = len(doc_ids) + idf = np.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1) + + for doc_id in doc_ids: + # In a real implementation, we'd fetch doc_length and TF from the index/DB + # Here we do a simplified calculation + tf = 1 # Simplified + doc_len = 100 # Simplified placeholder + + # BM25 Score for this term + numerator = idf * tf * (k1 + 1) + denominator = tf + k1 * (1 - b + b * (doc_len / self.avgdl)) + scores[doc_id] += numerator / denominator + + # Sort by score + sorted_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:k] + + # Fetch Metadata for top results + results = [] + if self.db_conn: + try: + with self.db_conn.cursor() as cur: + for doc_id, score in sorted_docs: + cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,)) + row = cur.fetchone() + if row: + results.append({ + "id": doc_id, + "url": row[0], + "score": score, + "title": row[0] # Use URL as title for now + }) + except Exception as e: + print(f"Error fetching metadata: {e}") + else: + # Fallback if DB is down + for doc_id, score in sorted_docs: + results.append({ + "id": doc_id, + "url": f"http://mock-url.com/{doc_id}", + "score": score, + "title": f"Mock Document {doc_id}" + }) + + return results diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index 0f800fc..1ac3e1a 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1 +1,4 @@ -flask==3.0.0 \ No newline at end of file +flask +git+https://github.com/twmht/python-rocksdb.git +psycopg2-binary +numpy \ No newline at end of file From 74a9ab315a9935ccc5392337bbd2128113ff0494 Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 04:29:47 +0530 Subject: [PATCH 2/9] feat: update indexer Dockerfile base image, mount shared data volume for ranker, and set API root to search index. --- API/config/routes.rb | 2 +- cpp/indexer/Dockerfile | 2 +- docker-compose.yml | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/API/config/routes.rb b/API/config/routes.rb index 3e6e365..4fe4af7 100644 --- a/API/config/routes.rb +++ b/API/config/routes.rb @@ -8,5 +8,5 @@ get "/search", to: "search#index" # Defines the root path route ("/") - # root "posts#index" + root "search#index" end diff --git a/cpp/indexer/Dockerfile b/cpp/indexer/Dockerfile index c3b3ab4..bb61cec 100644 --- a/cpp/indexer/Dockerfile +++ b/cpp/indexer/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:latest +FROM ubuntu:22.04 RUN apt-get update && apt-get install -y \ build-essential \ diff --git a/docker-compose.yml b/docker-compose.yml index 3aa0f37..e90c617 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -29,6 +29,7 @@ services: - "5000:5000" volumes: - ./python/ranker:/app # Hot-reloading for Python + - ./data/crawled_pages:/shared_data environment: - FLASK_ENV=development networks: From 5ffcbbcb0e43230c16bab0949d54f6760c5a8388 Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 04:42:15 +0530 Subject: [PATCH 3/9] build: Add RocksDB system dependencies to CI and fix average document length default in engine. --- .github/workflows/ci.yml | 5 +++++ python/ranker/engine.py | 4 ++-- python/ranker/requirements.txt | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b7b0045..6fe57ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -127,6 +127,11 @@ jobs: python-version: '3.9' cache: 'pip' + - name: Install System Dependencies + run: | + sudo apt-get update + sudo apt-get install -y librocksdb-dev zlib1g-dev libbz2-dev liblz4-dev libsnappy-dev libzstd-dev + - name: Install Dependencies run: pip install -r requirements.txt diff --git a/python/ranker/engine.py b/python/ranker/engine.py index 01419ef..f918ea9 100644 --- a/python/ranker/engine.py +++ b/python/ranker/engine.py @@ -56,7 +56,7 @@ def _calculate_avgdl(self): with self.db_conn.cursor() as cur: cur.execute("SELECT AVG(doc_length) FROM documents") avg = cur.fetchone()[0] - return float(avg) if avg else 0.0 + return float(avg) if avg else 100.0 #Default to 100 to avoid dividing by 0 except Exception as e: print(f"Error calculating avgdl: {e}") return 100.0 @@ -88,7 +88,7 @@ def search(self, query, k=10): postings_str = val.decode('utf-8') except Exception as e: print(f"Error fetching token {token}: {e}") - elif not ROCKSDB_AVAILABLE: + else: # Fallback to mock postings_str = self.mock_index.get(token) diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index 1ac3e1a..ebb53d0 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1,3 +1,4 @@ +Cython<3.0 flask git+https://github.com/twmht/python-rocksdb.git psycopg2-binary From fac0362be659d4cf46f44b1f958c3f48221f8efd Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 04:50:48 +0530 Subject: [PATCH 4/9] ci: Add pre-installation of build tools and `--no-build-isolation` for dependency installation. --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6fe57ed..4804660 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -133,7 +133,9 @@ jobs: sudo apt-get install -y librocksdb-dev zlib1g-dev libbz2-dev liblz4-dev libsnappy-dev libzstd-dev - name: Install Dependencies - run: pip install -r requirements.txt + run: | + pip install "Cython<3.0" setuptools wheel + pip install --no-build-isolation -r requirements.txt - name: Syntax Check run: python -m compileall . From 56b5236879d14732f49438cd1f131c58490da741 Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 05:03:26 +0530 Subject: [PATCH 5/9] feat: Enhance ranker's BM25 scoring with improved query parsing, dynamic document statistics, and robust database connection handling. --- docker-compose.yml | 4 + python/ranker/app.py | 2 + python/ranker/engine.py | 130 +++++++++++++++++++++++++++++---- python/ranker/requirements.txt | 10 +-- 4 files changed, 126 insertions(+), 20 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index e90c617..44769fb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,6 +32,10 @@ services: - ./data/crawled_pages:/shared_data environment: - FLASK_ENV=development + - DB_HOST=postgres_service + - DB_NAME=search_engine + - DB_USER=admin + - DB_PASS=password123 networks: - search_net diff --git a/python/ranker/app.py b/python/ranker/app.py index 6982c4d..3c5bf24 100644 --- a/python/ranker/app.py +++ b/python/ranker/app.py @@ -1,6 +1,7 @@ from flask import Flask, jsonify, request from engine import Ranker import time +import atexit app = Flask(__name__) @@ -8,6 +9,7 @@ ranker = None try: ranker = Ranker() + atexit.register(ranker.close) except Exception as e: print(f"Failed to initialize Ranker: {e}") diff --git a/python/ranker/engine.py b/python/ranker/engine.py index f918ea9..f5f89b2 100644 --- a/python/ranker/engine.py +++ b/python/ranker/engine.py @@ -1,4 +1,5 @@ import os +import re import psycopg2 import numpy as np from collections import defaultdict @@ -15,11 +16,19 @@ class Ranker: def __init__(self): # 1. Connect to Postgres (Metadata) try: + db_host = os.environ.get("DB_HOST", "postgres_service") + db_name = os.environ.get("DB_NAME", "search_engine") + db_user = os.environ.get("DB_USER") + db_pass = os.environ.get("DB_PASS") + + if not db_user or not db_pass: + raise ValueError("DB_USER and DB_PASS environment variables must be set.") + self.db_conn = psycopg2.connect( - host=os.environ.get("DB_HOST", "postgres_service"), - database=os.environ.get("DB_NAME", "search_engine"), - user=os.environ.get("DB_USER", "admin"), - password=os.environ.get("DB_PASS", "password123") + host=db_host, + database=db_name, + user=db_user, + password=db_pass ) print("Connected to Postgres") except Exception as e: @@ -45,9 +54,10 @@ def __init__(self): "cats": "3,4" } - # 3. Load Global Stats (avgdl) + # 3. Load Global Stats (avgdl, total_docs) self.avgdl = self._calculate_avgdl() - print(f"Ranker initialized. AvgDL: {self.avgdl}") + self.total_docs = self._get_total_docs() + print(f"Ranker initialized. AvgDL: {self.avgdl}, Total Docs: {self.total_docs}") def _calculate_avgdl(self): if not self.db_conn: @@ -61,12 +71,60 @@ def _calculate_avgdl(self): print(f"Error calculating avgdl: {e}") return 100.0 + def _get_total_docs(self): + if not self.db_conn: + return 1000 # Default + try: + with self.db_conn.cursor() as cur: + cur.execute("SELECT COUNT(*) FROM documents") + count = cur.fetchone()[0] + return int(count) if count else 0 + except Exception as e: + print(f"Error fetching total docs: {e}") + return 1000 + + def _get_doc_lengths(self, doc_ids): + """ + Fetches document lengths for a list of doc_ids. + Returns a dictionary: {doc_id: length} + """ + if not self.db_conn or not doc_ids: + return {} + + lengths = {} + try: + with self.db_conn.cursor() as cur: + # Use tuple(doc_ids) for SQL IN clause + # Handle single item tuple correctly + if len(doc_ids) == 1: + query = "SELECT id, length FROM documents WHERE id = %s" + params = (doc_ids[0],) + else: + query = "SELECT id, length FROM documents WHERE id IN %s" + params = (tuple(doc_ids),) + + cur.execute(query, params) + rows = cur.fetchall() + for r in rows: + lengths[r[0]] = r[1] + except Exception as e: + print(f"Error fetching doc lengths: {e}") + return lengths + def search(self, query, k=10): """ Performs BM25 search for the given query. Returns top k results: [{'url': ..., 'title': ..., 'score': ...}] """ - tokens = query.lower().split() # Simple tokenization + # Preprocessing to match Indexer: + # 1. Lowercase + # 2. Remove non-alphanumeric (keep spaces) + # 3. Split by whitespace + # 4. Filter length >= 3 + + query_clean = re.sub(r'[^a-z0-9\s]', '', query.lower()) + tokens = [t for t in query_clean.split() if len(t) >= 3] + if not tokens: return [] @@ -77,6 +135,10 @@ def search(self, query, k=10): # Accumulate scores: doc_id -> score scores = defaultdict(float) + # 1. Retrieve all posting lists and candidate docs + token_postings = {} # token -> [doc_ids] + candidate_doc_ids = set() + for token in tokens: # A. Get Posting List from RocksDB or Mock postings_str = None @@ -101,20 +163,38 @@ def search(self, query, k=10): postings_str = postings_str.decode('utf-8') doc_ids = [int(d) for d in postings_str.split(',')] - + token_postings[token] = doc_ids + candidate_doc_ids.update(doc_ids) + + if not candidate_doc_ids: + return [] + + # 2. Batch fetch document lengths + doc_lengths = self._get_doc_lengths(list(candidate_doc_ids)) + + # 3. Calculate BM25 Scores + for token in tokens: + doc_ids = token_postings.get(token, []) + if not doc_ids: + continue + # Calculate IDF # IDF(q_i) = log( (N - n(q_i) + 0.5) / (n(q_i) + 0.5) + 1 ) - # For simplicity in this phase, we'll use a basic IDF or just count - # We need N (total docs) - N = 1000 # Placeholder or fetch from DB + N = self.total_docs + if N == 0: N = 1 # Avoid division by zero issues if DB is empty + n_qi = len(doc_ids) idf = np.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1) for doc_id in doc_ids: - # In a real implementation, we'd fetch doc_length and TF from the index/DB - # Here we do a simplified calculation - tf = 1 # Simplified - doc_len = 100 # Simplified placeholder + # TODO: Fetch real TF from index. Currently index only stores doc_ids. + # We assume TF=1 for now. + tf = 1 + + # Get doc_len, fallback to avgdl if missing (e.g. sync issue) + doc_len = doc_lengths.get(doc_id, self.avgdl) + if doc_len is None or doc_len == 0: + doc_len = self.avgdl # Safety fallback # BM25 Score for this term numerator = idf * tf * (k1 + 1) @@ -152,3 +232,23 @@ def search(self, query, k=10): }) return results + + def close(self): + """Closes the database connection.""" + if self.db_conn: + try: + self.db_conn.close() + print("Closed Postgres connection") + except Exception as e: + print(f"Error closing Postgres connection: {e}") + finally: + self.db_conn = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def __del__(self): + self.close() diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index ebb53d0..fa07eef 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1,5 +1,5 @@ -Cython<3.0 -flask -git+https://github.com/twmht/python-rocksdb.git -psycopg2-binary -numpy \ No newline at end of file +Cython>=2.0.0,<3.0 +flask==3.1.2 +git+https://github.com/twmht/python-rocksdb.git@v0.7.0 +psycopg2-binary==2.9.11 +numpy==2.3.5 \ No newline at end of file From 974e6a601a7a3589b055a2d3f06da88dd91fcc2b Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 05:11:13 +0530 Subject: [PATCH 6/9] build: update ranker dependencies --- python/ranker/requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index fa07eef..ebb53d0 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1,5 +1,5 @@ -Cython>=2.0.0,<3.0 -flask==3.1.2 -git+https://github.com/twmht/python-rocksdb.git@v0.7.0 -psycopg2-binary==2.9.11 -numpy==2.3.5 \ No newline at end of file +Cython<3.0 +flask +git+https://github.com/twmht/python-rocksdb.git +psycopg2-binary +numpy \ No newline at end of file From e8a39885ecd3f82b5d68cc1a077e6632c1773072 Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 13:26:08 +0530 Subject: [PATCH 7/9] fix: Replace python-rocksdb with rocksdict to fix CI build --- python/ranker/engine.py | 18 +++++++++++++----- python/ranker/requirements.txt | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/python/ranker/engine.py b/python/ranker/engine.py index f5f89b2..54ab263 100644 --- a/python/ranker/engine.py +++ b/python/ranker/engine.py @@ -4,13 +4,13 @@ import numpy as np from collections import defaultdict -# Try to import rocksdb, fallback to mock if failed (e.g. build issues) +# Try to import rocksdict, fallback to mock if failed try: - import rocksdb + from rocksdict import Rdict, Options, AccessType ROCKSDB_AVAILABLE = True except ImportError: ROCKSDB_AVAILABLE = False - print("WARNING: python-rocksdb not available. Using Mock Index.") + print("WARNING: rocksdict not available. Using Mock Index.") class Ranker: def __init__(self): @@ -41,9 +41,8 @@ def __init__(self): if ROCKSDB_AVAILABLE: try: - opts = rocksdb.Options() # We only need read access - self.index_db = rocksdb.DB(rocksdb_path, opts, read_only=True) + self.index_db = Rdict(rocksdb_path, options=Options(), access_type=AccessType.read_only()) print(f"Opened RocksDB at {rocksdb_path}") except Exception as e: print(f"Failed to open RocksDB: {e}") @@ -235,6 +234,15 @@ def search(self, query, k=10): def close(self): """Closes the database connection.""" + if self.index_db: + try: + self.index_db.close() + print("Closed RocksDB connection") + except Exception as e: + print(f"Error closing RocksDB connection: {e}") + finally: + self.index_db = None + if self.db_conn: try: self.db_conn.close() diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index ebb53d0..77e6c3c 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1,5 +1,5 @@ Cython<3.0 flask -git+https://github.com/twmht/python-rocksdb.git +rocksdict psycopg2-binary numpy \ No newline at end of file From 83ccbd609a5e5e3e0a5d080b37d5b10f2fc1d757 Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 13:30:00 +0530 Subject: [PATCH 8/9] feat: Add example environment file and update docker-compose to use environment variables for database configuration --- .env.example | 3 +++ README.md | 23 +++++++++++++++++++++++ docker-compose.yml | 18 +++++++++--------- 3 files changed, 35 insertions(+), 9 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..ca2d73a --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +DB_USER=admin +DB_PASS=change_me_in_production +DB_NAME=search_engine diff --git a/README.md b/README.md index 5d06fde..d6c3111 100644 --- a/README.md +++ b/README.md @@ -1 +1,24 @@ ![CodeRabbit Pull Request Reviews](https://img.shields.io/coderabbit/prs/github/Digvijay-x1/Search-Engine?utm_source=oss&utm_medium=github&utm_campaign=Digvijay-x1%2FSearch-Engine&labelColor=171717&color=FF570A&link=https%3A%2F%2Fcoderabbit.ai&label=CodeRabbit+Reviews) + +## Setup + +### Environment Variables + +This project uses environment variables for configuration, including database credentials. + +1. Copy the example environment file: + ```bash + cp .env.example .env + ``` +2. Edit `.env` and set your own secure passwords and configuration: + ```ini + DB_USER=admin + DB_PASS=your_secure_password + DB_NAME=search_engine + ``` + +### Running with Docker + +```bash +docker-compose up --build +``` diff --git a/docker-compose.yml b/docker-compose.yml index 44769fb..276c2ea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,9 +13,9 @@ services: postgres_service: image: postgres:17.2-alpine3.21 environment: - POSTGRES_USER: admin - POSTGRES_PASSWORD: password123 - POSTGRES_DB: search_engine + POSTGRES_USER: ${DB_USER} + POSTGRES_PASSWORD: ${DB_PASS} + POSTGRES_DB: ${DB_NAME} volumes: - ./data/init.sql:/docker-entrypoint-initdb.d/init.sql # Runs on first startup - pg_data:/var/lib/postgresql/data # Persistence @@ -33,9 +33,9 @@ services: environment: - FLASK_ENV=development - DB_HOST=postgres_service - - DB_NAME=search_engine - - DB_USER=admin - - DB_PASS=password123 + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASS=${DB_PASS} networks: - search_net @@ -70,9 +70,9 @@ services: - ./data/crawled_pages:/shared_data environment: - DB_HOST=postgres_service - - DB_NAME=search_engine - - DB_USER=admin - - DB_PASS=password123 + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASS=${DB_PASS} depends_on: - redis_service - postgres_service From 01f26835032ed640500851298aa9b8e9bd8f82cc Mon Sep 17 00:00:00 2001 From: Digvijay Singh Rawat Date: Tue, 9 Dec 2025 13:34:26 +0530 Subject: [PATCH 9/9] fix: Update SQL query to use 'doc_length' instead of 'length' for document retrieval --- python/ranker/engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ranker/engine.py b/python/ranker/engine.py index 54ab263..ba275f9 100644 --- a/python/ranker/engine.py +++ b/python/ranker/engine.py @@ -96,10 +96,10 @@ def _get_doc_lengths(self, doc_ids): # Use tuple(doc_ids) for SQL IN clause # Handle single item tuple correctly if len(doc_ids) == 1: - query = "SELECT id, length FROM documents WHERE id = %s" + query = "SELECT id, doc_length FROM documents WHERE id = %s" params = (doc_ids[0],) else: - query = "SELECT id, length FROM documents WHERE id IN %s" + query = "SELECT id, doc_length FROM documents WHERE id IN %s" params = (tuple(doc_ids),) cur.execute(query, params)