Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ jobs:
python-version: '3.9'
cache: 'pip'

- name: Install System Dependencies
run: |
sudo apt-get update
sudo apt-get install -y librocksdb-dev zlib1g-dev libbz2-dev liblz4-dev libsnappy-dev libzstd-dev

- name: Install Dependencies
run: pip install -r requirements.txt

Expand Down
2 changes: 1 addition & 1 deletion API/config/routes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
get "/search", to: "search#index"

# Defines the root path route ("/")
# root "posts#index"
root "search#index"
end
2 changes: 1 addition & 1 deletion cpp/indexer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:latest
FROM ubuntu:22.04

RUN apt-get update && apt-get install -y \
build-essential \
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ services:
- "5000:5000"
volumes:
- ./python/ranker:/app # Hot-reloading for Python
- ./data/crawled_pages:/shared_data
environment:
- FLASK_ENV=development
networks:
Expand Down
28 changes: 25 additions & 3 deletions python/ranker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,28 @@
FROM python:3.9-slim
FROM ubuntu:22.04

# Install system dependencies
# python3-dev is needed for headers
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
python3-dev \
git \
build-essential \
librocksdb-dev \
libpq-dev \
zlib1g-dev \
libbz2-dev \
liblz4-dev \
libsnappy-dev \
libzstd-dev \
&& rm -rf /var/lib/apt/lists/*

# Create a symlink for python if needed, though python3 is standard
RUN ln -s /usr/bin/python3 /usr/bin/python

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN pip3 install --no-cache-dir "Cython<3"
RUN pip3 install --no-cache-dir -r requirements.txt
COPY . .
CMD ["python", "app.py"]
CMD ["python3", "app.py"]
39 changes: 31 additions & 8 deletions python/ranker/app.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,46 @@
from flask import Flask, jsonify, request
from engine import Ranker
import time

app = Flask(__name__)

# Mock Data (The "Database")
MOCK_INDEX = {
"computer": [{"id": 1, "title": "History of Computers"}, {"id": 2, "title": "Computer Science 101"}],
"cats": [{"id": 3, "title": "Funny Cats"}, {"id": 4, "title": "Cat Care"}]
}
# Initialize Ranker (Global Singleton)
ranker = None
try:
ranker = Ranker()
except Exception as e:
print(f"Failed to initialize Ranker: {e}")

@app.route('/health')
def health():
return jsonify({"status": "healthy", "service": "ranker"})
status = "healthy" if ranker else "degraded"
return jsonify({"status": status, "service": "ranker"})

@app.route('/search')
def search():
global ranker
if not ranker:
# Fallback for dev/restart if before_first_request didn't fire or failed
try:
ranker = Ranker()
except Exception as e:
return jsonify({"error": f"Ranker not initialized: {str(e)}"}), 500

query = request.args.get('q', '').lower()
print(f"Received query: {query}")
results = MOCK_INDEX.get(query, [])
return jsonify({"query": query, "results": results})

start_time = time.time()
results = ranker.search(query)
duration_ms = (time.time() - start_time) * 1000

return jsonify({
"query": query,
"results": results,
"meta": {
"count": len(results),
"latency_ms": round(duration_ms, 2)
}
})

if __name__ == '__main__':
# host='0.0.0.0' is CRITICAL for Docker networking
Expand Down
154 changes: 154 additions & 0 deletions python/ranker/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import os
import psycopg2
import numpy as np
from collections import defaultdict

# Try to import rocksdb, fallback to mock if failed (e.g. build issues)
try:
import rocksdb
ROCKSDB_AVAILABLE = True
except ImportError:
ROCKSDB_AVAILABLE = False
print("WARNING: python-rocksdb not available. Using Mock Index.")

class Ranker:
def __init__(self):
# 1. Connect to Postgres (Metadata)
try:
self.db_conn = psycopg2.connect(
host=os.environ.get("DB_HOST", "postgres_service"),
database=os.environ.get("DB_NAME", "search_engine"),
user=os.environ.get("DB_USER", "admin"),
password=os.environ.get("DB_PASS", "password123")
)
print("Connected to Postgres")
except Exception as e:
print(f"Failed to connect to Postgres: {e}")
self.db_conn = None

# 2. Open RocksDB (Inverted Index) - Read Only
rocksdb_path = os.environ.get("ROCKSDB_PATH", "/shared_data/search_index.db")
self.index_db = None

if ROCKSDB_AVAILABLE:
try:
opts = rocksdb.Options()
# We only need read access
self.index_db = rocksdb.DB(rocksdb_path, opts, read_only=True)
print(f"Opened RocksDB at {rocksdb_path}")
except Exception as e:
print(f"Failed to open RocksDB: {e}")

# Mock Index for fallback
self.mock_index = {
"computer": "1,2",
"cats": "3,4"
}

# 3. Load Global Stats (avgdl)
self.avgdl = self._calculate_avgdl()
print(f"Ranker initialized. AvgDL: {self.avgdl}")

def _calculate_avgdl(self):
if not self.db_conn:
return 100.0 # Default if DB not connected
try:
with self.db_conn.cursor() as cur:
cur.execute("SELECT AVG(doc_length) FROM documents")
avg = cur.fetchone()[0]
return float(avg) if avg else 100.0 #Default to 100 to avoid dividing by 0
except Exception as e:
print(f"Error calculating avgdl: {e}")
return 100.0

def search(self, query, k=10):
"""
Performs BM25 search for the given query.
Returns top k results: [{'url': ..., 'title': ..., 'score': ...}]
"""
tokens = query.lower().split() # Simple tokenization
if not tokens:
return []

# BM25 Constants
k1 = 1.5
b = 0.75

# Accumulate scores: doc_id -> score
scores = defaultdict(float)

for token in tokens:
# A. Get Posting List from RocksDB or Mock
postings_str = None

if self.index_db:
try:
val = self.index_db.get(token.encode('utf-8'))
if val:
postings_str = val.decode('utf-8')
except Exception as e:
print(f"Error fetching token {token}: {e}")
else:
# Fallback to mock
postings_str = self.mock_index.get(token)

if not postings_str:
continue

# Format: "doc_id1,doc_id2,..." (Simplified for now, ideally should have TF)
# For this phase, we assume TF=1 for all occurrences in the simplified index
if isinstance(postings_str, bytes):
postings_str = postings_str.decode('utf-8')

doc_ids = [int(d) for d in postings_str.split(',')]

# Calculate IDF
# IDF(q_i) = log( (N - n(q_i) + 0.5) / (n(q_i) + 0.5) + 1 )
# For simplicity in this phase, we'll use a basic IDF or just count
# We need N (total docs)
N = 1000 # Placeholder or fetch from DB
n_qi = len(doc_ids)
idf = np.log((N - n_qi + 0.5) / (n_qi + 0.5) + 1)

for doc_id in doc_ids:
# In a real implementation, we'd fetch doc_length and TF from the index/DB
# Here we do a simplified calculation
tf = 1 # Simplified
doc_len = 100 # Simplified placeholder

# BM25 Score for this term
numerator = idf * tf * (k1 + 1)
denominator = tf + k1 * (1 - b + b * (doc_len / self.avgdl))
scores[doc_id] += numerator / denominator

# Sort by score
sorted_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:k]

# Fetch Metadata for top results
results = []
if self.db_conn:
try:
with self.db_conn.cursor() as cur:
for doc_id, score in sorted_docs:
cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,))
row = cur.fetchone()
if row:
results.append({
"id": doc_id,
"url": row[0],
"score": score,
"title": row[0] # Use URL as title for now
})
except Exception as e:
print(f"Error fetching metadata: {e}")
else:
# Fallback if DB is down
for doc_id, score in sorted_docs:
results.append({
"id": doc_id,
"url": f"http://mock-url.com/{doc_id}",
"score": score,
"title": f"Mock Document {doc_id}"
})

return results
6 changes: 5 additions & 1 deletion python/ranker/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
flask==3.0.0
Cython<3.0
flask
git+https://github.com/twmht/python-rocksdb.git
psycopg2-binary
numpy
Loading