Module vedyut.api
+FastAPI application for Vedyut
+Sub-modules
+-
+
vedyut.api.main
+-
++
FastAPI application for Vedyut Sanskrit NLP API
+
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0c4f02..1931344 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -95,7 +95,7 @@ jobs: run: uv python install 3.12 - name: Install dependencies - run: uv sync + run: uv sync --all-extras - name: Run ruff (format check) run: uv run ruff format --check . @@ -122,14 +122,11 @@ jobs: - name: Install Rust uses: dtolnay/rust-toolchain@stable - - name: Build Rust workspace (PyO3 needs Python) - run: cargo build --release - working-directory: ./rust - env: - PYO3_PYTHON: python3.12 + - name: Build with Maturin + run: uvx maturin build --release - name: Check Python package - run: uv sync + run: uv sync --all-extras security: name: Security Audit diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index cb9b0bb..8782210 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -28,11 +28,10 @@ jobs: run: uv python install 3.12 - name: Install dependencies - run: uv sync + run: uv sync --all-extras - name: Build Python docs run: | - uv run pip install pdoc3 uv run pdoc --html python/vedyut --output-dir docs/python - name: Deploy to GitHub Pages diff --git a/docs/python/vedyut/api/index.html b/docs/python/vedyut/api/index.html new file mode 100644 index 0000000..6701be9 --- /dev/null +++ b/docs/python/vedyut/api/index.html @@ -0,0 +1,79 @@ + + +
+ + + +vedyut.apiFastAPI application for Vedyut
+vedyut.api.mainFastAPI application for Vedyut Sanskrit NLP API
vedyut.api.mainFastAPI application for Vedyut Sanskrit NLP API
+
+async def analyze(req: AnalyzeRequest)
+@app.post("/v1/analyze", response_model=AnalyzeResponse)
+async def analyze(req: AnalyzeRequest):
+ """
+ Perform morphological analysis on a Sanskrit word
+
+ Returns possible analyses with grammatical features
+ """
+ start_time = time.time()
+
+ try:
+ # TODO: Call Rust core for actual analysis
+ # Placeholder: return mock analysis
+ analyses = [
+ AnalysisResult(
+ lemma=req.word,
+ case="nominative",
+ number="singular",
+ )
+ ]
+
+ took_ms = (time.time() - start_time) * 1000
+
+ return AnalyzeResponse(
+ word=req.word,
+ analyses=analyses,
+ took_ms=took_ms,
+ )
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+Perform morphological analysis on a Sanskrit word
+Returns possible analyses with grammatical features
+async def generate(req: GenerateRequest)
+@app.post("/v1/generate", response_model=GenerateResponse)
+async def generate(req: GenerateRequest):
+ """
+ Generate Sanskrit word forms from root + grammatical features
+
+ Generates tiṅanta (verb) forms following Pāṇinian grammar
+ """
+ start_time = time.time()
+
+ try:
+ # TODO: Call Rust core for actual generation
+ # Placeholder: return mock form
+ forms = [f"{req.dhatu}+{req.lakara}+{req.purusha}+{req.vacana}"]
+
+ took_ms = (time.time() - start_time) * 1000
+
+ return GenerateResponse(
+ forms=forms,
+ dhatu=req.dhatu,
+ took_ms=took_ms,
+ )
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+Generate Sanskrit word forms from root + grammatical features
+Generates tiṅanta (verb) forms following Pāṇinian grammar
+async def health()
+@app.get("/health")
+async def health():
+ """Health check endpoint"""
+ return {"status": "ok", "service": "vedyut"}
+Health check endpoint
+async def metrics()
+@app.get("/metrics")
+async def metrics():
+ """Basic API metrics (placeholder)"""
+ return {
+ "requests_total": 0,
+ "avg_latency_ms": 0,
+ "uptime_seconds": 0,
+ }
+Basic API metrics (placeholder)
+async def root()
+@app.get("/")
+async def root():
+ """Root endpoint with API information"""
+ return {
+ "name": "Vedyut Sanskrit NLP API",
+ "version": "0.1.0",
+ "docs": "/docs",
+ "health": "/health",
+ }
+Root endpoint with API information
+async def sanskritify_text(req: SanskritifyRequest)
+@app.post("/v1/sanskritify", response_model=SanskritifyResponse)
+async def sanskritify_text(req: SanskritifyRequest):
+ """
+ Make text in any Indian language more like refined Sanskrit
+
+ Transforms modern colloquial text to use Sanskrit-style vocabulary,
+ grammar patterns, and formal register.
+
+ Supports ALL Indian scripts: Devanagari, Tamil, Telugu, Malayalam,
+ Kannada, Bengali, Gujarati, Gurmukhi, etc.
+ """
+ start_time = time.time()
+
+ try:
+ # TODO: Call Rust core for actual sanskritification
+ # Placeholder transformation
+ refined = f"[Sanskritified: {req.text}]"
+
+ took_ms = (time.time() - start_time) * 1000
+
+ return SanskritifyResponse(
+ original=req.text,
+ refined=refined,
+ script=req.script,
+ level=req.level,
+ took_ms=took_ms,
+ )
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+Make text in any Indian language more like refined Sanskrit
+Transforms modern colloquial text to use Sanskrit-style vocabulary, +grammar patterns, and formal register.
+Supports ALL Indian scripts: Devanagari, Tamil, Telugu, Malayalam, +Kannada, Bengali, Gujarati, Gurmukhi, etc.
+async def segment(req: SegmentRequest)
+@app.post("/v1/segment", response_model=SegmentResponse)
+async def segment(req: SegmentRequest):
+ """
+ Segment Sanskrit text into words
+
+ Returns multiple possible segmentations ranked by likelihood
+ """
+ start_time = time.time()
+
+ try:
+ # TODO: Call Rust core for actual segmentation
+ # Placeholder: return mock segmentation
+ segments = [
+ req.text.split(), # Simple space split as placeholder
+ ]
+
+ took_ms = (time.time() - start_time) * 1000
+
+ return SegmentResponse(
+ segments=segments,
+ took_ms=took_ms,
+ )
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+Segment Sanskrit text into words
+Returns multiple possible segmentations ranked by likelihood
+async def transliterate(req: TransliterateRequest)
+@app.post("/v1/transliterate", response_model=TransliterateResponse)
+async def transliterate(req: TransliterateRequest):
+ """
+ Transliterate Sanskrit text between different scripts
+
+ Supported schemes: devanagari, iast, slp1, hk (harvard-kyoto), itrans
+ """
+ start_time = time.time()
+
+ try:
+ # TODO: Call Rust core for actual transliteration
+ result = f"[TODO: Transliterate '{req.text}' from {req.from_scheme} to {req.to_scheme}]"
+
+ took_ms = (time.time() - start_time) * 1000
+
+ return TransliterateResponse(
+ result=result,
+ from_scheme=req.from_scheme,
+ to_scheme=req.to_scheme,
+ took_ms=took_ms,
+ )
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+Transliterate Sanskrit text between different scripts
+Supported schemes: devanagari, iast, slp1, hk (harvard-kyoto), itrans
+class AnalysisResult
+(**data: Any)
+class AnalysisResult(BaseModel):
+ """Morphological analysis result"""
+ lemma: str
+ case: Optional[str] = None
+ number: Optional[str] = None
+ gender: Optional[str] = None
+ person: Optional[str] = None
+ tense: Optional[str] = None
+Morphological analysis result
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var case : str | Nonevar gender : str | Nonevar lemma : strvar model_configvar number : str | Nonevar person : str | Nonevar tense : str | None
+class AnalyzeRequest
+(**data: Any)
+class AnalyzeRequest(BaseModel):
+ """Request model for morphological analysis"""
+ word: str = Field(..., description="Sanskrit word to analyze")
+ scheme: str = Field("devanagari", description="Input script scheme")
+Request model for morphological analysis
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var model_configvar scheme : strvar word : str
+class AnalyzeResponse
+(**data: Any)
+class AnalyzeResponse(BaseModel):
+ """Response model for analysis"""
+ word: str
+ analyses: List[AnalysisResult]
+ took_ms: float
+Response model for analysis
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var analyses : List[AnalysisResult]var model_configvar took_ms : floatvar word : str
+class GenerateRequest
+(**data: Any)
+class GenerateRequest(BaseModel):
+ """Request model for word generation"""
+ dhatu: str = Field(..., description="Verb root (dhatu)")
+ lakara: str = Field(..., description="Tense/mood (lakara)")
+ purusha: str = Field(..., description="Person (prathama, madhyama, uttama)")
+ vacana: str = Field(..., description="Number (eka, dvi, bahu)")
+Request model for word generation
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var dhatu : strvar lakara : strvar model_configvar purusha : strvar vacana : str
+class GenerateResponse
+(**data: Any)
+class GenerateResponse(BaseModel):
+ """Response model for generation"""
+ forms: List[str]
+ dhatu: str
+ took_ms: float
+Response model for generation
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var dhatu : strvar forms : List[str]var model_configvar took_ms : float
+class SanskritifyRequest
+(**data: Any)
+class SanskritifyRequest(BaseModel):
+ """Request model for sanskritification"""
+ text: str = Field(..., description="Text to sanskritify (any Indian language)")
+ script: str = Field("devanagari", description="Script for input/output")
+ level: str = Field("medium", description="Refinement level: light, medium, high, classical")
+ preserve_meaning: bool = Field(True, description="Preserve original meaning")
+Request model for sanskritification
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var level : strvar model_configvar preserve_meaning : boolvar script : strvar text : str
+class SanskritifyResponse
+(**data: Any)
+class SanskritifyResponse(BaseModel):
+ """Response model for sanskritification"""
+ original: str
+ refined: str
+ script: str
+ level: str
+ took_ms: float
+Response model for sanskritification
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var level : strvar model_configvar original : strvar refined : strvar script : strvar took_ms : float
+class SegmentRequest
+(**data: Any)
+class SegmentRequest(BaseModel):
+ """Request model for segmentation"""
+ text: str = Field(..., description="Sanskrit text to segment")
+ max_splits: int = Field(10, description="Maximum number of segmentation options")
+ scheme: str = Field("devanagari", description="Input script scheme")
+Request model for segmentation
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var max_splits : intvar model_configvar scheme : strvar text : str
+class SegmentResponse
+(**data: Any)
+class SegmentResponse(BaseModel):
+ """Response model for segmentation"""
+ segments: List[List[str]]
+ took_ms: float
+Response model for segmentation
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var model_configvar segments : List[List[str]]var took_ms : float
+class TransliterateRequest
+(**data: Any)
+class TransliterateRequest(BaseModel):
+ """Request model for transliteration"""
+ text: str = Field(..., description="Text to transliterate")
+ from_scheme: str = Field(..., description="Source script (iast, slp1, devanagari, etc.)")
+ to_scheme: str = Field(..., description="Target script (iast, slp1, devanagari, etc.)")
+Request model for transliteration
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var from_scheme : strvar model_configvar text : strvar to_scheme : str
+class TransliterateResponse
+(**data: Any)
+class TransliterateResponse(BaseModel):
+ """Response model for transliteration"""
+ result: str
+ from_scheme: str
+ to_scheme: str
+ took_ms: float
+Response model for transliteration
+Create a new model by parsing and validating input data from keyword arguments.
+Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be
+validated to form a valid model.
self is explicitly positional-only to allow self as a field name.
var from_scheme : strvar model_configvar result : strvar to_scheme : strvar took_ms : floatvedyutVedyut - High-performance Sanskrit NLP Toolkit
+A next-generation Sanskrit NLP toolkit combining Rust performance +with Python ease-of-use.
+Script is a first-class parameter throughout the API.
+vedyut.apiFastAPI application for Vedyut
vedyut.llmLLM integration for Sanskrit NLP with grammar treatise RAG
+def analyze(word: str,
script: Script = Script.DEVANAGARI) ‑> List[Dict[str, Any]]
+def analyze(
+ word: str,
+ script: Script = Script.DEVANAGARI,
+) -> List[Dict[str, Any]]:
+ """
+ Analyze morphological features of a Sanskrit word.
+
+ Script is explicitly specified (default: Devanagari).
+
+ Args:
+ word: Sanskrit word to analyze
+ script: Input script (first-class parameter)
+
+ Returns:
+ List of possible analyses with grammatical features
+
+ Examples:
+ >>> analyze("रामः", Script.DEVANAGARI)
+ [{'stem': 'राम', 'case': 'nominative', 'number': 'singular', ...}]
+ """
+ if RUST_AVAILABLE:
+ return _rust_analyze(word, script.value)
+
+ # Fallback if Rust not available
+ return [{"word": word, "script": script.value}]
+Analyze morphological features of a Sanskrit word.
+Script is explicitly specified (default: Devanagari).
+wordscriptList of possible analyses with grammatical features
+>>> analyze("रामः", Script.DEVANAGARI)
+[{'stem': 'राम', 'case': 'nominative', 'number': 'singular', ...}]
+
+def generate_verb(dhatu: str,
lakara: str,
purusha: str,
vacana: str,
output_script: Script = Script.DEVANAGARI) ‑> List[str]
+def generate_verb(
+ dhatu: str,
+ lakara: str,
+ purusha: str,
+ vacana: str,
+ output_script: Script = Script.DEVANAGARI,
+) -> List[str]:
+ """
+ Generate Sanskrit verb forms from root + grammatical features.
+
+ Output script is explicitly specified (default: Devanagari).
+
+ Args:
+ dhatu: Verb root
+ lakara: Tense/mood (lat, lit, lut, etc.)
+ purusha: Person (prathama, madhyama, uttama)
+ vacana: Number (eka, dvi, bahu)
+ output_script: Output script (first-class parameter!)
+
+ Returns:
+ List of generated forms
+
+ Examples:
+ >>> generate_verb("भू", "lat", "prathama", "eka", Script.DEVANAGARI)
+ ['भवति']
+
+ >>> generate_verb("bhū", "lat", "prathama", "eka", Script.IAST)
+ ['bhavati']
+ """
+ # TODO: Call Rust core when built
+ # from ._core import generate_verb as _generate
+ # return _generate(dhatu, lakara, purusha, vacana, output_script.value)
+
+ # Placeholder
+ return [f"{dhatu}+{lakara}+{purusha}+{vacana}"]
+Generate Sanskrit verb forms from root + grammatical features.
+Output script is explicitly specified (default: Devanagari).
+dhatulakarapurushavacanaoutput_scriptList of generated forms
+>>> generate_verb("भू", "lat", "prathama", "eka", Script.DEVANAGARI)
+['भवति']
+
+>>> generate_verb("bhū", "lat", "prathama", "eka", Script.IAST)
+['bhavati']
+
+def list_scripts() ‑> List[Script]
+def list_scripts() -> List[Script]:
+ """
+ Get all supported scripts.
+
+ Returns:
+ List of all Script enum values
+ """
+ return list(Script)
+Get all supported scripts.
+List of all Script enum values
+def sanskritify(text: str,
script: Script = Script.DEVANAGARI,
level: str = 'medium',
preserve_meaning: bool = True,
replace_urdu_arabic: bool = True,
use_llm_fallback: bool = True,
llm_api_key: str | None = None) ‑> str
+def sanskritify(
+ text: str,
+ script: Script = Script.DEVANAGARI,
+ level: str = "medium",
+ preserve_meaning: bool = True,
+ replace_urdu_arabic: bool = True,
+ use_llm_fallback: bool = True,
+ llm_api_key: Optional[str] = None,
+) -> str:
+ """
+ Make text in any Indian language more like refined Sanskrit.
+
+ Transforms modern colloquial text to use Sanskrit-style vocabulary,
+ grammar patterns, and formal register. Works with ALL scripts!
+
+ **NEW**: Automatically replaces Urdu/Arabic/Persian words with Sanskrit equivalents.
+ Uses LLM fallback for words not in vocabulary database.
+
+ Args:
+ text: Text to sanskritify
+ script: Script for input/output (first-class parameter!)
+ level: Refinement level ("light", "medium", "high", "classical")
+ preserve_meaning: Preserve original meaning vs. prioritize form
+ replace_urdu_arabic: Replace Urdu/Arabic/Persian words with Sanskrit (default: True)
+ use_llm_fallback: Use LLM for words not in vocabulary (default: True)
+ llm_api_key: API key for LLM provider (OpenAI, Anthropic, etc.)
+
+ Returns:
+ Sanskritified text
+
+ Examples:
+ >>> # Basic sanskritification
+ >>> sanskritify("hello friend", Script.DEVANAGARI)
+ 'नमस्ते मित्र'
+
+ >>> # Works with any Indian script
+ >>> sanskritify("hello friend", Script.TAMIL)
+ 'நமஸ்தே மித்ர'
+
+ >>> # Replace Urdu/Arabic words automatically
+ >>> sanskritify("duniya mein kitab", Script.DEVANAGARI)
+ 'जगत् में पुस्तक'
+
+ >>> # High refinement with LLM fallback
+ >>> sanskritify("salaam duniya", Script.DEVANAGARI,
+ ... level="high", use_llm_fallback=True)
+ 'नमस्कार विश्व'
+ """
+ if RUST_AVAILABLE:
+ return _rust_sanskritify(
+ text,
+ script.value,
+ level,
+ preserve_meaning,
+ replace_urdu_arabic
+ )
+
+ # Fallback if Rust not available
+ return f"[Sanskritify '{text}' in {script.value} at {level} level]"
+Make text in any Indian language more like refined Sanskrit.
+Transforms modern colloquial text to use Sanskrit-style vocabulary, +grammar patterns, and formal register. Works with ALL scripts!
+NEW: Automatically replaces Urdu/Arabic/Persian words with Sanskrit equivalents. +Uses LLM fallback for words not in vocabulary database.
+textscriptlevelpreserve_meaningreplace_urdu_arabicuse_llm_fallbackllm_api_keySanskritified text
+>>> # Basic sanskritification
+>>> sanskritify("hello friend", Script.DEVANAGARI)
+'नमस्ते मित्र'
+
+>>> # Works with any Indian script
+>>> sanskritify("hello friend", Script.TAMIL)
+'நமஸ்தே மித்ர'
+
+>>> # Replace Urdu/Arabic words automatically
+>>> sanskritify("duniya mein kitab", Script.DEVANAGARI)
+'जगत् में पुस्तक'
+
+>>> # High refinement with LLM fallback
+>>> sanskritify("salaam duniya", Script.DEVANAGARI,
+... level="high", use_llm_fallback=True)
+'नमस्कार विश्व'
+
+def segment(text: str,
script: Script = Script.DEVANAGARI,
max_results: int = 10) ‑> List[List[str]]
+def segment(
+ text: str,
+ script: Script = Script.DEVANAGARI,
+ max_results: int = 10,
+) -> List[List[str]]:
+ """
+ Segment Sanskrit text into words.
+
+ Script is explicitly specified (default: Devanagari).
+
+ Args:
+ text: Sanskrit text to segment
+ script: Input script (first-class parameter with sensible default)
+ max_results: Maximum number of segmentations to return
+
+ Returns:
+ List of possible segmentations, each as a list of words
+
+ Examples:
+ >>> segment("धर्मक्षेत्रे कुरुक्षेत्रे", Script.DEVANAGARI)
+ [['धर्मक्षेत्रे', 'कुरुक्षेत्रे']]
+
+ >>> segment("dharmakṣetre kurukṣetre", Script.IAST)
+ [['dharmakṣetre', 'kurukṣetre']]
+ """
+ if RUST_AVAILABLE:
+ return _rust_segment(text, script.value, max_results)
+
+ # Fallback to simple split if Rust not available
+ return [text.split()]
+Segment Sanskrit text into words.
+Script is explicitly specified (default: Devanagari).
+textscriptmax_resultsList of possible segmentations, each as a list of words
+>>> segment("धर्मक्षेत्रे कुरुक्षेत्रे", Script.DEVANAGARI)
+[['धर्मक्षेत्रे', 'कुरुक्षेत्रे']]
+
+>>> segment("dharmakṣetre kurukṣetre", Script.IAST)
+[['dharmakṣetre', 'kurukṣetre']]
+
+def transliterate(text: str,
from_script: Script,
to_script: Script) ‑> str
+def transliterate(text: str, from_script: Script, to_script: Script) -> str:
+ """
+ Transliterate Sanskrit text between scripts.
+
+ Script is a **first-class parameter** - explicit and required.
+
+ Args:
+ text: Text to transliterate
+ from_script: Source script (first-class parameter!)
+ to_script: Target script (first-class parameter!)
+
+ Returns:
+ Transliterated text
+
+ Examples:
+ >>> transliterate("namaste", Script.IAST, Script.DEVANAGARI)
+ 'नमस्ते'
+
+ >>> transliterate("namaste", Script.IAST, Script.TAMIL)
+ 'நமஸ்தே'
+
+ >>> transliterate("namaste", Script.IAST, Script.TELUGU)
+ 'నమస్తే'
+ """
+ if RUST_AVAILABLE:
+ return _rust_transliterate(text, from_script.value, to_script.value)
+
+ # Fallback to placeholder if Rust not available
+ if from_script == to_script:
+ return text
+ return f"[Transliterate '{text}' from {from_script.value} to {to_script.value}]"
+Transliterate Sanskrit text between scripts.
+Script is a first-class parameter - explicit and required.
+textfrom_scriptto_scriptTransliterated text
+>>> transliterate("namaste", Script.IAST, Script.DEVANAGARI)
+'नमस्ते'
+
+>>> transliterate("namaste", Script.IAST, Script.TAMIL)
+'நமஸ்தே'
+
+>>> transliterate("namaste", Script.IAST, Script.TELUGU)
+'నమస్తే'
+
+class Script
+(*args, **kwds)
+class Script(str, Enum):
+ """
+ Supported scripts for Sanskrit text.
+
+ Script is a FIRST-CLASS parameter in vedyut, not buried in options.
+ Every function that deals with script-specific text takes Script as
+ an explicit, required parameter.
+ """
+ # Romanization schemes
+ IAST = "iast"
+ SLP1 = "slp1"
+ HARVARD_KYOTO = "harvard-kyoto"
+ ITRANS = "itrans"
+ ISO15919 = "iso15919"
+ VELTHUIS = "velthuis"
+ WX = "wx"
+
+ # Brahmic scripts
+ DEVANAGARI = "devanagari"
+ TELUGU = "telugu"
+ TAMIL = "tamil"
+ KANNADA = "kannada"
+ MALAYALAM = "malayalam"
+ BENGALI = "bengali"
+ GUJARATI = "gujarati"
+ GURMUKHI = "gurmukhi"
+ ODIA = "odia"
+ ASSAMESE = "assamese"
+ TIBETAN = "tibetan"
+ SINHALA = "sinhala"
+ BURMESE = "burmese"
+ THAI = "thai"
+ GRANTHA = "grantha"
+Supported scripts for Sanskrit text.
+Script is a FIRST-CLASS parameter in vedyut, not buried in options. +Every function that deals with script-specific text takes Script as +an explicit, required parameter.
var ASSAMESEvar BENGALIvar BURMESEvar DEVANAGARIvar GRANTHAvar GUJARATIvar GURMUKHIvar HARVARD_KYOTOvar IASTvar ISO15919var ITRANSvar KANNADAvar MALAYALAMvar ODIAvar SINHALAvar SLP1var TAMILvar TELUGUvar THAIvar TIBETANvar VELTHUISvar WXvedyut.llm.clientUnified LLM client with swappable backends via LiteLLM
+
+def quick_complete(prompt: str, model: str | None = None) ‑> str
+def quick_complete(prompt: str, model: Optional[str] = None) -> str:
+ """Quick one-off completion (not for production)
+
+ Args:
+ prompt: User prompt
+ model: Optional model override
+
+ Returns:
+ Response text
+ """
+ client = LLMClient(model=model)
+ return client.complete([{"role": "user", "content": prompt}])
+Quick one-off completion (not for production)
+promptmodelResponse text
+class LLMClient
+(model: str | None = None,
embedding_model: str | None = None,
temperature: float = 0.7,
max_tokens: int | None = None,
api_key: str | None = None)
+class LLMClient:
+ """Unified LLM client supporting 100+ providers via LiteLLM
+
+ Supported models:
+ - OpenAI: gpt-4o, gpt-4-turbo, gpt-3.5-turbo
+ - Anthropic: claude-3-5-sonnet-20241022, claude-3-opus
+ - Google: gemini/gemini-1.5-pro, gemini/gemini-1.5-flash
+ - Azure, AWS Bedrock, Ollama, etc.
+
+ Configuration via environment variables:
+ - VEDYUT_LLM_MODEL: Model name (default: gpt-4o)
+ - OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, etc.
+ """
+
+ DEFAULT_MODEL = "gpt-4o"
+ DEFAULT_EMBEDDING_MODEL = "text-embedding-3-large"
+
+ def __init__(
+ self,
+ model: Optional[str] = None,
+ embedding_model: Optional[str] = None,
+ temperature: float = 0.7,
+ max_tokens: Optional[int] = None,
+ api_key: Optional[str] = None,
+ ):
+ """Initialize LLM client
+
+ Args:
+ model: Model name (e.g., "gpt-4o", "claude-3-5-sonnet-20241022")
+ embedding_model: Model for embeddings
+ temperature: Sampling temperature (0.0-1.0)
+ max_tokens: Max tokens in response
+ api_key: Optional API key (or use env vars)
+ """
+ self.model = model or os.getenv("VEDYUT_LLM_MODEL", self.DEFAULT_MODEL)
+ self.embedding_model = embedding_model or os.getenv(
+ "VEDYUT_EMBEDDING_MODEL", self.DEFAULT_EMBEDDING_MODEL
+ )
+ self.temperature = temperature
+ self.max_tokens = max_tokens
+
+ # LiteLLM auto-detects API keys from env (OPENAI_API_KEY, etc.)
+ if api_key:
+ litellm.api_key = api_key
+
+ def complete(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+ ) -> str:
+ """Complete a chat conversation
+
+ Args:
+ messages: List of {"role": "user/assistant/system", "content": "..."}
+ **kwargs: Additional args passed to LiteLLM (temperature, max_tokens, etc.)
+
+ Returns:
+ Response text
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items() if k not in ["temperature", "max_tokens"]}
+ )
+ return response.choices[0].message.content
+
+ def complete_with_json(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+ ) -> Dict[str, Any]:
+ """Complete with structured JSON response
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Returns:
+ Parsed JSON response as dict
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ response_format={"type": "json_object"},
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "response_format"]}
+ )
+
+ import json
+ content = response.choices[0].message.content
+ return json.loads(content)
+
+ def embed(self, texts: List[str]) -> List[List[float]]:
+ """Generate embeddings for texts
+
+ Args:
+ texts: List of text strings to embed
+
+ Returns:
+ List of embedding vectors
+ """
+ if isinstance(texts, str):
+ texts = [texts]
+
+ response = embedding(
+ model=self.embedding_model,
+ input=texts
+ )
+ return [item["embedding"] for item in response.data]
+
+ def embed_single(self, text: str) -> List[float]:
+ """Generate embedding for a single text
+
+ Args:
+ text: Text to embed
+
+ Returns:
+ Embedding vector
+ """
+ return self.embed([text])[0]
+
+ def stream(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+ ):
+ """Stream completion response (for long responses)
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Yields:
+ Response chunks
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ stream=True,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "stream"]}
+ )
+
+ for chunk in response:
+ if chunk.choices[0].delta.content:
+ yield chunk.choices[0].delta.content
+Unified LLM client supporting 100+ providers via LiteLLM
+Supported models: +- OpenAI: gpt-4o, gpt-4-turbo, gpt-3.5-turbo +- Anthropic: claude-3-5-sonnet-20241022, claude-3-opus +- Google: gemini/gemini-1.5-pro, gemini/gemini-1.5-flash +- Azure, AWS Bedrock, Ollama, etc.
+Configuration via environment variables: +- VEDYUT_LLM_MODEL: Model name (default: gpt-4o) +- OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, etc.
+Initialize LLM client
+modelembedding_modeltemperaturemax_tokensapi_keyvar DEFAULT_EMBEDDING_MODELvar DEFAULT_MODEL
+def complete(self, messages: List[Dict[str, str]], **kwargs) ‑> str
+def complete(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+) -> str:
+ """Complete a chat conversation
+
+ Args:
+ messages: List of {"role": "user/assistant/system", "content": "..."}
+ **kwargs: Additional args passed to LiteLLM (temperature, max_tokens, etc.)
+
+ Returns:
+ Response text
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items() if k not in ["temperature", "max_tokens"]}
+ )
+ return response.choices[0].message.content
+Complete a chat conversation
+messages**kwargsResponse text
+def complete_with_json(self, messages: List[Dict[str, str]], **kwargs) ‑> Dict[str, Any]
+def complete_with_json(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+) -> Dict[str, Any]:
+ """Complete with structured JSON response
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Returns:
+ Parsed JSON response as dict
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ response_format={"type": "json_object"},
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "response_format"]}
+ )
+
+ import json
+ content = response.choices[0].message.content
+ return json.loads(content)
+Complete with structured JSON response
+messages**kwargsParsed JSON response as dict
+def embed(self, texts: List[str]) ‑> List[List[float]]
+def embed(self, texts: List[str]) -> List[List[float]]:
+ """Generate embeddings for texts
+
+ Args:
+ texts: List of text strings to embed
+
+ Returns:
+ List of embedding vectors
+ """
+ if isinstance(texts, str):
+ texts = [texts]
+
+ response = embedding(
+ model=self.embedding_model,
+ input=texts
+ )
+ return [item["embedding"] for item in response.data]
+Generate embeddings for texts
+textsList of embedding vectors
+def embed_single(self, text: str) ‑> List[float]
+def embed_single(self, text: str) -> List[float]:
+ """Generate embedding for a single text
+
+ Args:
+ text: Text to embed
+
+ Returns:
+ Embedding vector
+ """
+ return self.embed([text])[0]
+Generate embedding for a single text
+textEmbedding vector
+def stream(self, messages: List[Dict[str, str]], **kwargs)
+def stream(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+):
+ """Stream completion response (for long responses)
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Yields:
+ Response chunks
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ stream=True,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "stream"]}
+ )
+
+ for chunk in response:
+ if chunk.choices[0].delta.content:
+ yield chunk.choices[0].delta.content
+Stream completion response (for long responses)
+messages**kwargsResponse chunks
vedyut.llmLLM integration for Sanskrit NLP with grammar treatise RAG
+vedyut.llm.clientUnified LLM client with swappable backends via LiteLLM
vedyut.llm.ragRAG (Retrieval-Augmented Generation) for Sanskrit grammar treatises …
vedyut.llm.tasksSanskrit-specific LLM tasks using RAG
+def disambiguate_segmentation(text: str,
candidates: List[List[str]],
llm: LLMClient | None = None,
rag: GrammarRAG | None = None) ‑> int
+def disambiguate_segmentation(
+ text: str,
+ candidates: List[List[str]],
+ llm: Optional[LLMClient] = None,
+ rag: Optional[GrammarRAG] = None,
+) -> int:
+ """Use LLM to choose best segmentation from candidates
+
+ Args:
+ text: Original Sanskrit text
+ candidates: List of possible segmentations (each a list of words)
+ llm: LLM client (created if None)
+ rag: Grammar RAG (optional, for rule-based context)
+
+ Returns:
+ Index of best candidate (0-indexed)
+
+ Example:
+ >>> text = "धर्मक्षेत्रे"
+ >>> candidates = [
+ ... ["धर्म", "क्षेत्रे"],
+ ... ["धर्मक्षेत्रे"],
+ ... ]
+ >>> best_idx = disambiguate_segmentation(text, candidates)
+ >>> print(candidates[best_idx])
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ # Build context from sandhi rules if RAG available
+ context = ""
+ if rag:
+ results = rag.query(f"sandhi rules for: {text}", top_k=2, topic="sandhi")
+ if results:
+ context = "\n\nRelevant sandhi rules:\n" + "\n".join([
+ f"- {chunk.text[:200]}..." for chunk, _ in results
+ ])
+
+ candidates_text = "\n".join([
+ f"{i+1}. {' + '.join(seg)}" for i, seg in enumerate(candidates)
+ ])
+
+ prompt = f"""You are a Sanskrit grammar expert. Given a Sanskrit text and multiple possible segmentations, choose the most grammatically correct and semantically meaningful one.
+
+Text: {text}
+
+Possible segmentations:
+{candidates_text}
+{context}
+
+Respond with ONLY the number (1-{len(candidates)}) of the best segmentation.
+Number: """
+
+ response = llm.complete(
+ [{"role": "user", "content": prompt}],
+ temperature=0.3,
+ max_tokens=10
+ )
+
+ try:
+ number = int(response.strip().split()[0])
+ return max(0, min(number - 1, len(candidates) - 1))
+ except (ValueError, IndexError):
+ return 0 # Default to first candidate
+Use LLM to choose best segmentation from candidates
+textcandidatesllmragIndex of best candidate (0-indexed)
+>>> text = "धर्मक्षेत्रे"
+>>> candidates = [
+... ["धर्म", "क्षेत्रे"],
+... ["धर्मक्षेत्रे"],
+... ]
+>>> best_idx = disambiguate_segmentation(text, candidates)
+>>> print(candidates[best_idx])
+
+def explain_grammar(word: str,
analysis: Dict | None = None,
llm: LLMClient | None = None,
rag: GrammarRAG | None = None) ‑> str
+def explain_grammar(
+ word: str,
+ analysis: Optional[Dict] = None,
+ llm: Optional[LLMClient] = None,
+ rag: Optional[GrammarRAG] = None,
+) -> str:
+ """Generate natural language explanation of grammatical analysis
+
+ Args:
+ word: Sanskrit word
+ analysis: Grammatical analysis dict (lemma, case, number, etc.)
+ llm: LLM client
+ rag: Grammar RAG for rule references
+
+ Returns:
+ Beginner-friendly explanation
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ analysis_text = ""
+ if analysis:
+ analysis_text = "\n".join([f"- {k}: {v}" for k, v in analysis.items()])
+
+ # Get relevant grammar rules if RAG available
+ context = ""
+ if rag and analysis:
+ query = f"grammar for {word} "
+ if "case" in analysis:
+ query += f"case {analysis['case']}"
+ if "tense" in analysis:
+ query += f"tense {analysis['tense']}"
+
+ results = rag.query(query, top_k=2)
+ if results:
+ context = "\n\nGrammar rules:\n" + "\n".join([
+ f"[{chunk.source}] {chunk.text[:150]}..." for chunk, _ in results
+ ])
+
+ prompt = f"""Explain the grammar of this Sanskrit word in simple, beginner-friendly terms:
+
+Word: {word}
+
+Grammatical analysis:
+{analysis_text}
+{context}
+
+Provide a clear explanation suitable for someone learning Sanskrit. Include:
+1. What the word means
+2. Its grammatical function (case, number, gender, tense, etc.)
+3. Why it has this form
+4. A simple example sentence
+
+EXPLANATION:
+"""
+
+ return llm.complete([{"role": "user", "content": prompt}], temperature=0.6)
+Generate natural language explanation of grammatical analysis
+wordanalysisllmragBeginner-friendly explanation
+def generate_test_cases(function_description: str,
rag: GrammarRAG | None = None,
llm: LLMClient | None = None,
num_cases: int = 10) ‑> List[Dict[str, str]]
+def generate_test_cases(
+ function_description: str,
+ rag: Optional[GrammarRAG] = None,
+ llm: Optional[LLMClient] = None,
+ num_cases: int = 10,
+) -> List[Dict[str, str]]:
+ """Generate test cases for a Sanskrit NLP function
+
+ Args:
+ function_description: What the function does
+ rag: Grammar RAG for rule-based examples
+ llm: LLM client
+ num_cases: Number of test cases to generate
+
+ Returns:
+ List of {"input": "...", "expected": "...", "description": "..."} dicts
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ # Get grammar context if available
+ context = ""
+ if rag:
+ results = rag.query(function_description, top_k=2)
+ if results:
+ context = "\n\nGrammar references:\n" + "\n".join([
+ f"{chunk.text[:200]}..." for chunk, _ in results
+ ])
+
+ prompt = f"""Generate {num_cases} diverse test cases for this Sanskrit NLP function:
+
+Function: {function_description}
+{context}
+
+For each test case, provide:
+1. Input (Sanskrit text or word)
+2. Expected output
+3. Brief description of what it tests
+
+Return as JSON array:
+[
+ {{
+ "input": "...",
+ "expected": "...",
+ "description": "..."
+ }},
+ ...
+]
+
+JSON:
+"""
+
+ try:
+ result = llm.complete_with_json([{"role": "user", "content": prompt}])
+ if isinstance(result, dict) and "test_cases" in result:
+ return result["test_cases"]
+ elif isinstance(result, list):
+ return result
+ else:
+ return []
+ except Exception as e:
+ print(f"Error generating test cases: {e}")
+ return []
+Generate test cases for a Sanskrit NLP function
+function_descriptionragllmnum_casesList of {"input": "…", "expected": "…", "description": "…"} dicts
+def suggest_implementation(rule_description: str,
rag: GrammarRAG,
language: str = 'rust',
include_tests: bool = True) ‑> str
+def suggest_implementation(
+ rule_description: str,
+ rag: GrammarRAG,
+ language: str = "rust",
+ include_tests: bool = True,
+) -> str:
+ """Generate code implementation suggestion from grammar rule
+
+ ⚠️ WARNING: LLM-generated code requires human review!
+ Use this as a starting point, not production code.
+
+ Args:
+ rule_description: Description of what to implement
+ rag: Grammar RAG (required for rule lookup)
+ language: Target programming language
+ include_tests: Generate test cases
+
+ Returns:
+ Generated code with comments
+ """
+ # Retrieve relevant grammar chunks
+ results = rag.query(rule_description, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+
+ if not context_chunks:
+ return f"# No relevant grammar rules found for: {rule_description}"
+
+ context_text = "\n\n".join([
+ f"[{chunk.source} {chunk.sutra_number or ''}]\n{chunk.text}"
+ for chunk in context_chunks
+ ])
+
+ test_instruction = ""
+ if include_tests:
+ test_instruction = "\n4. Test cases with examples"
+
+ prompt = f"""You are a Sanskrit NLP expert implementing Pāṇinian grammar rules in code.
+
+Grammar References:
+{context_text}
+
+Task: {rule_description}
+
+Generate clean, production-ready {language} code with:
+1. Clear function signature with type annotations
+2. Implementation following the grammar rules above
+3. Detailed comments explaining each step and referencing sūtras{test_instruction}
+
+⚠️ IMPORTANT:
+- Be precise with grammar rules
+- Handle edge cases
+- Note any ambiguities or limitations
+
+{language.upper()} CODE:
+"""
+
+ llm = rag.llm
+ return llm.complete(
+ [{"role": "user", "content": prompt}],
+ temperature=0.3,
+ max_tokens=2000
+ )
+Generate code implementation suggestion from grammar rule
+⚠️ WARNING: LLM-generated code requires human review! +Use this as a starting point, not production code.
+rule_descriptionraglanguageinclude_testsGenerated code with comments
+def translate_sanskrit(text: str,
target_lang: str = 'english',
llm: LLMClient | None = None,
with_explanation: bool = False) ‑> str
+def translate_sanskrit(
+ text: str,
+ target_lang: str = "english",
+ llm: Optional[LLMClient] = None,
+ with_explanation: bool = False,
+) -> str:
+ """Translate Sanskrit text to target language
+
+ Args:
+ text: Sanskrit text (Devanagari or transliterated)
+ target_lang: Target language (default: "english")
+ llm: LLM client
+ with_explanation: Include word-by-word breakdown
+
+ Returns:
+ Translation (and optional explanation)
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ if with_explanation:
+ prompt = f"""Translate this Sanskrit text to {target_lang} with word-by-word explanation:
+
+Sanskrit: {text}
+
+Provide:
+1. Word-by-word breakdown with grammatical analysis
+2. Smooth {target_lang} translation
+
+FORMAT:
+Word-by-word:
+- word1 (grammatical info): meaning
+- word2 (grammatical info): meaning
+
+Translation: [full translation]
+"""
+ else:
+ prompt = f"Translate this Sanskrit text to {target_lang}: {text}"
+
+ return llm.complete([{"role": "user", "content": prompt}], temperature=0.5)
+Translate Sanskrit text to target language
+texttarget_langllmwith_explanationTranslation (and optional explanation)
+class GrammarRAG
+(data_dir: str = 'data/grammar',
llm_client: LLMClient | None = None,
index_file: str = 'grammar_index.json')
+class GrammarRAG:
+ """RAG system for Sanskrit grammar treatises
+
+ Usage:
+ rag = GrammarRAG(data_dir="data/grammar")
+ rag.load_texts() # Load grammar treatises
+ rag.build_index() # Generate embeddings
+
+ # Query for relevant rules
+ results = rag.query("How to form present tense verbs?", top_k=3)
+
+ # Use with LLM
+ code = rag.generate_code("Implement sandhi rule for 'a + i → e'")
+ """
+
+ def __init__(
+ self,
+ data_dir: str = "data/grammar",
+ llm_client: Optional[LLMClient] = None,
+ index_file: str = "grammar_index.json"
+ ):
+ """Initialize RAG system
+
+ Args:
+ data_dir: Directory containing grammar text files
+ llm_client: LLM client for embeddings and generation
+ index_file: File to save/load embedded chunks
+ """
+ self.data_dir = Path(data_dir)
+ self.llm = llm_client or LLMClient()
+ self.index_file = self.data_dir / index_file
+
+ self.chunks: List[GrammarChunk] = []
+ self.chunk_embeddings: Optional[np.ndarray] = None
+
+ def load_texts(self):
+ """Load grammar treatises from data directory
+
+ Expected structure:
+ data/grammar/
+ ashtadhyayi.txt # Sūtras in Sanskrit/SLP1
+ kashika.txt # Commentary in Sanskrit
+ kale_grammar.txt # English textbook
+ panini_intro.txt # Modern English explanations
+ custom_rules.json # Custom rule definitions
+ """
+ if not self.data_dir.exists():
+ print(f"Warning: Grammar data directory not found: {self.data_dir}")
+ print("Create it and add grammar texts to enable RAG functionality.")
+ return
+
+ # Load text files
+ for file_path in self.data_dir.glob("*.txt"):
+ self._load_text_file(file_path)
+
+ # Load structured JSON files
+ for file_path in self.data_dir.glob("*.json"):
+ self._load_json_file(file_path)
+
+ print(f"Loaded {len(self.chunks)} grammar chunks from {self.data_dir}")
+
+ def _load_text_file(self, file_path: Path):
+ """Load and chunk a text file"""
+ source = file_path.stem # e.g., "ashtadhyayi", "kale_grammar"
+ language = "sanskrit" if any(x in source for x in ["ashtadhyayi", "kashika"]) else "english"
+
+ with open(file_path, encoding="utf-8") as f:
+ content = f.read()
+
+ # Simple chunking by paragraphs (TODO: improve with sutra-aware chunking)
+ paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
+
+ for i, para in enumerate(paragraphs):
+ chunk = GrammarChunk(
+ id=f"{source}_{i}",
+ text=para,
+ source=source,
+ language=language,
+ sutra_number=self._extract_sutra_number(para),
+ topic=self._infer_topic(para),
+ )
+ self.chunks.append(chunk)
+
+ def _load_json_file(self, file_path: Path):
+ """Load structured grammar rules from JSON
+
+ Expected format:
+ [
+ {
+ "sutra": "1.1.1",
+ "sanskrit": "वृद्धिरादैच्",
+ "transliteration": "vṛddhir ādaic",
+ "english": "a, ai, au are called vṛddhi",
+ "explanation": "This defines the vṛddhi vowels...",
+ "topic": "sandhi"
+ },
+ ...
+ ]
+ """
+ with open(file_path, encoding="utf-8") as f:
+ data = json.load(f)
+
+ for i, rule in enumerate(data):
+ # Create chunks for Sanskrit and English versions
+ if "sanskrit" in rule:
+ chunk = GrammarChunk(
+ id=f"{file_path.stem}_{i}_sa",
+ text=f"{rule.get('sutra', '')}: {rule['sanskrit']}\n{rule.get('explanation', '')}",
+ source=file_path.stem,
+ sutra_number=rule.get("sutra"),
+ topic=rule.get("topic"),
+ language="sanskrit",
+ )
+ self.chunks.append(chunk)
+
+ if "english" in rule:
+ chunk = GrammarChunk(
+ id=f"{file_path.stem}_{i}_en",
+ text=f"{rule.get('sutra', '')}: {rule['english']}\n{rule.get('explanation', '')}",
+ source=file_path.stem,
+ sutra_number=rule.get("sutra"),
+ topic=rule.get("topic"),
+ language="english",
+ )
+ self.chunks.append(chunk)
+
+ def _extract_sutra_number(self, text: str) -> Optional[str]:
+ """Extract sūtra number from text (e.g., '1.1.1', '3.2.123')"""
+ import re
+ match = re.search(r'\b(\d+\.\d+\.\d+)\b', text[:100])
+ return match.group(1) if match else None
+
+ def _infer_topic(self, text: str) -> Optional[str]:
+ """Infer grammatical topic from text content"""
+ text_lower = text.lower()
+ if any(word in text_lower for word in ["sandhi", "सन्धि"]):
+ return "sandhi"
+ elif any(word in text_lower for word in ["lakara", "लकार", "tense", "वृत्ति"]):
+ return "lakara"
+ elif any(word in text_lower for word in ["dhatu", "धातु", "verb", "root"]):
+ return "dhatu"
+ elif any(word in text_lower for word in ["vibhakti", "विभक्ति", "case"]):
+ return "vibhakti"
+ elif any(word in text_lower for word in ["samasa", "समास", "compound"]):
+ return "samasa"
+ return None
+
+ def build_index(self, force_rebuild: bool = False):
+ """Generate embeddings for all chunks and build search index
+
+ Args:
+ force_rebuild: If True, rebuild even if index exists
+ """
+ # Try to load existing index
+ if not force_rebuild and self.index_file.exists():
+ self._load_index()
+ print(f"Loaded existing index from {self.index_file}")
+ return
+
+ if not self.chunks:
+ print("No chunks to index. Run load_texts() first.")
+ return
+
+ print(f"Generating embeddings for {len(self.chunks)} chunks...")
+ texts = [chunk.text for chunk in self.chunks]
+
+ # Generate embeddings in batches (API rate limits)
+ batch_size = 100
+ all_embeddings = []
+
+ for i in range(0, len(texts), batch_size):
+ batch = texts[i:i + batch_size]
+ embeddings = self.llm.embed(batch)
+ all_embeddings.extend(embeddings)
+ print(f" Embedded {min(i + batch_size, len(texts))}/{len(texts)}")
+
+ # Store embeddings in chunks
+ for chunk, embedding in zip(self.chunks, all_embeddings):
+ chunk.embedding = embedding
+
+ self.chunk_embeddings = np.array(all_embeddings)
+
+ # Save index
+ self._save_index()
+ print(f"Index saved to {self.index_file}")
+
+ def _save_index(self):
+ """Save chunks and embeddings to disk"""
+ self.data_dir.mkdir(parents=True, exist_ok=True)
+
+ data = {
+ "chunks": [asdict(chunk) for chunk in self.chunks],
+ "version": "1.0"
+ }
+
+ with open(self.index_file, "w", encoding="utf-8") as f:
+ json.dump(data, f, ensure_ascii=False, indent=2)
+
+ def _load_index(self):
+ """Load chunks and embeddings from disk"""
+ with open(self.index_file, encoding="utf-8") as f:
+ data = json.load(f)
+
+ self.chunks = [GrammarChunk(**chunk) for chunk in data["chunks"]]
+ self.chunk_embeddings = np.array([chunk.embedding for chunk in self.chunks])
+
+ def query(
+ self,
+ query_text: str,
+ top_k: int = 5,
+ topic: Optional[str] = None,
+ language: Optional[str] = None,
+ ) -> List[Tuple[GrammarChunk, float]]:
+ """Retrieve most relevant grammar chunks for a query
+
+ Args:
+ query_text: Natural language query (e.g., "How to form past tense?")
+ top_k: Number of results to return
+ topic: Filter by topic ("sandhi", "lakara", etc.)
+ language: Filter by language ("sanskrit" or "english")
+
+ Returns:
+ List of (chunk, similarity_score) tuples, sorted by relevance
+ """
+ if self.chunk_embeddings is None:
+ raise ValueError("Index not built. Run build_index() first.")
+
+ # Generate query embedding
+ query_embedding = self.llm.embed_single(query_text)
+ query_vec = np.array(query_embedding)
+
+ # Compute cosine similarity
+ similarities = np.dot(self.chunk_embeddings, query_vec) / (
+ np.linalg.norm(self.chunk_embeddings, axis=1) * np.linalg.norm(query_vec)
+ )
+
+ # Filter by topic/language if specified
+ filtered_indices = []
+ for i, chunk in enumerate(self.chunks):
+ if topic and chunk.topic != topic:
+ continue
+ if language and chunk.language != language:
+ continue
+ filtered_indices.append(i)
+
+ # Get top-k
+ if filtered_indices:
+ filtered_sims = [(i, similarities[i]) for i in filtered_indices]
+ top_indices = sorted(filtered_sims, key=lambda x: x[1], reverse=True)[:top_k]
+ else:
+ top_indices = [(i, similarities[i]) for i in np.argsort(similarities)[::-1][:top_k]]
+
+ results = [(self.chunks[i], float(score)) for i, score in top_indices]
+ return results
+
+ def generate_code(
+ self,
+ task_description: str,
+ context_chunks: Optional[List[GrammarChunk]] = None,
+ language: str = "rust",
+ ) -> str:
+ """Generate code implementation based on grammar rules
+
+ Args:
+ task_description: What to implement (e.g., "sandhi rule for a + i")
+ context_chunks: Relevant grammar chunks (auto-retrieved if None)
+ language: Target programming language
+
+ Returns:
+ Generated code with comments
+ """
+ # Retrieve relevant chunks if not provided
+ if context_chunks is None:
+ results = self.query(task_description, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+
+ # Build context from chunks
+ context_text = "\n\n".join([
+ f"[{chunk.source} {chunk.sutra_number or ''}]\n{chunk.text}"
+ for chunk in context_chunks
+ ])
+
+ prompt = f"""You are a Sanskrit NLP expert. Based on these Pāṇinian grammar rules, generate {language} code to implement the requested functionality.
+
+Grammar References:
+{context_text}
+
+Task: {task_description}
+
+Generate clean, well-commented {language} code. Include:
+1. Function signature with types
+2. Implementation logic
+3. Comments explaining the grammar rule
+4. Example usage in comments
+
+{language.upper()} CODE:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.3)
+
+ def explain_rule(
+ self,
+ sutra_number: Optional[str] = None,
+ query: Optional[str] = None,
+ ) -> str:
+ """Get natural language explanation of a grammar rule
+
+ Args:
+ sutra_number: Specific sūtra (e.g., "1.1.1")
+ query: Natural language query (if sutra_number not provided)
+
+ Returns:
+ Plain English explanation
+ """
+ if sutra_number:
+ # Find chunks with this sutra
+ matching_chunks = [c for c in self.chunks if c.sutra_number == sutra_number]
+ if not matching_chunks:
+ return f"Sūtra {sutra_number} not found in loaded texts."
+ context_chunks = matching_chunks[:3]
+ elif query:
+ results = self.query(query, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+ else:
+ raise ValueError("Provide either sutra_number or query")
+
+ context_text = "\n\n".join([chunk.text for chunk in context_chunks])
+
+ prompt = f"""Explain this Pāṇinian grammar rule in simple, clear English.
+
+Grammar Text:
+{context_text}
+
+Provide:
+1. What the rule says
+2. When it applies
+3. A simple example
+4. Common mistakes
+
+EXPLANATION:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.5)
+RAG system for Sanskrit grammar treatises
+rag = GrammarRAG(data_dir="data/grammar") +rag.load_texts() +# Load grammar treatises +rag.build_index() +# Generate embeddings
+results = rag.query("How to form present tense verbs?", top_k=3)
+code = rag.generate_code("Implement sandhi rule for 'a + i → e'")
+Initialize RAG system
+data_dirllm_clientindex_file
+def build_index(self, force_rebuild: bool = False)
+def build_index(self, force_rebuild: bool = False):
+ """Generate embeddings for all chunks and build search index
+
+ Args:
+ force_rebuild: If True, rebuild even if index exists
+ """
+ # Try to load existing index
+ if not force_rebuild and self.index_file.exists():
+ self._load_index()
+ print(f"Loaded existing index from {self.index_file}")
+ return
+
+ if not self.chunks:
+ print("No chunks to index. Run load_texts() first.")
+ return
+
+ print(f"Generating embeddings for {len(self.chunks)} chunks...")
+ texts = [chunk.text for chunk in self.chunks]
+
+ # Generate embeddings in batches (API rate limits)
+ batch_size = 100
+ all_embeddings = []
+
+ for i in range(0, len(texts), batch_size):
+ batch = texts[i:i + batch_size]
+ embeddings = self.llm.embed(batch)
+ all_embeddings.extend(embeddings)
+ print(f" Embedded {min(i + batch_size, len(texts))}/{len(texts)}")
+
+ # Store embeddings in chunks
+ for chunk, embedding in zip(self.chunks, all_embeddings):
+ chunk.embedding = embedding
+
+ self.chunk_embeddings = np.array(all_embeddings)
+
+ # Save index
+ self._save_index()
+ print(f"Index saved to {self.index_file}")
+Generate embeddings for all chunks and build search index
+force_rebuild
+def explain_rule(self, sutra_number: str | None = None, query: str | None = None) ‑> str
+ def explain_rule(
+ self,
+ sutra_number: Optional[str] = None,
+ query: Optional[str] = None,
+ ) -> str:
+ """Get natural language explanation of a grammar rule
+
+ Args:
+ sutra_number: Specific sūtra (e.g., "1.1.1")
+ query: Natural language query (if sutra_number not provided)
+
+ Returns:
+ Plain English explanation
+ """
+ if sutra_number:
+ # Find chunks with this sutra
+ matching_chunks = [c for c in self.chunks if c.sutra_number == sutra_number]
+ if not matching_chunks:
+ return f"Sūtra {sutra_number} not found in loaded texts."
+ context_chunks = matching_chunks[:3]
+ elif query:
+ results = self.query(query, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+ else:
+ raise ValueError("Provide either sutra_number or query")
+
+ context_text = "\n\n".join([chunk.text for chunk in context_chunks])
+
+ prompt = f"""Explain this Pāṇinian grammar rule in simple, clear English.
+
+Grammar Text:
+{context_text}
+
+Provide:
+1. What the rule says
+2. When it applies
+3. A simple example
+4. Common mistakes
+
+EXPLANATION:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.5)
+Get natural language explanation of a grammar rule
+sutra_numberqueryPlain English explanation
+def generate_code(self,
task_description: str,
context_chunks: List[GrammarChunk] | None = None,
language: str = 'rust') ‑> str
+ def generate_code(
+ self,
+ task_description: str,
+ context_chunks: Optional[List[GrammarChunk]] = None,
+ language: str = "rust",
+ ) -> str:
+ """Generate code implementation based on grammar rules
+
+ Args:
+ task_description: What to implement (e.g., "sandhi rule for a + i")
+ context_chunks: Relevant grammar chunks (auto-retrieved if None)
+ language: Target programming language
+
+ Returns:
+ Generated code with comments
+ """
+ # Retrieve relevant chunks if not provided
+ if context_chunks is None:
+ results = self.query(task_description, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+
+ # Build context from chunks
+ context_text = "\n\n".join([
+ f"[{chunk.source} {chunk.sutra_number or ''}]\n{chunk.text}"
+ for chunk in context_chunks
+ ])
+
+ prompt = f"""You are a Sanskrit NLP expert. Based on these Pāṇinian grammar rules, generate {language} code to implement the requested functionality.
+
+Grammar References:
+{context_text}
+
+Task: {task_description}
+
+Generate clean, well-commented {language} code. Include:
+1. Function signature with types
+2. Implementation logic
+3. Comments explaining the grammar rule
+4. Example usage in comments
+
+{language.upper()} CODE:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.3)
+Generate code implementation based on grammar rules
+task_descriptioncontext_chunkslanguageGenerated code with comments
+def load_texts(self)
+def load_texts(self):
+ """Load grammar treatises from data directory
+
+ Expected structure:
+ data/grammar/
+ ashtadhyayi.txt # Sūtras in Sanskrit/SLP1
+ kashika.txt # Commentary in Sanskrit
+ kale_grammar.txt # English textbook
+ panini_intro.txt # Modern English explanations
+ custom_rules.json # Custom rule definitions
+ """
+ if not self.data_dir.exists():
+ print(f"Warning: Grammar data directory not found: {self.data_dir}")
+ print("Create it and add grammar texts to enable RAG functionality.")
+ return
+
+ # Load text files
+ for file_path in self.data_dir.glob("*.txt"):
+ self._load_text_file(file_path)
+
+ # Load structured JSON files
+ for file_path in self.data_dir.glob("*.json"):
+ self._load_json_file(file_path)
+
+ print(f"Loaded {len(self.chunks)} grammar chunks from {self.data_dir}")
+Load grammar treatises from data directory
+Expected structure: +data/grammar/ +ashtadhyayi.txt +# Sūtras in Sanskrit/SLP1 +kashika.txt +# Commentary in Sanskrit +kale_grammar.txt +# English textbook +panini_intro.txt +# Modern English explanations +custom_rules.json +# Custom rule definitions
+def query(self,
query_text: str,
top_k: int = 5,
topic: str | None = None,
language: str | None = None) ‑> List[Tuple[GrammarChunk, float]]
+def query(
+ self,
+ query_text: str,
+ top_k: int = 5,
+ topic: Optional[str] = None,
+ language: Optional[str] = None,
+) -> List[Tuple[GrammarChunk, float]]:
+ """Retrieve most relevant grammar chunks for a query
+
+ Args:
+ query_text: Natural language query (e.g., "How to form past tense?")
+ top_k: Number of results to return
+ topic: Filter by topic ("sandhi", "lakara", etc.)
+ language: Filter by language ("sanskrit" or "english")
+
+ Returns:
+ List of (chunk, similarity_score) tuples, sorted by relevance
+ """
+ if self.chunk_embeddings is None:
+ raise ValueError("Index not built. Run build_index() first.")
+
+ # Generate query embedding
+ query_embedding = self.llm.embed_single(query_text)
+ query_vec = np.array(query_embedding)
+
+ # Compute cosine similarity
+ similarities = np.dot(self.chunk_embeddings, query_vec) / (
+ np.linalg.norm(self.chunk_embeddings, axis=1) * np.linalg.norm(query_vec)
+ )
+
+ # Filter by topic/language if specified
+ filtered_indices = []
+ for i, chunk in enumerate(self.chunks):
+ if topic and chunk.topic != topic:
+ continue
+ if language and chunk.language != language:
+ continue
+ filtered_indices.append(i)
+
+ # Get top-k
+ if filtered_indices:
+ filtered_sims = [(i, similarities[i]) for i in filtered_indices]
+ top_indices = sorted(filtered_sims, key=lambda x: x[1], reverse=True)[:top_k]
+ else:
+ top_indices = [(i, similarities[i]) for i in np.argsort(similarities)[::-1][:top_k]]
+
+ results = [(self.chunks[i], float(score)) for i, score in top_indices]
+ return results
+Retrieve most relevant grammar chunks for a query
+query_texttop_ktopiclanguageList of (chunk, similarity_score) tuples, sorted by relevance
+class LLMClient
+(model: str | None = None,
embedding_model: str | None = None,
temperature: float = 0.7,
max_tokens: int | None = None,
api_key: str | None = None)
+class LLMClient:
+ """Unified LLM client supporting 100+ providers via LiteLLM
+
+ Supported models:
+ - OpenAI: gpt-4o, gpt-4-turbo, gpt-3.5-turbo
+ - Anthropic: claude-3-5-sonnet-20241022, claude-3-opus
+ - Google: gemini/gemini-1.5-pro, gemini/gemini-1.5-flash
+ - Azure, AWS Bedrock, Ollama, etc.
+
+ Configuration via environment variables:
+ - VEDYUT_LLM_MODEL: Model name (default: gpt-4o)
+ - OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, etc.
+ """
+
+ DEFAULT_MODEL = "gpt-4o"
+ DEFAULT_EMBEDDING_MODEL = "text-embedding-3-large"
+
+ def __init__(
+ self,
+ model: Optional[str] = None,
+ embedding_model: Optional[str] = None,
+ temperature: float = 0.7,
+ max_tokens: Optional[int] = None,
+ api_key: Optional[str] = None,
+ ):
+ """Initialize LLM client
+
+ Args:
+ model: Model name (e.g., "gpt-4o", "claude-3-5-sonnet-20241022")
+ embedding_model: Model for embeddings
+ temperature: Sampling temperature (0.0-1.0)
+ max_tokens: Max tokens in response
+ api_key: Optional API key (or use env vars)
+ """
+ self.model = model or os.getenv("VEDYUT_LLM_MODEL", self.DEFAULT_MODEL)
+ self.embedding_model = embedding_model or os.getenv(
+ "VEDYUT_EMBEDDING_MODEL", self.DEFAULT_EMBEDDING_MODEL
+ )
+ self.temperature = temperature
+ self.max_tokens = max_tokens
+
+ # LiteLLM auto-detects API keys from env (OPENAI_API_KEY, etc.)
+ if api_key:
+ litellm.api_key = api_key
+
+ def complete(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+ ) -> str:
+ """Complete a chat conversation
+
+ Args:
+ messages: List of {"role": "user/assistant/system", "content": "..."}
+ **kwargs: Additional args passed to LiteLLM (temperature, max_tokens, etc.)
+
+ Returns:
+ Response text
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items() if k not in ["temperature", "max_tokens"]}
+ )
+ return response.choices[0].message.content
+
+ def complete_with_json(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+ ) -> Dict[str, Any]:
+ """Complete with structured JSON response
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Returns:
+ Parsed JSON response as dict
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ response_format={"type": "json_object"},
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "response_format"]}
+ )
+
+ import json
+ content = response.choices[0].message.content
+ return json.loads(content)
+
+ def embed(self, texts: List[str]) -> List[List[float]]:
+ """Generate embeddings for texts
+
+ Args:
+ texts: List of text strings to embed
+
+ Returns:
+ List of embedding vectors
+ """
+ if isinstance(texts, str):
+ texts = [texts]
+
+ response = embedding(
+ model=self.embedding_model,
+ input=texts
+ )
+ return [item["embedding"] for item in response.data]
+
+ def embed_single(self, text: str) -> List[float]:
+ """Generate embedding for a single text
+
+ Args:
+ text: Text to embed
+
+ Returns:
+ Embedding vector
+ """
+ return self.embed([text])[0]
+
+ def stream(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+ ):
+ """Stream completion response (for long responses)
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Yields:
+ Response chunks
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ stream=True,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "stream"]}
+ )
+
+ for chunk in response:
+ if chunk.choices[0].delta.content:
+ yield chunk.choices[0].delta.content
+Unified LLM client supporting 100+ providers via LiteLLM
+Supported models: +- OpenAI: gpt-4o, gpt-4-turbo, gpt-3.5-turbo +- Anthropic: claude-3-5-sonnet-20241022, claude-3-opus +- Google: gemini/gemini-1.5-pro, gemini/gemini-1.5-flash +- Azure, AWS Bedrock, Ollama, etc.
+Configuration via environment variables: +- VEDYUT_LLM_MODEL: Model name (default: gpt-4o) +- OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, etc.
+Initialize LLM client
+modelembedding_modeltemperaturemax_tokensapi_keyvar DEFAULT_EMBEDDING_MODELvar DEFAULT_MODEL
+def complete(self, messages: List[Dict[str, str]], **kwargs) ‑> str
+def complete(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+) -> str:
+ """Complete a chat conversation
+
+ Args:
+ messages: List of {"role": "user/assistant/system", "content": "..."}
+ **kwargs: Additional args passed to LiteLLM (temperature, max_tokens, etc.)
+
+ Returns:
+ Response text
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items() if k not in ["temperature", "max_tokens"]}
+ )
+ return response.choices[0].message.content
+Complete a chat conversation
+messages**kwargsResponse text
+def complete_with_json(self, messages: List[Dict[str, str]], **kwargs) ‑> Dict[str, Any]
+def complete_with_json(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+) -> Dict[str, Any]:
+ """Complete with structured JSON response
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Returns:
+ Parsed JSON response as dict
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ response_format={"type": "json_object"},
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "response_format"]}
+ )
+
+ import json
+ content = response.choices[0].message.content
+ return json.loads(content)
+Complete with structured JSON response
+messages**kwargsParsed JSON response as dict
+def embed(self, texts: List[str]) ‑> List[List[float]]
+def embed(self, texts: List[str]) -> List[List[float]]:
+ """Generate embeddings for texts
+
+ Args:
+ texts: List of text strings to embed
+
+ Returns:
+ List of embedding vectors
+ """
+ if isinstance(texts, str):
+ texts = [texts]
+
+ response = embedding(
+ model=self.embedding_model,
+ input=texts
+ )
+ return [item["embedding"] for item in response.data]
+Generate embeddings for texts
+textsList of embedding vectors
+def embed_single(self, text: str) ‑> List[float]
+def embed_single(self, text: str) -> List[float]:
+ """Generate embedding for a single text
+
+ Args:
+ text: Text to embed
+
+ Returns:
+ Embedding vector
+ """
+ return self.embed([text])[0]
+Generate embedding for a single text
+textEmbedding vector
+def stream(self, messages: List[Dict[str, str]], **kwargs)
+def stream(
+ self,
+ messages: List[Dict[str, str]],
+ **kwargs
+):
+ """Stream completion response (for long responses)
+
+ Args:
+ messages: Chat messages
+ **kwargs: Additional args
+
+ Yields:
+ Response chunks
+ """
+ response = completion(
+ model=self.model,
+ messages=messages,
+ stream=True,
+ temperature=kwargs.get("temperature", self.temperature),
+ max_tokens=kwargs.get("max_tokens", self.max_tokens),
+ **{k: v for k, v in kwargs.items()
+ if k not in ["temperature", "max_tokens", "stream"]}
+ )
+
+ for chunk in response:
+ if chunk.choices[0].delta.content:
+ yield chunk.choices[0].delta.content
+Stream completion response (for long responses)
+messages**kwargsResponse chunks
vedyut.llm.ragRAG (Retrieval-Augmented Generation) for Sanskrit grammar treatises
+This module enables LLMs to reference Pāṇinian grammar texts: +- Aṣṭādhyāyī sūtras (Sanskrit) +- Kāśikā commentary (Sanskrit) +- English textbooks (Kale, Whitney, etc.) +- Modern explanations
+The LLM can then: +1. Retrieve relevant sūtras for a grammar question +2. Generate code based on grammar rules +3. Explain rules in natural language +4. Cross-reference multiple sources
+
+class GrammarChunk
+(id: str,
text: str,
source: str,
sutra_number: str | None = None,
topic: str | None = None,
language: str = 'sanskrit',
embedding: List[float] | None = None)
+@dataclass
+class GrammarChunk:
+ """A chunk of grammar text with metadata"""
+ id: str
+ text: str # The actual content (sūtra + commentary)
+ source: str # "ashtadhyayi", "kashika", "kale", etc.
+ sutra_number: Optional[str] = None # e.g., "1.1.1", "3.2.123"
+ topic: Optional[str] = None # e.g., "sandhi", "lakara", "dhatu"
+ language: str = "sanskrit" # "sanskrit" or "english"
+ embedding: Optional[List[float]] = None
+A chunk of grammar text with metadata
var embedding : List[float] | Nonevar id : strvar language : strvar source : strvar sutra_number : str | Nonevar text : strvar topic : str | None
+class GrammarRAG
+(data_dir: str = 'data/grammar',
llm_client: LLMClient | None = None,
index_file: str = 'grammar_index.json')
+class GrammarRAG:
+ """RAG system for Sanskrit grammar treatises
+
+ Usage:
+ rag = GrammarRAG(data_dir="data/grammar")
+ rag.load_texts() # Load grammar treatises
+ rag.build_index() # Generate embeddings
+
+ # Query for relevant rules
+ results = rag.query("How to form present tense verbs?", top_k=3)
+
+ # Use with LLM
+ code = rag.generate_code("Implement sandhi rule for 'a + i → e'")
+ """
+
+ def __init__(
+ self,
+ data_dir: str = "data/grammar",
+ llm_client: Optional[LLMClient] = None,
+ index_file: str = "grammar_index.json"
+ ):
+ """Initialize RAG system
+
+ Args:
+ data_dir: Directory containing grammar text files
+ llm_client: LLM client for embeddings and generation
+ index_file: File to save/load embedded chunks
+ """
+ self.data_dir = Path(data_dir)
+ self.llm = llm_client or LLMClient()
+ self.index_file = self.data_dir / index_file
+
+ self.chunks: List[GrammarChunk] = []
+ self.chunk_embeddings: Optional[np.ndarray] = None
+
+ def load_texts(self):
+ """Load grammar treatises from data directory
+
+ Expected structure:
+ data/grammar/
+ ashtadhyayi.txt # Sūtras in Sanskrit/SLP1
+ kashika.txt # Commentary in Sanskrit
+ kale_grammar.txt # English textbook
+ panini_intro.txt # Modern English explanations
+ custom_rules.json # Custom rule definitions
+ """
+ if not self.data_dir.exists():
+ print(f"Warning: Grammar data directory not found: {self.data_dir}")
+ print("Create it and add grammar texts to enable RAG functionality.")
+ return
+
+ # Load text files
+ for file_path in self.data_dir.glob("*.txt"):
+ self._load_text_file(file_path)
+
+ # Load structured JSON files
+ for file_path in self.data_dir.glob("*.json"):
+ self._load_json_file(file_path)
+
+ print(f"Loaded {len(self.chunks)} grammar chunks from {self.data_dir}")
+
+ def _load_text_file(self, file_path: Path):
+ """Load and chunk a text file"""
+ source = file_path.stem # e.g., "ashtadhyayi", "kale_grammar"
+ language = "sanskrit" if any(x in source for x in ["ashtadhyayi", "kashika"]) else "english"
+
+ with open(file_path, encoding="utf-8") as f:
+ content = f.read()
+
+ # Simple chunking by paragraphs (TODO: improve with sutra-aware chunking)
+ paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
+
+ for i, para in enumerate(paragraphs):
+ chunk = GrammarChunk(
+ id=f"{source}_{i}",
+ text=para,
+ source=source,
+ language=language,
+ sutra_number=self._extract_sutra_number(para),
+ topic=self._infer_topic(para),
+ )
+ self.chunks.append(chunk)
+
+ def _load_json_file(self, file_path: Path):
+ """Load structured grammar rules from JSON
+
+ Expected format:
+ [
+ {
+ "sutra": "1.1.1",
+ "sanskrit": "वृद्धिरादैच्",
+ "transliteration": "vṛddhir ādaic",
+ "english": "a, ai, au are called vṛddhi",
+ "explanation": "This defines the vṛddhi vowels...",
+ "topic": "sandhi"
+ },
+ ...
+ ]
+ """
+ with open(file_path, encoding="utf-8") as f:
+ data = json.load(f)
+
+ for i, rule in enumerate(data):
+ # Create chunks for Sanskrit and English versions
+ if "sanskrit" in rule:
+ chunk = GrammarChunk(
+ id=f"{file_path.stem}_{i}_sa",
+ text=f"{rule.get('sutra', '')}: {rule['sanskrit']}\n{rule.get('explanation', '')}",
+ source=file_path.stem,
+ sutra_number=rule.get("sutra"),
+ topic=rule.get("topic"),
+ language="sanskrit",
+ )
+ self.chunks.append(chunk)
+
+ if "english" in rule:
+ chunk = GrammarChunk(
+ id=f"{file_path.stem}_{i}_en",
+ text=f"{rule.get('sutra', '')}: {rule['english']}\n{rule.get('explanation', '')}",
+ source=file_path.stem,
+ sutra_number=rule.get("sutra"),
+ topic=rule.get("topic"),
+ language="english",
+ )
+ self.chunks.append(chunk)
+
+ def _extract_sutra_number(self, text: str) -> Optional[str]:
+ """Extract sūtra number from text (e.g., '1.1.1', '3.2.123')"""
+ import re
+ match = re.search(r'\b(\d+\.\d+\.\d+)\b', text[:100])
+ return match.group(1) if match else None
+
+ def _infer_topic(self, text: str) -> Optional[str]:
+ """Infer grammatical topic from text content"""
+ text_lower = text.lower()
+ if any(word in text_lower for word in ["sandhi", "सन्धि"]):
+ return "sandhi"
+ elif any(word in text_lower for word in ["lakara", "लकार", "tense", "वृत्ति"]):
+ return "lakara"
+ elif any(word in text_lower for word in ["dhatu", "धातु", "verb", "root"]):
+ return "dhatu"
+ elif any(word in text_lower for word in ["vibhakti", "विभक्ति", "case"]):
+ return "vibhakti"
+ elif any(word in text_lower for word in ["samasa", "समास", "compound"]):
+ return "samasa"
+ return None
+
+ def build_index(self, force_rebuild: bool = False):
+ """Generate embeddings for all chunks and build search index
+
+ Args:
+ force_rebuild: If True, rebuild even if index exists
+ """
+ # Try to load existing index
+ if not force_rebuild and self.index_file.exists():
+ self._load_index()
+ print(f"Loaded existing index from {self.index_file}")
+ return
+
+ if not self.chunks:
+ print("No chunks to index. Run load_texts() first.")
+ return
+
+ print(f"Generating embeddings for {len(self.chunks)} chunks...")
+ texts = [chunk.text for chunk in self.chunks]
+
+ # Generate embeddings in batches (API rate limits)
+ batch_size = 100
+ all_embeddings = []
+
+ for i in range(0, len(texts), batch_size):
+ batch = texts[i:i + batch_size]
+ embeddings = self.llm.embed(batch)
+ all_embeddings.extend(embeddings)
+ print(f" Embedded {min(i + batch_size, len(texts))}/{len(texts)}")
+
+ # Store embeddings in chunks
+ for chunk, embedding in zip(self.chunks, all_embeddings):
+ chunk.embedding = embedding
+
+ self.chunk_embeddings = np.array(all_embeddings)
+
+ # Save index
+ self._save_index()
+ print(f"Index saved to {self.index_file}")
+
+ def _save_index(self):
+ """Save chunks and embeddings to disk"""
+ self.data_dir.mkdir(parents=True, exist_ok=True)
+
+ data = {
+ "chunks": [asdict(chunk) for chunk in self.chunks],
+ "version": "1.0"
+ }
+
+ with open(self.index_file, "w", encoding="utf-8") as f:
+ json.dump(data, f, ensure_ascii=False, indent=2)
+
+ def _load_index(self):
+ """Load chunks and embeddings from disk"""
+ with open(self.index_file, encoding="utf-8") as f:
+ data = json.load(f)
+
+ self.chunks = [GrammarChunk(**chunk) for chunk in data["chunks"]]
+ self.chunk_embeddings = np.array([chunk.embedding for chunk in self.chunks])
+
+ def query(
+ self,
+ query_text: str,
+ top_k: int = 5,
+ topic: Optional[str] = None,
+ language: Optional[str] = None,
+ ) -> List[Tuple[GrammarChunk, float]]:
+ """Retrieve most relevant grammar chunks for a query
+
+ Args:
+ query_text: Natural language query (e.g., "How to form past tense?")
+ top_k: Number of results to return
+ topic: Filter by topic ("sandhi", "lakara", etc.)
+ language: Filter by language ("sanskrit" or "english")
+
+ Returns:
+ List of (chunk, similarity_score) tuples, sorted by relevance
+ """
+ if self.chunk_embeddings is None:
+ raise ValueError("Index not built. Run build_index() first.")
+
+ # Generate query embedding
+ query_embedding = self.llm.embed_single(query_text)
+ query_vec = np.array(query_embedding)
+
+ # Compute cosine similarity
+ similarities = np.dot(self.chunk_embeddings, query_vec) / (
+ np.linalg.norm(self.chunk_embeddings, axis=1) * np.linalg.norm(query_vec)
+ )
+
+ # Filter by topic/language if specified
+ filtered_indices = []
+ for i, chunk in enumerate(self.chunks):
+ if topic and chunk.topic != topic:
+ continue
+ if language and chunk.language != language:
+ continue
+ filtered_indices.append(i)
+
+ # Get top-k
+ if filtered_indices:
+ filtered_sims = [(i, similarities[i]) for i in filtered_indices]
+ top_indices = sorted(filtered_sims, key=lambda x: x[1], reverse=True)[:top_k]
+ else:
+ top_indices = [(i, similarities[i]) for i in np.argsort(similarities)[::-1][:top_k]]
+
+ results = [(self.chunks[i], float(score)) for i, score in top_indices]
+ return results
+
+ def generate_code(
+ self,
+ task_description: str,
+ context_chunks: Optional[List[GrammarChunk]] = None,
+ language: str = "rust",
+ ) -> str:
+ """Generate code implementation based on grammar rules
+
+ Args:
+ task_description: What to implement (e.g., "sandhi rule for a + i")
+ context_chunks: Relevant grammar chunks (auto-retrieved if None)
+ language: Target programming language
+
+ Returns:
+ Generated code with comments
+ """
+ # Retrieve relevant chunks if not provided
+ if context_chunks is None:
+ results = self.query(task_description, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+
+ # Build context from chunks
+ context_text = "\n\n".join([
+ f"[{chunk.source} {chunk.sutra_number or ''}]\n{chunk.text}"
+ for chunk in context_chunks
+ ])
+
+ prompt = f"""You are a Sanskrit NLP expert. Based on these Pāṇinian grammar rules, generate {language} code to implement the requested functionality.
+
+Grammar References:
+{context_text}
+
+Task: {task_description}
+
+Generate clean, well-commented {language} code. Include:
+1. Function signature with types
+2. Implementation logic
+3. Comments explaining the grammar rule
+4. Example usage in comments
+
+{language.upper()} CODE:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.3)
+
+ def explain_rule(
+ self,
+ sutra_number: Optional[str] = None,
+ query: Optional[str] = None,
+ ) -> str:
+ """Get natural language explanation of a grammar rule
+
+ Args:
+ sutra_number: Specific sūtra (e.g., "1.1.1")
+ query: Natural language query (if sutra_number not provided)
+
+ Returns:
+ Plain English explanation
+ """
+ if sutra_number:
+ # Find chunks with this sutra
+ matching_chunks = [c for c in self.chunks if c.sutra_number == sutra_number]
+ if not matching_chunks:
+ return f"Sūtra {sutra_number} not found in loaded texts."
+ context_chunks = matching_chunks[:3]
+ elif query:
+ results = self.query(query, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+ else:
+ raise ValueError("Provide either sutra_number or query")
+
+ context_text = "\n\n".join([chunk.text for chunk in context_chunks])
+
+ prompt = f"""Explain this Pāṇinian grammar rule in simple, clear English.
+
+Grammar Text:
+{context_text}
+
+Provide:
+1. What the rule says
+2. When it applies
+3. A simple example
+4. Common mistakes
+
+EXPLANATION:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.5)
+RAG system for Sanskrit grammar treatises
+rag = GrammarRAG(data_dir="data/grammar") +rag.load_texts() +# Load grammar treatises +rag.build_index() +# Generate embeddings
+results = rag.query("How to form present tense verbs?", top_k=3)
+code = rag.generate_code("Implement sandhi rule for 'a + i → e'")
+Initialize RAG system
+data_dirllm_clientindex_file
+def build_index(self, force_rebuild: bool = False)
+def build_index(self, force_rebuild: bool = False):
+ """Generate embeddings for all chunks and build search index
+
+ Args:
+ force_rebuild: If True, rebuild even if index exists
+ """
+ # Try to load existing index
+ if not force_rebuild and self.index_file.exists():
+ self._load_index()
+ print(f"Loaded existing index from {self.index_file}")
+ return
+
+ if not self.chunks:
+ print("No chunks to index. Run load_texts() first.")
+ return
+
+ print(f"Generating embeddings for {len(self.chunks)} chunks...")
+ texts = [chunk.text for chunk in self.chunks]
+
+ # Generate embeddings in batches (API rate limits)
+ batch_size = 100
+ all_embeddings = []
+
+ for i in range(0, len(texts), batch_size):
+ batch = texts[i:i + batch_size]
+ embeddings = self.llm.embed(batch)
+ all_embeddings.extend(embeddings)
+ print(f" Embedded {min(i + batch_size, len(texts))}/{len(texts)}")
+
+ # Store embeddings in chunks
+ for chunk, embedding in zip(self.chunks, all_embeddings):
+ chunk.embedding = embedding
+
+ self.chunk_embeddings = np.array(all_embeddings)
+
+ # Save index
+ self._save_index()
+ print(f"Index saved to {self.index_file}")
+Generate embeddings for all chunks and build search index
+force_rebuild
+def explain_rule(self, sutra_number: str | None = None, query: str | None = None) ‑> str
+ def explain_rule(
+ self,
+ sutra_number: Optional[str] = None,
+ query: Optional[str] = None,
+ ) -> str:
+ """Get natural language explanation of a grammar rule
+
+ Args:
+ sutra_number: Specific sūtra (e.g., "1.1.1")
+ query: Natural language query (if sutra_number not provided)
+
+ Returns:
+ Plain English explanation
+ """
+ if sutra_number:
+ # Find chunks with this sutra
+ matching_chunks = [c for c in self.chunks if c.sutra_number == sutra_number]
+ if not matching_chunks:
+ return f"Sūtra {sutra_number} not found in loaded texts."
+ context_chunks = matching_chunks[:3]
+ elif query:
+ results = self.query(query, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+ else:
+ raise ValueError("Provide either sutra_number or query")
+
+ context_text = "\n\n".join([chunk.text for chunk in context_chunks])
+
+ prompt = f"""Explain this Pāṇinian grammar rule in simple, clear English.
+
+Grammar Text:
+{context_text}
+
+Provide:
+1. What the rule says
+2. When it applies
+3. A simple example
+4. Common mistakes
+
+EXPLANATION:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.5)
+Get natural language explanation of a grammar rule
+sutra_numberqueryPlain English explanation
+def generate_code(self,
task_description: str,
context_chunks: List[GrammarChunk] | None = None,
language: str = 'rust') ‑> str
+ def generate_code(
+ self,
+ task_description: str,
+ context_chunks: Optional[List[GrammarChunk]] = None,
+ language: str = "rust",
+ ) -> str:
+ """Generate code implementation based on grammar rules
+
+ Args:
+ task_description: What to implement (e.g., "sandhi rule for a + i")
+ context_chunks: Relevant grammar chunks (auto-retrieved if None)
+ language: Target programming language
+
+ Returns:
+ Generated code with comments
+ """
+ # Retrieve relevant chunks if not provided
+ if context_chunks is None:
+ results = self.query(task_description, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+
+ # Build context from chunks
+ context_text = "\n\n".join([
+ f"[{chunk.source} {chunk.sutra_number or ''}]\n{chunk.text}"
+ for chunk in context_chunks
+ ])
+
+ prompt = f"""You are a Sanskrit NLP expert. Based on these Pāṇinian grammar rules, generate {language} code to implement the requested functionality.
+
+Grammar References:
+{context_text}
+
+Task: {task_description}
+
+Generate clean, well-commented {language} code. Include:
+1. Function signature with types
+2. Implementation logic
+3. Comments explaining the grammar rule
+4. Example usage in comments
+
+{language.upper()} CODE:
+"""
+
+ messages = [{"role": "user", "content": prompt}]
+ return self.llm.complete(messages, temperature=0.3)
+Generate code implementation based on grammar rules
+task_descriptioncontext_chunkslanguageGenerated code with comments
+def load_texts(self)
+def load_texts(self):
+ """Load grammar treatises from data directory
+
+ Expected structure:
+ data/grammar/
+ ashtadhyayi.txt # Sūtras in Sanskrit/SLP1
+ kashika.txt # Commentary in Sanskrit
+ kale_grammar.txt # English textbook
+ panini_intro.txt # Modern English explanations
+ custom_rules.json # Custom rule definitions
+ """
+ if not self.data_dir.exists():
+ print(f"Warning: Grammar data directory not found: {self.data_dir}")
+ print("Create it and add grammar texts to enable RAG functionality.")
+ return
+
+ # Load text files
+ for file_path in self.data_dir.glob("*.txt"):
+ self._load_text_file(file_path)
+
+ # Load structured JSON files
+ for file_path in self.data_dir.glob("*.json"):
+ self._load_json_file(file_path)
+
+ print(f"Loaded {len(self.chunks)} grammar chunks from {self.data_dir}")
+Load grammar treatises from data directory
+Expected structure: +data/grammar/ +ashtadhyayi.txt +# Sūtras in Sanskrit/SLP1 +kashika.txt +# Commentary in Sanskrit +kale_grammar.txt +# English textbook +panini_intro.txt +# Modern English explanations +custom_rules.json +# Custom rule definitions
+def query(self,
query_text: str,
top_k: int = 5,
topic: str | None = None,
language: str | None = None) ‑> List[Tuple[GrammarChunk, float]]
+def query(
+ self,
+ query_text: str,
+ top_k: int = 5,
+ topic: Optional[str] = None,
+ language: Optional[str] = None,
+) -> List[Tuple[GrammarChunk, float]]:
+ """Retrieve most relevant grammar chunks for a query
+
+ Args:
+ query_text: Natural language query (e.g., "How to form past tense?")
+ top_k: Number of results to return
+ topic: Filter by topic ("sandhi", "lakara", etc.)
+ language: Filter by language ("sanskrit" or "english")
+
+ Returns:
+ List of (chunk, similarity_score) tuples, sorted by relevance
+ """
+ if self.chunk_embeddings is None:
+ raise ValueError("Index not built. Run build_index() first.")
+
+ # Generate query embedding
+ query_embedding = self.llm.embed_single(query_text)
+ query_vec = np.array(query_embedding)
+
+ # Compute cosine similarity
+ similarities = np.dot(self.chunk_embeddings, query_vec) / (
+ np.linalg.norm(self.chunk_embeddings, axis=1) * np.linalg.norm(query_vec)
+ )
+
+ # Filter by topic/language if specified
+ filtered_indices = []
+ for i, chunk in enumerate(self.chunks):
+ if topic and chunk.topic != topic:
+ continue
+ if language and chunk.language != language:
+ continue
+ filtered_indices.append(i)
+
+ # Get top-k
+ if filtered_indices:
+ filtered_sims = [(i, similarities[i]) for i in filtered_indices]
+ top_indices = sorted(filtered_sims, key=lambda x: x[1], reverse=True)[:top_k]
+ else:
+ top_indices = [(i, similarities[i]) for i in np.argsort(similarities)[::-1][:top_k]]
+
+ results = [(self.chunks[i], float(score)) for i, score in top_indices]
+ return results
+Retrieve most relevant grammar chunks for a query
+query_texttop_ktopiclanguageList of (chunk, similarity_score) tuples, sorted by relevance
vedyut.llm.tasksSanskrit-specific LLM tasks using RAG
+
+def disambiguate_segmentation(text: str,
candidates: List[List[str]],
llm: LLMClient | None = None,
rag: GrammarRAG | None = None) ‑> int
+def disambiguate_segmentation(
+ text: str,
+ candidates: List[List[str]],
+ llm: Optional[LLMClient] = None,
+ rag: Optional[GrammarRAG] = None,
+) -> int:
+ """Use LLM to choose best segmentation from candidates
+
+ Args:
+ text: Original Sanskrit text
+ candidates: List of possible segmentations (each a list of words)
+ llm: LLM client (created if None)
+ rag: Grammar RAG (optional, for rule-based context)
+
+ Returns:
+ Index of best candidate (0-indexed)
+
+ Example:
+ >>> text = "धर्मक्षेत्रे"
+ >>> candidates = [
+ ... ["धर्म", "क्षेत्रे"],
+ ... ["धर्मक्षेत्रे"],
+ ... ]
+ >>> best_idx = disambiguate_segmentation(text, candidates)
+ >>> print(candidates[best_idx])
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ # Build context from sandhi rules if RAG available
+ context = ""
+ if rag:
+ results = rag.query(f"sandhi rules for: {text}", top_k=2, topic="sandhi")
+ if results:
+ context = "\n\nRelevant sandhi rules:\n" + "\n".join([
+ f"- {chunk.text[:200]}..." for chunk, _ in results
+ ])
+
+ candidates_text = "\n".join([
+ f"{i+1}. {' + '.join(seg)}" for i, seg in enumerate(candidates)
+ ])
+
+ prompt = f"""You are a Sanskrit grammar expert. Given a Sanskrit text and multiple possible segmentations, choose the most grammatically correct and semantically meaningful one.
+
+Text: {text}
+
+Possible segmentations:
+{candidates_text}
+{context}
+
+Respond with ONLY the number (1-{len(candidates)}) of the best segmentation.
+Number: """
+
+ response = llm.complete(
+ [{"role": "user", "content": prompt}],
+ temperature=0.3,
+ max_tokens=10
+ )
+
+ try:
+ number = int(response.strip().split()[0])
+ return max(0, min(number - 1, len(candidates) - 1))
+ except (ValueError, IndexError):
+ return 0 # Default to first candidate
+Use LLM to choose best segmentation from candidates
+textcandidatesllmragIndex of best candidate (0-indexed)
+>>> text = "धर्मक्षेत्रे"
+>>> candidates = [
+... ["धर्म", "क्षेत्रे"],
+... ["धर्मक्षेत्रे"],
+... ]
+>>> best_idx = disambiguate_segmentation(text, candidates)
+>>> print(candidates[best_idx])
+
+def explain_grammar(word: str,
analysis: Dict | None = None,
llm: LLMClient | None = None,
rag: GrammarRAG | None = None) ‑> str
+def explain_grammar(
+ word: str,
+ analysis: Optional[Dict] = None,
+ llm: Optional[LLMClient] = None,
+ rag: Optional[GrammarRAG] = None,
+) -> str:
+ """Generate natural language explanation of grammatical analysis
+
+ Args:
+ word: Sanskrit word
+ analysis: Grammatical analysis dict (lemma, case, number, etc.)
+ llm: LLM client
+ rag: Grammar RAG for rule references
+
+ Returns:
+ Beginner-friendly explanation
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ analysis_text = ""
+ if analysis:
+ analysis_text = "\n".join([f"- {k}: {v}" for k, v in analysis.items()])
+
+ # Get relevant grammar rules if RAG available
+ context = ""
+ if rag and analysis:
+ query = f"grammar for {word} "
+ if "case" in analysis:
+ query += f"case {analysis['case']}"
+ if "tense" in analysis:
+ query += f"tense {analysis['tense']}"
+
+ results = rag.query(query, top_k=2)
+ if results:
+ context = "\n\nGrammar rules:\n" + "\n".join([
+ f"[{chunk.source}] {chunk.text[:150]}..." for chunk, _ in results
+ ])
+
+ prompt = f"""Explain the grammar of this Sanskrit word in simple, beginner-friendly terms:
+
+Word: {word}
+
+Grammatical analysis:
+{analysis_text}
+{context}
+
+Provide a clear explanation suitable for someone learning Sanskrit. Include:
+1. What the word means
+2. Its grammatical function (case, number, gender, tense, etc.)
+3. Why it has this form
+4. A simple example sentence
+
+EXPLANATION:
+"""
+
+ return llm.complete([{"role": "user", "content": prompt}], temperature=0.6)
+Generate natural language explanation of grammatical analysis
+wordanalysisllmragBeginner-friendly explanation
+def generate_test_cases(function_description: str,
rag: GrammarRAG | None = None,
llm: LLMClient | None = None,
num_cases: int = 10) ‑> List[Dict[str, str]]
+def generate_test_cases(
+ function_description: str,
+ rag: Optional[GrammarRAG] = None,
+ llm: Optional[LLMClient] = None,
+ num_cases: int = 10,
+) -> List[Dict[str, str]]:
+ """Generate test cases for a Sanskrit NLP function
+
+ Args:
+ function_description: What the function does
+ rag: Grammar RAG for rule-based examples
+ llm: LLM client
+ num_cases: Number of test cases to generate
+
+ Returns:
+ List of {"input": "...", "expected": "...", "description": "..."} dicts
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ # Get grammar context if available
+ context = ""
+ if rag:
+ results = rag.query(function_description, top_k=2)
+ if results:
+ context = "\n\nGrammar references:\n" + "\n".join([
+ f"{chunk.text[:200]}..." for chunk, _ in results
+ ])
+
+ prompt = f"""Generate {num_cases} diverse test cases for this Sanskrit NLP function:
+
+Function: {function_description}
+{context}
+
+For each test case, provide:
+1. Input (Sanskrit text or word)
+2. Expected output
+3. Brief description of what it tests
+
+Return as JSON array:
+[
+ {{
+ "input": "...",
+ "expected": "...",
+ "description": "..."
+ }},
+ ...
+]
+
+JSON:
+"""
+
+ try:
+ result = llm.complete_with_json([{"role": "user", "content": prompt}])
+ if isinstance(result, dict) and "test_cases" in result:
+ return result["test_cases"]
+ elif isinstance(result, list):
+ return result
+ else:
+ return []
+ except Exception as e:
+ print(f"Error generating test cases: {e}")
+ return []
+Generate test cases for a Sanskrit NLP function
+function_descriptionragllmnum_casesList of {"input": "…", "expected": "…", "description": "…"} dicts
+def suggest_implementation(rule_description: str,
rag: GrammarRAG,
language: str = 'rust',
include_tests: bool = True) ‑> str
+def suggest_implementation(
+ rule_description: str,
+ rag: GrammarRAG,
+ language: str = "rust",
+ include_tests: bool = True,
+) -> str:
+ """Generate code implementation suggestion from grammar rule
+
+ ⚠️ WARNING: LLM-generated code requires human review!
+ Use this as a starting point, not production code.
+
+ Args:
+ rule_description: Description of what to implement
+ rag: Grammar RAG (required for rule lookup)
+ language: Target programming language
+ include_tests: Generate test cases
+
+ Returns:
+ Generated code with comments
+ """
+ # Retrieve relevant grammar chunks
+ results = rag.query(rule_description, top_k=3)
+ context_chunks = [chunk for chunk, _ in results]
+
+ if not context_chunks:
+ return f"# No relevant grammar rules found for: {rule_description}"
+
+ context_text = "\n\n".join([
+ f"[{chunk.source} {chunk.sutra_number or ''}]\n{chunk.text}"
+ for chunk in context_chunks
+ ])
+
+ test_instruction = ""
+ if include_tests:
+ test_instruction = "\n4. Test cases with examples"
+
+ prompt = f"""You are a Sanskrit NLP expert implementing Pāṇinian grammar rules in code.
+
+Grammar References:
+{context_text}
+
+Task: {rule_description}
+
+Generate clean, production-ready {language} code with:
+1. Clear function signature with type annotations
+2. Implementation following the grammar rules above
+3. Detailed comments explaining each step and referencing sūtras{test_instruction}
+
+⚠️ IMPORTANT:
+- Be precise with grammar rules
+- Handle edge cases
+- Note any ambiguities or limitations
+
+{language.upper()} CODE:
+"""
+
+ llm = rag.llm
+ return llm.complete(
+ [{"role": "user", "content": prompt}],
+ temperature=0.3,
+ max_tokens=2000
+ )
+Generate code implementation suggestion from grammar rule
+⚠️ WARNING: LLM-generated code requires human review! +Use this as a starting point, not production code.
+rule_descriptionraglanguageinclude_testsGenerated code with comments
+def translate_sanskrit(text: str,
target_lang: str = 'english',
llm: LLMClient | None = None,
with_explanation: bool = False) ‑> str
+def translate_sanskrit(
+ text: str,
+ target_lang: str = "english",
+ llm: Optional[LLMClient] = None,
+ with_explanation: bool = False,
+) -> str:
+ """Translate Sanskrit text to target language
+
+ Args:
+ text: Sanskrit text (Devanagari or transliterated)
+ target_lang: Target language (default: "english")
+ llm: LLM client
+ with_explanation: Include word-by-word breakdown
+
+ Returns:
+ Translation (and optional explanation)
+ """
+ if llm is None:
+ llm = LLMClient()
+
+ if with_explanation:
+ prompt = f"""Translate this Sanskrit text to {target_lang} with word-by-word explanation:
+
+Sanskrit: {text}
+
+Provide:
+1. Word-by-word breakdown with grammatical analysis
+2. Smooth {target_lang} translation
+
+FORMAT:
+Word-by-word:
+- word1 (grammatical info): meaning
+- word2 (grammatical info): meaning
+
+Translation: [full translation]
+"""
+ else:
+ prompt = f"Translate this Sanskrit text to {target_lang}: {text}"
+
+ return llm.complete([{"role": "user", "content": prompt}], temperature=0.5)
+Translate Sanskrit text to target language
+texttarget_langllmwith_explanationTranslation (and optional explanation)
+def validate_rule_implementation(code: str,
rule_description: str,
rag: GrammarRAG,
language: str = 'rust') ‑> Dict[str, ]
+def validate_rule_implementation(
+ code: str,
+ rule_description: str,
+ rag: GrammarRAG,
+ language: str = "rust",
+) -> Dict[str, any]:
+ """Validate that code correctly implements a grammar rule
+
+ ⚠️ WARNING: This is a heuristic check, not formal verification!
+ Always test with actual Sanskrit data.
+
+ Args:
+ code: Code to validate
+ rule_description: What it should implement
+ rag: Grammar RAG for rule lookup
+ language: Programming language
+
+ Returns:
+ {
+ "is_valid": bool,
+ "confidence": float (0-1),
+ "issues": List[str],
+ "suggestions": List[str]
+ }
+ """
+ # Retrieve grammar rules
+ results = rag.query(rule_description, top_k=2)
+ context_text = "\n\n".join([chunk.text for chunk, _ in results])
+
+ prompt = f"""Review this {language} code implementing a Pāṇinian grammar rule.
+
+Grammar Rule:
+{context_text}
+
+Implementation:
+```{language}
+{code}
+```
+
+Task: {rule_description}
+
+Analyze if the code correctly implements the grammar rule. Return JSON:
+{{
+ "is_valid": true/false,
+ "confidence": 0.0-1.0,
+ "issues": ["issue 1", "issue 2", ...],
+ "suggestions": ["suggestion 1", "suggestion 2", ...]
+}}
+
+JSON:
+"""
+
+ try:
+ return rag.llm.complete_with_json([{"role": "user", "content": prompt}])
+ except Exception as e:
+ return {
+ "is_valid": False,
+ "confidence": 0.0,
+ "issues": [f"Validation failed: {e}"],
+ "suggestions": []
+ }
+Validate that code correctly implements a grammar rule
+⚠️ WARNING: This is a heuristic check, not formal verification! +Always test with actual Sanskrit data.
+coderule_descriptionraglanguage{ +"is_valid": bool, +"confidence": float (0-1), +"issues": List[str], +"suggestions": List[str] +}