From efa313addcd2864e9c045262f2e2f080497e3b62 Mon Sep 17 00:00:00 2001 From: "Md. Kishor Morol" Date: Sat, 6 Jun 2026 19:04:59 -0400 Subject: [PATCH 1/2] feat: cap site to 1000/section + split HF dataset by source Website: - Cap each browse section (arXiv / conference / journal) to 1,000 papers (3,000 total). _queryPapers now clamps the reported count to SECTION_CAP and trims rows past the cap, so pagination stops at 1,000 per section. The full corpus stays available via the API and the HF dataset; the static JSON was already capped at 1,000. HF dataset: - Split the `papers` config by source into `arxiv`, `conference`, and `journal` splits (uploaded as papers_.jsonl), alongside the existing combined `train` split (kept for backward compatibility). Users can now `load_dataset(repo, "papers", split="journal")`. - Card updated: per-source counts in stats, file table, and usage examples. Co-Authored-By: Claude Opus 4.8 --- site/assets/js/railway-api.js | 25 +++++++++++---- src/storage/hf_dataset.py | 58 +++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/site/assets/js/railway-api.js b/site/assets/js/railway-api.js index 9a70a68..7bb7a6e 100644 --- a/site/assets/js/railway-api.js +++ b/site/assets/js/railway-api.js @@ -11,6 +11,11 @@ const RS_API = 'https://researchscope-production.up.railway.app'; +// Public site shows at most this many papers per section (arXiv / conference / +// journal) — 3 000 total. The full corpus stays available via the API and the +// Hugging Face dataset; this just bounds what the browse pages paginate through. +const SECTION_CAP = 1000; + // ── Auth state ──────────────────────────────────────────────────────────────── const _auth = { @@ -67,25 +72,33 @@ async function _queryPapers({ else if (source === 'conference') params.set('source_type', 'conference'); else if (source === 'journal') params.set('source_type', 'journal'); - // 1. Try Railway + const start = (page - 1) * pageSize; + + // 1. Try Railway — clamp the reported count and trim rows past the cap so the + // browse page paginates through at most SECTION_CAP papers for this section. try { const json = await _apiFetch(`/papers?${params}`); - if (json && Array.isArray(json.results)) - return { data: json.results, count: json.total ?? 0, error: null }; + if (json && Array.isArray(json.results)) { + const count = Math.min(json.total ?? 0, SECTION_CAP); + let data = json.results; + if (start >= SECTION_CAP) data = []; + else if (start + data.length > SECTION_CAP) data = data.slice(0, SECTION_CAP - start); + return { data, count, error: null }; + } } catch (e) { console.warn('[railway] queryPapers failed, falling back to static JSON:', e.message); } - // 2. Last resort — static JSON + // 2. Last resort — static JSON (already capped at 1 000 by the generator) try { const res = await fetch('data/papers.json'); const all = await res.json(); - const start = (page - 1) * pageSize; const filtered = search ? all.filter(p => (p.title||'').toLowerCase().includes(search.toLowerCase()) || (p.abstract||'').toLowerCase().includes(search.toLowerCase())) : all; - return { data: filtered.slice(start, start + pageSize), count: filtered.length, error: null }; + const count = Math.min(filtered.length, SECTION_CAP); + return { data: filtered.slice(start, Math.min(start + pageSize, SECTION_CAP)), count, error: null }; } catch (e) { console.warn('[static] papers.json failed:', e.message); } diff --git a/src/storage/hf_dataset.py b/src/storage/hf_dataset.py index fa7cbf3..ddc8c73 100644 --- a/src/storage/hf_dataset.py +++ b/src/storage/hf_dataset.py @@ -91,6 +91,17 @@ def _to_raw(paper: dict) -> dict: return out +def _bucket(row: dict) -> str: + """Classify a paper into arxiv / conference / journal for per-source splits.""" + st = str(row.get("source_type") or "").lower() + if st == "journal": + return "journal" + if st in ("conference", "workshop"): + return "conference" + # preprint / unknown → treat as arXiv (source is arxiv/openalex preprints) + return "arxiv" + + def _to_instruct_rows(paper: dict) -> list[dict]: rows = [] inp = _input_text(paper) @@ -191,7 +202,7 @@ def push(papers: list[dict] | None = None) -> bool: today = datetime.now(timezone.utc).date() - # ── Raw split ───────────────────────────────────────────────────────────── + # ── Raw split — combined `train` + per-source arxiv/conference/journal ────── raw_rows = [_to_raw(p) for p in papers if p.get("title")] log.info("[hf] pushing %d raw paper records …", len(raw_rows)) _upload_with_retry( @@ -201,6 +212,21 @@ def push(papers: list[dict] | None = None) -> bool: commit_message=f"update papers.jsonl ({len(raw_rows):,} papers) [{today}]", ) + # Per-source splits so users can load just arXiv, conference, or journal + # papers — `load_dataset(repo, "papers", split="journal")`. The combined + # `train` split above stays for backward compatibility. + buckets: dict[str, list[dict]] = {"arxiv": [], "conference": [], "journal": []} + for row in raw_rows: + buckets[_bucket(row)].append(row) + for name, rows in buckets.items(): + log.info("[hf] %s split → %d papers", name, len(rows)) + _upload_with_retry( + api, + path_or_fileobj=_jsonl_bytes(rows), + path_in_repo=f"data/papers_{name}.jsonl", + commit_message=f"update papers_{name}.jsonl ({len(rows):,} papers) [{today}]", + ) + # ── Instruction split ───────────────────────────────────────────────────── instruct_rows = [] for p in papers: @@ -214,7 +240,8 @@ def push(papers: list[dict] | None = None) -> bool: ) # ── Dataset card ────────────────────────────────────────────────────────── - _push_card(api, len(raw_rows), len(instruct_rows)) + _push_card(api, len(raw_rows), len(instruct_rows), + {k: len(v) for k, v in buckets.items()}) log.info("[hf] push complete → https://huggingface.co/datasets/%s", _REPO_ID) return True @@ -299,7 +326,12 @@ def _ensure_sections_config(api: Any) -> None: log.info("[hf] registered sections config in dataset card.") -def _push_card(api: Any, n_papers: int, n_instruct: int) -> None: +def _push_card(api: Any, n_papers: int, n_instruct: int, + by_source: dict[str, int] | None = None) -> None: + by_source = by_source or {} + n_arxiv = by_source.get("arxiv", 0) + n_conf = by_source.get("conference", 0) + n_journal = by_source.get("journal", 0) card = f"""--- license: cc-by-4.0 language: @@ -323,6 +355,12 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None: data_files: - split: train path: data/papers.jsonl + - split: arxiv + path: data/papers_arxiv.jsonl + - split: conference + path: data/papers_conference.jsonl + - split: journal + path: data/papers_journal.jsonl - config_name: instruct data_files: - split: train @@ -341,7 +379,7 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None: ## Stats -- **{n_papers:,}** papers (raw metadata) +- **{n_papers:,}** papers (raw metadata) — **{n_arxiv:,}** arXiv · **{n_conf:,}** conference · **{n_journal:,}** journal - **{n_instruct:,}** instruction-tuning rows - Sources: arXiv, OpenAlex, ACL Anthology, OpenReview, PMLR, CVF, Semantic Scholar - Venues: NeurIPS, ICML, ICLR, ACL, EMNLP, CVPR, AAAI, IJCAI, JMLR, TMLR, TACL, TPAMI, NMI and more @@ -350,7 +388,10 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None: | File | Description | |------|-------------| -| `data/papers.jsonl` | Raw paper metadata — title, abstract, authors, venue, year, tags, scores | +| `data/papers.jsonl` | Raw paper metadata — title, abstract, authors, venue, year, tags, scores (all sources combined) | +| `data/papers_arxiv.jsonl` | arXiv / preprint papers only | +| `data/papers_conference.jsonl` | Conference papers only (NeurIPS, ICML, ICLR, ACL, CVPR, …) | +| `data/papers_journal.jsonl` | Journal papers only (JMLR, TPAMI, NMI, TACL, …) | | `data/instruct.jsonl` | Instruction-tuning pairs — summarize, key contribution, why it matters, plain English | | `data/sections.jsonl` | Per-section fine-tuning rows for A* papers — real body text of `abstract`, `introduction`, `related_work`, `method`, `experiments`, `results`, `conclusion`. Filter by the `section` field to train a per-section writing agent. | @@ -359,9 +400,14 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None: ```python from datasets import load_dataset -# Raw papers +# All papers (combined) papers = load_dataset("kishormorol/researchscope-papers", "papers", split="train") +# Just one source — arXiv, conference, or journal papers +arxiv = load_dataset("kishormorol/researchscope-papers", "papers", split="arxiv") +conference = load_dataset("kishormorol/researchscope-papers", "papers", split="conference") +journal = load_dataset("kishormorol/researchscope-papers", "papers", split="journal") + # Instruction tuning instruct = load_dataset("kishormorol/researchscope-papers", "instruct", split="train") From 59d674b127f416da975a6ecae216c65fc25d4db3 Mon Sep 17 00:00:00 2001 From: "Md. Kishor Morol" Date: Sat, 6 Jun 2026 19:08:57 -0400 Subject: [PATCH 2/2] =?UTF-8?q?fix(site):=20correct=20stale=2083K=20paper?= =?UTF-8?q?=20count=20=E2=86=92=20live/100K+?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The homepage hero claimed "83,000+ papers" — long stale (the corpus is now 100K+; the Railway API holds ~165K). Fixes: - Hero count is now dynamic: loadStats() injects the live total from stats.json, rounded down to a clean "N,000+", so it never goes stale again (static fallback "100,000+" before JS loads). - Replace remaining hardcoded "83K+"/"83,000+" copy with "100,000+" across README, the index feature card, and the sign-in/register pages. Co-Authored-By: Claude Opus 4.8 --- README.md | 10 +++++----- site/assets/js/app.js | 6 ++++++ site/index.html | 4 ++-- site/register.html | 2 +- site/signin.html | 2 +- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index b961e29..b580800 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # ResearchScope -**CS Research Intelligence Platform — 83,000+ papers, scored, ranked, and searchable.** +**CS Research Intelligence Platform — 100,000+ papers, scored, ranked, and searchable.** Stop skimming paper lists. ResearchScope scores papers by impact, surfaces research gaps, recommends venues, and tracks who's driving the frontier — updated daily. @@ -40,7 +40,7 @@ The frontend is a static site on GitHub Pages backed by a **FastAPI REST API** o |---|---| | **Jun 2026** | **OpenReview Acceptance Tiers** — oral/spotlight/poster signals captured for ICLR, NeurIPS, ICML & COLM; oral/spotlight boost paper scores and show as badges. Coverage extended through ICLR 2026, NeurIPS 2025, ICML 2025 | | **Jun 2026** | **Journal Recommender** — paste title + abstract to match against 20 Q1 journals (JMLR, TPAMI, Nature MI, CSUR…) with impact factor, review timeline, and open access info | -| **Jun 2026** | **FastAPI Backend on Railway** — full REST API with JWT auth, favourites, PostgreSQL full-text search (83K+ papers). User accounts synced across devices | +| **Jun 2026** | **FastAPI Backend on Railway** — full REST API with JWT auth, favourites, PostgreSQL full-text search (100K+ papers). User accounts synced across devices | | **Jun 2026** | **OpenAlex Integration** — 250M+ work catalogue added as a data source, covering ML/NLP/CV/IR concept groups | | **Jun 2026** | **HuggingFace Training Dataset** — `kishormorol/researchscope-papers` auto-pushed after every pipeline run: raw metadata JSONL + instruction-tuning pairs | | **Jun 2026** | **20 Q1 Journals** — JMLR, TMLR, TACL, TPAMI, IJCV, AIJ, TNNLS, Nature MI, CSUR, TIP, MLJ, TKDE, DAMI, NN, PR, CL, IPM, JACM, NatComms, TOIS | @@ -54,11 +54,11 @@ The frontend is a static site on GitHub Pages backed by a **FastAPI REST API** o | Feature | Description | |---|---| -| 📄 **83K+ papers** | Scored by recency, venue rank, acceptance tier (oral/spotlight), novelty, author prestige, and citation quality | +| 📄 **100K+ papers** | Scored by recency, venue rank, acceptance tier (oral/spotlight), novelty, author prestige, and citation quality | | 🎓 **A* Conference coverage** | NeurIPS, ICML, ICLR, CVPR, ACL, EMNLP, AAAI, IJCAI, CHI, SIGIR, WWW, KDD and more | | 📖 **20 Q1 Journals** | JMLR, TMLR, TACL, TPAMI, Nature MI, and 15 more — with IF, review time, OA status | | 🎯 **Venue Recommenders** | Conference + Journal recommenders: paste abstract → ranked matches with expectations | -| 🔍 **Full-text search** | PostgreSQL `tsvector` search across 83K papers via Railway API | +| 🔍 **Full-text search** | PostgreSQL `tsvector` search across 100K+ papers via Railway API | | 👤 **User accounts** | JWT auth, favourites synced across devices via Railway backend | | 🕳 **Research gaps** | 3-layer extraction: explicit, pattern-detected, and starter ideas | | 👩‍🔬 **Author intelligence** | 5,000+ researchers ranked by momentum score | @@ -134,7 +134,7 @@ The paper dataset is published on HuggingFace and auto-updated after every pipel ```python from datasets import load_dataset -# 83K+ raw paper records (pretraining / RAG) +# 100K+ raw paper records (pretraining / RAG) papers = load_dataset("kishormorol/researchscope-papers", data_files="data/papers.jsonl", split="train") diff --git a/site/assets/js/app.js b/site/assets/js/app.js index 925c6d2..0380847 100644 --- a/site/assets/js/app.js +++ b/site/assets/js/app.js @@ -204,6 +204,12 @@ async function loadStats() { const el = document.getElementById(id); if (el) el.textContent = (val ?? 0).toLocaleString(); } + // Hero tagline count — rounded down to a clean "N,000+" so it never goes stale. + const heroEl = document.getElementById('hero-paper-count'); + if (heroEl && stats.total_papers) { + const rounded = Math.floor(stats.total_papers / 1000) * 1000; + heroEl.textContent = rounded.toLocaleString() + '+'; + } const genEl = document.getElementById('stat-generated'); if (genEl && stats.generated_at) { genEl.textContent = 'Updated ' + new Date(stats.generated_at).toLocaleDateString('en-US', { month:'short', day:'numeric', year:'numeric' }); diff --git a/site/index.html b/site/index.html index 94950cc..895d9f2 100644 --- a/site/index.html +++ b/site/index.html @@ -137,7 +137,7 @@

- 83,000+ papers scored by impact · Conferences & journals ranked · + 100,000+ papers scored by impact · Conferences & journals ranked · Research gaps surfaced · Find where to submit your next paper

@@ -208,7 +208,7 @@

📄

Paper Browser

-

83K+ papers scored by impact, novelty, and venue rank. Filter by topic, year, difficulty, or source.

+

Papers scored by impact, novelty, and venue rank. Filter by topic, year, difficulty, or source.

diff --git a/site/register.html b/site/register.html index 47e7361..15e927f 100644 --- a/site/register.html +++ b/site/register.html @@ -123,7 +123,7 @@

⭐Save & sync favourite papers
  • - 🔍Full-text search across 83K+ papers + 🔍Full-text search across 100K+ papers
  • 🎓Conference & journal recommender diff --git a/site/signin.html b/site/signin.html index 63b0c42..e62c25f 100644 --- a/site/signin.html +++ b/site/signin.html @@ -122,7 +122,7 @@

    ⭐Save & sync favourite papers

  • - 🔍Full-text search across 83K+ papers + 🔍Full-text search across 100K+ papers
  • 🎓Conference & journal recommender